From 5e66b3a3e8cec4232919a4c71a34b2b555d9b5b3 Mon Sep 17 00:00:00 2001
From: Carlos Timoteo <chmstimoteo@gmail.com>
Date: Wed, 11 Sep 2024 13:22:49 -0400
Subject: [PATCH] Bug fix purchase propensity training preparation (#193)

* predicting for only the users with traffic in the past 72h - purchase propensity

* running inference only for users events in the past 72h

* including 72h users for all models predictions

* considering null values in TabWorkflow models

* deleting unused pipfile

* upgrading lib versions

* implementing reporting preprocessing as a new pipeline

* adding more code documentation

* adding important information on the main README.md and DEVELOPMENT.md

* adding schedule run name and more code documentation

* implementing a new scheduler using the vertex ai sdk & adding user_id to procedures for consistency

* adding more code documentation

* adding code doc to the python custom component

* adding more code documentation

* fixing aggregated predictions query

* removing unnecessary resources from deployment

* Writing MDS guide

* adding the MDS developer and troubleshooting documentation

* fixing deployment for activation pipelines and gemini dataset

* Update README.md

* Update README.md

* Update README.md

* Update README.md

* removing deprecated api

* fixing purchase propensity pipelines names

* adding extra condition for when there is not enough data for the window interval to be applied on backfill procedures

* adding more instructions for post deployment and fixing issues when GA4 export was configured for less than 10 days

* removing unnecessary comments

* adding the number of past days to process in the variables files

* adding comment about combining data from different ga4 export datasets to data store

* fixing small issues with feature engineering and ml pipelines

* fixing hyper parameter tuning for kmeans modeling

* fixing optuna parameters

* adding cloud shell image

* fixing the list of all possible users in the propensity training preparation tables

* additional guardrails for when there is not enough data

* adding more documentation

* adding more doc to feature store

* add feature store documentation

* adding ml pipelines docs

* adding ml pipelines docs

* adding more documentation

* adding user agent client info

* fixing scope of client info

* fix

* removing client_info from vertex components

* fixing versioning of tf submodules

* reconfiguring meta providers

* fixing issue 187

---------

Co-authored-by: Carlos Timoteo <ctimoteo@google.com>
---
 infrastructure/terraform/README.md            |  2 ++
 ...churn_propensity_training_preparation.sqlx |  2 +-
 ...chase_propensity_training_preparation.sqlx | 21 +++++++++----------
 3 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/infrastructure/terraform/README.md b/infrastructure/terraform/README.md
index 4a2ae4cd..793f9419 100644
--- a/infrastructure/terraform/README.md
+++ b/infrastructure/terraform/README.md
@@ -91,6 +91,7 @@ Step by step installation guide with [![Open in Cloud Shell](https://gstatic.com
    gcloud auth login
    gcloud auth application-default login --quiet --scopes="openid,https://www.googleapis.com/auth/userinfo.email,https://www.googleapis.com/auth/cloud-platform,https://www.googleapis.com/auth/sqlservice.login,https://www.googleapis.com/auth/analytics,https://www.googleapis.com/auth/analytics.edit,https://www.googleapis.com/auth/analytics.provision,https://www.googleapis.com/auth/analytics.readonly,https://www.googleapis.com/auth/accounts.reauth"
    gcloud auth application-default set-quota-project $PROJECT_ID
+   export GOOGLE_APPLICATION_CREDENTIALS=/Users/<USER_NAME>/.config/gcloud/application_default_credentials.json
    ```
 
     **Note:** You may receive an error message informing the Cloud Resource Manager API has not been used/enabled for your project, similar to the following: 
@@ -187,6 +188,7 @@ Follow the authentication workflow, since your credentials expires daily:
    # Authenticate your application default login to Google Cloud with the right scopes for Terraform to run
    gcloud auth application-default login --quiet --scopes="openid,https://www.googleapis.com/auth/userinfo.email,https://www.googleapis.com/auth/cloud-platform,https://www.googleapis.com/auth/sqlservice.login,https://www.googleapis.com/auth/analytics,https://www.googleapis.com/auth/analytics.edit,https://www.googleapis.com/auth/analytics.provision,https://www.googleapis.com/auth/analytics.readonly,https://www.googleapis.com/auth/accounts.reauth"
    gcloud auth application-default set-quota-project $PROJECT_ID
+   export GOOGLE_APPLICATION_CREDENTIALS=/Users/<USER_NAME>/.config/gcloud/application_default_credentials.json
    ```
 
 To resume working on a new terminal session run the following commands:
diff --git a/sql/query/invoke_churn_propensity_training_preparation.sqlx b/sql/query/invoke_churn_propensity_training_preparation.sqlx
index b6e5ebbe..632fb03b 100644
--- a/sql/query/invoke_churn_propensity_training_preparation.sqlx
+++ b/sql/query/invoke_churn_propensity_training_preparation.sqlx
@@ -37,7 +37,7 @@ DECLARE min_date DATE;
 SET max_date = (SELECT DATE_SUB(MAX(event_date), INTERVAL {{interval_max_date}} DAY) FROM `{{mds_project_id}}.{{mds_dataset}}.event`); 
 SET min_date = (SELECT DATE_ADD(MIN(event_date), INTERVAL {{interval_min_date}} DAY) FROM `{{mds_project_id}}.{{mds_dataset}}.event`); 
 
--- If min_date > maximum event_date OR max_date < minimum event_date, then set min_date for the max event_date and set max_date for the min event_date
+-- If min_date > maximum event_date OR max_date < minimum event_date, then set min_date for the min event_date and set max_date for the max event_date
 IF min_date >= (SELECT MAX(event_date) FROM `{{mds_project_id}}.{{mds_dataset}}.event`) OR max_date <= (SELECT MIN(event_date) FROM `{{mds_project_id}}.{{mds_dataset}}.event`)  OR min_date >= max_date THEN
   SET min_date = (SELECT MIN(event_date) FROM `{{mds_project_id}}.{{mds_dataset}}.event`);
   SET max_date = (SELECT MAX(event_date) FROM `{{mds_project_id}}.{{mds_dataset}}.event`);
diff --git a/sql/query/invoke_purchase_propensity_training_preparation.sqlx b/sql/query/invoke_purchase_propensity_training_preparation.sqlx
index 615f8cac..4d2eab86 100644
--- a/sql/query/invoke_purchase_propensity_training_preparation.sqlx
+++ b/sql/query/invoke_purchase_propensity_training_preparation.sqlx
@@ -32,20 +32,19 @@ DECLARE purchasers INT64 DEFAULT NULL;
 -- Used to store the maximum and minimum event dates from the source data.
 DECLARE max_date DATE;
 DECLARE min_date DATE;
-SET min_date = (SELECT MIN(event_date) FROM `{{mds_project_id}}.{{mds_dataset}}.event`);
-SET max_date = (SELECT MAX(event_date) FROM `{{mds_project_id}}.{{mds_dataset}}.event`);
 
 -- Determining Maximum and Minimum Dates
-SET train_end_date = (SELECT DATE_SUB(MAX(event_date), INTERVAL {{interval_max_date}} DAY) FROM `{{mds_project_id}}.{{mds_dataset}}.event`); 
-SET train_start_date = (SELECT DATE_ADD(MIN(event_date), INTERVAL {{interval_min_date}} DAY) FROM `{{mds_project_id}}.{{mds_dataset}}.event`); 
+SET max_date = (SELECT DATE_SUB(MAX(event_date), INTERVAL {{interval_max_date}} DAY) FROM `{{mds_project_id}}.{{mds_dataset}}.event`); 
+SET min_date = (SELECT DATE_ADD(MIN(event_date), INTERVAL {{interval_min_date}} DAY) FROM `{{mds_project_id}}.{{mds_dataset}}.event`); 
 
--- If train_start_date > maximum event_date OR train_end_date < minimum event_date, then set train_start_date for the min event_date and set train_end_date for the max event_date
-IF train_start_date >= max_date OR train_end_date <= min_date THEN
-  SET train_start_date = min_date;
-  SET train_end_date = max_date;
+-- If min_date > maximum event_date OR max_date < minimum event_date, then set min_date for the min event_date and set max_date for the max event_date
+IF min_date >= (SELECT MAX(event_date) FROM `{{mds_project_id}}.{{mds_dataset}}.event`) OR max_date <= (SELECT MIN(event_date) FROM `{{mds_project_id}}.{{mds_dataset}}.event`)  OR min_date >= max_date THEN
+  SET min_date = (SELECT MIN(event_date) FROM `{{mds_project_id}}.{{mds_dataset}}.event`);
+  SET max_date = (SELECT MAX(event_date) FROM `{{mds_project_id}}.{{mds_dataset}}.event`);
 END IF;
 
--- Sets the train_split_end_number to a user-defined value. This value determines the proportion of data used for training.
+-- Setting Split Numbers
+-- Sets the train_split_end_number to a user-defined value. This value likely determines the proportion of data used for training.
 SET train_split_end_number = {{train_split_end_number}};    -- If you want 60% for training use number 5. If you want 80% use number 7.
 -- Sets the validation_split_end_number to a user-defined value, controlling the proportion of data used for validation.
 SET validation_split_end_number = {{validation_split_end_number}};
@@ -61,8 +60,8 @@ SET purchasers = (SELECT COUNT(DISTINCT user_pseudo_id)
 -- If there are purchasers no changes to the train_start_date and train_end_date
 -- Else, expand the interval, hopefully a purchaser will be in the interval
 IF purchasers > 0 THEN
-    SET train_start_date =  train_start_date;
-    SET train_end_date = train_end_date;
+    SET train_start_date =  GREATEST(train_start_date, min_date);
+    SET train_end_date = LEAST(train_end_date, max_date);
 ELSE
     SET train_start_date =  min_date;
     SET train_end_date = max_date;