From 5e66b3a3e8cec4232919a4c71a34b2b555d9b5b3 Mon Sep 17 00:00:00 2001 From: Carlos Timoteo Date: Wed, 11 Sep 2024 13:22:49 -0400 Subject: [PATCH] Bug fix purchase propensity training preparation (#193) * predicting for only the users with traffic in the past 72h - purchase propensity * running inference only for users events in the past 72h * including 72h users for all models predictions * considering null values in TabWorkflow models * deleting unused pipfile * upgrading lib versions * implementing reporting preprocessing as a new pipeline * adding more code documentation * adding important information on the main README.md and DEVELOPMENT.md * adding schedule run name and more code documentation * implementing a new scheduler using the vertex ai sdk & adding user_id to procedures for consistency * adding more code documentation * adding code doc to the python custom component * adding more code documentation * fixing aggregated predictions query * removing unnecessary resources from deployment * Writing MDS guide * adding the MDS developer and troubleshooting documentation * fixing deployment for activation pipelines and gemini dataset * Update README.md * Update README.md * Update README.md * Update README.md * removing deprecated api * fixing purchase propensity pipelines names * adding extra condition for when there is not enough data for the window interval to be applied on backfill procedures * adding more instructions for post deployment and fixing issues when GA4 export was configured for less than 10 days * removing unnecessary comments * adding the number of past days to process in the variables files * adding comment about combining data from different ga4 export datasets to data store * fixing small issues with feature engineering and ml pipelines * fixing hyper parameter tuning for kmeans modeling * fixing optuna parameters * adding cloud shell image * fixing the list of all possible users in the propensity training preparation tables * additional guardrails for when there is not enough data * adding more documentation * adding more doc to feature store * add feature store documentation * adding ml pipelines docs * adding ml pipelines docs * adding more documentation * adding user agent client info * fixing scope of client info * fix * removing client_info from vertex components * fixing versioning of tf submodules * reconfiguring meta providers * fixing issue 187 --------- Co-authored-by: Carlos Timoteo --- infrastructure/terraform/README.md | 2 ++ ...churn_propensity_training_preparation.sqlx | 2 +- ...chase_propensity_training_preparation.sqlx | 21 +++++++++---------- 3 files changed, 13 insertions(+), 12 deletions(-) diff --git a/infrastructure/terraform/README.md b/infrastructure/terraform/README.md index 4a2ae4cd..793f9419 100644 --- a/infrastructure/terraform/README.md +++ b/infrastructure/terraform/README.md @@ -91,6 +91,7 @@ Step by step installation guide with [![Open in Cloud Shell](https://gstatic.com gcloud auth login gcloud auth application-default login --quiet --scopes="openid,https://www.googleapis.com/auth/userinfo.email,https://www.googleapis.com/auth/cloud-platform,https://www.googleapis.com/auth/sqlservice.login,https://www.googleapis.com/auth/analytics,https://www.googleapis.com/auth/analytics.edit,https://www.googleapis.com/auth/analytics.provision,https://www.googleapis.com/auth/analytics.readonly,https://www.googleapis.com/auth/accounts.reauth" gcloud auth application-default set-quota-project $PROJECT_ID + export GOOGLE_APPLICATION_CREDENTIALS=/Users//.config/gcloud/application_default_credentials.json ``` **Note:** You may receive an error message informing the Cloud Resource Manager API has not been used/enabled for your project, similar to the following: @@ -187,6 +188,7 @@ Follow the authentication workflow, since your credentials expires daily: # Authenticate your application default login to Google Cloud with the right scopes for Terraform to run gcloud auth application-default login --quiet --scopes="openid,https://www.googleapis.com/auth/userinfo.email,https://www.googleapis.com/auth/cloud-platform,https://www.googleapis.com/auth/sqlservice.login,https://www.googleapis.com/auth/analytics,https://www.googleapis.com/auth/analytics.edit,https://www.googleapis.com/auth/analytics.provision,https://www.googleapis.com/auth/analytics.readonly,https://www.googleapis.com/auth/accounts.reauth" gcloud auth application-default set-quota-project $PROJECT_ID + export GOOGLE_APPLICATION_CREDENTIALS=/Users//.config/gcloud/application_default_credentials.json ``` To resume working on a new terminal session run the following commands: diff --git a/sql/query/invoke_churn_propensity_training_preparation.sqlx b/sql/query/invoke_churn_propensity_training_preparation.sqlx index b6e5ebbe..632fb03b 100644 --- a/sql/query/invoke_churn_propensity_training_preparation.sqlx +++ b/sql/query/invoke_churn_propensity_training_preparation.sqlx @@ -37,7 +37,7 @@ DECLARE min_date DATE; SET max_date = (SELECT DATE_SUB(MAX(event_date), INTERVAL {{interval_max_date}} DAY) FROM `{{mds_project_id}}.{{mds_dataset}}.event`); SET min_date = (SELECT DATE_ADD(MIN(event_date), INTERVAL {{interval_min_date}} DAY) FROM `{{mds_project_id}}.{{mds_dataset}}.event`); --- If min_date > maximum event_date OR max_date < minimum event_date, then set min_date for the max event_date and set max_date for the min event_date +-- If min_date > maximum event_date OR max_date < minimum event_date, then set min_date for the min event_date and set max_date for the max event_date IF min_date >= (SELECT MAX(event_date) FROM `{{mds_project_id}}.{{mds_dataset}}.event`) OR max_date <= (SELECT MIN(event_date) FROM `{{mds_project_id}}.{{mds_dataset}}.event`) OR min_date >= max_date THEN SET min_date = (SELECT MIN(event_date) FROM `{{mds_project_id}}.{{mds_dataset}}.event`); SET max_date = (SELECT MAX(event_date) FROM `{{mds_project_id}}.{{mds_dataset}}.event`); diff --git a/sql/query/invoke_purchase_propensity_training_preparation.sqlx b/sql/query/invoke_purchase_propensity_training_preparation.sqlx index 615f8cac..4d2eab86 100644 --- a/sql/query/invoke_purchase_propensity_training_preparation.sqlx +++ b/sql/query/invoke_purchase_propensity_training_preparation.sqlx @@ -32,20 +32,19 @@ DECLARE purchasers INT64 DEFAULT NULL; -- Used to store the maximum and minimum event dates from the source data. DECLARE max_date DATE; DECLARE min_date DATE; -SET min_date = (SELECT MIN(event_date) FROM `{{mds_project_id}}.{{mds_dataset}}.event`); -SET max_date = (SELECT MAX(event_date) FROM `{{mds_project_id}}.{{mds_dataset}}.event`); -- Determining Maximum and Minimum Dates -SET train_end_date = (SELECT DATE_SUB(MAX(event_date), INTERVAL {{interval_max_date}} DAY) FROM `{{mds_project_id}}.{{mds_dataset}}.event`); -SET train_start_date = (SELECT DATE_ADD(MIN(event_date), INTERVAL {{interval_min_date}} DAY) FROM `{{mds_project_id}}.{{mds_dataset}}.event`); +SET max_date = (SELECT DATE_SUB(MAX(event_date), INTERVAL {{interval_max_date}} DAY) FROM `{{mds_project_id}}.{{mds_dataset}}.event`); +SET min_date = (SELECT DATE_ADD(MIN(event_date), INTERVAL {{interval_min_date}} DAY) FROM `{{mds_project_id}}.{{mds_dataset}}.event`); --- If train_start_date > maximum event_date OR train_end_date < minimum event_date, then set train_start_date for the min event_date and set train_end_date for the max event_date -IF train_start_date >= max_date OR train_end_date <= min_date THEN - SET train_start_date = min_date; - SET train_end_date = max_date; +-- If min_date > maximum event_date OR max_date < minimum event_date, then set min_date for the min event_date and set max_date for the max event_date +IF min_date >= (SELECT MAX(event_date) FROM `{{mds_project_id}}.{{mds_dataset}}.event`) OR max_date <= (SELECT MIN(event_date) FROM `{{mds_project_id}}.{{mds_dataset}}.event`) OR min_date >= max_date THEN + SET min_date = (SELECT MIN(event_date) FROM `{{mds_project_id}}.{{mds_dataset}}.event`); + SET max_date = (SELECT MAX(event_date) FROM `{{mds_project_id}}.{{mds_dataset}}.event`); END IF; --- Sets the train_split_end_number to a user-defined value. This value determines the proportion of data used for training. +-- Setting Split Numbers +-- Sets the train_split_end_number to a user-defined value. This value likely determines the proportion of data used for training. SET train_split_end_number = {{train_split_end_number}}; -- If you want 60% for training use number 5. If you want 80% use number 7. -- Sets the validation_split_end_number to a user-defined value, controlling the proportion of data used for validation. SET validation_split_end_number = {{validation_split_end_number}}; @@ -61,8 +60,8 @@ SET purchasers = (SELECT COUNT(DISTINCT user_pseudo_id) -- If there are purchasers no changes to the train_start_date and train_end_date -- Else, expand the interval, hopefully a purchaser will be in the interval IF purchasers > 0 THEN - SET train_start_date = train_start_date; - SET train_end_date = train_end_date; + SET train_start_date = GREATEST(train_start_date, min_date); + SET train_end_date = LEAST(train_end_date, max_date); ELSE SET train_start_date = min_date; SET train_end_date = max_date;