Skip to content

Commit

Permalink
Improve backfill procedures (#35)
Browse files Browse the repository at this point in the history
* Update invoke_backfill_customer_lifetime_value_label.sqlx

* rewriting the backfill queries avoiding the for loop

* templating backfill cltv label

* templating the other queries

* fixing typo

* fixing typo on backfill user dimensions

* adjust post-installation instructions

* substitute create temp table for CREATE OR REPLACE temp table

* bug fixing user_session_event_aggregated_metrics

* bug fixing dates_interval on backfill user_segmentation_dimensions

* bug fixing dimensions and labels features calculations

* bug fixing dimensions and labels features calculations

* fixing typos

* removing nth_* features and fixing typos

* removing nth_* features

* removing fixing first_traffic features

* removing unnecessary features and adjusting calculations to avoid duplicates

* fixing purchases labels and user_ltv_revenue values dependencies so that labels dont count todays purchases

* fixing purchases labels andremoved features from tables schemas

* removing duplicate rows

* adding more sp invokes to readme

* fixing typo

* fixing typo

* fixing missing feature on dimensions procedures

* fixing removing unnecessary features from audience segmentation inference prep

---------

Co-authored-by: Carlos Timoteo <[email protected]>
  • Loading branch information
chmstimoteo and Carlos Timoteo authored Sep 21, 2023
1 parent 5aa6df0 commit e2d16dc
Show file tree
Hide file tree
Showing 32 changed files with 4,165 additions and 3,097 deletions.
64 changes: 32 additions & 32 deletions config/config.yaml.tftpl
Original file line number Diff line number Diff line change
Expand Up @@ -563,80 +563,80 @@ bigquery:
mds_project_id: "${project_id}"
mds_dataset: "${mds_dataset}"
invoke_backfill_user_lifetime_dimensions:
mds_project_id: "${project_id}"
mds_dataset: "${mds_dataset}"
project_id: "${project_id}"
dataset: "feature_store"
stored_procedure: "user_lifetime_dimensions"
insert_table: "user_lifetime_dimensions"
interval_max_date: 180
interval_min_date: 180
interval_end_date: 180
mds_project_id: "${project_id}"
mds_dataset: "${mds_dataset}"
invoke_backfill_user_scoped_lifetime_metrics:
mds_project_id: "${project_id}"
mds_dataset: "${mds_dataset}"
project_id: "${project_id}"
dataset: "feature_store"
stored_procedure: "user_scoped_lifetime_metrics"
insert_table: "user_scoped_lifetime_metrics"
interval_max_date: 180
interval_min_date: 180
interval_end_date: 180
invoke_backfill_user_session_event_aggregated_metrics:
mds_project_id: "${project_id}"
mds_dataset: "${mds_dataset}"
invoke_backfill_user_session_event_aggregated_metrics:
project_id: "${project_id}"
dataset: "feature_store"
stored_procedure: "user_session_event_aggregated_metrics"
insert_table: "user_session_event_aggregated_metrics"
interval_max_date: 15
interval_min_date: 30
interval_end_date: 30
mds_project_id: "${project_id}"
mds_dataset: "${mds_dataset}"
invoke_backfill_customer_lifetime_value_label:
mds_project_id: "${project_id}"
mds_dataset: "${mds_dataset}"
project_id: "${project_id}"
dataset: "feature_store"
stored_procedure: "customer_lifetime_value_label"
insert_table: "customer_lifetime_value_label"
interval_max_date: 180
interval_min_date: 180
interval_end_date: 180
invoke_backfill_user_lookback_metrics:
mds_project_id: "${project_id}"
mds_dataset: "${mds_dataset}"
invoke_backfill_user_lookback_metrics:
project_id: "${project_id}"
dataset: "feature_store"
stored_procedure: "user_lookback_metrics"
insert_table: "user_lookback_metrics"
interval_min_date: 15
interval_end_date: 15
mds_project_id: "${project_id}"
mds_dataset: "${mds_dataset}"
invoke_backfill_user_rolling_window_lifetime_metrics:
mds_project_id: "${project_id}"
mds_dataset: "${mds_dataset}"
project_id: "${project_id}"
dataset: "feature_store"
stored_procedure: "user_rolling_window_lifetime_metrics"
insert_table: "user_rolling_window_lifetime_metrics"
interval_max_date: 180
interval_min_date: 180
interval_end_date: 180
mds_project_id: "${project_id}"
mds_dataset: "${mds_dataset}"
invoke_backfill_user_scoped_segmentation_metrics:
mds_project_id: "${project_id}"
mds_dataset: "${mds_dataset}"
project_id: "${project_id}"
dataset: "feature_store"
stored_procedure: "user_scoped_segmentation_metrics"
insert_table: "user_scoped_segmentation_metrics"
interval_min_date: 15
interval_end_date: 15
mds_project_id: "${project_id}"
mds_dataset: "${mds_dataset}"
interval_end_date: 15
invoke_backfill_user_segmentation_dimensions:
mds_project_id: "${project_id}"
mds_dataset: "${mds_dataset}"
project_id: "${project_id}"
dataset: "feature_store"
stored_procedure: "user_segmentation_dimensions"
insert_table: "user_segmentation_dimensions"
interval_min_date: 15
interval_end_date: 15
mds_project_id: "${project_id}"
mds_dataset: "${mds_dataset}"
invoke_backfill_purchase_propensity_label:
mds_project_id: "${project_id}"
mds_dataset: "${mds_dataset}"
project_id: "${project_id}"
dataset: "feature_store"
stored_procedure: "purchase_propensity_label"
insert_table: "purchase_propensity_label"
interval_max_date: 15
interval_min_date: 30
interval_end_date: 30
Expand All @@ -645,28 +645,28 @@ bigquery:
mds_dataset: "${mds_dataset}"
project_id: "${project_id}"
dataset: "feature_store"
stored_procedure: "user_dimensions"
insert_table: "user_dimensions"
interval_max_date: 15
interval_min_date: 30
interval_end_date: 30
invoke_backfill_user_rolling_window_metrics:
mds_project_id: "${project_id}"
mds_dataset: "${mds_dataset}"
project_id: "${project_id}"
dataset: "feature_store"
stored_procedure: "user_rolling_window_metrics"
insert_table: "user_rolling_window_metrics"
interval_max_date: 15
interval_min_date: 30
interval_end_date: 30
mds_project_id: "${project_id}"
mds_dataset: "${mds_dataset}"
invoke_backfill_user_scoped_metrics:
mds_project_id: "${project_id}"
mds_dataset: "${mds_dataset}"
project_id: "${project_id}"
dataset: "feature_store"
stored_procedure: "user_scoped_metrics"
insert_table: "user_scoped_metrics"
interval_max_date: 15
interval_min_date: 30
interval_end_date: 30
mds_project_id: "${project_id}"
mds_dataset: "${mds_dataset}"
interval_end_date: 30
invoke_customer_lifetime_value_label:
project_id: "${project_id}"
dataset: "feature_store"
Expand Down
27 changes: 19 additions & 8 deletions infrastructure/terraform/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -115,27 +115,38 @@ To manually start the data flow you must perform the following tasks:

On the Google Cloud console, navigate to Workflows page. You will see a Workflow named `dataform-prod-incremental`, then under Actions, click on the three dots and `Execute` the Workflow.

**Note:** If you have a considerable amount of data (>XX GBs of data) in your exported GA4 and Ads BigQuery datasets, it can take several minutes or hours to process all the data. Make sure that the processing has completed successfully before you continue to the next step.
**Note:** If you have a considerable amount of data (>XXX GBs of data) in your exported GA4 and Ads BigQuery datasets, it can take several minutes or hours to process all the data. Make sure that the processing has completed successfully before you continue to the next step.

1. Invoke the BigQuery stored procedures having the prefix `invoke_backfill_*` to backfill the feature store in case the GA4 Export has been enabled a long time ago before installing MDE.

On the Google Cloud console, navigate to BigQuery page. On the query composer, run the following queries to invoke the stored procedures.
```sql
## Backfill customer ltv tables
CALL `feature_store.invoke_backfill_customer_lifetime_value_label`();
CALL `feature_store.invoke_backfill_purchase_propensity_label`();
CALL `feature_store.invoke_backfill_user_dimensions`();
CALL `feature_store.invoke_backfill_user_lifetime_dimensions`();
CALL `feature_store.invoke_backfill_user_lookback_metrics`();
CALL `feature_store.invoke_backfill_user_rolling_window_lifetime_metrics`();
CALL `feature_store.invoke_backfill_user_rolling_window_metrics`();
CALL `feature_store.invoke_backfill_user_scoped_lifetime_metrics`();
CALL `customer_lifetime_value.invoke_customer_lifetime_value_training_preparation`();
CALL `customer_lifetime_value.invoke_customer_lifetime_value_inference_preparation`();
## Backfill purchase propensity tables
CALL `feature_store.invoke_backfill_user_dimensions`();
CALL `feature_store.invoke_backfill_user_rolling_window_metrics`();
CALL `feature_store.invoke_backfill_user_scoped_metrics`();
CALL `feature_store.invoke_backfill_user_scoped_segmentation_metrics`();
CALL `feature_store.invoke_backfill_user_segmentation_dimensions`();
CALL `feature_store.invoke_backfill_user_session_event_aggregated_metrics`();
CALL `feature_store.invoke_backfill_purchase_propensity_label`();
CALL `purchase_propensity.invoke_purchase_propensity_training_preparation`();
CALL `purchase_propensity.invoke_purchase_propensity_inference_preparation`();
## Backfill audience segmentation tables
CALL `feature_store.invoke_backfill_user_segmentation_dimensions`();
CALL `feature_store.invoke_backfill_user_lookback_metrics`();
CALL `feature_store.invoke_backfill_user_scoped_segmentation_metrics`();
CALL `audience_segmentation.invoke_audience_segmentation_training_preparation`();
CALL `audience_segmentation.invoke_audience_segmentation_inference_preparation`();
```

**Note:** If you have a considerable amount of data (>XX GBs of data) in your exported GA4 BigQuery datasets over the last six months, it can take several hours to backfill the feature data so that you can train your ML model. Make sure that backfilling doesn't fail in the first several minutes before you continue to the next step.
**Note:** If you have a considerable amount of data (>XXX GBs of data) in your exported GA4 BigQuery datasets over the last six months, it can take several hours to backfill the feature data so that you can train your ML model. Make sure that the backfill procedures starts without errors before you continue to the next step.

1. Redeploy the ML pipelines using Terraform.

Expand Down
33 changes: 9 additions & 24 deletions sql/procedure/audience_segmentation_inference_preparation.sqlx
Original file line number Diff line number Diff line change
Expand Up @@ -16,19 +16,14 @@
SET inference_date = DATE_SUB(inference_date, INTERVAL 1 DAY);

CREATE TEMP TABLE inference_preparation AS (
SELECT
SELECT DISTINCT
UD.user_pseudo_id,
UD.user_id,
UD.feature_date,
UD.month_of_the_year,
UD.week_of_the_year,
UD.day_of_the_month,
UD.day_of_week,
UD.hour_of_day,
UD.nth_day,
UD.nth_hour,
UD.nth_week,
UD.nth_month,
UD.device_category,
UD.device_mobile_brand_name,
UD.device_mobile_model_name,
Expand Down Expand Up @@ -102,6 +97,8 @@ CREATE TEMP TABLE inference_preparation AS (
WHERE
-- Define the training+validation subset interval
UD.feature_date = inference_date );


INSERT INTO
`{{project_id}}.{{dataset}}.{{insert_table}}` (feature_date,
user_pseudo_id,
Expand All @@ -110,11 +107,6 @@ INSERT INTO
week_of_the_year,
day_of_the_month,
day_of_week,
hour_of_day,
nth_day,
nth_hour,
nth_week,
nth_month,
device_category,
device_mobile_brand_name,
device_mobile_model_name,
Expand Down Expand Up @@ -182,11 +174,11 @@ SELECT
week_of_the_year,
day_of_the_month,
day_of_week,
hour_of_day,
nth_day,
nth_hour,
nth_week,
nth_month,
NULL as hour_of_day,
NULL as nth_day,
NULL as nth_hour,
NULL as nth_week,
NULL as nth_month,
device_category,
device_mobile_brand_name,
device_mobile_model_name,
Expand Down Expand Up @@ -251,7 +243,7 @@ FROM

CREATE OR REPLACE TABLE
`{{project_id}}.{{dataset}}.audience_segmentation_inference_15` AS(
SELECT
SELECT DISTINCT
CURRENT_TIMESTAMP() AS processed_timestamp,
feature_date,
user_pseudo_id,
Expand All @@ -260,11 +252,6 @@ CREATE OR REPLACE TABLE
week_of_the_year,
day_of_the_month,
day_of_week,
hour_of_day,
nth_day,
nth_hour,
nth_week,
nth_month,
device_category,
device_mobile_brand_name,
device_mobile_model_name,
Expand Down Expand Up @@ -334,7 +321,6 @@ CREATE OR REPLACE VIEW
user_id,
day_of_the_month,
day_of_week,
hour_of_day,
device_category,
device_mobile_model_name,
device_os_version,
Expand Down Expand Up @@ -372,7 +358,6 @@ SELECT
user_id,
day_of_the_month,
day_of_week,
hour_of_day,
device_category,
device_mobile_model_name,
device_os_version,
Expand Down
12 changes: 1 addition & 11 deletions sql/procedure/audience_segmentation_training_preparation.sqlx
Original file line number Diff line number Diff line change
Expand Up @@ -20,19 +20,14 @@ SELECT max_date;
SELECT min_date;

CREATE TEMP TABLE training_preparation as (
SELECT
SELECT DISTINCT
UD.user_pseudo_id,
UD.user_id,
UD.feature_date,
UD.month_of_the_year,
UD.week_of_the_year,
UD.day_of_the_month,
UD.day_of_week,
UD.hour_of_day,
UD.nth_day,
UD.nth_hour,
UD.nth_week,
UD.nth_month,
UD.device_category,
UD.device_mobile_brand_name,
UD.device_mobile_model_name,
Expand Down Expand Up @@ -121,7 +116,6 @@ CREATE TEMP TABLE DataForTargetTable AS(
user_id,
day_of_the_month,
day_of_week,
hour_of_day,
device_category,
device_mobile_model_name,
device_os_version,
Expand Down Expand Up @@ -163,7 +157,6 @@ CREATE OR REPLACE TABLE `{{project_id}}.{{dataset}}.audience_segmentation_traini
user_id,
day_of_the_month,
day_of_week,
hour_of_day,
device_category,
device_mobile_model_name,
device_os_version,
Expand Down Expand Up @@ -201,7 +194,6 @@ CREATE OR REPLACE VIEW `{{project_id}}.{{dataset}}.v_audience_segmentation_train
user_id,
day_of_the_month,
day_of_week,
hour_of_day,
device_category,
device_mobile_model_name,
device_os_version,
Expand Down Expand Up @@ -243,7 +235,6 @@ SELECT
user_id,
day_of_the_month,
day_of_week,
hour_of_day,
device_category,
device_mobile_model_name,
device_os_version,
Expand Down Expand Up @@ -279,7 +270,6 @@ FROM (
user_id,
day_of_the_month,
day_of_week,
hour_of_day,
device_category,
device_mobile_model_name,
device_os_version,
Expand Down
Loading

0 comments on commit e2d16dc

Please sign in to comment.