From 92b585a3c7bd718f7f622c79db23ed5ee5181d00 Mon Sep 17 00:00:00 2001 From: Charlie Wang <2144018+kingman@users.noreply.github.com> Date: Tue, 26 Nov 2024 17:16:41 +0100 Subject: [PATCH] specify the columns in the backfill procedures, makes sure the scripts are executable with a previous created feature table (#254) --- ...nvoke_backfill_churn_propensity_label.sqlx | 8 ++- ...ackfill_customer_lifetime_value_label.sqlx | 9 ++- ...ke_backfill_purchase_propensity_label.sqlx | 21 ++++++- .../invoke_backfill_user_dimensions.sqlx | 26 +++++++- ...oke_backfill_user_lifetime_dimensions.sqlx | 26 +++++++- ...invoke_backfill_user_lookback_metrics.sqlx | 20 ++++++- ..._user_rolling_window_lifetime_metrics.sqlx | 45 +++++++++++++- ..._backfill_user_rolling_window_metrics.sqlx | 60 ++++++++++++++++++- ...backfill_user_scoped_lifetime_metrics.sqlx | 30 +++++++++- .../invoke_backfill_user_scoped_metrics.sqlx | 30 +++++++++- ...fill_user_scoped_segmentation_metrics.sqlx | 30 +++++++++- ...backfill_user_segmentation_dimensions.sqlx | 26 +++++++- ...user_session_event_aggregated_metrics.sqlx | 40 ++++++++++++- 13 files changed, 358 insertions(+), 13 deletions(-) diff --git a/sql/query/invoke_backfill_churn_propensity_label.sqlx b/sql/query/invoke_backfill_churn_propensity_label.sqlx index 4cbe77ac..9dd41da7 100644 --- a/sql/query/invoke_backfill_churn_propensity_label.sqlx +++ b/sql/query/invoke_backfill_churn_propensity_label.sqlx @@ -119,7 +119,13 @@ GROUP BY ); -- Insert data into the target table, combining user information with churn and bounce status -INSERT INTO `{{project_id}}.{{dataset}}.{{insert_table}}` +INSERT INTO `{{project_id}}.{{dataset}}.{{insert_table}}` ( + processed_timestamp, + feature_date, + user_pseudo_id, + churned, + bounced +) SELECT DISTINCT -- Current timestamp as the processing timestamp CURRENT_TIMESTAMP() AS processed_timestamp, diff --git a/sql/query/invoke_backfill_customer_lifetime_value_label.sqlx b/sql/query/invoke_backfill_customer_lifetime_value_label.sqlx index 27ea59d0..569e5db5 100644 --- a/sql/query/invoke_backfill_customer_lifetime_value_label.sqlx +++ b/sql/query/invoke_backfill_customer_lifetime_value_label.sqlx @@ -109,7 +109,14 @@ CREATE OR REPLACE TEMP TABLE future_revenue_per_user AS ( ); -- Insert data into the target table -INSERT INTO `{{project_id}}.{{dataset}}.{{insert_table}}` +INSERT INTO `{{project_id}}.{{dataset}}.{{insert_table}}` ( + processed_timestamp, + feature_date, + user_pseudo_id, + pltv_revenue_30_days, + pltv_revenue_90_days, + pltv_revenue_180_days +) SELECT DISTINCT -- Current timestamp of the processing CURRENT_TIMESTAMP() AS processed_timestamp, diff --git a/sql/query/invoke_backfill_purchase_propensity_label.sqlx b/sql/query/invoke_backfill_purchase_propensity_label.sqlx index a2c8bee0..b062dc58 100644 --- a/sql/query/invoke_backfill_purchase_propensity_label.sqlx +++ b/sql/query/invoke_backfill_purchase_propensity_label.sqlx @@ -125,7 +125,26 @@ CREATE OR REPLACE TEMP TABLE future_purchases_per_user AS ( ); -- Inserts data into the target table -INSERT INTO `{{project_id}}.{{dataset}}.{{insert_table}}` +INSERT INTO `{{project_id}}.{{dataset}}.{{insert_table}}` ( + processed_timestamp, + feature_date, + user_pseudo_id, + purchase_day_1, + purchase_day_2, + purchase_day_3, + purchase_day_4, + purchase_day_5, + purchase_day_6, + purchase_day_7, + purchase_day_8, + purchase_day_9, + purchase_day_10, + purchase_day_11, + purchase_day_12, + purchase_day_13, + purchase_day_14, + purchase_day_15_30 +) SELECT DISTINCT -- Selects the current timestamp and assigns it to the column processed_timestamp CURRENT_TIMESTAMP() AS processed_timestamp, diff --git a/sql/query/invoke_backfill_user_dimensions.sqlx b/sql/query/invoke_backfill_user_dimensions.sqlx index c27dd299..6c81b412 100644 --- a/sql/query/invoke_backfill_user_dimensions.sqlx +++ b/sql/query/invoke_backfill_user_dimensions.sqlx @@ -122,7 +122,31 @@ CREATE OR REPLACE TEMP TABLE events_users as ( ; -- Inserting aggregated user data into the target table. -INSERT INTO `{{project_id}}.{{dataset}}.{{insert_table}}` +INSERT INTO `{{project_id}}.{{dataset}}.{{insert_table}}` ( + processed_timestamp, + feature_date, + user_pseudo_id, + user_id, + user_ltv_revenue, + device_category, + device_mobile_brand_name, + device_mobile_model_name, + device_os, + device_language, + device_web_browser, + geo_sub_continent, + geo_country, + geo_region, + geo_city, + geo_metro, + last_traffic_source_medium, + last_traffic_source_name, + last_traffic_source_source, + first_traffic_source_medium, + first_traffic_source_name, + first_traffic_source_source, + has_signed_in_with_user_id +) SELECT DISTINCT -- Timestamp of the data processing CURRENT_TIMESTAMP() AS processed_timestamp, diff --git a/sql/query/invoke_backfill_user_lifetime_dimensions.sqlx b/sql/query/invoke_backfill_user_lifetime_dimensions.sqlx index 4001878f..b05611e0 100644 --- a/sql/query/invoke_backfill_user_lifetime_dimensions.sqlx +++ b/sql/query/invoke_backfill_user_lifetime_dimensions.sqlx @@ -137,7 +137,31 @@ CREATE OR REPLACE TEMP TABLE events_users as ( -- This code block inserts data into the specified table, combining information from the "events_users" table -- and the "user_dimensions_event_session_scoped" table. -- It aggregates user-level features for each user and date. -INSERT INTO `{{project_id}}.{{dataset}}.{{insert_table}}` +INSERT INTO `{{project_id}}.{{dataset}}.{{insert_table}}` ( + processed_timestamp, + feature_date, + user_pseudo_id, + user_id, + user_ltv_revenue, + device_category, + device_mobile_brand_name, + device_mobile_model_name, + device_os, + device_language, + device_web_browser, + geo_sub_continent, + geo_country, + geo_region, + geo_city, + geo_metro, + last_traffic_source_medium, + last_traffic_source_name, + last_traffic_source_source, + first_traffic_source_medium, + first_traffic_source_name, + first_traffic_source_source, + has_signed_in_with_user_id +) SELECT DISTINCT -- The current timestamp. CURRENT_TIMESTAMP() AS processed_timestamp, diff --git a/sql/query/invoke_backfill_user_lookback_metrics.sqlx b/sql/query/invoke_backfill_user_lookback_metrics.sqlx index 25e3566b..37bd4563 100644 --- a/sql/query/invoke_backfill_user_lookback_metrics.sqlx +++ b/sql/query/invoke_backfill_user_lookback_metrics.sqlx @@ -230,7 +230,25 @@ AND D.device_os IS NOT NULL -- This code is part of a larger process for building a machine learning model that predicts -- user behavior based on their past activity. The features generated by this code can be used -- as input to the model, helping it learn patterns and make predictions. -INSERT INTO `{{project_id}}.{{dataset}}.{{insert_table}}` +INSERT INTO `{{project_id}}.{{dataset}}.{{insert_table}}` ( + processed_timestamp, + feature_date, + user_pseudo_id, + active_users_past_1_7_day, + active_users_past_8_14_day, + purchases_past_1_7_day, + purchases_past_8_14_day, + visits_past_1_7_day, + visits_past_8_14_day, + view_items_past_1_7_day, + view_items_past_8_14_day, + add_to_carts_past_1_7_day, + add_to_carts_past_8_14_day, + checkouts_past_1_7_day, + checkouts_past_8_14_day, + ltv_revenue_past_1_7_day, + ltv_revenue_past_7_15_day +) SELECT DISTINCT -- Timestamp indicating when the data was processed CURRENT_TIMESTAMP() AS processed_timestamp, diff --git a/sql/query/invoke_backfill_user_rolling_window_lifetime_metrics.sqlx b/sql/query/invoke_backfill_user_rolling_window_lifetime_metrics.sqlx index 2ee219f1..b4a0a415 100644 --- a/sql/query/invoke_backfill_user_rolling_window_lifetime_metrics.sqlx +++ b/sql/query/invoke_backfill_user_rolling_window_lifetime_metrics.sqlx @@ -283,7 +283,50 @@ AND D.device_os IS NOT NULL -- This code is part of a larger process for building a machine learning model that predicts -- user behavior based on their past activity. The features generated by this code can be used -- as input to the model, helping it learn patterns and make predictions. -INSERT INTO `{{project_id}}.{{dataset}}.{{insert_table}}` +INSERT INTO `{{project_id}}.{{dataset}}.{{insert_table}}` ( + processed_timestamp, + feature_date, + user_pseudo_id, + active_users_past_1_30_day, + active_users_past_30_60_day, + active_users_past_60_90_day, + active_users_past_90_120_day, + active_users_past_120_150_day, + active_users_past_150_180_day, + purchases_past_1_30_day, + purchases_past_30_60_day, + purchases_past_60_90_day, + purchases_past_90_120_day, + purchases_past_120_150_day, + purchases_past_150_180_day, + visits_past_1_30_day, + visits_past_30_60_day, + visits_past_60_90_day, + visits_past_90_120_day, + visits_past_120_150_day, + visits_past_150_180_day, + view_items_past_1_30_day, + view_items_past_30_60_day, + view_items_past_60_90_day, + view_items_past_90_120_day, + view_items_past_120_150_day, + view_items_past_150_180_day, + add_to_carts_past_1_30_day, + add_to_carts_past_30_60_day, + add_to_carts_past_60_90_day, + add_to_carts_past_90_120_day, + add_to_carts_past_120_150_day, + add_to_carts_past_150_180_day, + checkouts_past_1_30_day, + checkouts_past_30_60_day, + checkouts_past_60_90_day, + checkouts_past_90_120_day, + checkouts_past_120_150_day, + checkouts_past_150_180_day, + ltv_revenue_past_1_30_day, + ltv_revenue_past_30_90_day, + ltv_revenue_past_90_180_day +) SELECT DISTINCT -- This selects the current timestamp and assigns it to the column processed_timestamp. CURRENT_TIMESTAMP() AS processed_timestamp, diff --git a/sql/query/invoke_backfill_user_rolling_window_metrics.sqlx b/sql/query/invoke_backfill_user_rolling_window_metrics.sqlx index 9317225a..be0a0860 100644 --- a/sql/query/invoke_backfill_user_rolling_window_metrics.sqlx +++ b/sql/query/invoke_backfill_user_rolling_window_metrics.sqlx @@ -272,7 +272,65 @@ CREATE OR REPLACE TEMP TABLE events_users as ( -- table and several temporary tables containing rolling window features. The resulting data -- represents user-level features for each user and date, capturing their past activity within -- different time windows. -INSERT INTO `{{project_id}}.{{dataset}}.{{insert_table}}` +INSERT INTO `{{project_id}}.{{dataset}}.{{insert_table}}` ( + processed_timestamp, + feature_date, + user_pseudo_id, + active_users_past_1_day, + active_users_past_2_day, + active_users_past_3_day, + active_users_past_4_day, + active_users_past_5_day, + active_users_past_6_day, + active_users_past_7_day, + active_users_past_8_14_day, + active_users_past_15_30_day, + purchases_past_1_day, + purchases_past_2_day, + purchases_past_3_day, + purchases_past_4_day, + purchases_past_5_day, + purchases_past_6_day, + purchases_past_7_day, + purchases_past_8_14_day, + purchases_past_15_30_day, + visits_past_1_day, + visits_past_2_day, + visits_past_3_day, + visits_past_4_day, + visits_past_5_day, + visits_past_6_day, + visits_past_7_day, + visits_past_8_14_day, + visits_past_15_30_day, + view_items_past_1_day, + view_items_past_2_day, + view_items_past_3_day, + view_items_past_4_day, + view_items_past_5_day, + view_items_past_6_day, + view_items_past_7_day, + view_items_past_8_14_day, + view_items_past_15_30_day, + add_to_carts_past_1_day, + add_to_carts_past_2_day, + add_to_carts_past_3_day, + add_to_carts_past_4_day, + add_to_carts_past_5_day, + add_to_carts_past_6_day, + add_to_carts_past_7_day, + add_to_carts_past_8_14_day, + add_to_carts_past_15_30_day, + checkouts_past_1_day, + checkouts_past_2_day, + checkouts_past_3_day, + checkouts_past_4_day, + checkouts_past_5_day, + checkouts_past_6_day, + checkouts_past_7_day, + checkouts_past_8_14_day, + checkouts_past_15_30_day +) SELECT DISTINCT -- This selects the current timestamp and assigns it to the column processed_timestamp. CURRENT_TIMESTAMP() AS processed_timestamp, diff --git a/sql/query/invoke_backfill_user_scoped_lifetime_metrics.sqlx b/sql/query/invoke_backfill_user_scoped_lifetime_metrics.sqlx index bfb93869..ed4bf30e 100644 --- a/sql/query/invoke_backfill_user_scoped_lifetime_metrics.sqlx +++ b/sql/query/invoke_backfill_user_scoped_lifetime_metrics.sqlx @@ -163,7 +163,35 @@ CREATE OR REPLACE TEMP TABLE first_purchasers as ( ); -- This SQL code calculates various user engagement and revenue metrics at a daily level and inserts the results into a target table. It leverages several temporary tables created earlier in the script to aggregate data efficiently. -INSERT INTO `{{project_id}}.{{dataset}}.{{insert_table}}` +INSERT INTO `{{project_id}}.{{dataset}}.{{insert_table}}` ( + processed_timestamp, + feature_date, + lifetime_purchasers_users, + lifetime_average_daily_purchasers, + lifetime_active_users, + lifetime_DAU, + lifetime_MAU, + lifetime_WAU, + lifetime_dau_per_mau, + lifetime_dau_per_wau, + lifetime_wau_per_mau, + lifetime_users_engagement_duration_seconds, + lifetime_average_engagement_time, + lifetime_average_engagement_time_per_session, + lifetime_average_sessions_per_user, + lifetime_ARPPU, + lifetime_ARPU, + lifetime_average_daily_revenue, + lifetime_max_daily_revenue, + lifetime_min_daily_revenue, + lifetime_new_users, + lifetime_returning_users, + lifetime_first_time_purchasers, + lifetime_first_time_purchaser_conversion, + lifetime_first_time_purchasers_per_new_user, + lifetime_avg_user_conversion_rate, + lifetime_avg_session_conversion_rate +) SELECT -- Records the current timestamp when the query is executed. CURRENT_TIMESTAMP() AS processed_timestamp, diff --git a/sql/query/invoke_backfill_user_scoped_metrics.sqlx b/sql/query/invoke_backfill_user_scoped_metrics.sqlx index 3cc45b49..c5252519 100644 --- a/sql/query/invoke_backfill_user_scoped_metrics.sqlx +++ b/sql/query/invoke_backfill_user_scoped_metrics.sqlx @@ -183,7 +183,35 @@ CREATE OR REPLACE TEMP TABLE new_users_ as ( ); -- Insert data into the target table after calculating various user engagement and revenue metrics. -INSERT INTO `{{project_id}}.{{dataset}}.{{insert_table}}` +INSERT INTO `{{project_id}}.{{dataset}}.{{insert_table}}` ( + processed_timestamp, + feature_date, + purchasers_users, + average_daily_purchasers, + active_users, + DAU, + MAU, + WAU, + dau_per_mau, + dau_per_wau, + wau_per_mau, + users_engagement_duration_seconds, + average_engagement_time, + average_engagement_time_per_session, + average_sessions_per_user, + ARPPU, + ARPU, + average_daily_revenue, + max_daily_revenue, + min_daily_revenue, + new_users, + returning_users, + first_time_purchasers, + first_time_purchaser_conversion, + first_time_purchasers_per_new_user, + avg_user_conversion_rate, + avg_session_conversion_rate +) SELECT DISTINCT -- Record the current timestamp when the query is executed. CURRENT_TIMESTAMP() AS processed_timestamp, diff --git a/sql/query/invoke_backfill_user_scoped_segmentation_metrics.sqlx b/sql/query/invoke_backfill_user_scoped_segmentation_metrics.sqlx index c6f03aaa..251dfead 100644 --- a/sql/query/invoke_backfill_user_scoped_segmentation_metrics.sqlx +++ b/sql/query/invoke_backfill_user_scoped_segmentation_metrics.sqlx @@ -136,7 +136,35 @@ GROUP BY feature_date ); -- This SQL code calculates various user engagement and revenue metrics at a daily level and inserts the results into a target table. It leverages several temporary tables created earlier in the script to aggregate data efficiently. -INSERT INTO `{{project_id}}.{{dataset}}.{{insert_table}}` +INSERT INTO `{{project_id}}.{{dataset}}.{{insert_table}}` ( + processed_timestamp, + feature_date, + purchasers_users, + average_daily_purchasers, + active_users, + DAU, + MAU, + WAU, + dau_per_mau, + dau_per_wau, + wau_per_mau, + users_engagement_duration_seconds, + average_engagement_time, + average_engagement_time_per_session, + average_sessions_per_user, + ARPPU, + ARPU, + average_daily_revenue, + max_daily_revenue, + min_daily_revenue, + new_users, + returning_users, + first_time_purchasers, + first_time_purchaser_conversion, + first_time_purchasers_per_new_user, + avg_user_conversion_rate, + avg_session_conversion_rate +) SELECT -- Records the current timestamp when the query is executed. CURRENT_TIMESTAMP() AS processed_timestamp, diff --git a/sql/query/invoke_backfill_user_segmentation_dimensions.sqlx b/sql/query/invoke_backfill_user_segmentation_dimensions.sqlx index be402415..cf2dc7ff 100644 --- a/sql/query/invoke_backfill_user_segmentation_dimensions.sqlx +++ b/sql/query/invoke_backfill_user_segmentation_dimensions.sqlx @@ -95,7 +95,31 @@ CREATE OR REPLACE TEMP TABLE events_users as ( -- This code snippet performs a complex aggregation and insertion operation. It combines data from two temporary tables, -- calculates various user-level dimensions, and inserts the aggregated results into a target table. The use of window functions, -- approximate aggregation, and careful joining ensures that the query is efficient and produces meaningful insights from the data. -INSERT INTO `{{project_id}}.{{dataset}}.{{insert_table}}` +INSERT INTO `{{project_id}}.{{dataset}}.{{insert_table}}` ( + processed_timestamp, + feature_date, + user_pseudo_id, + user_id, + user_ltv_revenue, + device_category, + device_mobile_brand_name, + device_mobile_model_name, + device_os, + device_language, + device_web_browser, + geo_sub_continent, + geo_country, + geo_region, + geo_city, + geo_metro, + last_traffic_source_medium, + last_traffic_source_name, + last_traffic_source_source, + first_traffic_source_medium, + first_traffic_source_name, + first_traffic_source_source, + has_signed_in_with_user_id +) -- The DISTINCT keyword ensures that only unique rows are inserted, eliminating any potential duplicates. SELECT DISTINCT CURRENT_TIMESTAMP() AS processed_timestamp, diff --git a/sql/query/invoke_backfill_user_session_event_aggregated_metrics.sqlx b/sql/query/invoke_backfill_user_session_event_aggregated_metrics.sqlx index 7ba0e2f7..4c6f3373 100644 --- a/sql/query/invoke_backfill_user_session_event_aggregated_metrics.sqlx +++ b/sql/query/invoke_backfill_user_session_event_aggregated_metrics.sqlx @@ -354,7 +354,45 @@ CREATE OR REPLACE TEMP TABLE events_users_days as ( -- user_events_per_day_event_scoped (UEPDES): Contains user-level event metrics aggregated on a daily basis. Metrics include add_to_carts, cart_to_view_rate, checkouts, ecommerce_purchases, etc. -- repeated_purchase (R): Stores information about whether a user has made previous purchases, indicated by the how_many_purchased_before column. -- cart_to_purchase (CP): Contains a flag (has_abandoned_cart) indicating whether a user abandoned their cart on a given day. -INSERT INTO `{{project_id}}.{{dataset}}.{{insert_table}}` +INSERT INTO `{{project_id}}.{{dataset}}.{{insert_table}}` ( + processed_timestamp, + feature_date, + user_pseudo_id, + engagement_rate, + engaged_sessions_per_user, + session_conversion_rate, + bounces, + bounce_rate_per_user, + sessions_per_user, + avg_views_per_session, + sum_engagement_time_seconds, + avg_engagement_time_seconds, + new_visits, + returning_visits, + add_to_carts, + cart_to_view_rate, + checkouts, + ecommerce_purchases, + ecommerce_quantity, + ecommerce_revenue, + item_revenue, + item_quantity, + item_refund_amount, + item_view_events, + items_clicked_in_promotion, + items_clicked_in_list, + items_checked_out, + items_added_to_cart, + item_list_click_events, + item_list_view_events, + purchase_revenue, + purchase_to_view_rate, + refunds, + transactions_per_purchaser, + user_conversion_rate, + how_many_purchased_before, + has_abandoned_cart +) SELECT CURRENT_TIMESTAMP() AS processed_timestamp, EUD.feature_date,