Skip to content

Commit

Permalink
optimizing scalability of training and inference preparation sp
Browse files Browse the repository at this point in the history
  • Loading branch information
Carlos Timoteo committed Oct 11, 2023
1 parent 7c05ac6 commit cae3d7b
Show file tree
Hide file tree
Showing 6 changed files with 226 additions and 78 deletions.
155 changes: 104 additions & 51 deletions sql/procedure/audience_segmentation_inference_preparation.sqlx
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,13 @@ DECLARE lastest_processed_time_ud TIMESTAMP;
DECLARE lastest_processed_time_uwm TIMESTAMP;
DECLARE lastest_processed_time_um TIMESTAMP;

-- Setting procedure to lookback from the day before `inference_date`
SET inference_date = DATE_SUB(inference_date, INTERVAL 1 DAY);

SET lastest_processed_time_ud = (SELECT MAX(processed_timestamp) FROM `{{feature_store_project_id}}.{{feature_store_dataset}}.user_segmentation_dimensions` WHERE feature_date = inference_date LIMIT 1);
SET lastest_processed_time_uwm = (SELECT MAX(processed_timestamp) FROM `{{feature_store_project_id}}.{{feature_store_dataset}}.user_lookback_metrics` WHERE feature_date = inference_date LIMIT 1);
SET lastest_processed_time_um = (SELECT MAX(processed_timestamp) FROM `{{feature_store_project_id}}.{{feature_store_dataset}}.user_scoped_segmentation_metrics` WHERE feature_date = inference_date LIMIT 1);

SET inference_date = DATE_SUB(inference_date, INTERVAL 1 DAY);

CREATE OR REPLACE TEMP TABLE inference_preparation_ud as (
SELECT DISTINCT
UD.user_pseudo_id,
Expand Down Expand Up @@ -62,42 +63,10 @@ CREATE OR REPLACE TEMP TABLE inference_preparation_ud as (
user_segmentation_dimensions_window AS (PARTITION BY UD.user_pseudo_id ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)
);








CREATE TEMP TABLE inference_preparation AS (
CREATE TEMP TABLE inference_preparation_uwm AS (
SELECT DISTINCT
UD.user_pseudo_id,
MAX(UD.user_id) OVER(user_segmentation_dimensions_window) AS user_id,
UD.feature_date,
MAX(UD.month_of_the_year) OVER(user_segmentation_dimensions_window) AS month_of_the_year,
MAX(UD.week_of_the_year) OVER(user_segmentation_dimensions_window) AS week_of_the_year,
MAX(UD.day_of_the_month) OVER(user_segmentation_dimensions_window) AS day_of_the_month,
MAX(UD.day_of_week) OVER(user_segmentation_dimensions_window) AS day_of_week,
MAX(UD.device_category) OVER(user_segmentation_dimensions_window) AS device_category,
MAX(UD.device_mobile_brand_name) OVER(user_segmentation_dimensions_window) AS device_mobile_brand_name,
MAX(UD.device_mobile_model_name) OVER(user_segmentation_dimensions_window) AS device_mobile_model_name,
MAX(UD.device_os) OVER(user_segmentation_dimensions_window) AS device_os,
MAX(UD.device_os_version) OVER(user_segmentation_dimensions_window) AS device_os_version,
MAX(UD.device_language) OVER(user_segmentation_dimensions_window) AS device_language,
MAX(UD.device_web_browser) OVER(user_segmentation_dimensions_window) AS device_web_browser,
MAX(UD.device_web_browser_version) OVER(user_segmentation_dimensions_window) AS device_web_browser_version,
MAX(UD.geo_sub_continent) OVER(user_segmentation_dimensions_window) AS geo_sub_continent,
MAX(UD.geo_country) OVER(user_segmentation_dimensions_window) AS geo_country,
MAX(UD.geo_region) OVER(user_segmentation_dimensions_window) AS geo_region,
MAX(UD.geo_city) OVER(user_segmentation_dimensions_window) AS geo_city,
MAX(UD.geo_metro) OVER(user_segmentation_dimensions_window) AS geo_metro,
MAX(UD.last_traffic_source_medium) OVER(user_segmentation_dimensions_window) AS last_traffic_source_medium,
MAX(UD.last_traffic_source_name) OVER(user_segmentation_dimensions_window) AS last_traffic_source_name,
MAX(UD.last_traffic_source_source) OVER(user_segmentation_dimensions_window) AS last_traffic_source_source,
MAX(UD.first_traffic_source_medium) OVER(user_segmentation_dimensions_window) AS first_traffic_source_medium,
MAX(UD.first_traffic_source_name) OVER(user_segmentation_dimensions_window) AS first_traffic_source_name,
MAX(UD.first_traffic_source_source) OVER(user_segmentation_dimensions_window) AS first_traffic_source_source,
MAX(UD.has_signed_in_with_user_id) OVER(user_segmentation_dimensions_window) AS has_signed_in_with_user_id,
UWM.user_pseudo_id,
UWM.feature_date,
MAX(UWM.active_users_past_1_7_day) OVER(user_lookback_metrics_window) AS active_users_past_1_7_day,
MAX(UWM.active_users_past_8_14_day) OVER(user_lookback_metrics_window) AS active_users_past_8_14_day,
MAX(UWM.purchases_past_1_7_day) OVER(user_lookback_metrics_window) AS purchases_past_1_7_day,
Expand All @@ -111,7 +80,22 @@ CREATE TEMP TABLE inference_preparation AS (
MAX(UWM.checkouts_past_1_7_day) OVER(user_lookback_metrics_window) AS checkouts_past_1_7_day,
MAX(UWM.checkouts_past_8_14_day) OVER(user_lookback_metrics_window) AS checkouts_past_8_14_day,
MAX(UWM.ltv_revenue_past_1_7_day) OVER(user_lookback_metrics_window) AS ltv_revenue_past_1_7_day,
MAX(UWM.ltv_revenue_past_7_15_day) OVER(user_lookback_metrics_window) AS ltv_revenue_past_7_15_day,
MAX(UWM.ltv_revenue_past_7_15_day) OVER(user_lookback_metrics_window) AS ltv_revenue_past_7_15_day
FROM
`{{feature_store_project_id}}.{{feature_store_dataset}}.user_lookback_metrics` UWM
WHERE
-- Define the training+validation subset interval
UWM.feature_date = inference_date
AND UWM.processed_timestamp = lastest_processed_time_uwm
WINDOW
user_lookback_metrics_window AS (PARTITION BY UWM.user_pseudo_id ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)
);



CREATE TEMP TABLE inference_preparation_um AS (
SELECT DISTINCT
UM.feature_date,
MAX(UM.purchasers_users) OVER(user_scoped_segmentation_metrics_window) AS purchasers_users,
MAX(UM.average_daily_purchasers) OVER(user_scoped_segmentation_metrics_window) AS average_daily_purchasers,
MAX(UM.active_users) OVER(user_scoped_segmentation_metrics_window) AS active_users,
Expand All @@ -138,26 +122,95 @@ CREATE TEMP TABLE inference_preparation AS (
MAX(UM.avg_user_conversion_rate) OVER(user_scoped_segmentation_metrics_window) AS avg_user_conversion_rate,
MAX(UM.avg_session_conversion_rate) OVER(user_scoped_segmentation_metrics_window) AS avg_session_conversion_rate
FROM
`{{feature_store_project_id}}.{{feature_store_dataset}}.user_segmentation_dimensions` UD
`{{feature_store_project_id}}.{{feature_store_dataset}}.user_scoped_segmentation_metrics` UM
WHERE
-- Define the training+validation subset interval
UM.feature_date = inference_date
AND UM.processed_timestamp = lastest_processed_time_um
WINDOW
user_scoped_segmentation_metrics_window AS (PARTITION BY UM.feature_date ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)
);


CREATE TEMP TABLE inference_preparation AS (
SELECT DISTINCT
UD.user_pseudo_id,
UD.user_id,
UD.feature_date,
UD.month_of_the_year,
UD.week_of_the_year,
UD.day_of_the_month,
UD.day_of_week,
UD.device_category,
UD.device_mobile_brand_name,
UD.device_mobile_model_name,
UD.device_os,
UD.device_os_version,
UD.device_language,
UD.device_web_browser,
UD.device_web_browser_version,
UD.geo_sub_continent,
UD.geo_country,
UD.geo_region,
UD.geo_city,
UD.geo_metro,
UD.last_traffic_source_medium,
UD.last_traffic_source_name,
UD.last_traffic_source_source,
UD.first_traffic_source_medium,
UD.first_traffic_source_name,
UD.first_traffic_source_source,
UD.has_signed_in_with_user_id,
UWM.active_users_past_1_7_day,
UWM.active_users_past_8_14_day,
UWM.purchases_past_1_7_day,
UWM.purchases_past_8_14_day,
UWM.visits_past_1_7_day,
UWM.visits_past_8_14_day,
UWM.view_items_past_1_7_day,
UWM.view_items_past_8_14_day,
UWM.add_to_carts_past_1_7_day,
UWM.add_to_carts_past_8_14_day,
UWM.checkouts_past_1_7_day,
UWM.checkouts_past_8_14_day,
UWM.ltv_revenue_past_1_7_day,
UWM.ltv_revenue_past_7_15_day,
UM.purchasers_users,
UM.average_daily_purchasers,
UM.active_users,
UM.DAU,
UM.MAU,
UM.WAU,
UM.dau_per_mau,
UM.dau_per_wau,
UM.wau_per_mau,
UM.users_engagement_duration_seconds,
UM.average_engagement_time,
UM.average_engagement_time_per_session,
UM.average_sessions_per_user,
UM.ARPPU,
UM.ARPU,
UM.average_daily_revenue,
UM.max_daily_revenue,
UM.min_daily_revenue,
UM.new_users,
UM.returning_users,
UM.first_time_purchasers,
UM.first_time_purchaser_conversion,
UM.first_time_purchasers_per_new_user,
UM.avg_user_conversion_rate,
UM.avg_session_conversion_rate
FROM
inference_preparation_ud UD
INNER JOIN
`{{feature_store_project_id}}.{{feature_store_dataset}}.user_lookback_metrics` UWM
inference_preparation_uwm UWM
ON
UWM.user_pseudo_id = UD.user_pseudo_id
AND UWM.feature_date = UD.feature_date
INNER JOIN
`{{feature_store_project_id}}.{{feature_store_dataset}}.user_scoped_segmentation_metrics` UM
inference_preparation_um UM
ON
UM.feature_date = UD.feature_date
WHERE
-- Define the training+validation subset interval
UD.feature_date = inference_date
AND UD.processed_timestamp = (SELECT MAX(processed_timestamp) FROM `{{feature_store_project_id}}.{{feature_store_dataset}}.user_segmentation_dimensions` WHERE feature_date = inference_date LIMIT 1)
AND UWM.processed_timestamp = (SELECT MAX(processed_timestamp) FROM `{{feature_store_project_id}}.{{feature_store_dataset}}.user_lookback_metrics` WHERE feature_date = inference_date LIMIT 1)
AND UM.processed_timestamp = (SELECT MAX(processed_timestamp) FROM `{{feature_store_project_id}}.{{feature_store_dataset}}.user_scoped_segmentation_metrics` WHERE feature_date = inference_date LIMIT 1)
WINDOW
user_segmentation_dimensions_window AS (PARTITION BY UD.user_pseudo_id, UD.feature_date ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING),
user_lookback_metrics_window AS (PARTITION BY UWM.user_pseudo_id, UWM.feature_date ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING),
user_scoped_segmentation_metrics_window AS (PARTITION BY UM.feature_date ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)
);

DELETE FROM `{{project_id}}.{{dataset}}.{{insert_table}}` WHERE TRUE;
Expand Down
125 changes: 111 additions & 14 deletions sql/procedure/audience_segmentation_training_preparation.sqlx
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,9 @@ DECLARE min_date DATE;
SET max_date = (SELECT DATE_SUB(MAX(event_date), INTERVAL 1 DAY) FROM `{{mds_project_id}}.{{mds_dataset}}.event`);
SET min_date = (SELECT DATE_ADD(MIN(event_date), INTERVAL 15 DAY) FROM `{{mds_project_id}}.{{mds_dataset}}.event`);

CREATE TEMP TABLE training_preparation as (
CREATE TEMP TABLE training_preparation_ud as (
SELECT DISTINCT
UD.user_pseudo_id,
UD.user_pseudo_id,
MAX(UD.user_id) OVER(user_segmentation_dimensions_window) AS user_id,
UD.feature_date,
MAX(UD.month_of_the_year) OVER(user_segmentation_dimensions_window) AS month_of_the_year,
Expand All @@ -45,7 +45,21 @@ CREATE TEMP TABLE training_preparation as (
MAX(UD.first_traffic_source_medium) OVER(user_segmentation_dimensions_window) AS first_traffic_source_medium,
MAX(UD.first_traffic_source_name) OVER(user_segmentation_dimensions_window) AS first_traffic_source_name,
MAX(UD.first_traffic_source_source) OVER(user_segmentation_dimensions_window) AS first_traffic_source_source,
MAX(UD.has_signed_in_with_user_id) OVER(user_segmentation_dimensions_window) AS has_signed_in_with_user_id,
MAX(UD.has_signed_in_with_user_id) OVER(user_segmentation_dimensions_window) AS has_signed_in_with_user_id
FROM
`{{feature_store_project_id}}.{{feature_store_dataset}}.user_segmentation_dimensions` UD
WHERE
-- Define the training+validation subset interval
UD.feature_date BETWEEN GREATEST(start_date, min_date) AND LEAST(end_date, max_date)
WINDOW
user_segmentation_dimensions_window AS (PARTITION BY UD.user_pseudo_id, UD.feature_date ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)
);


CREATE TEMP TABLE training_preparation_uwm as (
SELECT DISTINCT
UWM.user_pseudo_id,
UWM.feature_date,
MAX(UWM.active_users_past_1_7_day) OVER(user_lookback_metrics_window) AS active_users_past_1_7_day,
MAX(UWM.active_users_past_8_14_day) OVER(user_lookback_metrics_window) AS active_users_past_8_14_day,
MAX(UWM.purchases_past_1_7_day) OVER(user_lookback_metrics_window) AS purchases_past_1_7_day,
Expand All @@ -59,7 +73,19 @@ CREATE TEMP TABLE training_preparation as (
MAX(UWM.checkouts_past_1_7_day) OVER(user_lookback_metrics_window) AS checkouts_past_1_7_day,
MAX(UWM.checkouts_past_8_14_day) OVER(user_lookback_metrics_window) AS checkouts_past_8_14_day,
MAX(UWM.ltv_revenue_past_1_7_day) OVER(user_lookback_metrics_window) AS ltv_revenue_past_1_7_day,
MAX(UWM.ltv_revenue_past_7_15_day) OVER(user_lookback_metrics_window) AS ltv_revenue_past_7_15_day,
MAX(UWM.ltv_revenue_past_7_15_day) OVER(user_lookback_metrics_window) AS ltv_revenue_past_7_15_day
FROM
`{{feature_store_project_id}}.{{feature_store_dataset}}.user_lookback_metrics` UWM
WHERE
-- Define the training+validation subset interval
UWM.feature_date BETWEEN GREATEST(start_date, min_date) AND LEAST(end_date, max_date)
WINDOW
user_lookback_metrics_window AS (PARTITION BY UWM.user_pseudo_id, UWM.feature_date ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)
);

CREATE TEMP TABLE training_preparation_um as (
SELECT DISTINCT
UM.feature_date,
MAX(UM.purchasers_users) OVER(user_scoped_segmentation_metrics_window) AS purchasers_users,
MAX(UM.average_daily_purchasers) OVER(user_scoped_segmentation_metrics_window) AS average_daily_purchasers,
MAX(UM.active_users) OVER(user_scoped_segmentation_metrics_window) AS active_users,
Expand All @@ -86,23 +112,94 @@ CREATE TEMP TABLE training_preparation as (
MAX(UM.avg_user_conversion_rate) OVER(user_scoped_segmentation_metrics_window) AS avg_user_conversion_rate,
MAX(UM.avg_session_conversion_rate) OVER(user_scoped_segmentation_metrics_window) AS avg_session_conversion_rate
FROM
`{{feature_store_project_id}}.{{feature_store_dataset}}.user_segmentation_dimensions` UD
`{{feature_store_project_id}}.{{feature_store_dataset}}.user_scoped_segmentation_metrics` UM
WHERE
-- Define the training+validation subset interval
UM.feature_date BETWEEN GREATEST(start_date, min_date) AND LEAST(end_date, max_date)
WINDOW
user_scoped_segmentation_metrics_window AS (PARTITION BY UM.feature_date ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)
);


CREATE TEMP TABLE training_preparation as (
SELECT DISTINCT
UD.user_pseudo_id,
UD.user_id,
UD.feature_date,
UD.month_of_the_year,
UD.week_of_the_year,
UD.day_of_the_month,
UD.day_of_week,
UD.device_category,
UD.device_mobile_brand_name,
UD.device_mobile_model_name,
UD.device_os,
UD.device_os_version,
UD.device_language,
UD.device_web_browser,
UD.device_web_browser_version,
UD.geo_sub_continent,
UD.geo_country,
UD.geo_region,
UD.geo_city,
UD.geo_metro,
UD.last_traffic_source_medium,
UD.last_traffic_source_name,
UD.last_traffic_source_source,
UD.first_traffic_source_medium,
UD.first_traffic_source_name,
UD.first_traffic_source_source,
UD.has_signed_in_with_user_id,
UWM.active_users_past_1_7_day,
UWM.active_users_past_8_14_day,
UWM.purchases_past_1_7_day,
UWM.purchases_past_8_14_day,
UWM.visits_past_1_7_day,
UWM.visits_past_8_14_day,
UWM.view_items_past_1_7_day,
UWM.view_items_past_8_14_day,
UWM.add_to_carts_past_1_7_day,
UWM.add_to_carts_past_8_14_day,
UWM.checkouts_past_1_7_day,
UWM.checkouts_past_8_14_day,
UWM.ltv_revenue_past_1_7_day,
UWM.ltv_revenue_past_7_15_day,
UM.purchasers_users,
UM.average_daily_purchasers,
UM.active_users,
UM.DAU,
UM.MAU,
UM.WAU,
UM.dau_per_mau,
UM.dau_per_wau,
UM.wau_per_mau,
UM.users_engagement_duration_seconds,
UM.average_engagement_time,
UM.average_engagement_time_per_session,
UM.average_sessions_per_user,
UM.ARPPU,
UM.ARPU,
UM.average_daily_revenue,
UM.max_daily_revenue,
UM.min_daily_revenue,
UM.new_users,
UM.returning_users,
UM.first_time_purchasers,
UM.first_time_purchaser_conversion,
UM.first_time_purchasers_per_new_user,
UM.avg_user_conversion_rate,
UM.avg_session_conversion_rate
FROM
training_preparation_ud UD
INNER JOIN
`{{feature_store_project_id}}.{{feature_store_dataset}}.user_lookback_metrics` UWM
training_preparation_uwm UWM
ON
UWM.user_pseudo_id = UD.user_pseudo_id
AND UWM.feature_date = UD.feature_date
INNER JOIN
`{{feature_store_project_id}}.{{feature_store_dataset}}.user_scoped_segmentation_metrics` UM
training_preparation_um UM
ON
UM.feature_date = UD.feature_date
WHERE
-- Define the training+validation subset interval
UD.feature_date BETWEEN GREATEST(start_date, min_date) AND LEAST(end_date, max_date)
WINDOW
user_segmentation_dimensions_window AS (PARTITION BY UD.user_pseudo_id, UD.feature_date ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING),
user_lookback_metrics_window AS (PARTITION BY UWM.user_pseudo_id, UWM.feature_date ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING),
user_scoped_segmentation_metrics_window AS (PARTITION BY UM.feature_date ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)
);


Expand Down
12 changes: 6 additions & 6 deletions sql/procedure/customer_lifetime_value_inference_preparation.sqlx
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,13 @@ DECLARE lastest_processed_time_ud TIMESTAMP;
DECLARE lastest_processed_time_uwm TIMESTAMP;
DECLARE lastest_processed_time_um TIMESTAMP;

-- Setting procedure to lookback from the day before `inference_date`
SET inference_date = DATE_SUB(inference_date, INTERVAL 1 DAY);

SET lastest_processed_time_ud = (SELECT MAX(processed_timestamp) FROM `{{feature_store_project_id}}.{{feature_store_dataset}}.user_lifetime_dimensions` WHERE feature_date = inference_date LIMIT 1);
SET lastest_processed_time_uwm = (SELECT MAX(processed_timestamp) FROM `{{feature_store_project_id}}.{{feature_store_dataset}}.user_rolling_window_lifetime_metrics` WHERE feature_date = inference_date LIMIT 1);
SET lastest_processed_time_um = (SELECT MAX(processed_timestamp) FROM `{{feature_store_project_id}}.{{feature_store_dataset}}.user_scoped_lifetime_metrics` WHERE feature_date = inference_date LIMIT 1);

-- Setting procedure to lookback from the day before `inference_date`
SET inference_date = DATE_SUB(inference_date, INTERVAL 1 DAY);

CREATE OR REPLACE TEMP TABLE inference_preparation_ud as (
SELECT DISTINCT
UD.user_pseudo_id,
Expand Down Expand Up @@ -248,14 +248,14 @@ CREATE OR REPLACE TEMP TABLE inference_preparation as (
UM.lifetime_avg_user_conversion_rate,
UM.lifetime_avg_session_conversion_rate
FROM
`{{feature_store_project_id}}.{{feature_store_dataset}}.user_lifetime_dimensions` UD
inference_preparation_ud UD
INNER JOIN
`{{feature_store_project_id}}.{{feature_store_dataset}}.user_rolling_window_lifetime_metrics` UWM
inference_preparation_uwm UWM
ON
UWM.user_pseudo_id = UD.user_pseudo_id
AND UWM.feature_date = UD.feature_date
INNER JOIN
`{{feature_store_project_id}}.{{feature_store_dataset}}.user_scoped_lifetime_metrics` UM
inference_preparation_um UM
ON
UM.feature_date = UD.feature_date
);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ FROM
`{{feature_store_project_id}}.{{feature_store_dataset}}.customer_lifetime_value_label` LABEL
WHERE
-- Define the training subset interval
UD.feature_date BETWEEN GREATEST(start_date, min_date) AND LEAST(end_date, max_date)
LABEL.feature_date BETWEEN GREATEST(start_date, min_date) AND LEAST(end_date, max_date)
WINDOW
customer_lifetime_value_window AS (PARTITION BY LABEL.user_pseudo_id, LABEL.feature_date ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)
);
Expand Down
Loading

0 comments on commit cae3d7b

Please sign in to comment.