Skip to content

Commit

Permalink
all compiled and deployed
Browse files Browse the repository at this point in the history
  • Loading branch information
Carlos Timoteo committed Dec 17, 2024
1 parent ca3a14c commit b31ea28
Show file tree
Hide file tree
Showing 8 changed files with 416 additions and 138 deletions.
12 changes: 10 additions & 2 deletions config/config.yaml.tftpl
Original file line number Diff line number Diff line change
Expand Up @@ -586,7 +586,7 @@ vertex_ai:
pipeline_parameters_substitutions: # Substitutions are applied to the parameters before compilation
lead_score_propensity_label_procedure_name: "${project_id}.feature_store.invoke_lead_score_propensity_label"
user_dimensions_procedure_name: "${project_id}.feature_store.invoke_user_dimensions"
user_rolling_window_metrics_procedure_name: "${project_id}.feature_store.invoke_user_rolling_window_metrics"
user_rolling_window_metrics_procedure_name: "${project_id}.feature_store.invoke_user_rolling_window_lead_metrics"
user_scoped_metrics_procedure_name: "${project_id}.feature_store.invoke_user_scoped_metrics"
user_session_event_aggregated_metrics_procedure_name: "${project_id}.feature_store.invoke_user_session_event_aggregated_metrics"
date_timezone: "UTC" # used when input_date is None and need to get current date.
Expand Down Expand Up @@ -1728,7 +1728,7 @@ vertex_ai:
model_metric_threshold: 0.9
number_of_models_considered: 1
# This is the prediction dataset table or view.
bigquery_source: "${project_id}.lead_score_propensity.v_lead_score_propensity_inference_30_15"
bigquery_source: "${project_id}.lead_score_propensity.v_lead_score_propensity_inference_5_1"
bigquery_destination_prefix: "${project_id}.lead_score_propensity"
bq_unique_key: "user_pseudo_id"
machine_type: "n1-standard-4"
Expand Down Expand Up @@ -2668,6 +2668,14 @@ bigquery:
# The `interval_end_date` parameter defines how many days we leave out of the backfill before the last dates of events.
# This is usually the same value as the look forward window.
interval_end_date: 180
# This is a stored procedure that CALLs the User Rolling Window Metrics stored procedure.
invoke_user_rolling_window_lead_metrics:
project_id: "${project_id}"
dataset: "feature_store"
stored_procedure: "user_rolling_window_lead_metrics"
# The `interval_end_date` parameter defines how many days we leave out of the backfill before the last dates of events.
# This is usually the same value as the look forward window.
interval_end_date: 15
# This is a stored procedure that CALLs the User Scoped Metrics stored procedure.
invoke_user_scoped_metrics:
project_id: "${project_id}"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1637,7 +1637,7 @@ resource "null_resource" "create_gemini_model" {
# any changes to the table and will not attempt to update the table. The prevent_destroy attribute is set to true, which means that Terraform will prevent the table from being destroyed.
lifecycle {
ignore_changes = all
#prevent_destroy = true
prevent_destroy = true
}

depends_on = [
Expand Down
1 change: 0 additions & 1 deletion sql/procedure/user_rolling_window_lead_metrics.sqlx
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,6 @@ GROUP BY user_pseudo_id
);

-- Past User metrics: 1-day recipe_print per user, 2-5-day recipe_print per user
-- 2-5-day recipe_print per user
CREATE OR REPLACE TEMP TABLE rolling_recipe_print_past_days AS (
SELECT
-- User pseudo ID, a unique identifier for the user
Expand Down
376 changes: 242 additions & 134 deletions sql/query/invoke_backfill_user_rolling_window_lead_metrics.sqlx

Large diffs are not rendered by default.

23 changes: 23 additions & 0 deletions sql/query/invoke_lead_score_propensity_inference_preparation.sqlx
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
-- Copyright 2023 Google LLC
--
-- Licensed under the Apache License, Version 2.0 (the "License");
-- you may not use this file except in compliance with the License.
-- You may obtain a copy of the License at
--
-- http://www.apache.org/licenses/LICENSE-2.0
--
-- Unless required by applicable law or agreed to in writing, software
-- distributed under the License is distributed on an "AS IS" BASIS,
-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-- See the License for the specific language governing permissions and
-- limitations under the License.

-- This script determines the current date and then passes it as an argument to a
-- stored procedure in your BigQuery project. This pattern is commonly used when
-- you want a stored procedure to perform operations or calculations that are
-- relevant to the current date, such as data processing, analysis, or reporting tasks.

DECLARE inference_date DATE DEFAULT NULL;
SET inference_date = CURRENT_DATE();

CALL `{{project_id}}.{{dataset}}.{{stored_procedure}}`(inference_date);
39 changes: 39 additions & 0 deletions sql/query/invoke_lead_score_propensity_label.sqlx
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
-- Copyright 2023 Google LLC
--
-- Licensed under the Apache License, Version 2.0 (the "License");
-- you may not use this file except in compliance with the License.
-- You may obtain a copy of the License at
--
-- http://www.apache.org/licenses/LICENSE-2.0
--
-- Unless required by applicable law or agreed to in writing, software
-- distributed under the License is distributed on an "AS IS" BASIS,
-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-- See the License for the specific language governing permissions and
-- limitations under the License.

-- This script sets up a date range, calls a stored procedure with this range and a variable to
-- store a result, and then returns the result of the stored procedure. This pattern is common
-- for orchestrating data processing tasks within BigQuery using stored procedures.

DECLARE input_date DATE;
DECLARE end_date DATE;
DECLARE users_added INT64 DEFAULT NULL;

SET end_date= CURRENT_DATE();
SET input_date= (SELECT DATE_SUB(end_date, INTERVAL {{interval_input_date}} DAY));

-- This code block ensures that the end_date used in subsequent operations is not later than one day after the latest available data in
-- the specified events table. This prevents potential attempts to process data for a date range that extends beyond the actual data availability.
IF (SELECT DATE_SUB(end_date, INTERVAL 1 DAY)) > (SELECT MAX(event_date) FROM `{{mds_project_id}}.{{mds_dataset}}.event`) THEN
SET end_date = (SELECT DATE_ADD(MAX(event_date), INTERVAL 1 DAY) FROM `{{mds_project_id}}.{{mds_dataset}}.event`);
END IF;

-- This code block ensures that the input_date used in subsequent operations is not before the earliest available data in the
-- specified events table. This prevents potential errors or unexpected behavior that might occur when trying to process data
-- for a date range that precedes the actual data availability.
IF input_date < (SELECT MIN(event_date) FROM `{{mds_project_id}}.{{mds_dataset}}.event`) THEN
SET input_date = (SELECT DATE_ADD(MIN(event_date), INTERVAL 1 DAY) FROM `{{mds_project_id}}.{{mds_dataset}}.event`);
END IF;

CALL `{{project_id}}.{{dataset}}.{{stored_procedure}}`(input_date, end_date, users_added);
73 changes: 73 additions & 0 deletions sql/query/invoke_lead_score_propensity_training_preparation.sqlx
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
-- Copyright 2023 Google LLC
--
-- Licensed under the Apache License, Version 2.0 (the "License");
-- you may not use this file except in compliance with the License.
-- You may obtain a copy of the License at
--
-- http://www.apache.org/licenses/LICENSE-2.0
--
-- Unless required by applicable law or agreed to in writing, software
-- distributed under the License is distributed on an "AS IS" BASIS,
-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-- See the License for the specific language governing permissions and
-- limitations under the License.

-- This script intelligently determines the optimal date range for training a purchase
-- propensity model by considering user-defined parameters and the availability of purchase
-- events within the dataset. It ensures that the training data includes purchase events if
-- they exist within the specified bounds.

-- Intended start and end dates for training data
-- Initializing Training Dates
DECLARE train_start_date DATE DEFAULT NULL;
DECLARE train_end_date DATE DEFAULT NULL;

-- Control data splitting for training and validation (likely used in a subsequent process).
DECLARE train_split_end_number INT64 DEFAULT NULL;
DECLARE validation_split_end_number INT64 DEFAULT NULL;

-- Will store the count of distinct users who made a login within a given period.
DECLARE logged_users INT64 DEFAULT NULL;

-- Used to store the maximum and minimum event dates from the source data.
DECLARE max_date DATE;
DECLARE min_date DATE;

-- Determining Maximum and Minimum Dates
SET max_date = (SELECT DATE_SUB(MAX(event_date), INTERVAL {{interval_max_date}} DAY) FROM `{{mds_project_id}}.{{mds_dataset}}.event`);
SET min_date = (SELECT DATE_ADD(MIN(event_date), INTERVAL {{interval_min_date}} DAY) FROM `{{mds_project_id}}.{{mds_dataset}}.event`);

-- If min_date > maximum event_date OR max_date < minimum event_date, then set min_date for the min event_date and set max_date for the max event_date
IF min_date >= (SELECT MAX(event_date) FROM `{{mds_project_id}}.{{mds_dataset}}.event`) OR max_date <= (SELECT MIN(event_date) FROM `{{mds_project_id}}.{{mds_dataset}}.event`) OR min_date >= max_date THEN
SET min_date = (SELECT MIN(event_date) FROM `{{mds_project_id}}.{{mds_dataset}}.event`);
SET max_date = (SELECT MAX(event_date) FROM `{{mds_project_id}}.{{mds_dataset}}.event`);
END IF;

-- Setting Split Numbers
-- Sets the train_split_end_number to a user-defined value. This value likely determines the proportion of data used for training.
SET train_split_end_number = {{train_split_end_number}}; -- If you want 60% for training use number 5. If you want 80% use number 7.
-- Sets the validation_split_end_number to a user-defined value, controlling the proportion of data used for validation.
SET validation_split_end_number = {{validation_split_end_number}};

-- This crucial step counts distinct users who have an event named 'login' within the initially set training date range.
-- IF there are no logged_users in the time interval selected, then set "train_start_date" and "train_end_date" as "max_date" and "min_date".
SET logged_users = (SELECT COUNT(DISTINCT user_pseudo_id)
FROM `{{mds_project_id}}.{{mds_dataset}}.event`
WHERE event_name = 'login' AND
event_date BETWEEN min_date AND max_date
);

-- Setting Training Dates
-- If there are logged_users in the training set, then keep the calculated dates, or else set
-- the start and end dates to a fixed interval preventing `train_start_date` and `train_end_date` from being NULL.
IF logged_users > 0 THEN
SET train_start_date = min_date;
SET train_end_date = max_date;
ELSE
SET train_start_date = DATE_SUB(CURRENT_DATE(), INTERVAL 3 YEAR);
SET train_end_date = DATE_SUB(CURRENT_DATE(), INTERVAL 5 DAY);
END IF;

-- Finally, the script calls a stored procedure, passing the adjusted training dates and split numbers as arguments. This stored procedure
-- handles the actual data preparation for the lead score propensity model.
CALL `{{project_id}}.{{dataset}}.{{stored_procedure}}`(train_start_date, train_end_date, train_split_end_number, validation_split_end_number);
28 changes: 28 additions & 0 deletions sql/query/invoke_user_rolling_window_lead_metrics.sqlx
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
-- Copyright 2023 Google LLC
--
-- Licensed under the Apache License, Version 2.0 (the "License");
-- you may not use this file except in compliance with the License.
-- You may obtain a copy of the License at
--
-- http://www.apache.org/licenses/LICENSE-2.0
--
-- Unless required by applicable law or agreed to in writing, software
-- distributed under the License is distributed on an "AS IS" BASIS,
-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-- See the License for the specific language governing permissions and
-- limitations under the License.

-- This script sets up a date range, calls a stored procedure with this range and a variable to
-- store a result, and then returns the result of the stored procedure. This pattern is common
-- for orchestrating data processing tasks within BigQuery using stored procedures.

DECLARE input_date DATE;
DECLARE end_date DATE;
DECLARE users_added INT64 DEFAULT NULL;

SET input_date= CURRENT_DATE();
SET end_date= (SELECT DATE_SUB(input_date, INTERVAL {{interval_end_date}} DAY));

CALL `{{project_id}}.{{dataset}}.{{stored_procedure}}`(input_date, end_date, users_added);

SELECT users_added;

0 comments on commit b31ea28

Please sign in to comment.