diff --git a/config/config.yaml.tftpl b/config/config.yaml.tftpl index 4b4d9914..b9d087b9 100644 --- a/config/config.yaml.tftpl +++ b/config/config.yaml.tftpl @@ -195,7 +195,7 @@ vertex_ai: subnetwork: "default" # If you want to use the vpc network defined above, set the following flag to true use_private_service_access: false - state: '${pipeline_configuration.feature-creation-auto-audience-segmentation.execution.schedule.state}' + state: ${pipeline_configuration.feature-creation-auto-audience-segmentation.execution.schedule.state} # The `pipeline_parameters` defines the parameters that are going to be used to compile the pipeline. # Those values may difer depending on the pipeline type and the pipeline steps being used. # Make sure you review the python function the defines the pipeline. @@ -277,7 +277,7 @@ vertex_ai: subnetwork: "default" # If you want to use the vpc network defined above, set the following flag to true use_private_service_access: false - state: '${pipeline_configuration.feature-creation-audience-segmentation.execution.schedule.state}' + state: ${pipeline_configuration.feature-creation-audience-segmentation.execution.schedule.state} # The `pipeline_parameters` defines the parameters that are going to be used to compile the pipeline. # Those values may difer depending on the pipeline type and the pipeline steps being used. # Make sure you review the python function the defines the pipeline. @@ -340,7 +340,7 @@ vertex_ai: subnetwork: "default" # If you want to use the vpc network defined above, set the following flag to true use_private_service_access: false - state: '${pipeline_configuration.feature-creation-purchase-propensity.execution.schedule.state}' + state: ${pipeline_configuration.feature-creation-purchase-propensity.execution.schedule.state} pipeline_parameters: project_id: "${project_id}" location: "${location}" @@ -401,7 +401,7 @@ vertex_ai: subnetwork: "default" # If you want to use the vpc network defined above, set the following flag to true use_private_service_access: false - state: '${pipeline_configuration.feature-creation-churn-propensity.execution.schedule.state}' + state: ${pipeline_configuration.feature-creation-churn-propensity.execution.schedule.state} pipeline_parameters: project_id: "${project_id}" location: "${location}" @@ -456,7 +456,7 @@ vertex_ai: subnetwork: "default" # If you want to use the vpc network defined above, set the following flag to true use_private_service_access: false - state: '${pipeline_configuration.feature-creation-customer-ltv.execution.schedule.state}' + state: ${pipeline_configuration.feature-creation-customer-ltv.execution.schedule.state} pipeline_parameters: project_id: "${project_id}" location: "${location}" @@ -517,7 +517,7 @@ vertex_ai: subnetwork: "default" # If you want to use the vpc network defined above, set the following flag to true use_private_service_access: false - state: '${pipeline_configuration.feature-creation-aggregated-value-based-bidding.execution.schedule.state}' + state: ${pipeline_configuration.feature-creation-aggregated-value-based-bidding.execution.schedule.state} pipeline_parameters: project_id: "${project_id}" location: "${location}" @@ -531,6 +531,67 @@ vertex_ai: pipeline_parameters_substitutions: aggregated_value_based_bidding_training_preparation_procedure_name: "${project_id}.aggregated_vbb.invoke_aggregated_value_based_bidding_training_preparation" aggregated_value_based_bidding_explanation_preparation_procedure_name: "${project_id}.aggregated_vbb.invoke_aggregated_value_based_bidding_explanation_preparation" + + # This pipeline contains the configuration parameters for the feature creation pipeline for the lead score propensity model. + # To deploy this pipeline to your Google Cloud project: + ## 1. Define the pipeline parameters below, following YAML format + ## 2. Define the queries and procedures SQL parameters in this file under the `bigquery` section, following YAML format + ## 3. Create the queries and procedures SQL files under sql/ folder + ## 4. Create the terraform resources uin terraform/feature-store/bigquery-procedures.tf + ## 5. Create the terraform resources to compile and schedule the pipeline in terraform/pipelines/pipelines.tf + ## 6. Define python function that perform `compilation` and `upload to GCS bucket` are defined in `python/pipelines/compiler.py` and `python/pipelines/uploader.py`. + ## 7. Define python function that perform `schedule` of the pipeline is defined in `python/pipelines/scheduler.py`. + ## 8. Create the pipeline python function in python/pipelines/feature_engineering_pipelines.py + ## 9. Run terraform apply + feature-creation-lead-score-propensity: + execution: + name: "feature-creation-lead-score-propensity" + job_id_prefix: "feature-creation-lead-score-propensity-" + experiment_name: "feature-creation-lead-score-propensity" + # `type` must be "custom", when we're building Python and/or SQL based pipelines for feature engineering purposes. + type: "custom" + schedule: + cron: "TZ=${time_zone} 0 1 * * *" + # Define the maximum number of concurrent pipeline runs. + # The default value is 1. + max_concurrent_run_count: 1 + start_time: null + end_time: null + # The `subnetwork` defines the subnetwork in which the pipeline will be executed. + # The default value is "default". + # Follow the guide: https://cloud.google.com/vertex-ai/docs/general/vpc-peering + subnetwork: "default" + # If you want to use the vpc network defined above, set the following flag to true + use_private_service_access: false + state: ${pipeline_configuration.feature-creation-lead-score-propensity.execution.schedule.state} + pipeline_parameters: + project_id: "${project_id}" + location: "${location}" + # The query_lead_score_propensity_label defines the procedure that will be used to invoke the creation of the lead score propensity label feature table. + query_lead_score_propensity_label: " + CALL `{lead_score_propensity_label_procedure_name}`();" + # The query_user_dimensions defines the procedure that will be used to invoke the creation of the user dimensions feature table. + query_user_dimensions: " + CALL `{user_dimensions_procedure_name}`();" + # The query_user_rolling_window_metrics defines the procedure that will be used to invoke the creation of the user rolling window metrics feature table. + query_user_rolling_window_metrics: " + CALL `{user_rolling_window_metrics_procedure_name}`();" + # The query_lead_score_propensity_inference_preparation define the procedure that will be used to invoke the creation of the lead score propensity inference preparation table. + query_lead_score_propensity_inference_preparation: " + CALL `{lead_score_propensity_inference_preparation_procedure_name}`();" + # The query_lead_score_propensity_training_preparation define the procedure that will be used to invoke the creation of the lead score propensity training preparation table. + query_lead_score_propensity_training_preparation: " + CALL `{lead_score_propensity_training_preparation_procedure_name}`();" + timeout: 3600.0 + pipeline_parameters_substitutions: # Substitutions are applied to the parameters before compilation + lead_score_propensity_label_procedure_name: "${project_id}.feature_store.invoke_lead_score_propensity_label" + user_dimensions_procedure_name: "${project_id}.feature_store.invoke_user_dimensions" + user_rolling_window_metrics_procedure_name: "${project_id}.feature_store.invoke_user_rolling_window_lead_metrics" + user_scoped_metrics_procedure_name: "${project_id}.feature_store.invoke_user_scoped_metrics" + user_session_event_aggregated_metrics_procedure_name: "${project_id}.feature_store.invoke_user_session_event_aggregated_metrics" + date_timezone: "UTC" # used when input_date is None and need to get current date. + lead_score_propensity_inference_preparation_procedure_name: "${project_id}.lead_score_propensity.invoke_lead_score_propensity_inference_preparation" + lead_score_propensity_training_preparation_procedure_name: "${project_id}.lead_score_propensity.invoke_lead_score_propensity_training_preparation" # This pipeline contains the configuration parameters for the value based bidding training and inference pipelines. # To deploy this pipeline to your Google Cloud project: @@ -563,7 +624,7 @@ vertex_ai: subnetwork: "default" # If you want to use the vpc network defined above, set the following flag to true use_private_service_access: false - state: '${pipeline_configuration.value_based_bidding.training.schedule.state}' + state: ${pipeline_configuration.value_based_bidding.training.schedule.state} # These are pipeline parameters that will be passed to the pipeline to be recompiled pipeline_parameters: project: "${project_id}" @@ -601,10 +662,15 @@ vertex_ai: stats_and_example_gen_dataflow_max_num_workers: 10 evaluation_dataflow_starting_num_workers: 5 evaluation_dataflow_max_num_workers: 10 - distill_batch_predict_max_replica_count: 10 - distill_batch_predict_starting_replica_count: 10 - evaluation_batch_predict_max_replica_count: 10 - evaluation_batch_predict_starting_replica_count: 10 + distill_batch_predict_machine_type: "n1-highmem-8" + distill_batch_predict_max_replica_count: 5 + distill_batch_predict_starting_replica_count: 5 + evaluation_batch_predict_max_replica_count: 5 + evaluation_batch_predict_starting_replica_count: 5 + evaluation_batch_explain_max_replica_count: 5 + evaluation_batch_explain_starting_replica_count: 5 + stage_1_num_parallel_trials: 5 + stage_2_num_parallel_trials: 5 evaluation_dataflow_disk_size_gb: 30 stats_and_example_gen_dataflow_disk_size_gb: 30 transform_dataflow_disk_size_gb: 30 @@ -644,7 +710,7 @@ vertex_ai: subnetwork: "default" # If you want to use the vpc network defined above, set the following flag to true use_private_service_access: false - state: '${pipeline_configuration.value_based_bidding.explanation.schedule.state}' + state: ${pipeline_configuration.value_based_bidding.explanation.schedule.state} pipeline_parameters: project: "${project_id}" location: "${cloud_region}" @@ -689,7 +755,7 @@ vertex_ai: subnetwork: "default" # If you want to use the vpc network defined above, set the following flag to true use_private_service_access: false - state: '${pipeline_configuration.purchase_propensity.training.schedule.state}' + state: ${pipeline_configuration.purchase_propensity.training.schedule.state} # These are pipeline parameters that will be passed to the pipeline to be recompiled pipeline_parameters: project: "${project_id}" @@ -735,10 +801,15 @@ vertex_ai: stats_and_example_gen_dataflow_max_num_workers: 10 evaluation_dataflow_starting_num_workers: 5 evaluation_dataflow_max_num_workers: 10 - distill_batch_predict_max_replica_count: 10 - distill_batch_predict_starting_replica_count: 10 - evaluation_batch_predict_max_replica_count: 10 - evaluation_batch_predict_starting_replica_count: 10 + distill_batch_predict_machine_type: "n1-highmem-8" + distill_batch_predict_max_replica_count: 5 + distill_batch_predict_starting_replica_count: 5 + evaluation_batch_predict_max_replica_count: 5 + evaluation_batch_predict_starting_replica_count: 5 + evaluation_batch_explain_max_replica_count: 5 + evaluation_batch_explain_starting_replica_count: 5 + stage_1_num_parallel_trials: 5 + stage_2_num_parallel_trials: 5 evaluation_dataflow_disk_size_gb: 30 stats_and_example_gen_dataflow_disk_size_gb: 30 transform_dataflow_disk_size_gb: 30 @@ -790,7 +861,7 @@ vertex_ai: subnetwork: "default" # If you want to use the vpc network defined above, set the following flag to true use_private_service_access: false - state: '${pipeline_configuration.purchase_propensity.prediction.schedule.state}' + state: ${pipeline_configuration.purchase_propensity.prediction.schedule.state} pipeline_parameters: project_id: "${project_id}" location: "${cloud_region}" @@ -850,10 +921,10 @@ vertex_ai: # Follow the guide: https://cloud.google.com/vertex-ai/docs/general/vpc-peering subnetwork: "default" # If you want to use the vpc network defined above, set the following flag to true - use_private_service_access: false + use_private_service_access: false # The `state` defines the state of the pipeline. # In case you don't want to schedule the pipeline, set the state to `PAUSED`. - state: '${pipeline_configuration.churn_propensity.training.schedule.state}' + state: ${pipeline_configuration.churn_propensity.training.schedule.state} # These are pipeline parameters that will be passed to the pipeline to be recompiled pipeline_parameters: project: "${project_id}" @@ -899,10 +970,17 @@ vertex_ai: stats_and_example_gen_dataflow_max_num_workers: 10 evaluation_dataflow_starting_num_workers: 5 evaluation_dataflow_max_num_workers: 10 - distill_batch_predict_max_replica_count: 10 - distill_batch_predict_starting_replica_count: 10 - evaluation_batch_predict_max_replica_count: 10 - evaluation_batch_predict_starting_replica_count: 10 + distill_batch_predict_machine_type: "n1-highmem-8" + distill_batch_predict_max_replica_count: 5 + distill_batch_predict_starting_replica_count: 5 + evaluation_batch_predict_max_replica_count: 5 + evaluation_batch_predict_starting_replica_count: 5 + evaluation_batch_explain_max_replica_count: 5 + evaluation_batch_explain_starting_replica_count: 5 + evaluation_batch_explain_max_replica_count: 5 + evaluation_batch_explain_starting_replica_count: 5 + stage_1_num_parallel_trials: 5 + stage_2_num_parallel_trials: 5 evaluation_dataflow_disk_size_gb: 30 stats_and_example_gen_dataflow_disk_size_gb: 30 transform_dataflow_disk_size_gb: 30 @@ -956,7 +1034,7 @@ vertex_ai: use_private_service_access: false # The `state` defines the state of the pipeline. # In case you don't want to schedule the pipeline, set the state to `PAUSED`. - state: PAUSED # possible states ACTIVE or PAUSED + state: ${pipeline_configuration.churn_propensity.prediction.schedule.state} pipeline_parameters: project_id: "${project_id}" location: "${cloud_region}" @@ -1019,7 +1097,7 @@ vertex_ai: use_private_service_access: false # The `state` defines the state of the pipeline. # In case you don't want to schedule the pipeline, set the state to `PAUSED`. - state: PAUSED # possible states ACTIVE or PAUSED + state: ${pipeline_configuration.segmentation.training.schedule.state} # These are pipeline parameters that will be passed to the pipeline to be compiled # For Demographics Audience Segmentation model, we use the BQML KMeans clustering algorithm. # Check the official documentation for better understanding the algorithm @@ -1070,7 +1148,7 @@ vertex_ai: use_private_service_access: false # The `state` defines the state of the pipeline. # In case you don't want to schedule the pipeline, set the state to `PAUSED`. - state: PAUSED # possible states ACTIVE or PAUSED + state: ${pipeline_configuration.segmentation.prediction.schedule.state} pipeline_parameters: project_id: "${project_id}" location: "${location}" @@ -1124,7 +1202,7 @@ vertex_ai: use_private_service_access: false # The `state` defines the state of the pipeline. # In case you don't want to schedule the pipeline, set the state to `PAUSED`. - state: PAUSED # possible states ACTIVE or PAUSED + state: ${pipeline_configuration.auto_segmentation.training.schedule.state} # These are pipeline parameters that will be passed to the pipeline to be compiled # For Interest based Auto Audience Segmentation model, we use the BQML KMeans clustering algorithm. # Check the official documentation for better understanding the algorithm @@ -1174,7 +1252,7 @@ vertex_ai: use_private_service_access: false # The `state` defines the state of the pipeline. # In case you don't want to schedule the pipeline, set the state to `PAUSED`. - state: PAUSED # possible states ACTIVE or PAUSED + state: ${pipeline_configuration.auto_segmentation.prediction.schedule.state} pipeline_parameters: project_id: "${project_id}" location: "${location}" @@ -1231,7 +1309,7 @@ vertex_ai: use_private_service_access: false # The `state` defines the state of the pipeline. # In case you don't want to schedule the pipeline, set the state to `PAUSED`. - state: PAUSED # possible states ACTIVE or PAUSED + state: ${pipeline_configuration.propensity_clv.training.schedule.state} # These are pipeline parameters that will be passed to the pipeline to be recompiled pipeline_parameters: project: "${project_id}" @@ -1276,10 +1354,15 @@ vertex_ai: stats_and_example_gen_dataflow_max_num_workers: 10 evaluation_dataflow_starting_num_workers: 5 evaluation_dataflow_max_num_workers: 10 - distill_batch_predict_max_replica_count: 10 - distill_batch_predict_starting_replica_count: 10 - evaluation_batch_predict_max_replica_count: 10 - evaluation_batch_predict_starting_replica_count: 10 + distill_batch_predict_machine_type: "n1-highmem-8" + distill_batch_predict_max_replica_count: 5 + distill_batch_predict_starting_replica_count: 5 + evaluation_batch_predict_max_replica_count: 5 + evaluation_batch_predict_starting_replica_count: 5 + evaluation_batch_explain_max_replica_count: 5 + evaluation_batch_explain_starting_replica_count: 5 + stage_1_num_parallel_trials: 5 + stage_2_num_parallel_trials: 5 evaluation_dataflow_disk_size_gb: 30 stats_and_example_gen_dataflow_disk_size_gb: 30 transform_dataflow_disk_size_gb: 30 @@ -1350,7 +1433,7 @@ vertex_ai: use_private_service_access: false # The `state` defines the state of the pipeline. # In case you don't want to schedule the pipeline, set the state to `PAUSED`. - state: PAUSED # possible states ACTIVE or PAUSED + state: ${pipeline_configuration.clv.training.schedule.state} # These are pipeline parameters that will be passed to the pipeline to be recompiled pipeline_parameters: project: "${project_id}" @@ -1392,10 +1475,15 @@ vertex_ai: stats_and_example_gen_dataflow_max_num_workers: 10 evaluation_dataflow_starting_num_workers: 5 evaluation_dataflow_max_num_workers: 10 - distill_batch_predict_max_replica_count: 10 - distill_batch_predict_starting_replica_count: 10 - evaluation_batch_predict_max_replica_count: 10 - evaluation_batch_predict_starting_replica_count: 10 + distill_batch_predict_machine_type: "n1-highmem-8" + distill_batch_predict_max_replica_count: 5 + distill_batch_predict_starting_replica_count: 5 + evaluation_batch_predict_max_replica_count: 5 + evaluation_batch_predict_starting_replica_count: 5 + evaluation_batch_explain_max_replica_count: 5 + evaluation_batch_explain_starting_replica_count: 5 + stage_1_num_parallel_trials: 5 + stage_2_num_parallel_trials: 5 evaluation_dataflow_disk_size_gb: 30 stats_and_example_gen_dataflow_disk_size_gb: 30 transform_dataflow_disk_size_gb: 30 @@ -1446,7 +1534,7 @@ vertex_ai: use_private_service_access: false # The `state` defines the state of the pipeline. # In case you don't want to schedule the pipeline, set the state to `PAUSED`. - state: PAUSED # possible states ACTIVE or PAUSED + state: ${pipeline_configuration.clv.prediction.schedule.state} # These are the pipeline parameters to be used in this convoluted prediction pipeline that takes predictions from LTV model and purchase propensity model. pipeline_parameters: project_id: "${project_id}" @@ -1491,6 +1579,173 @@ vertex_ai: pubsub_activation_type: "cltv-180-30" # cltv-180-180 | cltv-180-90 | cltv-180-30 pipeline_parameters_substitutions: null + # This pipeline contains the configuration parameters for the propensity training and inference pipelines for the lead score propensity model. + # To deploy this pipeline to your Google Cloud project: + ## 1. Define the pipeline parameters below, following YAML format + ## 2. Define the bigquery or Vertex AI KFP components to be used by your pipeline in `python/pipelines` section, if applicable. + ## 3. Define or reuse the pipeline definition method to be used to compile the pipeline into a YAML file in `python/pipelines` section. + ## 5. Create the terraform resources to compile, upload and schedule the pipeline in `terraform/pipelines/pipelines.tf` + ## 6. Define python function that perform `compilation` and `upload to GCS bucket` are defined in `python/pipelines/compiler.py` and `python/pipelines/uploader.py`. + ## 7. Define python function that perform `schedule` of the pipeline is defined in `python/pipelines/scheduler.py`. + ## 9. Run terraform apply + ## Note: For `type` = "tabular workflows", the pre-compiled YAML file `automl_tabular_pl_v4.yaml` is recompiled by parsing the `pipeline_parameters` below as default values + ## to the the new YAML file. The recompiled YAML will be uploaded and scheduled in Vertex AI Pipelines. + lead_score_propensity: + training: + name: "lead-score-propensity-training-pl" + job_id_prefix: "lead-score-propensity-training-pl-" + experiment_name: "lead-score-propensity-training" + # `type` can be "custom" or "tabular-workflows". + # For using Vertex AI Tabular Workflow use the later, for all other modeling approaches use "custom" (i.e. BQML, Scikit-learn). + type: "tabular-workflows" + schedule: + cron: "TZ=${time_zone} 0 8 * * SAT" + max_concurrent_run_count: 1 + start_time: null + end_time: null + # The `subnetwork` defines the subnetwork in which the pipeline will be executed. + # The default value is "default". + # Follow the guide: https://cloud.google.com/vertex-ai/docs/general/vpc-peering + subnetwork: "default" + # If you want to use the vpc network defined above, set the following flag to true + use_private_service_access: false + state: ${pipeline_configuration.lead_score_propensity.training.schedule.state} + # These are pipeline parameters that will be passed to the pipeline to be recompiled + pipeline_parameters: + project: "${project_id}" + location: "${cloud_region}" + root_dir: "gs://${project_id}-pipelines/lead-score-propensity-training" + transformations: "gs://${project_id}-pipelines/lead-score-propensity-training/transformations_config_{timestamp}.json" + # These are specific data types transformations that will be applied to the dataset. + custom_transformations: "pipelines/transformations-lead-score-propensity.json" + train_budget_milli_node_hours: 100 # 1000 = 1 hour + # Set these to apply feature selection tuning. + max_selected_features: 20 + apply_feature_selection_tuning: true + run_evaluation: true + run_distillation: false + # The Lead Score Propensity model name + model_display_name: "lead-score-propensity-model" + # The Lead Score Propensity model description + model_description: "Lead Score Propensity Classification AutoML Model" + # Use `prediction_type` to "regression" for training models that predict a numerical value. For classification models, use "classification" and you will + # also get the probability likelihood for that class. + prediction_type: "classification" + # The optimization objectives may change depending on the `prediction_type`. + # For binary classification, use "maximize-au-roc", "minimize-log-loss", "maximize-au-prc", "maximize-precision-at-recall" or "maximize-recall-at-precision". + # For multi class classification, use "minimize-log-loss". + # For regression, use "minimize-rmse", "minimize-mae", or "minimize-rmsle". + optimization_objective: "maximize-au-roc" # maximize-precision-at-recall, maximize-au-prc, maximize-au-roc, minimize-log-loss, maximize-recall-at-precision + #Don't use when parameter `optimization_objective` is not `maximize-precision-at-recall` or `maximize-recall-at-precision` + #optimization_objective_recall_value: 0.72 + #optimization_objective_precision_value: 0.72 + target_column: "will_login" + predefined_split_key: "data_split" + data_source_csv_filenames: null + training_fraction: null + validation_fraction: null + test_fraction: null + # This is the training dataset provided during the training routine. + # The schema in this table or view must match the schema in the json files. + # Take into consideration the `excluded_features` list below. They won't be used for training. + data_source_bigquery_table_path: "bq://${project_id}.lead_score_propensity.v_lead_score_propensity_training_5_1_last_window" + data_source_bigquery_table_schema: "../sql/schema/table/lead_score_propensity_training_preparation.json" + dataflow_service_account: "df-worker@${project_id}.iam.gserviceaccount.com" + transform_dataflow_max_num_workers: 10 + stats_and_example_gen_dataflow_max_num_workers: 10 + evaluation_dataflow_starting_num_workers: 5 + evaluation_dataflow_max_num_workers: 10 + distill_batch_predict_machine_type: "n1-highmem-8" + distill_batch_predict_max_replica_count: 5 + distill_batch_predict_starting_replica_count: 5 + evaluation_batch_predict_max_replica_count: 5 + evaluation_batch_predict_starting_replica_count: 5 + evaluation_batch_explain_max_replica_count: 5 + evaluation_batch_explain_starting_replica_count: 5 + stage_1_num_parallel_trials: 5 + stage_2_num_parallel_trials: 5 + evaluation_dataflow_disk_size_gb: 30 + stats_and_example_gen_dataflow_disk_size_gb: 30 + transform_dataflow_disk_size_gb: 30 + timestamp_split_key: null + stratified_split_key: null + weight_column: null + additional_experiments: null + export_additional_model_without_custom_ops: false + # Override the study spec parameters in case you want to restrict hyperparameter search space. Including `model_type`. + # In this case, for Value Based Bidding, we're looking for a perfect fit using a tree based model. + # Don't use when parameter `apply_feature_selection_tuning` is `true` + #study_spec_parameters_override: + # - parameter_id: "model_type" + # categorical_value_spec: + # values: + # - nn + # - boosted_trees + # - parameter_id: "feature_selection_rate" + # double_value_spec: + # min_value: 0.5 + # max_value: 1.0 + # scale_type: UNIT_LINEAR_SCALE + # Features to be excluded from the training dataset. + exclude_features: + - processed_timestamp + - data_split + #- feature_date + - user_pseudo_id + - user_id + - device_web_browser_version + - device_os_version + - will_login + pipeline_parameters_substitutions: null + prediction: + name: "lead-score-propensity-prediction-pl" + job_id_prefix: "lead-score-propensity-prediction-pl-" + experiment_name: "lead-score-propensity-prediction" + # `type` can be "custom" or "tabular-workflows". + # For using Vertex AI Tabular Workflow use the later, for all other modeling approaches use "custom" (i.e. BQML, Scikit-learn). + type: "custom" + schedule: + cron: "TZ=${time_zone} 0 5 * * *" + max_concurrent_run_count: 1 + start_time: null + end_time: null + # The `subnetwork` defines the subnetwork in which the pipeline will be executed. + # The default value is "default". + # Follow the guide: https://cloud.google.com/vertex-ai/docs/general/vpc-peering + subnetwork: "default" + # If you want to use the vpc network defined above, set the following flag to true + use_private_service_access: false + state: ${pipeline_configuration.lead_score_propensity.prediction.schedule.state} + pipeline_parameters: + project_id: "${project_id}" + location: "${cloud_region}" + job_name_prefix: "lead-score-propensity-prediction-pl-" + # The Lead Score Propensity model name to be used for prediction + model_display_name: "lead-score-propensity-model" + model_metric_name: "logLoss" + # The `model_metric_threshold` parameter defines what is the maximum acceptable value for the `model_metric_name` so that the model can be selected. + # If the actual models metrics values are higher than this limit, no models will be selected and the pipeline is going to fail. + model_metric_threshold: 0.9 + number_of_models_considered: 1 + # This is the prediction dataset table or view. + bigquery_source: "${project_id}.lead_score_propensity.v_lead_score_propensity_inference_5_1" + bigquery_destination_prefix: "${project_id}.lead_score_propensity" + bq_unique_key: "user_pseudo_id" + machine_type: "n1-standard-4" + max_replica_count: 10 + batch_size: 64 + accelerator_count: 0 + accelerator_type: "ACCELERATOR_TYPE_UNSPECIFIED" # ONE OF ACCELERATOR_TYPE_UNSPECIFIED, NVIDIA_TESLA_K80, NVIDIA_TESLA_P100, NVIDIA_TESLA_V100, NVIDIA_TESLA_P4, NVIDIA_TESLA_T4 + generate_explanation: false + # This is the probability value that will tell the condition to slit into the two classes. + # For probabilities higher than `threashold`, set postive label to 1, otherwise 0. + threashold: 0.5 + positive_label: "1" + # THese are parameters to trigger the Activation Application Dataflow. + pubsub_activation_topic: "activation-trigger" + pubsub_activation_type: "lead-score-propensity-5-1" # lead-score-propensity-5-1 + pipeline_parameters_substitutions: null + # This pipeline contains the configuration parameters for the feature creation pipeline for the audience segmentation model. # This block defines the pipeline parameters that are going to be used for three tasks: compilation, upload and scheduling. # To deploy this pipeline to your Google Cloud project: @@ -1533,7 +1788,7 @@ vertex_ai: use_private_service_access: false # The `state` defines the state of the pipeline. # In case you don't want to schedule the pipeline, set the state to `PAUSED`. - state: PAUSED # possible states ACTIVE or PAUSED + state: ACTIVE # possible states ACTIVE or PAUSED # The `pipeline_parameters` defines the parameters that are going to be used to compile the pipeline. # Those values may difer depending on the pipeline type and the pipeline steps being used. # Make sure you review the python function the defines the pipeline. @@ -1597,7 +1852,7 @@ vertex_ai: use_private_service_access: false # The `state` defines the state of the pipeline. # In case you don't want to schedule the pipeline, set the state to `PAUSED`. - state: PAUSED # possible states ACTIVE or PAUSED + state: ACTIVE # possible states ACTIVE or PAUSED # The `pipeline_parameters` defines the parameters that are going to be used to compile the pipeline. # Those values may difer depending on the pipeline type and the pipeline steps being used. # Make sure you review the python function the defines the pipeline. @@ -1726,6 +1981,18 @@ bigquery: description: "Dataset with gemini_insights results from multiple use cases" friendly_name: "Gemini Insights Dataset" max_time_travel_hours: 168 + # Dataset for the lead score propensity use case. + lead_score_propensity: + name: "lead_score_propensity" + location: "${location}" + project_id: "${project_id}" + collation: "und:ci" + is_case_insensitive: TRUE + description: "Lead Score Propensity Use Case dataset for Marketing behavioural modeling" + friendly_name: "Lead Score Propensity Dataset" + max_time_travel_hours: 168 + default_partition_expiration_days: 365 + default_table_expiration_days: 365 table: # Table containing the feature engineered dataset that will be used for the Audience Segmentation prediction pipeline. audience_segmentation_inference_preparation: @@ -1762,6 +2029,13 @@ bigquery: table_name: "churn_propensity_inference_preparation" location: "${location}" table_description: "Purchase Propensity Inference Preparation table to be used for Model Prediction" + # Table containing the feature engineered dataset that will be used for the Lead Score Propensity prediction pipeline. + lead_score_propensity_inference_preparation: + project_id: "${project_id}" + dataset: "lead_score_propensity" + table_name: "lead_score_propensity_inference_preparation" + location: "${location}" + table_description: "Lead Score Propensity Inference Preparation table to be used for Model Prediction" # Table containing the feature engineered labels that will be used for the Purchase Propensity training pipeline. purchase_propensity_label: project_id: "${project_id}" @@ -1769,13 +2043,20 @@ bigquery: table_name: "purchase_propensity_label" location: "${location}" table_description: "Purchase Propensity Label table to be used for Model Prediction" - # Table containing the feature engineered labels that will be used for the Purchase Propensity training pipeline. + # Table containing the feature engineered labels that will be used for the Churn Propensity training pipeline. churn_propensity_label: project_id: "${project_id}" dataset: "feature_store" table_name: "churn_propensity_label" location: "${location}" table_description: "Churn Propensity Label table to be used for Model Prediction" + # Table containing the feature engineered labels that will be used for the Purchase Propensity training pipeline. + lead_score_propensity_label: + project_id: "${project_id}" + dataset: "feature_store" + table_name: "lead_score_propensity_label" + location: "${location}" + table_description: "Lead Score Propensity Label table to be used for Model Prediction" # Table containing the feature engineered dimensions that will be used for the Purchase Propensity training and inference pipeline. user_dimensions: project_id: "${project_id}" @@ -1811,6 +2092,13 @@ bigquery: table_name: "user_rolling_window_metrics" location: "${location}" table_description: "User Rolling Window Metrics table as part of the Feature Store for Purchase Propensity use case" + # Table containing the featured engineered rolling window metrics that will be used for the Purchase Propensity training and inference pipeline. + user_rolling_window_lead_metrics: + project_id: "${project_id}" + dataset: "feature_store" + table_name: "user_rolling_window_lead_metrics" + location: "${location}" + table_description: "User Rolling Window Lead Metrics table as part of the Feature Store for Lead Score Propensity use case" # Table containing the feature engineered all users metrics that will be used for the Customer Lifetime Value training and inference pipeline. user_scoped_lifetime_metrics: project_id: "${project_id}" @@ -1860,6 +2148,9 @@ bigquery: churn_propensity_query_template: none: none # This is a query template to be used by the Activation application, so there is no configuration to be applied. + lead_score_propensity_query_template: + none: none + # This is a query template to be used by the Activation application, so there is no configuration to be applied. cltv_query_template: none: none create_gemini_model: @@ -1875,6 +2166,25 @@ bigquery: project_id: "${project_id}" dataset: "aggregated_vbb" stored_procedure: "aggregated_value_based_bidding_training_preparation" + # This is a stored procedure that CALLs the Lead Score Propensity Training Preparation stored procedure. + invoke_lead_score_propensity_training_preparation: + project_id: "${project_id}" + dataset: "lead_score_propensity" + stored_procedure: "lead_score_propensity_training_preparation" + # The `interval_max_date` parameter defines hwo many days we leave out of the training dataset after the latest date in the dataset. + # This is usually the same value as the look forward window. + interval_max_date: 1 + # The `interval_min_date` parameter defines how many days we leave out of the training dataset before the first date in the dataset. + # This is usually the same value as the lookback window. + interval_min_date: 5 + # `training_split_end_number` must be smaller then `validation_split_end_number`. + # This is a number out of 10 deciles, how many rows will belong to the `data_split` = TRAIN (Between 1 and `training_split_end_number`) + train_split_end_number: 5 + # This is a number out of 10 deciles, how many rows will belong to the `data_split` = VALIDATE (Between `training_split_end_number` and `validation_split_end_number`) + # The rest of the rows will belong to the `data_split` = TEST (Between `validation_split_end_number` and 10) + validation_split_end_number: 8 + mds_project_id: "${mds_project_id}" + mds_dataset: "${mds_dataset}" # This is a stored procedure that CALLs the Purchase Propensity Training Preparation stored procedure. invoke_purchase_propensity_training_preparation: project_id: "${project_id}" @@ -1894,7 +2204,7 @@ bigquery: validation_split_end_number: 8 mds_project_id: "${mds_project_id}" mds_dataset: "${mds_dataset}" - # This is a stored procedure that CALLs the Purchase Propensity Training Preparation stored procedure. + # This is a stored procedure that CALLs the Churn Propensity Training Preparation stored procedure. invoke_churn_propensity_training_preparation: project_id: "${project_id}" dataset: "churn_propensity" @@ -1962,6 +2272,11 @@ bigquery: stored_procedure: "auto_audience_segmentation_training_preparation" # The `lookback_days` parameter is the number of days to look back for training data. lookback_days: 15 + # This is a stored procedure that CALLs the Lead Score Propensity Inference Inference Preparation stored procedure. + invoke_lead_score_propensity_inference_preparation: + project_id: "${project_id}" + dataset: "lead_score_propensity" + stored_procedure: "lead_score_propensity_inference_preparation" # This is a stored procedure that CALLs the Purchase Propensity Inference Inference Preparation stored procedure. invoke_purchase_propensity_inference_preparation: project_id: "${project_id}" @@ -2085,6 +2400,22 @@ bigquery: # The `interval_end_date` parameter defines how many days we leave out of the backfill before the last dates of events. # This is usually the same value as the look forward window. interval_end_date: 180 + # This is a stored procedure that CALLs the Lead Store Propensity Label Backfill stored procedure. + invoke_backfill_lead_score_propensity_label: + mds_project_id: "${mds_project_id}" + mds_dataset: "${mds_dataset}" + project_id: "${project_id}" + dataset: "feature_store" + insert_table: "lead_score_propensity_label" + # The `interval_max_date` parameter defines hwo many days we leave out of the training dataset after the latest date in the dataset. + # This is usually the same value as the look forward window. + interval_max_date: 1 + # The `interval_min_date` parameter defines how many days we leave out of the training dataset before the first date in the dataset. + # This is usually the same value as the lookback window. + interval_min_date: 5 + # The `interval_end_date` parameter defines how many days we leave out of the backfill before the last dates of events. + # This is usually the same value as the look forward window. + interval_end_date: 5 # This is a stored procedure that CALLs the Purchase Propensity Label Backfill stored procedure. invoke_backfill_purchase_propensity_label: mds_project_id: "${mds_project_id}" @@ -2154,6 +2485,22 @@ bigquery: # The `interval_end_date` parameter defines how many days we leave out of the backfill before the last dates of events. # This is usually the same value as the look forward window. interval_end_date: 30 + # This is a stored procedure that CALLs the User Rolling Window Lead Metrics Backfill stored procedure. + invoke_backfill_user_rolling_window_lead_metrics: + mds_project_id: "${mds_project_id}" + mds_dataset: "${mds_dataset}" + project_id: "${project_id}" + dataset: "feature_store" + insert_table: "user_rolling_window_lead_metrics" + # The `interval_max_date` parameter defines hwo many days we leave out of the training dataset after the latest date in the dataset. + # This is usually the same value as the look forward window. + interval_max_date: 1 + # The `interval_min_date` parameter defines how many days we leave out of the training dataset before the first date in the dataset. + # This is usually the same value as the lookback window. + interval_min_date: 5 + # The `interval_end_date` parameter defines how many days we leave out of the backfill before the last dates of events. + # This is usually the same value as the look forward window. + interval_end_date: 5 # This is a stored procedure that CALLs the User Scoped Metrics Backfill stored procedure. invoke_backfill_user_scoped_metrics: mds_project_id: "${mds_project_id}" @@ -2219,6 +2566,16 @@ bigquery: # The `interval_end_date` parameter defines how many days we leave out of the backfill before the last dates of events. # This is usually the same value as the look forward window. interval_input_date: 180 + # This is a stored procedure that CALLs the Lead Score Propensity Label stored procedure. + invoke_lead_score_propensity_label: + mds_project_id: "${mds_project_id}" + mds_dataset: "${mds_dataset}" + project_id: "${project_id}" + dataset: "feature_store" + stored_procedure: "lead_score_propensity_label" + # The `interval_end_date` parameter defines how many days we leave out of the backfill before the last dates of events. + # This is usually the same value as the look forward window. + interval_input_date: 1 # This is a stored procedure that CALLs the Purchase Propensity Label stored procedure. invoke_purchase_propensity_label: mds_project_id: "${mds_project_id}" @@ -2311,6 +2668,14 @@ bigquery: # The `interval_end_date` parameter defines how many days we leave out of the backfill before the last dates of events. # This is usually the same value as the look forward window. interval_end_date: 180 + # This is a stored procedure that CALLs the User Rolling Window Metrics stored procedure. + invoke_user_rolling_window_lead_metrics: + project_id: "${project_id}" + dataset: "feature_store" + stored_procedure: "user_rolling_window_lead_metrics" + # The `interval_end_date` parameter defines how many days we leave out of the backfill before the last dates of events. + # This is usually the same value as the look forward window. + interval_end_date: 15 # This is a stored procedure that CALLs the User Scoped Metrics stored procedure. invoke_user_scoped_metrics: project_id: "${project_id}" @@ -2322,10 +2687,11 @@ bigquery: # This section sets the parameters for the features, training and inference procedures that insert data into tables and views to be used for # training and prediction. # There is no strict recommendation on the right parameters that will maximize the models performance, however here are some back of the envelope numbers. - # Purchase Propensity model: 1 month-2 years for dates interval. From Xk - 10M users. - # Customer LTV model: 6 months-2 years for dates interval. From Xk - 10M users. - # Audience Segmentation / Auto Audience Segmentation models: 1 month-1 year for dates interval. From XXX - 10M users. - # Aggregated VBB model: 1000 days - 2000 days + # Lead Score Propensity model: 2 weeks-1 year for dates interval. From Xk - 1M users. + # Purchase Propensity model: 1 month-2 years for dates interval. From Xk - 1M users. + # Customer LTV model: 6 months-2 years for dates interval. From Xk - 1M users. + # Audience Segmentation / Auto Audience Segmentation models: 1 month-1 year for dates interval. From XXX - 1M users. + # Aggregated VBB model: 100 days - 2000 days # Note: For Aggregated VBB, it's common to duplicate rows to that training dataset size reaches at least 1k rows for AutoML to train a model. # If that is your case, this is not a problem since typically duplicated rows has a similar effect as of training the model for more epochs. procedure: @@ -2413,6 +2779,17 @@ bigquery: expiration_duration_hours: 168 custom_start_date: "'2023-01-01'" custom_end_date: "NULL" + # This is the stored procedure that calculates the label column for the Lead Score Propensity use case. + # The label represents wether a user will make a lead score over a period of time. + # Typically, looking at a period of 1 day in the future. + # The granularity level is per user per day. + lead_score_propensity_label: + project_id: "${project_id}" + dataset: "feature_store" + name: "lead_score_propensity_label" + insert_table: "lead_score_propensity_label" + mds_project_id: "${mds_project_id}" + mds_dataset: "${mds_dataset}" # This is the stored procedure that calculates the label column for the Purchase Propensity use case. # The label represents wether a user will make a purchase over a period of time. # Typically, looking at a period of 15 to 30 days in the future. @@ -2425,7 +2802,7 @@ bigquery: mds_project_id: "${mds_project_id}" mds_dataset: "${mds_dataset}" # This is the stored procedure that calculates the label column for the Churn Propensity use case. - # The label represents wether a user will make a purchase over a period of time. + # The label represents wether a user will churn over a period of time. # Typically, looking at a period of 30 days in the future. # The granularity level is per user per day. churn_propensity_label: @@ -2444,6 +2821,22 @@ bigquery: # The procedure will split the data into three splits (TRAIN, VALIDATE, TEST) and will take care of avoiding splits contamination. # There is a minimum number of examples rows of 1000 and the maximum is as much as it fits in memory, overall consensus is that for ML models # you will provide at maximum a couple of millions of rows. + lead_score_propensity_training_preparation: + project_id: "${project_id}" + dataset: "lead_score_propensity" + name: "lead_score_propensity_training_preparation" + insert_table: "lead_score_propensity_training_full_dataset" + feature_store_project_id: "${project_id}" + feature_store_dataset: "feature_store" + mds_project_id: "${mds_project_id}" + mds_dataset: "${mds_dataset}" + expiration_duration_hours: 168 + custom_start_date: "'2024-01-01'" + custom_end_date: "NULL" + # This is the stored procedure that collects the features and prepare the examples rows to train a model. + # The procedure will split the data into three splits (TRAIN, VALIDATE, TEST) and will take care of avoiding splits contamination. + # There is a minimum number of examples rows of 1000 and the maximum is as much as it fits in memory, overall consensus is that for ML models + # you will provide at maximum a couple of millions of rows. purchase_propensity_training_preparation: project_id: "${project_id}" dataset: "purchase_propensity" @@ -2454,7 +2847,7 @@ bigquery: mds_project_id: "${mds_project_id}" mds_dataset: "${mds_dataset}" expiration_duration_hours: 168 - custom_start_date: "'2023-01-01'" + custom_start_date: "'2024-01-01'" custom_end_date: "NULL" # This is the stored procedure that collects the features and prepare the examples rows to train a model. # The procedure will split the data into three splits (TRAIN, VALIDATE, TEST) and will take care of avoiding splits contamination. @@ -2524,6 +2917,16 @@ bigquery: insert_table: "user_rolling_window_metrics" mds_project_id: "${mds_project_id}" mds_dataset: "${mds_dataset}" + # This is the stored procedure that UPSERTs new look back rolling windows metrics rows daily. + # The granularity level is per user per day. + # These metrics are used for the Lead Score Propensity use case. + user_rolling_window_lead_metrics: + project_id: "${project_id}" + dataset: "feature_store" + name: "user_rolling_window_lead_metrics" + insert_table: "user_rolling_window_lead_metrics" + mds_project_id: "${mds_project_id}" + mds_dataset: "${mds_dataset}" # This is the stored procedure that UPSERTs new aggregated users metrics rows daily. # The granularity level is per day, whereas the calculations take into consideration all users. # These metrics are used for the Customer Lifetime Value use case. @@ -2575,6 +2978,16 @@ bigquery: mds_project_id: "${mds_project_id}" mds_dataset: "${mds_dataset}" # This is the stored procedure that collects the features and prepare the examples rows for daily prediction. + lead_score_propensity_inference_preparation: + project_id: "${project_id}" + mds_dataset: "${mds_dataset}" + dataset: "lead_score_propensity" + name: "lead_score_propensity_inference_preparation" + feature_store_project_id: "${project_id}" + feature_store_dataset: "feature_store" + insert_table: "lead_score_propensity_inference_preparation" + expiration_duration_hours: 168 + # This is the stored procedure that collects the features and prepare the examples rows for daily prediction. purchase_propensity_inference_preparation: project_id: "${project_id}" mds_dataset: "${mds_dataset}" diff --git a/infrastructure/terraform/.terraform.lock.hcl b/infrastructure/terraform/.terraform.lock.hcl index 04cda0d6..fb574eb5 100644 --- a/infrastructure/terraform/.terraform.lock.hcl +++ b/infrastructure/terraform/.terraform.lock.hcl @@ -167,6 +167,7 @@ provider "registry.terraform.io/hashicorp/template" { provider "registry.terraform.io/hashicorp/time" { version = "0.12.1" hashes = [ + "h1:6BhxSYBJdBBKyuqatOGkuPKVenfx6UmLdiI13Pb3his=", "h1:j+ED7j0ZFJ4EDx7sdna76wsiIf397toylDN0dFi6v0U=", "zh:090023137df8effe8804e81c65f636dadf8f9d35b79c3afff282d39367ba44b2", "zh:26f1e458358ba55f6558613f1427dcfa6ae2be5119b722d0b3adb27cd001efea", diff --git a/infrastructure/terraform/README.md b/infrastructure/terraform/README.md index dc2836b0..fcbb9246 100644 --- a/infrastructure/terraform/README.md +++ b/infrastructure/terraform/README.md @@ -106,6 +106,7 @@ Also, this method allows you to extend this solution and develop it to satisfy y Terraform stores state about managed infrastructure to map real-world resources to the configuration, keep track of metadata, and improve performance. Terraform stores this state in a local file by default, but you can also use a Terraform remote backend to store state remotely. [Remote state](https://developer.hashicorp.com/terraform/cdktf/concepts/remote-backends) makes it easier for teams to work together because all members have access to the latest state data in the remote store. ```bash + SOURCE_ROOT="${HOME}/${REPO}" cd ${SOURCE_ROOT} scripts/generate-tf-backend.sh ``` diff --git a/infrastructure/terraform/modules/activation/export-procedures.tf b/infrastructure/terraform/modules/activation/export-procedures.tf index e7281356..86d942be 100644 --- a/infrastructure/terraform/modules/activation/export-procedures.tf +++ b/infrastructure/terraform/modules/activation/export-procedures.tf @@ -119,10 +119,33 @@ resource "google_bigquery_routine" "export_churn_propensity_procedure" { routine_type = "PROCEDURE" language = "SQL" definition_body = data.template_file.churn_propensity_csv_export_query.rendered - description = "Export purchase propensity predictions as CSV for GA4 User Data Import" + description = "Export churn propensity predictions as CSV for GA4 User Data Import" arguments { name = "prediction_table_name" mode = "IN" data_type = jsonencode({ "typeKind" : "STRING" }) } } + +data "template_file" "lead_score_propensity_csv_export_query" { + template = file("${local.source_root_dir}/templates/activation_user_import/lead_score_propensity_csv_export.sqlx") + vars = { + ga4_stream_id = var.ga4_stream_id + export_bucket = module.pipeline_bucket.name + } +} + +resource "google_bigquery_routine" "export_lead_score_propensity_procedure" { + project = null_resource.check_bigquery_api.id != "" ? module.project_services.project_id : var.project_id + dataset_id = module.bigquery.bigquery_dataset.dataset_id + routine_id = "export_lead_score_propensity_predictions" + routine_type = "PROCEDURE" + language = "SQL" + definition_body = data.template_file.lead_score_propensity_csv_export_query.rendered + description = "Export lead score propensity predictions as CSV for GA4 User Data Import" + arguments { + name = "prediction_table_name" + mode = "IN" + data_type = jsonencode({ "typeKind" : "STRING" }) + } +} \ No newline at end of file diff --git a/infrastructure/terraform/modules/activation/main.tf b/infrastructure/terraform/modules/activation/main.tf index 1bf35a9d..ef58ce59 100644 --- a/infrastructure/terraform/modules/activation/main.tf +++ b/infrastructure/terraform/modules/activation/main.tf @@ -24,6 +24,8 @@ locals { cltv_query_template_file = "cltv_query_template.sqlx" purchase_propensity_query_template_file = "purchase_propensity_query_template.sqlx" purchase_propensity_vbb_query_template_file = "purchase_propensity_vbb_query_template.sqlx" + lead_score_propensity_query_template_file = "lead_score_propensity_query_template.sqlx" + lead_score_propensity_vbb_query_template_file = "lead_score_propensity_vbb_query_template.sqlx" churn_propensity_query_template_file = "churn_propensity_query_template.sqlx" activation_container_image_id = "activation-pipeline" docker_repo_prefix = "${var.location}-docker.pkg.dev/${var.project_id}" @@ -750,7 +752,7 @@ data "template_file" "churn_propensity_query_template_file" { } } -# This resource creates a bucket object using as content the purchase_propensity_query_template_file file. +# This resource creates a bucket object using as content the churn_propensity_query_template_file file. resource "google_storage_bucket_object" "churn_propensity_query_template_file" { name = "${local.configuration_folder}/${local.churn_propensity_query_template_file}" content = data.template_file.churn_propensity_query_template_file.rendered @@ -791,6 +793,40 @@ resource "google_storage_bucket_object" "purchase_propensity_vbb_query_template_ bucket = module.pipeline_bucket.name } +data "template_file" "lead_score_propensity_query_template_file" { + template = file("${local.template_dir}/activation_query/${local.lead_score_propensity_query_template_file}") + + vars = { + mds_project_id = var.mds_project_id + mds_dataset_suffix = var.mds_dataset_suffix + } +} + +# This resource creates a bucket object using as content the lead_score_propensity_query_template_file file. +resource "google_storage_bucket_object" "lead_score_propensity_query_template_file" { + name = "${local.configuration_folder}/${local.lead_score_propensity_query_template_file}" + content = data.template_file.lead_score_propensity_query_template_file.rendered + bucket = module.pipeline_bucket.name +} + +# This resource creates a bucket object using as content the lead_score_propensity_vbb_query_template_file file. +data "template_file" "lead_score_propensity_vbb_query_template_file" { + template = file("${local.template_dir}/activation_query/${local.lead_score_propensity_vbb_query_template_file}") + + vars = { + mds_project_id = var.mds_project_id + mds_dataset_suffix = var.mds_dataset_suffix + activation_project_id = var.project_id + dataset = module.bigquery.bigquery_dataset.dataset_id + } +} + +resource "google_storage_bucket_object" "lead_score_propensity_vbb_query_template_file" { + name = "${local.configuration_folder}/${local.lead_score_propensity_vbb_query_template_file}" + content = data.template_file.lead_score_propensity_vbb_query_template_file.rendered + bucket = module.pipeline_bucket.name +} + # This data resources creates a data resource that renders a template file and stores the rendered content in a variable. data "template_file" "activation_type_configuration" { template = file("${local.template_dir}/activation_type_configuration_template.tpl") @@ -802,6 +838,8 @@ data "template_file" "activation_type_configuration" { purchase_propensity_query_template_gcs_path = "gs://${module.pipeline_bucket.name}/${google_storage_bucket_object.purchase_propensity_query_template_file.output_name}" purchase_propensity_vbb_query_template_gcs_path = "gs://${module.pipeline_bucket.name}/${google_storage_bucket_object.purchase_propensity_vbb_query_template_file.output_name}" churn_propensity_query_template_gcs_path = "gs://${module.pipeline_bucket.name}/${google_storage_bucket_object.churn_propensity_query_template_file.output_name}" + lead_score_propensity_query_template_gcs_path = "gs://${module.pipeline_bucket.name}/${google_storage_bucket_object.lead_score_propensity_query_template_file.output_name}" + lead_score_propensity_vbb_query_template_gcs_path = "gs://${module.pipeline_bucket.name}/${google_storage_bucket_object.lead_score_propensity_vbb_query_template_file.output_name}" } } diff --git a/infrastructure/terraform/modules/feature-store/bigquery-datasets.tf b/infrastructure/terraform/modules/feature-store/bigquery-datasets.tf index d7b08539..52816e44 100644 --- a/infrastructure/terraform/modules/feature-store/bigquery-datasets.tf +++ b/infrastructure/terraform/modules/feature-store/bigquery-datasets.tf @@ -90,6 +90,32 @@ resource "google_bigquery_dataset" "churn_propensity" { } } +# This resource creates a BigQuery dataset called `lead_score_propensity`. +resource "google_bigquery_dataset" "lead_score_propensity" { + dataset_id = local.config_bigquery.dataset.lead_score_propensity.name + friendly_name = local.config_bigquery.dataset.lead_score_propensity.friendly_name + project = null_resource.check_bigquery_api.id != "" ? local.lead_score_propensity_project_id : local.feature_store_project_id + description = local.config_bigquery.dataset.lead_score_propensity.description + location = local.config_bigquery.dataset.lead_score_propensity.location + # The max_time_travel_hours attribute specifies the maximum number of hours that data in the dataset can be accessed using time travel queries. + # In this case, the maximum time travel hours is set to the value of the local file config.yaml section bigquery.dataset.feature_store.max_time_travel_hours configuration. + max_time_travel_hours = local.config_bigquery.dataset.lead_score_propensity.max_time_travel_hours + # The delete_contents_on_destroy attribute specifies whether the contents of the dataset should be deleted when the dataset is destroyed. + # In this case, the delete_contents_on_destroy attribute is set to false, which means that the contents of the dataset will not be deleted when the dataset is destroyed. + delete_contents_on_destroy = false + + labels = { + version = "prod" + } + + # The lifecycle block allows you to configure the lifecycle of the dataset. + # In this case, the ignore_changes attribute is set to all, which means that + # Terraform will ignore any changes to the dataset and will not attempt to update the dataset. + lifecycle { + ignore_changes = all + } +} + # This resource creates a BigQuery dataset called `customer_lifetime_value`. resource "google_bigquery_dataset" "customer_lifetime_value" { dataset_id = local.config_bigquery.dataset.customer_lifetime_value.name @@ -300,7 +326,8 @@ module "gemini_insights" { location = local.config_bigquery.dataset.gemini_insights.location # The delete_contents_on_destroy attribute specifies whether the contents of the dataset should be deleted when the dataset is destroyed. # In this case, the delete_contents_on_destroy attribute is set to false, which means that the contents of the dataset will not be deleted when the dataset is destroyed. - delete_contents_on_destroy = true + delete_contents_on_destroy = false + deletion_protection = true dataset_labels = { version = "prod", @@ -314,7 +341,7 @@ module "gemini_insights" { # The max_time_travel_hours attribute specifies the maximum number of hours that data in the dataset can be accessed using time travel queries. # In this case, the maximum time travel hours is set to the value of the local file config.yaml section bigquery.dataset.gemini_insights.max_time_travel_hours configuration. max_time_travel_hours = local.config_bigquery.dataset.gemini_insights.max_time_travel_hours - deletion_protection = false + deletion_protection = true time_partitioning = null, range_partitioning = null, expiration_time = null, diff --git a/infrastructure/terraform/modules/feature-store/bigquery-procedures.tf b/infrastructure/terraform/modules/feature-store/bigquery-procedures.tf index 1424656c..628a6848 100644 --- a/infrastructure/terraform/modules/feature-store/bigquery-procedures.tf +++ b/infrastructure/terraform/modules/feature-store/bigquery-procedures.tf @@ -350,6 +350,32 @@ resource "google_bigquery_routine" "churn_propensity_inference_preparation" { } } +# This resource reads the contents of a local SQL file named lead_score_propensity_inference_preparation.sql and +# stores it in a variable named lead_score_propensity_inference_preparation_file.content. +# The SQL file is expected to contain the definition of a BigQuery procedure named lead_score_propensity_inference_preparation. +data "local_file" "lead_score_propensity_inference_preparation_file" { + filename = "${local.sql_dir}/procedure/lead_score_propensity_inference_preparation.sql" +} + +# The lead_score_propensity_inference_preparation procedure is designed to prepare features for the Lead Score Propensity model. +# ## +# The procedure is typically invoked before prediction the Lead Score Propensity model to ensure that the features data +# is in the correct format and contains the necessary features for prediction. +resource "google_bigquery_routine" "lead_score_propensity_inference_preparation" { + project = null_resource.check_bigquery_api.id != "" ? local.lead_score_propensity_project_id : local.feature_store_project_id + dataset_id = google_bigquery_dataset.lead_score_propensity.dataset_id + routine_id = "lead_score_propensity_inference_preparation" + routine_type = "PROCEDURE" + language = "SQL" + definition_body = data.local_file.lead_score_propensity_inference_preparation_file.content + description = "Procedure that prepares features for Lead Score Propensity model inference. User-per-day granularity level features. Run this procedure every time before Lead Score Propensity model predict." + arguments { + name = "inference_date" + mode = "INOUT" + data_type = jsonencode({ "typeKind" : "DATE" }) + } +} + # This resource reads the contents of a local SQL file named purchase_propensity_label.sql and # stores it in a variable named purchase_propensity_label_file.content. # The SQL file is expected to contain the definition of a BigQuery procedure named purchase_propensity_label. @@ -422,6 +448,42 @@ resource "google_bigquery_routine" "churn_propensity_label" { } } +# This resource reads the contents of a local SQL file named lead_score_propensity_label.sql and +# stores it in a variable named lead_score_propensity_label_file.content. +# The SQL file is expected to contain the definition of a BigQuery procedure named lead_score_propensity_label. +data "local_file" "lead_score_propensity_label_file" { + filename = "${local.sql_dir}/procedure/lead_score_propensity_label.sql" +} + +# The lead_score_propensity_label procedure is designed to prepare label for the Lead Score Propensity model. +# ## +# The procedure is typically invoked before training the Lead Score Propensity model to ensure that the labeled data +# is in the correct format and ready for training. +resource "google_bigquery_routine" "lead_score_propensity_label" { + project = null_resource.check_bigquery_api.id != "" ? local.feature_store_project_id : local.feature_store_project_id + dataset_id = google_bigquery_dataset.feature_store.dataset_id + routine_id = "lead_score_propensity_label" + routine_type = "PROCEDURE" + language = "SQL" + definition_body = data.local_file.lead_score_propensity_label_file.content + description = "User-per-day granularity level labels. Run this procedure daily." + arguments { + name = "input_date" + mode = "INOUT" + data_type = jsonencode({ "typeKind" : "DATE" }) + } + arguments { + name = "end_date" + mode = "INOUT" + data_type = jsonencode({ "typeKind" : "DATE" }) + } + arguments { + name = "rows_added" + mode = "OUT" + data_type = jsonencode({ "typeKind" : "INT64" }) + } +} + # This resource reads the contents of a local SQL file named purchase_propensity_training_preparation.sql and # stores it in a variable named purchase_propensity_training_preparation_file.content. # The SQL file is expected to contain the definition of a BigQuery procedure named purchase_propensity_training_preparation. @@ -463,6 +525,46 @@ resource "google_bigquery_routine" "purchase_propensity_training_preparation" { } } +# This resource reads the contents of a local SQL file named lead_score_propensity_training_preparation.sql and +# stores it in a variable named lead_score_propensity_training_preparation_file.content. +# The SQL file is expected to contain the definition of a BigQuery procedure named lead_score_propensity_training_preparation. +data "local_file" "lead_score_propensity_training_preparation_file" { + filename = "${local.sql_dir}/procedure/lead_score_propensity_training_preparation.sql" +} + +# The lead_score_propensity_training_preparation procedure is designed to prepare features for the Lead Score Propensity model. +# ## +# The procedure is typically invoked before training the Lead Score Propensity model to ensure that the features data +# is in the correct format and contains the necessary features for training. +resource "google_bigquery_routine" "lead_score_propensity_training_preparation" { + project = null_resource.check_bigquery_api.id != "" ? local.lead_score_propensity_project_id : local.feature_store_project_id + dataset_id = google_bigquery_dataset.lead_score_propensity.dataset_id + routine_id = "lead_score_propensity_training_preparation" + routine_type = "PROCEDURE" + language = "SQL" + definition_body = data.local_file.lead_score_propensity_training_preparation_file.content + description = "Procedure that prepares features for Lead Score Propensity model training. User-per-day granularity level features. Run this procedure every time before Lead Score Propensity model train." + arguments { + name = "start_date" + mode = "INOUT" + data_type = jsonencode({ "typeKind" : "DATE" }) + } + arguments { + name = "end_date" + mode = "INOUT" + data_type = jsonencode({ "typeKind" : "DATE" }) + } + arguments { + name = "train_split_end_number" + mode = "INOUT" + data_type = jsonencode({ "typeKind" : "INT64" }) + } + arguments { + name = "validation_split_end_number" + mode = "INOUT" + data_type = jsonencode({ "typeKind" : "INT64" }) + } +} # This resource reads the contents of a local SQL file named churn_propensity_training_preparation.sql and # stores it in a variable named churn_propensity_training_preparation_file.content. @@ -685,6 +787,42 @@ resource "google_bigquery_routine" "user_rolling_window_metrics" { } } +# This resource reads the contents of a local SQL file named user_rolling_window_lead_metrics.sql and +# stores it in a variable named user_rolling_window_lead_metrics_file.content. +# The SQL file is expected to contain the definition of a BigQuery procedure named user_rolling_window_lead_metrics. +data "local_file" "user_rolling_window_lead_metrics_file" { + filename = "${local.sql_dir}/procedure/user_rolling_window_lead_metrics.sql" +} + +# The user_rolling_window_lead_metrics procedure is designed to prepare the features for the Purchase Propensity model. +# ## +# The procedure is typically invoked before training the Purchase Propensity model to ensure that the features data +# is in the correct format and ready for training. +resource "google_bigquery_routine" "user_rolling_window_lead_metrics" { + project = null_resource.check_bigquery_api.id != "" ? local.feature_store_project_id : local.feature_store_project_id + dataset_id = google_bigquery_dataset.feature_store.dataset_id + routine_id = "user_rolling_window_lead_metrics" + routine_type = "PROCEDURE" + language = "SQL" + definition_body = data.local_file.user_rolling_window_lead_metrics_file.content + description = "User-per-day granularity level metrics. Run this procedure daily. Metrics calculated using a rolling window operation." + arguments { + name = "input_date" + mode = "INOUT" + data_type = jsonencode({ "typeKind" : "DATE" }) + } + arguments { + name = "end_date" + mode = "INOUT" + data_type = jsonencode({ "typeKind" : "DATE" }) + } + arguments { + name = "rows_added" + mode = "OUT" + data_type = jsonencode({ "typeKind" : "INT64" }) + } +} + # This resource reads the contents of a local SQL file named user_scoped_lifetime_metrics.sql data "local_file" "user_scoped_lifetime_metrics_file" { filename = "${local.sql_dir}/procedure/user_scoped_lifetime_metrics.sql" @@ -880,6 +1018,14 @@ resource "google_bigquery_routine" "user_behaviour_revenue_insights" { depends_on = [ null_resource.check_gemini_model_exists ] + + # The lifecycle block is used to configure the lifecycle of the table. In this case, the ignore_changes attribute is set to all, which means that Terraform will ignore + # any changes to the table and will not attempt to update the table. The prevent_destroy attribute is set to true, which means that Terraform will prevent the table from being destroyed. + lifecycle { + ignore_changes = all + #prevent_destroy = true + create_before_destroy = true + } } /* @@ -930,6 +1076,20 @@ resource "google_bigquery_routine" "invoke_backfill_churn_propensity_label" { description = "Procedure that backfills the churn_propensity_label feature table. Run this procedure occasionally before training the models." } +data "local_file" "invoke_backfill_lead_score_propensity_label_file" { + filename = "${local.sql_dir}/query/invoke_backfill_lead_score_propensity_label.sql" +} + +resource "google_bigquery_routine" "invoke_backfill_lead_score_propensity_label" { + project = null_resource.check_bigquery_api.id != "" ? local.feature_store_project_id : local.feature_store_project_id + dataset_id = google_bigquery_dataset.feature_store.dataset_id + routine_id = "invoke_backfill_lead_score_propensity_label" + routine_type = "PROCEDURE" + language = "SQL" + definition_body = data.local_file.invoke_backfill_lead_score_propensity_label_file.content + description = "Procedure that backfills the lead_score_propensity_label feature table. Run this procedure occasionally before training the models." +} + data "local_file" "invoke_backfill_user_dimensions_file" { filename = "${local.sql_dir}/query/invoke_backfill_user_dimensions.sql" } @@ -1003,6 +1163,20 @@ resource "google_bigquery_routine" "invoke_backfill_user_rolling_window_metrics" description = "Procedure that backfills the user_rolling_window_metrics feature table. Run this procedure occasionally before training the models." } +data "local_file" "invoke_backfill_user_rolling_window_lead_metrics_file" { + filename = "${local.sql_dir}/query/invoke_backfill_user_rolling_window_lead_metrics.sql" +} + +resource "google_bigquery_routine" "invoke_backfill_user_rolling_window_lead_metrics" { + project = null_resource.check_bigquery_api.id != "" ? local.feature_store_project_id : local.feature_store_project_id + dataset_id = google_bigquery_dataset.feature_store.dataset_id + routine_id = "invoke_backfill_user_rolling_window_lead_metrics" + routine_type = "PROCEDURE" + language = "SQL" + definition_body = data.local_file.invoke_backfill_user_rolling_window_lead_metrics_file.content + description = "Procedure that backfills the user_rolling_window_lead_metrics feature table. Run this procedure occasionally before training the models." +} + data "local_file" "invoke_backfill_user_scoped_lifetime_metrics_file" { filename = "${local.sql_dir}/query/invoke_backfill_user_scoped_lifetime_metrics.sql" @@ -1091,6 +1265,14 @@ resource "google_bigquery_routine" "invoke_backfill_user_behaviour_revenue_insig null_resource.check_gemini_model_exists, null_resource.create_gemini_model ] + + # The lifecycle block is used to configure the lifecycle of the table. In this case, the ignore_changes attribute is set to all, which means that Terraform will ignore + # any changes to the table and will not attempt to update the table. The prevent_destroy attribute is set to true, which means that Terraform will prevent the table from being destroyed. + lifecycle { + ignore_changes = all + #prevent_destroy = true + create_before_destroy = true + } } /* @@ -1139,6 +1321,19 @@ resource "google_bigquery_routine" "invoke_churn_propensity_inference_preparatio definition_body = data.local_file.invoke_churn_propensity_inference_preparation_file.content } +data "local_file" "invoke_lead_score_propensity_inference_preparation_file" { + filename = "${local.sql_dir}/query/invoke_lead_score_propensity_inference_preparation.sql" +} + +resource "google_bigquery_routine" "invoke_lead_score_propensity_inference_preparation" { + project = null_resource.check_bigquery_api.id != "" ? local.lead_score_propensity_project_id : local.feature_store_project_id + dataset_id = google_bigquery_dataset.lead_score_propensity.dataset_id + routine_id = "invoke_lead_score_propensity_inference_preparation" + routine_type = "PROCEDURE" + language = "SQL" + definition_body = data.local_file.invoke_lead_score_propensity_inference_preparation_file.content +} + data "local_file" "invoke_audience_segmentation_inference_preparation_file" { filename = "${local.sql_dir}/query/invoke_audience_segmentation_inference_preparation.sql" @@ -1222,6 +1417,19 @@ resource "google_bigquery_routine" "invoke_churn_propensity_training_preparation } +data "local_file" "invoke_lead_score_propensity_training_preparation_file" { + filename = "${local.sql_dir}/query/invoke_lead_score_propensity_training_preparation.sql" +} + +resource "google_bigquery_routine" "invoke_lead_score_propensity_training_preparation" { + project = null_resource.check_bigquery_api.id != "" ? local.lead_score_propensity_project_id : local.feature_store_project_id + dataset_id = google_bigquery_dataset.lead_score_propensity.dataset_id + routine_id = "invoke_lead_score_propensity_training_preparation" + routine_type = "PROCEDURE" + language = "SQL" + definition_body = data.local_file.invoke_lead_score_propensity_training_preparation_file.content +} + data "local_file" "invoke_audience_segmentation_training_preparation_file" { filename = "${local.sql_dir}/query/invoke_audience_segmentation_training_preparation.sql" } @@ -1298,6 +1506,20 @@ resource "google_bigquery_routine" "invoke_purchase_propensity_label" { } +data "local_file" "invoke_lead_score_propensity_label_file" { + filename = "${local.sql_dir}/query/invoke_lead_score_propensity_label.sql" +} + +resource "google_bigquery_routine" "invoke_lead_score_propensity_label" { + project = null_resource.check_bigquery_api.id != "" ? local.feature_store_project_id : local.feature_store_project_id + dataset_id = google_bigquery_dataset.feature_store.dataset_id + routine_id = "invoke_lead_score_propensity_label" + routine_type = "PROCEDURE" + language = "SQL" + definition_body = data.local_file.invoke_lead_score_propensity_label_file.content + description = "Procedure that invokes the lead_score_propensity_label table. Daily granularity level. Run this procedure daily before running prediction pipelines." +} + data "local_file" "invoke_churn_propensity_label_file" { filename = "${local.sql_dir}/query/invoke_churn_propensity_label.sql" } @@ -1387,6 +1609,20 @@ resource "google_bigquery_routine" "invoke_user_rolling_window_metrics" { } +data "local_file" "invoke_user_rolling_window_lead_metrics_file" { + filename = "${local.sql_dir}/query/invoke_user_rolling_window_lead_metrics.sql" +} + +resource "google_bigquery_routine" "invoke_user_rolling_window_lead_metrics" { + project = null_resource.check_bigquery_api.id != "" ? local.feature_store_project_id : local.feature_store_project_id + dataset_id = google_bigquery_dataset.feature_store.dataset_id + routine_id = "invoke_user_rolling_window_lead_metrics" + routine_type = "PROCEDURE" + language = "SQL" + definition_body = data.local_file.invoke_user_rolling_window_lead_metrics_file.content + description = "Procedure that invokes the user_rolling_window_lead_metrics table. Daily granularity level. Run this procedure daily before running prediction pipelines." +} + data "local_file" "invoke_user_scoped_lifetime_metrics_file" { filename = "${local.sql_dir}/query/invoke_user_scoped_lifetime_metrics.sql" } @@ -1448,7 +1684,7 @@ data "local_file" "invoke_user_session_event_aggregated_metrics_file" { } resource "google_bigquery_routine" "invoke_user_session_event_aggregated_metrics" { - project = null_resource.check_bigquery_api.id != "" ? local.purchase_propensity_project_id : local.feature_store_project_id + project = null_resource.check_bigquery_api.id != "" ? local.feature_store_project_id : local.feature_store_project_id dataset_id = google_bigquery_dataset.feature_store.dataset_id routine_id = "invoke_user_session_event_aggregated_metrics" routine_type = "PROCEDURE" @@ -1481,7 +1717,8 @@ resource "null_resource" "create_gemini_model" { # any changes to the table and will not attempt to update the table. The prevent_destroy attribute is set to true, which means that Terraform will prevent the table from being destroyed. lifecycle { ignore_changes = all - prevent_destroy = true + #prevent_destroy = true + create_before_destroy = true } depends_on = [ diff --git a/infrastructure/terraform/modules/feature-store/bigquery-tables.tf b/infrastructure/terraform/modules/feature-store/bigquery-tables.tf index 891a4b3e..e74fb1ed 100644 --- a/infrastructure/terraform/modules/feature-store/bigquery-tables.tf +++ b/infrastructure/terraform/modules/feature-store/bigquery-tables.tf @@ -112,6 +112,24 @@ resource "google_bigquery_table" "churn_propensity_inference_preparation" { schema = file("${local.sql_dir}/schema/table/churn_propensity_inference_preparation.json") } +# This resource creates a BigQuery table named lead_score_propensity_inference_preparation +# in the dataset specified by google_bigquery_dataset.lead_score_propensity.dataset_id. +resource "google_bigquery_table" "lead_score_propensity_inference_preparation" { + project = google_bigquery_dataset.lead_score_propensity.project + dataset_id = google_bigquery_dataset.lead_score_propensity.dataset_id + table_id = local.config_bigquery.table.lead_score_propensity_inference_preparation.table_name + description = local.config_bigquery.table.lead_score_propensity_inference_preparation.table_description + + # The deletion_protection attribute specifies whether the table should be protected from deletion. In this case, it's set to false, which means that the table can be deleted. + deletion_protection = false + labels = { + version = "prod" + } + + # The schema attribute specifies the schema of the table. In this case, the schema is defined in the JSON file. + schema = file("${local.sql_dir}/schema/table/lead_score_propensity_inference_preparation.json") +} + # This resource creates a BigQuery table named purchase_propensity_label # in the dataset specified by google_bigquery_dataset.feature_store.dataset_id. resource "google_bigquery_table" "purchase_propensity_label" { @@ -162,6 +180,31 @@ resource "google_bigquery_table" "churn_propensity_label" { } } +# This resource creates a BigQuery table named lead_score_propensity_label +# in the dataset specified by google_bigquery_dataset.feature_store.dataset_id. +resource "google_bigquery_table" "lead_score_propensity_label" { + project = google_bigquery_dataset.feature_store.project + dataset_id = google_bigquery_dataset.feature_store.dataset_id + table_id = local.config_bigquery.table.lead_score_propensity_label.table_name + description = local.config_bigquery.table.lead_score_propensity_label.table_description + + # The deletion_protection attribute specifies whether the table should be protected from deletion. In this case, it's set to false, which means that the table can be deleted. + deletion_protection = false + labels = { + version = "prod" + } + + # The schema attribute specifies the schema of the table. In this case, the schema is defined in the JSON file. + schema = file("${local.sql_dir}/schema/table/lead_score_propensity_label.json") + + # The lifecycle block is used to configure the lifecycle of the table. In this case, the ignore_changes attribute is set to all, which means that Terraform will ignore + # any changes to the table and will not attempt to update the table. The prevent_destroy attribute is set to true, which means that Terraform will prevent the table from being destroyed. + lifecycle { + ignore_changes = all + prevent_destroy = true + } +} + # This resource creates a BigQuery table named user_dimensions # in the dataset specified by google_bigquery_dataset.feature_store.dataset_id. resource "google_bigquery_table" "user_dimensions" { @@ -287,6 +330,31 @@ resource "google_bigquery_table" "user_rolling_window_metrics" { } } +# This resource creates a BigQuery table named user_rolling_window_lead_metrics +# in the dataset specified by google_bigquery_dataset.feature_store.dataset_id. +resource "google_bigquery_table" "user_rolling_window_lead_metrics" { + project = google_bigquery_dataset.feature_store.project + dataset_id = google_bigquery_dataset.feature_store.dataset_id + table_id = local.config_bigquery.table.user_rolling_window_lead_metrics.table_name + description = local.config_bigquery.table.user_rolling_window_lead_metrics.table_description + + # The deletion_protection attribute specifies whether the table should be protected from deletion. In this case, it's set to false, which means that the table can be deleted. + deletion_protection = false + labels = { + version = "prod" + } + + # The schema attribute specifies the schema of the table. In this case, the schema is defined in the JSON file. + schema = file("${local.sql_dir}/schema/table/user_rolling_window_lead_metrics.json") + + # The lifecycle block is used to configure the lifecycle of the table. In this case, the ignore_changes attribute is set to all, which means that Terraform will ignore + # any changes to the table and will not attempt to update the table. The prevent_destroy attribute is set to true, which means that Terraform will prevent the table from being destroyed. + lifecycle { + ignore_changes = all + prevent_destroy = true + } +} + # This resource creates a BigQuery table named user_scoped_lifetime_metrics # in the dataset specified by google_bigquery_dataset.feature_store.dataset_id. resource "google_bigquery_table" "user_scoped_lifetime_metrics" { diff --git a/infrastructure/terraform/modules/feature-store/main.tf b/infrastructure/terraform/modules/feature-store/main.tf index 296d812f..9f1a8d7b 100644 --- a/infrastructure/terraform/modules/feature-store/main.tf +++ b/infrastructure/terraform/modules/feature-store/main.tf @@ -24,6 +24,7 @@ locals { builder_repository_id = "marketing-analytics-jumpstart-base-repo" purchase_propensity_project_id = null_resource.check_bigquery_api.id != "" ? local.config_vars.bigquery.dataset.purchase_propensity.project_id : local.feature_store_project_id churn_propensity_project_id = null_resource.check_bigquery_api.id != "" ? local.config_vars.bigquery.dataset.churn_propensity.project_id : local.feature_store_project_id + lead_score_propensity_project_id = null_resource.check_bigquery_api.id != "" ? local.config_vars.bigquery.dataset.lead_score_propensity.project_id : local.feature_store_project_id audience_segmentation_project_id = null_resource.check_bigquery_api.id != "" ? local.config_vars.bigquery.dataset.audience_segmentation.project_id : local.feature_store_project_id auto_audience_segmentation_project_id = null_resource.check_bigquery_api.id != "" ? local.config_vars.bigquery.dataset.auto_audience_segmentation.project_id : local.feature_store_project_id aggregated_vbb_project_id = null_resource.check_bigquery_api.id != "" ? local.config_vars.bigquery.dataset.aggregated_vbb.project_id : local.feature_store_project_id @@ -36,7 +37,7 @@ module "project_services" { source = "terraform-google-modules/project-factory/google//modules/project_services" version = "17.0.0" - disable_dependent_services = true + disable_dependent_services = false disable_services_on_destroy = false project_id = local.feature_store_project_id @@ -115,6 +116,14 @@ resource "google_bigquery_connection" "vertex_ai_connection" { project = null_resource.check_aiplatform_api.id != "" ? module.project_services.project_id : local.feature_store_project_id location = local.config_bigquery.region cloud_resource {} + + # The lifecycle block is used to configure the lifecycle of the table. In this case, the ignore_changes attribute is set to all, which means that Terraform will ignore + # any changes to the table and will not attempt to update the table. The prevent_destroy attribute is set to true, which means that Terraform will prevent the table from being destroyed. + lifecycle { + ignore_changes = all + #prevent_destroy = true + create_before_destroy = true + } } @@ -144,7 +153,8 @@ resource "google_project_iam_member" "vertex_ai_connection_sa_roles" { # any changes to the table and will not attempt to update the table. The prevent_destroy attribute is set to true, which means that Terraform will prevent the table from being destroyed. lifecycle { ignore_changes = all - prevent_destroy = true + #prevent_destroy = true + create_before_destroy = true } } @@ -157,6 +167,14 @@ resource "time_sleep" "wait_for_vertex_ai_connection_sa_role_propagation" { depends_on = [ google_project_iam_member.vertex_ai_connection_sa_roles ] + + # The lifecycle block is used to configure the lifecycle of the table. In this case, the ignore_changes attribute is set to all, which means that Terraform will ignore + # any changes to the table and will not attempt to update the table. The prevent_destroy attribute is set to true, which means that Terraform will prevent the table from being destroyed. + lifecycle { + ignore_changes = all + #prevent_destroy = true + create_before_destroy = true + } } diff --git a/infrastructure/terraform/modules/pipelines/main.tf b/infrastructure/terraform/modules/pipelines/main.tf index 86d363d7..5de45c5d 100644 --- a/infrastructure/terraform/modules/pipelines/main.tf +++ b/infrastructure/terraform/modules/pipelines/main.tf @@ -37,7 +37,7 @@ module "project_services" { source = "terraform-google-modules/project-factory/google//modules/project_services" version = "17.0.0" - disable_dependent_services = true + disable_dependent_services = false disable_services_on_destroy = false project_id = local.pipeline_vars.project_id diff --git a/infrastructure/terraform/modules/pipelines/pipelines.tf b/infrastructure/terraform/modules/pipelines/pipelines.tf index a55e9136..0074a536 100644 --- a/infrastructure/terraform/modules/pipelines/pipelines.tf +++ b/infrastructure/terraform/modules/pipelines/pipelines.tf @@ -18,6 +18,14 @@ resource "google_service_account" "service_account" { account_id = local.pipeline_vars.service_account_id display_name = local.pipeline_vars.service_account_id description = "Service Account to run Vertex AI Pipelines" + + # The lifecycle block is used to configure the lifecycle of the table. In this case, the ignore_changes attribute is set to all, which means that Terraform will ignore + # any changes to the table and will not attempt to update the table. The prevent_destroy attribute is set to true, which means that Terraform will prevent the table from being destroyed. + lifecycle { + ignore_changes = all + #prevent_destroy = true + create_before_destroy = true + } } # Wait for the pipelines service account to be created @@ -72,6 +80,14 @@ resource "google_project_iam_member" "pipelines_sa_roles" { "roles/compute.networkUser" ]) role = each.key + + # The lifecycle block is used to configure the lifecycle of the table. In this case, the ignore_changes attribute is set to all, which means that Terraform will ignore + # any changes to the table and will not attempt to update the table. The prevent_destroy attribute is set to true, which means that Terraform will prevent the table from being destroyed. + lifecycle { + ignore_changes = all + #prevent_destroy = true + create_before_destroy = true + } } # This resource binds the service account to the required roles in the mds project @@ -89,6 +105,14 @@ resource "google_project_iam_member" "pipelines_sa_mds_project_roles" { "roles/bigquery.dataViewer" ]) role = each.key + + # The lifecycle block is used to configure the lifecycle of the table. In this case, the ignore_changes attribute is set to all, which means that Terraform will ignore + # any changes to the table and will not attempt to update the table. The prevent_destroy attribute is set to true, which means that Terraform will prevent the table from being destroyed. + lifecycle { + ignore_changes = all + #prevent_destroy = true + create_before_destroy = true + } } # This resource creates a service account to run the dataflow jobs @@ -97,6 +121,14 @@ resource "google_service_account" "dataflow_worker_service_account" { account_id = local.dataflow_vars.worker_service_account_id display_name = local.dataflow_vars.worker_service_account_id description = "Service Account to run Dataflow jobs" + + # The lifecycle block is used to configure the lifecycle of the table. In this case, the ignore_changes attribute is set to all, which means that Terraform will ignore + # any changes to the table and will not attempt to update the table. The prevent_destroy attribute is set to true, which means that Terraform will prevent the table from being destroyed. + lifecycle { + ignore_changes = all + #prevent_destroy = true + create_before_destroy = true + } } # Wait for the dataflow worker service account to be created @@ -143,6 +175,14 @@ resource "google_project_iam_member" "dataflow_worker_sa_roles" { "roles/storage.objectAdmin", ]) role = each.key + + # The lifecycle block is used to configure the lifecycle of the table. In this case, the ignore_changes attribute is set to all, which means that Terraform will ignore + # any changes to the table and will not attempt to update the table. The prevent_destroy attribute is set to true, which means that Terraform will prevent the table from being destroyed. + lifecycle { + ignore_changes = all + #prevent_destroy = true + create_before_destroy = true + } } # This resource binds the service account to the required roles @@ -157,6 +197,14 @@ resource "google_service_account_iam_member" "dataflow_sa_iam" { service_account_id = "projects/${module.project_services.project_id}/serviceAccounts/${google_service_account.dataflow_worker_service_account.email}" role = "roles/iam.serviceAccountUser" member = "serviceAccount:${google_service_account.service_account.email}" + + # The lifecycle block is used to configure the lifecycle of the table. In this case, the ignore_changes attribute is set to all, which means that Terraform will ignore + # any changes to the table and will not attempt to update the table. The prevent_destroy attribute is set to true, which means that Terraform will prevent the table from being destroyed. + lifecycle { + ignore_changes = all + #prevent_destroy = true + create_before_destroy = true + } } # This resource creates a Cloud Storage Bucket for the pipeline artifacts @@ -170,12 +218,12 @@ resource "google_storage_bucket" "pipelines_bucket" { # even if it contains objects. In this case, it's set to false, which means that the bucket will not be destroyed if it contains objects. force_destroy = false - # The lifecycle block allows you to configure the lifecycle of the bucket. - # In this case, the ignore_changes attribute is set to all, which means that Terraform - # will ignore any changes to the bucket's lifecycle configuration. The prevent_destroy attribute is set to false, which means that the bucket can be destroyed. + # The lifecycle block is used to configure the lifecycle of the table. In this case, the ignore_changes attribute is set to all, which means that Terraform will ignore + # any changes to the table and will not attempt to update the table. The prevent_destroy attribute is set to true, which means that Terraform will prevent the table from being destroyed. lifecycle { ignore_changes = all - prevent_destroy = false ##true + #prevent_destroy = true + create_before_destroy = true } } @@ -190,12 +238,12 @@ resource "google_storage_bucket" "custom_model_bucket" { # even if it contains objects. In this case, it's set to false, which means that the bucket will not be destroyed if it contains objects. force_destroy = false - # The lifecycle block allows you to configure the lifecycle of the bucket. - # In this case, the ignore_changes attribute is set to all, which means that Terraform - # will ignore any changes to the bucket's lifecycle configuration. The prevent_destroy attribute is set to false, which means that the bucket can be destroyed. + # The lifecycle block is used to configure the lifecycle of the table. In this case, the ignore_changes attribute is set to all, which means that Terraform will ignore + # any changes to the table and will not attempt to update the table. The prevent_destroy attribute is set to true, which means that Terraform will prevent the table from being destroyed. lifecycle { ignore_changes = all - prevent_destroy = false ##true + #prevent_destroy = true + create_before_destroy = true } } @@ -350,6 +398,31 @@ resource "null_resource" "check_pipeline_docker_image_pushed" { ## Feature Engineering Pipelines ####### +# This resource is used to compile and upload the Vertex AI pipeline for feature engineering - lead score propensity use case +resource "null_resource" "compile_feature_engineering_lead_score_propensity_pipeline" { + triggers = { + working_dir = "${local.source_root_dir}/python" + tag = local.compile_pipelines_tag + pipelines_repo_id = google_artifact_registry_repository.pipelines-repo.id + pipelines_repo_create_time = google_artifact_registry_repository.pipelines-repo.create_time + source_content_hash = local.pipelines_content_hash + upstream_resource_dependency = null_resource.check_pipeline_docker_image_pushed.id + } + + # The provisioner block specifies the command that will be executed to compile and upload the pipeline. + # This command will execute the compiler function in the pipelines module, which will compile the pipeline YAML file, and the uploader function, + # which will upload the pipeline YAML file to the specified Artifact Registry repository. The scheduler function will then schedule the pipeline to run on a regular basis. + provisioner "local-exec" { + command = <<-EOT + ${var.uv_run_alias} python -m pipelines.compiler -c ${local.config_file_path_relative_python_run_dir} -p vertex_ai.pipelines.feature-creation-lead-score-propensity.execution -o fe_lead_score_propensity.yaml + ${var.uv_run_alias} python -m pipelines.uploader -c ${local.config_file_path_relative_python_run_dir} -f fe_lead_score_propensity.yaml -t ${self.triggers.tag} -t latest + ${var.uv_run_alias} python -m pipelines.scheduler -c ${local.config_file_path_relative_python_run_dir} -p vertex_ai.pipelines.feature-creation-lead-score-propensity.execution -i fe_lead_score_propensity.yaml + EOT + working_dir = self.triggers.working_dir + } +} + + # This resource is used to compile and upload the Vertex AI pipeline for feature engineering - auto audience segmentation use case resource "null_resource" "compile_feature_engineering_auto_audience_segmentation_pipeline" { triggers = { @@ -358,7 +431,7 @@ resource "null_resource" "compile_feature_engineering_auto_audience_segmentation pipelines_repo_id = google_artifact_registry_repository.pipelines-repo.id pipelines_repo_create_time = google_artifact_registry_repository.pipelines-repo.create_time source_content_hash = local.pipelines_content_hash - upstream_resource_dependency = null_resource.build_push_pipelines_components_image.id + upstream_resource_dependency = null_resource.compile_feature_engineering_lead_score_propensity_pipeline.id } # The provisioner block specifies the command that will be executed to compile and upload the pipeline. @@ -498,12 +571,54 @@ resource "null_resource" "compile_feature_engineering_customer_lifetime_value_pi ## Training and Inference Pipelines ### +# This resource is used to compile and upload the Vertex AI pipeline for training the propensity model - lead score propensity use case +resource "null_resource" "compile_lead_score_propensity_training_pipelines" { + triggers = { + working_dir = "${local.source_root_dir}/python" + tag = local.compile_pipelines_tag + upstream_resource_dependency = null_resource.compile_feature_engineering_customer_lifetime_value_pipeline.id + } + + # The provisioner block specifies the command that will be executed to compile and upload the pipeline. + # This command will execute the compiler function in the pipelines module, which will compile the pipeline YAML file, and the uploader function, + # which will upload the pipeline YAML file to the specified Artifact Registry repository. The scheduler function will then schedule the pipeline to run on a regular basis. + provisioner "local-exec" { + command = <<-EOT + ${var.uv_run_alias} python -m pipelines.compiler -c ${local.config_file_path_relative_python_run_dir} -p vertex_ai.pipelines.lead_score_propensity.training -o lead_score_propensity_training.yaml + ${var.uv_run_alias} python -m pipelines.uploader -c ${local.config_file_path_relative_python_run_dir} -f lead_score_propensity_training.yaml -t ${self.triggers.tag} -t latest + ${var.uv_run_alias} python -m pipelines.scheduler -c ${local.config_file_path_relative_python_run_dir} -p vertex_ai.pipelines.lead_score_propensity.training -i lead_score_propensity_training.yaml + EOT + working_dir = self.triggers.working_dir + } +} + +# This resource is used to compile and upload the Vertex AI pipeline for prediction using the propensity model - lead score propensity use case +resource "null_resource" "compile_lead_score_propensity_prediction_pipelines" { + triggers = { + working_dir = "${local.source_root_dir}/python" + tag = local.compile_pipelines_tag + upstream_resource_dependency = null_resource.compile_lead_score_propensity_training_pipelines.id + } + + # The provisioner block specifies the command that will be executed to compile and upload the pipeline. + # This command will execute the compiler function in the pipelines module, which will compile the pipeline YAML file, and the uploader function, + # which will upload the pipeline YAML file to the specified Artifact Registry repository. The scheduler function will then schedule the pipeline to run on a regular basis. + provisioner "local-exec" { + command = <<-EOT + ${var.uv_run_alias} python -m pipelines.compiler -c ${local.config_file_path_relative_python_run_dir} -p vertex_ai.pipelines.lead_score_propensity.prediction -o lead_score_propensity_prediction.yaml + ${var.uv_run_alias} python -m pipelines.uploader -c ${local.config_file_path_relative_python_run_dir} -f lead_score_propensity_prediction.yaml -t ${self.triggers.tag} -t latest + ${var.uv_run_alias} python -m pipelines.scheduler -c ${local.config_file_path_relative_python_run_dir} -p vertex_ai.pipelines.lead_score_propensity.prediction -i lead_score_propensity_prediction.yaml + EOT + working_dir = self.triggers.working_dir + } +} + # This resource is used to compile and upload the Vertex AI pipeline for training the propensity model - purchase propensity use case resource "null_resource" "compile_purchase_propensity_training_pipelines" { triggers = { working_dir = "${local.source_root_dir}/python" tag = local.compile_pipelines_tag - upstream_resource_dependency = null_resource.compile_feature_engineering_customer_lifetime_value_pipeline.id + upstream_resource_dependency = null_resource.compile_lead_score_propensity_prediction_pipelines.id } # The provisioner block specifies the command that will be executed to compile and upload the pipeline. diff --git a/infrastructure/terraform/terraform-sample.tfvars b/infrastructure/terraform/terraform-sample.tfvars index 71f440d7..ff4fd015 100644 --- a/infrastructure/terraform/terraform-sample.tfvars +++ b/infrastructure/terraform/terraform-sample.tfvars @@ -39,6 +39,151 @@ source_ads_export_data = [ feature_store_project_id = "Project ID where feature store resources will be created" +################### PIPELINE CONFIGURATIONS ################################## + +pipeline_configuration = { + feature-creation-auto-audience-segmentation = { + execution = { + schedule = { + state = "ACTIVE" + } + } + } + feature-creation-audience-segmentation = { + execution = { + schedule = { + state = "ACTIVE" + } + } + } + feature-creation-purchase-propensity = { + execution = { + schedule = { + state = "ACTIVE" + } + } + } + feature-creation-churn-propensity = { + execution = { + schedule = { + state = "ACTIVE" + } + } + } + feature-creation-customer-ltv = { + execution = { + schedule = { + state = "ACTIVE" + } + } + } + feature-creation-aggregated-value-based-bidding = { + execution = { + schedule = { + state = "ACTIVE" + } + } + } + feature-creation-lead-score-propensity = { + execution = { + schedule = { + state = "ACTIVE" + } + } + } + value_based_bidding = { + training = { + schedule = { + state = "PAUSED" + } + } + explanation = { + schedule = { + state = "ACTIVE" + } + } + } + purchase_propensity = { + training = { + schedule = { + state = "PAUSED" + } + } + prediction = { + schedule = { + state = "ACTIVE" + } + } + } + churn_propensity = { + training = { + schedule = { + state = "PAUSED" + } + } + prediction = { + schedule = { + state = "ACTIVE" + } + } + } + segmentation = { + training = { + schedule = { + state = "PAUSED" + } + } + prediction = { + schedule = { + state = "ACTIVE" + } + } + } + auto_segmentation = { + training = { + schedule = { + state = "PAUSED" + } + } + prediction = { + schedule = { + state = "ACTIVE" + } + } + } + propensity_clv = { + training = { + schedule = { + state = "PAUSED" + } + } + } + clv = { + training = { + schedule = { + state = "ACTIVE" + } + } + prediction = { + schedule = { + state = "ACTIVE" + } + } + } + lead_score_propensity = { + training = { + schedule = { + state = "PAUSED" + } + } + prediction = { + schedule = { + state = "ACTIVE" + } + } + } + } + #################### ML MODEL VARIABLES ################################# website_url = "Customer Website URL" # i.e. "https://shop.googlemerchandisestore.com/" diff --git a/python/activation/main.py b/python/activation/main.py index cfa7319e..21c416bd 100644 --- a/python/activation/main.py +++ b/python/activation/main.py @@ -62,6 +62,7 @@ def _add_argparse_args(cls, parser): - purchase-propensity-15-15 - purchase-propensity-15-7 - churn-propensity-30-15 + - lead-score-propensity-5-1 activation_type_configuration: The GCS path to the configuration file for all activation types. """ @@ -110,6 +111,7 @@ def _add_argparse_args(cls, parser): purchase-propensity-15-15 purchase-propensity-15-7 churn-propensity-30-15 + lead-score-propensity-5-1 ''', required=True ) diff --git a/python/ga4_setup/setup.py b/python/ga4_setup/setup.py index 658f1650..dd4c885d 100644 --- a/python/ga4_setup/setup.py +++ b/python/ga4_setup/setup.py @@ -276,6 +276,7 @@ def create_custom_dimensions(configuration: map): create_custom_dimensions_for('CLTV', ['cltv_decile'], existing_dimensions, configuration) create_custom_dimensions_for('Auto Audience Segmentation', ['a_a_s_prediction'], existing_dimensions, configuration) create_custom_dimensions_for('Churn Propensity', ['c_p_prediction', 'c_p_decile'], existing_dimensions, configuration) + create_custom_dimensions_for('Lead Score Propensity', ['l_s_p_prediction', 'l_s_p_decile'], existing_dimensions, configuration) diff --git a/python/pipelines/automl_tabular_pl_v4.yaml b/python/pipelines/automl_tabular_pl_v4.yaml index 4d20b803..6bdc8cfb 100644 --- a/python/pipelines/automl_tabular_pl_v4.yaml +++ b/python/pipelines/automl_tabular_pl_v4.yaml @@ -11151,21 +11151,21 @@ root: isOptional: true parameterType: BOOLEAN distill_batch_predict_machine_type: - defaultValue: n1-standard-16 + defaultValue: n1-highmem-8 description: 'The prediction server machine type for batch predict component in the model distillation.' isOptional: true parameterType: STRING distill_batch_predict_max_replica_count: - defaultValue: 25.0 + defaultValue: 5.0 description: 'The max number of prediction server for batch predict component in the model distillation.' isOptional: true parameterType: NUMBER_INTEGER distill_batch_predict_starting_replica_count: - defaultValue: 25.0 + defaultValue: 5.0 description: 'The initial number of prediction server for batch predict component in the model distillation.' @@ -11201,14 +11201,14 @@ root: isOptional: true parameterType: STRING evaluation_batch_explain_max_replica_count: - defaultValue: 10.0 + defaultValue: 5.0 description: 'The max number of prediction server for batch explain components during evaluation.' isOptional: true parameterType: NUMBER_INTEGER evaluation_batch_explain_starting_replica_count: - defaultValue: 10.0 + defaultValue: 5.0 description: 'The initial number of prediction server for batch explain components during evaluation.' @@ -11222,14 +11222,14 @@ root: isOptional: true parameterType: STRING evaluation_batch_predict_max_replica_count: - defaultValue: 20.0 + defaultValue: 5.0 description: 'The max number of prediction server for batch predict components during evaluation.' isOptional: true parameterType: NUMBER_INTEGER evaluation_batch_predict_starting_replica_count: - defaultValue: 20.0 + defaultValue: 5.0 description: 'The initial number of prediction server for batch predict components during evaluation.' @@ -11279,7 +11279,7 @@ root: description: The GCP region that runs the pipeline components. parameterType: STRING max_selected_features: - defaultValue: 1000.0 + defaultValue: 100.0 description: number of features to select for training. isOptional: true parameterType: NUMBER_INTEGER @@ -11356,7 +11356,7 @@ root: isOptional: true parameterType: BOOLEAN stage_1_num_parallel_trials: - defaultValue: 35.0 + defaultValue: 5.0 description: Number of parallel trails for stage 1. isOptional: true parameterType: NUMBER_INTEGER @@ -11367,7 +11367,7 @@ root: isOptional: true parameterType: LIST stage_2_num_parallel_trials: - defaultValue: 35.0 + defaultValue: 5.0 description: Number of parallel trails for stage 2. isOptional: true parameterType: NUMBER_INTEGER diff --git a/python/pipelines/compiler.py b/python/pipelines/compiler.py index 6b5224dd..97bbc62c 100644 --- a/python/pipelines/compiler.py +++ b/python/pipelines/compiler.py @@ -31,6 +31,7 @@ 'vertex_ai.pipelines.feature-creation-purchase-propensity.execution': "pipelines.feature_engineering_pipelines.purchase_propensity_feature_engineering_pipeline", 'vertex_ai.pipelines.feature-creation-churn-propensity.execution': "pipelines.feature_engineering_pipelines.churn_propensity_feature_engineering_pipeline", 'vertex_ai.pipelines.feature-creation-customer-ltv.execution': "pipelines.feature_engineering_pipelines.customer_lifetime_value_feature_engineering_pipeline", + 'vertex_ai.pipelines.feature-creation-lead-score-propensity.execution': "pipelines.feature_engineering_pipelines.lead_score_propensity_feature_engineering_pipeline", 'vertex_ai.pipelines.auto_segmentation.training': "pipelines.auto_segmentation_pipelines.training_pl", 'vertex_ai.pipelines.auto_segmentation.prediction': "pipelines.auto_segmentation_pipelines.prediction_pl", 'vertex_ai.pipelines.segmentation.training': "pipelines.segmentation_pipelines.training_pl", @@ -39,6 +40,8 @@ 'vertex_ai.pipelines.purchase_propensity.prediction': "pipelines.tabular_pipelines.prediction_binary_classification_pl", 'vertex_ai.pipelines.churn_propensity.training': None, # tabular workflows pipelines is precompiled 'vertex_ai.pipelines.churn_propensity.prediction': "pipelines.tabular_pipelines.prediction_binary_classification_pl", + 'vertex_ai.pipelines.lead_score_propensity.training': None, # tabular workflows pipelines is precompiled + 'vertex_ai.pipelines.lead_score_propensity.prediction': "pipelines.tabular_pipelines.prediction_binary_classification_pl", 'vertex_ai.pipelines.propensity_clv.training': None, # tabular workflows pipelines is precompiled 'vertex_ai.pipelines.clv.training': None, # tabular workflows pipelines is precompiled 'vertex_ai.pipelines.clv.prediction': "pipelines.tabular_pipelines.prediction_binary_classification_regression_pl", diff --git a/python/pipelines/feature_engineering_pipelines.py b/python/pipelines/feature_engineering_pipelines.py index deb7b88b..a15ffa12 100644 --- a/python/pipelines/feature_engineering_pipelines.py +++ b/python/pipelines/feature_engineering_pipelines.py @@ -196,8 +196,73 @@ def audience_segmentation_feature_engineering_pipeline( location=location, query=query_audience_segmentation_inference_preparation, timeout=timeout).set_display_name('audience_segmentation_inference_preparation').after(*phase_1) - - + + +@dsl.pipeline() +def lead_score_propensity_feature_engineering_pipeline( + project_id: str, + location: Optional[str], + query_lead_score_propensity_label: str, + query_user_dimensions: str, + query_user_rolling_window_metrics: str, + query_lead_score_propensity_inference_preparation: str, + query_lead_score_propensity_training_preparation: str, + timeout: Optional[float] = 3600.0 +): + """ + This pipeline defines the steps for feature engineering for the lead score propensity model. + + Args: + project_id: The Google Cloud project ID. + location: The Google Cloud region where the pipeline will be run. + query_lead_score_propensity_label: The SQL query that will be used to calculate the purchase propensity label. + query_user_dimensions: The SQL query that will be used to calculate the user dimensions. + query_user_rolling_window_metrics: The SQL query that will be used to calculate the user rolling window metrics. + query_lead_score_propensity_inference_preparation: The SQL query that will be used to prepare the inference data. + query_lead_score_propensity_training_preparation: The SQL query that will be used to prepare the training data. + timeout: The timeout for the pipeline in seconds. + + Returns: + None + """ + + # Features Preparation + phase_1 = list() + phase_1.append( + sp( + project=project_id, + location=location, + query=query_lead_score_propensity_label, + timeout=timeout).set_display_name('lead_score_propensity_label') + ) + phase_1.append( + sp( + project=project_id, + location=location, + query=query_user_dimensions, + timeout=timeout).set_display_name('user_dimensions') + ) + phase_1.append( + sp( + project=project_id, + location=location, + query=query_user_rolling_window_metrics, + timeout=timeout).set_display_name('user_rolling_window_metrics') + ) + # Training data preparation + purchase_propensity_train_prep = sp( + project=project_id, + location=location, + query=query_lead_score_propensity_training_preparation, + timeout=timeout).set_display_name('lead_score_propensity_training_preparation').after(*phase_1) + # Inference data preparation + purchase_propensity_inf_prep = sp( + project=project_id, + location=location, + query=query_lead_score_propensity_inference_preparation, + timeout=timeout).set_display_name('lead_score_propensity_inference_preparation').after(*phase_1) + + @dsl.pipeline() def purchase_propensity_feature_engineering_pipeline( project_id: str, diff --git a/python/pipelines/scheduler.py b/python/pipelines/scheduler.py index ce554ddd..7e00dc8e 100644 --- a/python/pipelines/scheduler.py +++ b/python/pipelines/scheduler.py @@ -37,8 +37,11 @@ def check_extention(file_path: str, type: str = '.yaml'): 'vertex_ai.pipelines.feature-creation-purchase-propensity.execution': "pipelines.feature_engineering_pipelines.purchase_propensity_feature_engineering_pipeline", 'vertex_ai.pipelines.feature-creation-churn-propensity.execution': "pipelines.feature_engineering_pipelines.churn_propensity_feature_engineering_pipeline", 'vertex_ai.pipelines.feature-creation-customer-ltv.execution': "pipelines.feature_engineering_pipelines.customer_lifetime_value_feature_engineering_pipeline", + 'vertex_ai.pipelines.feature-creation-lead-score-propensity.execution': "pipelines.feature_engineering_pipelines.lead_score_propensity_feature_engineering_pipeline", 'vertex_ai.pipelines.purchase_propensity.training': None, # tabular workflows pipelines is precompiled 'vertex_ai.pipelines.purchase_propensity.prediction': "pipelines.tabular_pipelines.prediction_binary_classification_pl", + 'vertex_ai.pipelines.lead_score_propensity.training': None, # tabular workflows pipelines is precompiled + 'vertex_ai.pipelines.lead_score_propensity.prediction': "pipelines.tabular_pipelines.prediction_binary_classification_pl", 'vertex_ai.pipelines.churn_propensity.training': None, # tabular workflows pipelines is precompiled 'vertex_ai.pipelines.churn_propensity.prediction': "pipelines.tabular_pipelines.prediction_binary_classification_pl", 'vertex_ai.pipelines.segmentation.training': "pipelines.segmentation_pipelines.training_pl", diff --git a/python/pipelines/transformations-lead-score-propensity.json b/python/pipelines/transformations-lead-score-propensity.json new file mode 100644 index 00000000..28ca5e70 --- /dev/null +++ b/python/pipelines/transformations-lead-score-propensity.json @@ -0,0 +1,368 @@ +[ + { + "numeric": { + "column_name": "user_ltv_revenue", + "invalid_values_allowed": true + } + }, + { + "categorical": { + "column_name": "device_category" + } + }, + { + "categorical": { + "column_name": "device_mobile_brand_name" + } + }, + { + "categorical": { + "column_name": "device_mobile_model_name" + } + }, + { + "categorical": { + "column_name": "device_os" + } + }, + { + "categorical": { + "column_name": "device_language" + } + }, + { + "categorical": { + "column_name": "device_web_browser" + } + }, + { + "categorical": { + "column_name": "geo_sub_continent" + } + }, + { + "categorical": { + "column_name": "geo_country" + } + }, + { + "categorical": { + "column_name": "geo_region" + } + }, + { + "categorical": { + "column_name": "geo_city" + } + }, + { + "categorical": { + "column_name": "geo_metro" + } + }, + { + "categorical": { + "column_name": "last_traffic_source_medium" + } + }, + { + "categorical": { + "column_name": "last_traffic_source_name" + } + }, + { + "categorical": { + "column_name": "last_traffic_source_source" + } + }, + { + "categorical": { + "column_name": "first_traffic_source_medium" + } + }, + { + "categorical": { + "column_name": "first_traffic_source_name" + } + }, + { + "categorical": { + "column_name": "first_traffic_source_source" + } + }, + { + "categorical": { + "column_name": "has_signed_in_with_user_id" + } + }, + { + "numeric": { + "column_name": "scroll_50_past_1_day", + "invalid_values_allowed": true + } + }, + { + "numeric": { + "column_name": "scroll_50_past_2_day", + "invalid_values_allowed": true + } + }, + { + "numeric": { + "column_name": "scroll_50_past_3_day", + "invalid_values_allowed": true + } + }, + { + "numeric": { + "column_name": "scroll_50_past_4_day", + "invalid_values_allowed": true + } + }, + { + "numeric": { + "column_name": "scroll_50_past_5_day", + "invalid_values_allowed": true + } + }, + { + "numeric": { + "column_name": "scroll_90_past_1_day", + "invalid_values_allowed": true + } + }, + { + "numeric": { + "column_name": "scroll_90_past_2_day", + "invalid_values_allowed": true + } + }, + { + "numeric": { + "column_name": "scroll_90_past_3_day", + "invalid_values_allowed": true + } + }, + { + "numeric": { + "column_name": "scroll_90_past_4_day", + "invalid_values_allowed": true + } + }, + { + "numeric": { + "column_name": "scroll_90_past_5_day", + "invalid_values_allowed": true + } + }, + { + "numeric": { + "column_name": "view_search_results_past_1_day", + "invalid_values_allowed": true + } + }, + { + "numeric": { + "column_name": "view_search_results_past_2_day", + "invalid_values_allowed": true + } + }, + { + "numeric": { + "column_name": "view_search_results_past_3_day", + "invalid_values_allowed": true + } + }, + { + "numeric": { + "column_name": "view_search_results_past_4_day", + "invalid_values_allowed": true + } + }, + { + "numeric": { + "column_name": "view_search_results_past_5_day", + "invalid_values_allowed": true + } + }, + { + "numeric": { + "column_name": "file_download_past_1_day", + "invalid_values_allowed": true + } + }, + { + "numeric": { + "column_name": "file_download_past_2_day", + "invalid_values_allowed": true + } + }, + { + "numeric": { + "column_name": "file_download_past_3_day", + "invalid_values_allowed": true + } + }, + { + "numeric": { + "column_name": "file_download_past_4_day", + "invalid_values_allowed": true + } + }, + { + "numeric": { + "column_name": "file_download_past_5_day", + "invalid_values_allowed": true + } + }, + { + "numeric": { + "column_name": "recipe_add_to_list_past_1_day", + "invalid_values_allowed": true + } + }, + { + "numeric": { + "column_name": "recipe_add_to_list_past_2_day", + "invalid_values_allowed": true + } + }, + { + "numeric": { + "column_name": "recipe_add_to_list_past_3_day", + "invalid_values_allowed": true + } + }, + { + "numeric": { + "column_name": "recipe_add_to_list_past_4_day", + "invalid_values_allowed": true + } + }, + { + "numeric": { + "column_name": "recipe_add_to_list_past_5_day", + "invalid_values_allowed": true + } + }, + { + "numeric": { + "column_name": "recipe_print_past_1_day", + "invalid_values_allowed": true + } + }, + { + "numeric": { + "column_name": "recipe_print_past_2_day", + "invalid_values_allowed": true + } + }, + { + "numeric": { + "column_name": "recipe_print_past_3_day", + "invalid_values_allowed": true + } + }, + { + "numeric": { + "column_name": "recipe_print_past_4_day", + "invalid_values_allowed": true + } + }, + { + "numeric": { + "column_name": "recipe_print_past_5_day", + "invalid_values_allowed": true + } + }, + { + "numeric": { + "column_name": "sign_up_past_1_day", + "invalid_values_allowed": true + } + }, + { + "numeric": { + "column_name": "sign_up_past_2_day", + "invalid_values_allowed": true + } + }, + { + "numeric": { + "column_name": "sign_up_past_3_day", + "invalid_values_allowed": true + } + }, + { + "numeric": { + "column_name": "sign_up_past_4_day", + "invalid_values_allowed": true + } + }, + { + "numeric": { + "column_name": "sign_up_past_5_day", + "invalid_values_allowed": true + } + }, + { + "numeric": { + "column_name": "recipe_favorite_past_1_day", + "invalid_values_allowed": true + } + }, + { + "numeric": { + "column_name": "recipe_favorite_past_2_day", + "invalid_values_allowed": true + } + }, + { + "numeric": { + "column_name": "recipe_favorite_past_3_day", + "invalid_values_allowed": true + } + }, + { + "numeric": { + "column_name": "recipe_favorite_past_4_day", + "invalid_values_allowed": true + } + }, + { + "numeric": { + "column_name": "recipe_favorite_past_5_day", + "invalid_values_allowed": true + } + }, + { + "numeric": { + "column_name": "recipe_add_to_menu_past_1_day", + "invalid_values_allowed": true + } + }, + { + "numeric": { + "column_name": "recipe_add_to_menu_past_2_day", + "invalid_values_allowed": true + } + }, + { + "numeric": { + "column_name": "recipe_add_to_menu_past_3_day", + "invalid_values_allowed": true + } + }, + { + "numeric": { + "column_name": "recipe_add_to_menu_past_4_day", + "invalid_values_allowed": true + } + }, + { + "numeric": { + "column_name": "recipe_add_to_menu_past_5_day", + "invalid_values_allowed": true + } + } +] \ No newline at end of file diff --git a/sql/procedure/lead_score_propensity_inference_preparation.sqlx b/sql/procedure/lead_score_propensity_inference_preparation.sqlx new file mode 100644 index 00000000..ec12d7fe --- /dev/null +++ b/sql/procedure/lead_score_propensity_inference_preparation.sqlx @@ -0,0 +1,672 @@ +-- Copyright 2023 Google LLC +-- +-- Licensed under the Apache License, Version 2.0 (the "License"); +-- you may not use this file except in compliance with the License. +-- You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, software +-- distributed under the License is distributed on an "AS IS" BASIS, +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- See the License for the specific language governing permissions and +-- limitations under the License. + +DECLARE lastest_processed_time_ud TIMESTAMP; +DECLARE lastest_processed_time_useam TIMESTAMP; +DECLARE lastest_processed_time_uwlm TIMESTAMP; +DECLARE lastest_processed_time_um TIMESTAMP; + +-- Setting procedure to lookback from the day before `inference_date` +SET inference_date = DATE_SUB(inference_date, INTERVAL 1 DAY); + +SET lastest_processed_time_ud = (SELECT MAX(processed_timestamp) FROM `{{feature_store_project_id}}.{{feature_store_dataset}}.user_dimensions` WHERE feature_date = inference_date LIMIT 1); +SET lastest_processed_time_useam = (SELECT MAX(processed_timestamp) FROM `{{feature_store_project_id}}.{{feature_store_dataset}}.user_session_event_aggregated_metrics` WHERE feature_date = inference_date LIMIT 1); +SET lastest_processed_time_uwlm = (SELECT MAX(processed_timestamp) FROM `{{feature_store_project_id}}.{{feature_store_dataset}}.user_rolling_window_lead_metrics` WHERE feature_date = inference_date LIMIT 1); +SET lastest_processed_time_um = (SELECT MAX(processed_timestamp) FROM `{{feature_store_project_id}}.{{feature_store_dataset}}.user_scoped_metrics` WHERE feature_date = inference_date LIMIT 1); + +CREATE OR REPLACE TEMP TABLE inference_preparation_ud as ( + SELECT DISTINCT + -- The user pseudo id + UD.user_pseudo_id, + -- The user id + MAX(UD.user_id) OVER(user_dimensions_window) AS user_id, + -- The feature date + UD.feature_date, + -- The user lifetime value revenue + MAX(UD.user_ltv_revenue) OVER(user_dimensions_window) AS user_ltv_revenue, + -- The device category + MAX(UD.device_category) OVER(user_dimensions_window) AS device_category, + -- The device brand name + MAX(UD.device_mobile_brand_name) OVER(user_dimensions_window) AS device_mobile_brand_name, + -- The device model name + MAX(UD.device_mobile_model_name) OVER(user_dimensions_window) AS device_mobile_model_name, + -- The device operating system + MAX(UD.device_os) OVER(user_dimensions_window) AS device_os, + -- The device language + MAX(UD.device_language) OVER(user_dimensions_window) AS device_language, + -- The device web browser + MAX(UD.device_web_browser) OVER(user_dimensions_window) AS device_web_browser, + -- The user sub continent + MAX(UD.geo_sub_continent) OVER(user_dimensions_window) AS geo_sub_continent, + -- The user country + MAX(UD.geo_country) OVER(user_dimensions_window) AS geo_country, + -- The user region + MAX(UD.geo_region) OVER(user_dimensions_window) AS geo_region, + -- The user city + MAX(UD.geo_city) OVER(user_dimensions_window) AS geo_city, + -- The user metro + MAX(UD.geo_metro) OVER(user_dimensions_window) AS geo_metro, + -- The user last traffic source medium + MAX(UD.last_traffic_source_medium) OVER(user_dimensions_window) AS last_traffic_source_medium, + -- The user last traffic source name + MAX(UD.last_traffic_source_name) OVER(user_dimensions_window) AS last_traffic_source_name, + -- The user last traffic source source + MAX(UD.last_traffic_source_source) OVER(user_dimensions_window) AS last_traffic_source_source, + -- The user first traffic source medium + MAX(UD.first_traffic_source_medium) OVER(user_dimensions_window) AS first_traffic_source_medium, + -- The user first traffic source name + MAX(UD.first_traffic_source_name) OVER(user_dimensions_window) AS first_traffic_source_name, + -- The user first traffic source source + MAX(UD.first_traffic_source_source) OVER(user_dimensions_window) AS first_traffic_source_source, + -- Whether the user has signed in with user ID + MAX(UD.has_signed_in_with_user_id) OVER(user_dimensions_window) AS has_signed_in_with_user_id, +FROM + `{{feature_store_project_id}}.{{feature_store_dataset}}.user_dimensions` UD +INNER JOIN + `{{project_id}}.{{mds_dataset}}.latest_event_per_user_last_72_hours` LEU +ON + UD.user_pseudo_id = LEU.user_pseudo_id +WHERE + -- In the future consider `feature_date BETWEEN start_date AND end_date`, to process multiple days. Modify Partition BY + UD.feature_date = inference_date + AND UD.processed_timestamp = lastest_processed_time_ud +WINDOW + user_dimensions_window AS (PARTITION BY UD.user_pseudo_id ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) +); + + +CREATE OR REPLACE TEMP TABLE inference_preparation_uwlm as ( + SELECT DISTINCT + -- User pseudo id + UWLM.user_pseudo_id, + -- Feature date + UWLM.feature_date, + -- Calculate the maximum value for each metric over the window + MAX(UWLM.scroll_50_past_1_day) OVER(user_rolling_lead_window) AS scroll_50_past_1_day, + -- Maximum active users in the past 2 days + MAX(UWLM.scroll_50_past_2_day) OVER(user_rolling_lead_window) AS scroll_50_past_2_day, + -- Maximum active users in the past 3 days + MAX(UWLM.scroll_50_past_3_day) OVER(user_rolling_lead_window) AS scroll_50_past_3_day, + -- Maximum active users in the past 4 days + MAX(UWLM.scroll_50_past_4_day) OVER(user_rolling_lead_window) AS scroll_50_past_4_day, + -- Maximum active users in the past 5 days + MAX(UWLM.scroll_50_past_5_day) OVER(user_rolling_lead_window) AS scroll_50_past_5_day, + -- Maximum scroll_90 in the past 1 day + MAX(UWLM.scroll_90_past_1_day) OVER(user_rolling_lead_window) AS scroll_90_past_1_day, + -- Maximum scroll_90 in the past 2 days + MAX(UWLM.scroll_90_past_2_day) OVER(user_rolling_lead_window) AS scroll_90_past_2_day, + -- Maximum scroll_90 in the past 3 days + MAX(UWLM.scroll_90_past_3_day) OVER(user_rolling_lead_window) AS scroll_90_past_3_day, + -- Maximum scroll_90 in the past 4 days + MAX(UWLM.scroll_90_past_4_day) OVER(user_rolling_lead_window) AS scroll_90_past_4_day, + -- Maximum scroll_90 in the past 5 days + MAX(UWLM.scroll_90_past_5_day) OVER(user_rolling_lead_window) AS scroll_90_past_5_day, + -- Maximum view_search_results in the past 1 day + MAX(UWLM.view_search_results_past_1_day) OVER(user_rolling_lead_window) AS view_search_results_past_1_day, + -- Maximum view_search_results in the past 2 days + MAX(UWLM.view_search_results_past_2_day) OVER(user_rolling_lead_window) AS view_search_results_past_2_day, + -- Maximum view_search_results in the past 3 days + MAX(UWLM.view_search_results_past_3_day) OVER(user_rolling_lead_window) AS view_search_results_past_3_day, + -- Maximum view_search_results in the past 4 days + MAX(UWLM.view_search_results_past_4_day) OVER(user_rolling_lead_window) AS view_search_results_past_4_day, + -- Maximum view_search_results in the past 5 days + MAX(UWLM.view_search_results_past_5_day) OVER(user_rolling_lead_window) AS view_search_results_past_5_day, + -- Maximum recipe_add_to_list in the past 1 day + MAX(UWLM.recipe_add_to_list_past_1_day) OVER(user_rolling_lead_window) AS recipe_add_to_list_past_1_day, + -- Maximum recipe_add_to_list in the past 2 days + MAX(UWLM.recipe_add_to_list_past_2_day) OVER(user_rolling_lead_window) AS recipe_add_to_list_past_2_day, + -- Maximum recipe_add_to_list in the past 3 days + MAX(UWLM.recipe_add_to_list_past_3_day) OVER(user_rolling_lead_window) AS recipe_add_to_list_past_3_day, + -- Maximum recipe_add_to_list in the past 4 days + MAX(UWLM.recipe_add_to_list_past_4_day) OVER(user_rolling_lead_window) AS recipe_add_to_list_past_4_day, + -- Maximum recipe_add_to_list in the past 5 days + MAX(UWLM.recipe_add_to_list_past_5_day) OVER(user_rolling_lead_window) AS recipe_add_to_list_past_5_day, + -- Maximum recipe_print in the past 1 day + MAX(UWLM.recipe_print_past_1_day) OVER(user_rolling_lead_window) AS recipe_print_past_1_day, + -- Maximum recipe_print in the past 2 days + MAX(UWLM.recipe_print_past_2_day) OVER(user_rolling_lead_window) AS recipe_print_past_2_day, + -- Maximum recipe_print in the past 3 days + MAX(UWLM.recipe_print_past_3_day) OVER(user_rolling_lead_window) AS recipe_print_past_3_day, + -- Maximum recipe_print in the past 4 days + MAX(UWLM.recipe_print_past_4_day) OVER(user_rolling_lead_window) AS recipe_print_past_4_day, + -- Maximum recipe_print in the past 5 days + MAX(UWLM.recipe_print_past_5_day) OVER(user_rolling_lead_window) AS recipe_print_past_5_day, + -- Maximum sign_up_past in the past 1 day + MAX(UWLM.sign_up_past_1_day) OVER(user_rolling_lead_window) AS sign_up_past_1_day, + -- Maximum sign_up_past in the past 2 days + MAX(UWLM.sign_up_past_2_day) OVER(user_rolling_lead_window) AS sign_up_past_2_day, + -- Maximum sign_up_past in the past 3 days + MAX(UWLM.sign_up_past_3_day) OVER(user_rolling_lead_window) AS sign_up_past_3_day, + -- Maximum sign_up_past in the past 4 days + MAX(UWLM.sign_up_past_4_day) OVER(user_rolling_lead_window) AS sign_up_past_4_day, + -- Maximum sign_up_past in the past 5 days + MAX(UWLM.sign_up_past_5_day) OVER(user_rolling_lead_window) AS sign_up_past_5_day, + -- Maximum recipe_favorite in the past 1 day + MAX(UWLM.recipe_favorite_past_1_day) OVER(user_rolling_lead_window) AS recipe_favorite_past_1_day, + -- Maximum recipe_favorite in the past 2 days + MAX(UWLM.recipe_favorite_past_2_day) OVER(user_rolling_lead_window) AS recipe_favorite_past_2_day, + -- Maximum recipe_favorite in the past 3 days + MAX(UWLM.recipe_favorite_past_3_day) OVER(user_rolling_lead_window) AS recipe_favorite_past_3_day, + -- Maximum recipe_favorite in the past 4 days + MAX(UWLM.recipe_favorite_past_4_day) OVER(user_rolling_lead_window) AS recipe_favorite_past_4_day, + -- Maximum recipe_favorite in the past 5 days + MAX(UWLM.recipe_favorite_past_5_day) OVER(user_rolling_lead_window) AS recipe_favorite_past_5_day, + -- Maximum recipe_add_to_menu in the past 1 day + MAX(UWLM.recipe_add_to_menu_past_1_day) OVER(user_rolling_lead_window) AS recipe_add_to_menu_past_1_day, + -- Maximum recipe_add_to_menu in the past 2 days + MAX(UWLM.recipe_add_to_menu_past_2_day) OVER(user_rolling_lead_window) AS recipe_add_to_menu_past_2_day, + -- Maximum recipe_add_to_menu in the past 3 days + MAX(UWLM.recipe_add_to_menu_past_3_day) OVER(user_rolling_lead_window) AS recipe_add_to_menu_past_3_day, + -- Maximum recipe_add_to_menu in the past 4 days + MAX(UWLM.recipe_add_to_menu_past_4_day) OVER(user_rolling_lead_window) AS recipe_add_to_menu_past_4_day, + -- Maximum recipe_add_to_menu in the past 5 days + MAX(UWLM.recipe_add_to_menu_past_5_day) OVER(user_rolling_lead_window) AS recipe_add_to_menu_past_5_day, +FROM + `{{feature_store_project_id}}.{{feature_store_dataset}}.user_rolling_window_lead_metrics` UWLM +INNER JOIN + `{{project_id}}.{{mds_dataset}}.latest_event_per_user_last_72_hours` LEU +ON + UWLM.user_pseudo_id = LEU.user_pseudo_id +WHERE + -- Filter for the features in the inferecen date + UWLM.feature_date = inference_date + AND UWLM.processed_timestamp = lastest_processed_time_uwlm +WINDOW + user_rolling_lead_window AS (PARTITION BY UWLM.user_pseudo_id ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) +); + +-- This is a temp table consolidating all features over the dates intervals. +CREATE OR REPLACE TEMP TABLE inference_preparation as ( + SELECT DISTINCT + UD.user_pseudo_id, + UD.user_id, + UD.feature_date, + UD.user_ltv_revenue, + UD.device_category, + UD.device_mobile_brand_name, + UD.device_mobile_model_name, + UD.device_os, + UD.device_language, + UD.device_web_browser, + UD.geo_sub_continent, + UD.geo_country, + UD.geo_region, + UD.geo_city, + UD.geo_metro, + UD.last_traffic_source_medium, + UD.last_traffic_source_name, + UD.last_traffic_source_source, + UD.first_traffic_source_medium, + UD.first_traffic_source_name, + UD.first_traffic_source_source, + UD.has_signed_in_with_user_id, + UWLM.scroll_50_past_1_day, + UWLM.scroll_50_past_2_day, + UWLM.scroll_50_past_3_day, + UWLM.scroll_50_past_4_day, + UWLM.scroll_50_past_5_day, + UWLM.scroll_90_past_1_day, + UWLM.scroll_90_past_2_day, + UWLM.scroll_90_past_3_day, + UWLM.scroll_90_past_4_day, + UWLM.scroll_90_past_5_day, + UWLM.view_search_results_past_1_day, + UWLM.view_search_results_past_2_day, + UWLM.view_search_results_past_3_day, + UWLM.view_search_results_past_4_day, + UWLM.view_search_results_past_5_day, + UWLM.recipe_add_to_list_past_1_day, + UWLM.recipe_add_to_list_past_2_day, + UWLM.recipe_add_to_list_past_3_day, + UWLM.recipe_add_to_list_past_4_day, + UWLM.recipe_add_to_list_past_5_day, + UWLM.recipe_print_past_1_day, + UWLM.recipe_print_past_2_day, + UWLM.recipe_print_past_3_day, + UWLM.recipe_print_past_4_day, + UWLM.recipe_print_past_5_day, + UWLM.sign_up_past_1_day, + UWLM.sign_up_past_2_day, + UWLM.sign_up_past_3_day, + UWLM.sign_up_past_4_day, + UWLM.sign_up_past_5_day, + UWLM.recipe_favorite_past_1_day, + UWLM.recipe_favorite_past_2_day, + UWLM.recipe_favorite_past_3_day, + UWLM.recipe_favorite_past_4_day, + UWLM.recipe_favorite_past_5_day, + UWLM.recipe_add_to_menu_past_1_day, + UWLM.recipe_add_to_menu_past_2_day, + UWLM.recipe_add_to_menu_past_3_day, + UWLM.recipe_add_to_menu_past_4_day, + UWLM.recipe_add_to_menu_past_5_day +FROM + inference_preparation_ud UD +INNER JOIN + inference_preparation_uwlm UWLM +ON + UWLM.user_pseudo_id = UD.user_pseudo_id + AND UWLM.feature_date = UD.feature_date +); + +DELETE FROM `{{project_id}}.{{dataset}}.{{insert_table}}` WHERE TRUE; + +INSERT INTO `{{project_id}}.{{dataset}}.{{insert_table}}` +( + feature_date, + user_pseudo_id, + user_id, + user_ltv_revenue, + device_category, + device_mobile_brand_name, + device_mobile_model_name, + device_os, + device_language, + device_web_browser, + geo_sub_continent, + geo_country, + geo_region, + geo_city, + geo_metro, + last_traffic_source_medium, + last_traffic_source_name, + last_traffic_source_source, + first_traffic_source_medium, + first_traffic_source_name, + first_traffic_source_source, + has_signed_in_with_user_id, + scroll_50_past_1_day, + scroll_50_past_2_day, + scroll_50_past_3_day, + scroll_50_past_4_day, + scroll_50_past_5_day, + scroll_90_past_1_day, + scroll_90_past_2_day, + scroll_90_past_3_day, + scroll_90_past_4_day, + scroll_90_past_5_day, + view_search_results_past_1_day, + view_search_results_past_2_day, + view_search_results_past_3_day, + view_search_results_past_4_day, + view_search_results_past_5_day, + recipe_add_to_list_past_1_day, + recipe_add_to_list_past_2_day, + recipe_add_to_list_past_3_day, + recipe_add_to_list_past_4_day, + recipe_add_to_list_past_5_day, + recipe_print_past_1_day, + recipe_print_past_2_day, + recipe_print_past_3_day, + recipe_print_past_4_day, + recipe_print_past_5_day, + sign_up_past_1_day, + sign_up_past_2_day, + sign_up_past_3_day, + sign_up_past_4_day, + sign_up_past_5_day, + recipe_favorite_past_1_day, + recipe_favorite_past_2_day, + recipe_favorite_past_3_day, + recipe_favorite_past_4_day, + recipe_favorite_past_5_day, + recipe_add_to_menu_past_1_day, + recipe_add_to_menu_past_2_day, + recipe_add_to_menu_past_3_day, + recipe_add_to_menu_past_4_day, + recipe_add_to_menu_past_5_day +) +SELECT DISTINCT +feature_date, + user_pseudo_id, + user_id, + MIN(user_ltv_revenue) OVER(PARTITION BY user_pseudo_id, feature_date) as user_ltv_revenue, + device_category, + device_mobile_brand_name, + device_mobile_model_name, + device_os, + device_language, + device_web_browser, + geo_sub_continent, + geo_country, + geo_region, + geo_city, + geo_metro, + last_traffic_source_medium, + last_traffic_source_name, + last_traffic_source_source, + first_traffic_source_medium, + first_traffic_source_name, + first_traffic_source_source, + has_signed_in_with_user_id, + scroll_50_past_1_day, + scroll_50_past_2_day, + scroll_50_past_3_day, + scroll_50_past_4_day, + scroll_50_past_5_day, + scroll_90_past_1_day, + scroll_90_past_2_day, + scroll_90_past_3_day, + scroll_90_past_4_day, + scroll_90_past_5_day, + view_search_results_past_1_day, + view_search_results_past_2_day, + view_search_results_past_3_day, + view_search_results_past_4_day, + view_search_results_past_5_day, + recipe_add_to_list_past_1_day, + recipe_add_to_list_past_2_day, + recipe_add_to_list_past_3_day, + recipe_add_to_list_past_4_day, + recipe_add_to_list_past_5_day, + recipe_print_past_1_day, + recipe_print_past_2_day, + recipe_print_past_3_day, + recipe_print_past_4_day, + recipe_print_past_5_day, + sign_up_past_1_day, + sign_up_past_2_day, + sign_up_past_3_day, + sign_up_past_4_day, + sign_up_past_5_day, + recipe_favorite_past_1_day, + recipe_favorite_past_2_day, + recipe_favorite_past_3_day, + recipe_favorite_past_4_day, + recipe_favorite_past_5_day, + recipe_add_to_menu_past_1_day, + recipe_add_to_menu_past_2_day, + recipe_add_to_menu_past_3_day, + recipe_add_to_menu_past_4_day, + recipe_add_to_menu_past_5_day +FROM inference_preparation; + + +CREATE OR REPLACE TABLE `{{project_id}}.{{dataset}}.lead_score_propensity_inference_5_1` AS( + SELECT DISTINCT + CURRENT_TIMESTAMP() AS processed_timestamp, + feature_date, + user_pseudo_id, + LAST_VALUE(user_id) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS user_id, + LAST_VALUE(user_ltv_revenue) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS user_ltv_revenue, + LAST_VALUE(device_category) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS device_category, + LAST_VALUE(device_mobile_brand_name) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS device_mobile_brand_name, + LAST_VALUE(device_mobile_model_name) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS device_mobile_model_name, + LAST_VALUE(device_os) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS device_os, + LAST_VALUE(device_language) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS device_language, + LAST_VALUE(device_web_browser) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS device_web_browser, + LAST_VALUE(geo_sub_continent) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS geo_sub_continent, + LAST_VALUE(geo_country) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS geo_country, + LAST_VALUE(geo_region) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS geo_region, + LAST_VALUE(geo_city) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS geo_city, + LAST_VALUE(geo_metro) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS geo_metro, + LAST_VALUE(last_traffic_source_medium) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS last_traffic_source_medium, + LAST_VALUE(last_traffic_source_name) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS last_traffic_source_name, + LAST_VALUE(last_traffic_source_source) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS last_traffic_source_source, + LAST_VALUE(first_traffic_source_medium) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS first_traffic_source_medium, + LAST_VALUE(first_traffic_source_name) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS first_traffic_source_name, + LAST_VALUE(first_traffic_source_source) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS first_traffic_source_source, + LAST_VALUE(has_signed_in_with_user_id) OVER(PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS has_signed_in_with_user_id, + LAST_VALUE(scroll_50_past_1_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS scroll_50_past_1_day, + LAST_VALUE(scroll_50_past_2_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS scroll_50_past_2_day, + LAST_VALUE(scroll_50_past_3_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS scroll_50_past_3_day, + LAST_VALUE(scroll_50_past_4_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS scroll_50_past_4_day, + LAST_VALUE(scroll_50_past_5_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS scroll_50_past_5_day, + LAST_VALUE(scroll_90_past_1_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS scroll_90_past_1_day, + LAST_VALUE(scroll_90_past_2_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS scroll_90_past_2_day, + LAST_VALUE(scroll_90_past_3_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS scroll_90_past_3_day, + LAST_VALUE(scroll_90_past_4_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS scroll_90_past_4_day, + LAST_VALUE(scroll_90_past_5_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS scroll_90_past_5_day, + LAST_VALUE(view_search_results_past_1_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS view_search_results_past_1_day, + LAST_VALUE(view_search_results_past_2_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS view_search_results_past_2_day, + LAST_VALUE(view_search_results_past_3_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS view_search_results_past_3_day, + LAST_VALUE(view_search_results_past_4_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS view_search_results_past_4_day, + LAST_VALUE(view_search_results_past_5_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS view_search_results_past_5_day, + LAST_VALUE(recipe_add_to_list_past_1_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS recipe_add_to_list_past_1_day, + LAST_VALUE(recipe_add_to_list_past_2_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS recipe_add_to_list_past_2_day, + LAST_VALUE(recipe_add_to_list_past_3_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS recipe_add_to_list_past_3_day, + LAST_VALUE(recipe_add_to_list_past_4_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS recipe_add_to_list_past_4_day, + LAST_VALUE(recipe_add_to_list_past_5_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS recipe_add_to_list_past_5_day, + LAST_VALUE(recipe_print_past_1_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS recipe_print_past_1_day, + LAST_VALUE(recipe_print_past_2_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS recipe_print_past_2_day, + LAST_VALUE(recipe_print_past_3_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS recipe_print_past_3_day, + LAST_VALUE(recipe_print_past_4_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS recipe_print_past_4_day, + LAST_VALUE(recipe_print_past_5_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS recipe_print_past_5_day, + LAST_VALUE(sign_up_past_1_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS sign_up_past_1_day, + LAST_VALUE(sign_up_past_2_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS sign_up_past_2_day, + LAST_VALUE(sign_up_past_3_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS sign_up_past_3_day, + LAST_VALUE(sign_up_past_4_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS sign_up_past_4_day, + LAST_VALUE(sign_up_past_5_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS sign_up_past_5_day, + LAST_VALUE(recipe_favorite_past_1_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS recipe_favorite_past_1_day, + LAST_VALUE(recipe_favorite_past_2_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS recipe_favorite_past_2_day, + LAST_VALUE(recipe_favorite_past_3_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS recipe_favorite_past_3_day, + LAST_VALUE(recipe_favorite_past_4_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS recipe_favorite_past_4_day, + LAST_VALUE(recipe_favorite_past_5_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS recipe_favorite_past_5_day, + LAST_VALUE(recipe_add_to_menu_past_1_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS recipe_add_to_menu_past_1_day, + LAST_VALUE(recipe_add_to_menu_past_2_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS recipe_add_to_menu_past_2_day, + LAST_VALUE(recipe_add_to_menu_past_3_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS recipe_add_to_menu_past_3_day, + LAST_VALUE(recipe_add_to_menu_past_4_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS recipe_add_to_menu_past_4_day, + LAST_VALUE(recipe_add_to_menu_past_5_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS recipe_add_to_menu_past_5_day + FROM `{{project_id}}.{{dataset}}.{{insert_table}}` +); + + +CREATE OR REPLACE VIEW `{{project_id}}.{{dataset}}.v_lead_score_propensity_inference_5_1` +(processed_timestamp, + feature_date, + user_pseudo_id, + user_id, + user_ltv_revenue, + device_category, + device_mobile_brand_name, + device_mobile_model_name, + device_os, + device_language, + device_web_browser, + geo_sub_continent, + geo_country, + geo_region, + geo_city, + geo_metro, + last_traffic_source_medium, + last_traffic_source_name, + last_traffic_source_source, + first_traffic_source_medium, + first_traffic_source_name, + first_traffic_source_source, + has_signed_in_with_user_id, + scroll_50_past_1_day, + scroll_50_past_2_day, + scroll_50_past_3_day, + scroll_50_past_4_day, + scroll_50_past_5_day, + scroll_90_past_1_day, + scroll_90_past_2_day, + scroll_90_past_3_day, + scroll_90_past_4_day, + scroll_90_past_5_day, + view_search_results_past_1_day, + view_search_results_past_2_day, + view_search_results_past_3_day, + view_search_results_past_4_day, + view_search_results_past_5_day, + recipe_add_to_list_past_1_day, + recipe_add_to_list_past_2_day, + recipe_add_to_list_past_3_day, + recipe_add_to_list_past_4_day, + recipe_add_to_list_past_5_day, + recipe_print_past_1_day, + recipe_print_past_2_day, + recipe_print_past_3_day, + recipe_print_past_4_day, + recipe_print_past_5_day, + sign_up_past_1_day, + sign_up_past_2_day, + sign_up_past_3_day, + sign_up_past_4_day, + sign_up_past_5_day, + recipe_favorite_past_1_day, + recipe_favorite_past_2_day, + recipe_favorite_past_3_day, + recipe_favorite_past_4_day, + recipe_favorite_past_5_day, + recipe_add_to_menu_past_1_day, + recipe_add_to_menu_past_2_day, + recipe_add_to_menu_past_3_day, + recipe_add_to_menu_past_4_day, + recipe_add_to_menu_past_5_day + ) +OPTIONS( + --expiration_timestamp=TIMESTAMP_ADD(CURRENT_TIMESTAMP(), INTERVAL {{expiration_duration_hours}} HOUR), + friendly_name="v_lead_score_propensity_inference_5_1", + description="View Lead Score Propensity Inference dataset using 5 days back to predict 1 day ahead. View expires after 48h and should run daily.", + labels=[("org_unit", "development")] +) AS +SELECT DISTINCT + processed_timestamp, + feature_date, + user_pseudo_id, + user_id, + user_ltv_revenue, + device_category, + device_mobile_brand_name, + device_mobile_model_name, + device_os, + device_language, + device_web_browser, + geo_sub_continent, + geo_country, + geo_region, + geo_city, + geo_metro, + last_traffic_source_medium, + last_traffic_source_name, + last_traffic_source_source, + first_traffic_source_medium, + first_traffic_source_name, + first_traffic_source_source, + has_signed_in_with_user_id, + scroll_50_past_1_day, + scroll_50_past_2_day, + scroll_50_past_3_day, + scroll_50_past_4_day, + scroll_50_past_5_day, + scroll_90_past_1_day, + scroll_90_past_2_day, + scroll_90_past_3_day, + scroll_90_past_4_day, + scroll_90_past_5_day, + view_search_results_past_1_day, + view_search_results_past_2_day, + view_search_results_past_3_day, + view_search_results_past_4_day, + view_search_results_past_5_day, + recipe_add_to_list_past_1_day, + recipe_add_to_list_past_2_day, + recipe_add_to_list_past_3_day, + recipe_add_to_list_past_4_day, + recipe_add_to_list_past_5_day, + recipe_print_past_1_day, + recipe_print_past_2_day, + recipe_print_past_3_day, + recipe_print_past_4_day, + recipe_print_past_5_day, + sign_up_past_1_day, + sign_up_past_2_day, + sign_up_past_3_day, + sign_up_past_4_day, + sign_up_past_5_day, + recipe_favorite_past_1_day, + recipe_favorite_past_2_day, + recipe_favorite_past_3_day, + recipe_favorite_past_4_day, + recipe_favorite_past_5_day, + recipe_add_to_menu_past_1_day, + recipe_add_to_menu_past_2_day, + recipe_add_to_menu_past_3_day, + recipe_add_to_menu_past_4_day, + recipe_add_to_menu_past_5_day +FROM ( +SELECT DISTINCT + processed_timestamp, + feature_date, + user_pseudo_id, + user_id, + user_ltv_revenue, + device_category, + device_mobile_brand_name, + device_mobile_model_name, + device_os, + device_language, + device_web_browser, + geo_sub_continent, + geo_country, + geo_region, + geo_city, + geo_metro, + last_traffic_source_medium, + last_traffic_source_name, + last_traffic_source_source, + first_traffic_source_medium, + first_traffic_source_name, + first_traffic_source_source, + has_signed_in_with_user_id, + scroll_50_past_1_day, + scroll_50_past_2_day, + scroll_50_past_3_day, + scroll_50_past_4_day, + scroll_50_past_5_day, + scroll_90_past_1_day, + scroll_90_past_2_day, + scroll_90_past_3_day, + scroll_90_past_4_day, + scroll_90_past_5_day, + view_search_results_past_1_day, + view_search_results_past_2_day, + view_search_results_past_3_day, + view_search_results_past_4_day, + view_search_results_past_5_day, + recipe_add_to_list_past_1_day, + recipe_add_to_list_past_2_day, + recipe_add_to_list_past_3_day, + recipe_add_to_list_past_4_day, + recipe_add_to_list_past_5_day, + recipe_print_past_1_day, + recipe_print_past_2_day, + recipe_print_past_3_day, + recipe_print_past_4_day, + recipe_print_past_5_day, + sign_up_past_1_day, + sign_up_past_2_day, + sign_up_past_3_day, + sign_up_past_4_day, + sign_up_past_5_day, + recipe_favorite_past_1_day, + recipe_favorite_past_2_day, + recipe_favorite_past_3_day, + recipe_favorite_past_4_day, + recipe_favorite_past_5_day, + recipe_add_to_menu_past_1_day, + recipe_add_to_menu_past_2_day, + recipe_add_to_menu_past_3_day, + recipe_add_to_menu_past_4_day, + recipe_add_to_menu_past_5_day, + -- Row number partitioned by user pseudo id ordered by feature date descending + ROW_NUMBER() OVER (PARTITION BY user_pseudo_id ORDER BY feature_date DESC) AS user_row_order + FROM `{{project_id}}.{{dataset}}.lead_score_propensity_inference_5_1` +) +WHERE + -- Filter only for the most recent user example + user_row_order = 1; + diff --git a/sql/procedure/lead_score_propensity_label.sqlx b/sql/procedure/lead_score_propensity_label.sqlx new file mode 100644 index 00000000..c92afd27 --- /dev/null +++ b/sql/procedure/lead_score_propensity_label.sqlx @@ -0,0 +1,103 @@ +-- Copyright 2023 Google LLC +-- +-- Licensed under the Apache License, Version 2.0 (the "License"); +-- you may not use this file except in compliance with the License. +-- You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, software +-- distributed under the License is distributed on an "AS IS" BASIS, +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- See the License for the specific language governing permissions and +-- limitations under the License. + +-- Run these windows aggregations every day. For each date in training and inference date ranges. +-- Setting procedure to lookback from the day before `input_date` until the day before `end_date` +SET input_date = DATE_SUB(input_date, INTERVAL 1 DAY); +SET end_date = DATE_SUB(end_date, INTERVAL 1 DAY); + +-- Future User metrics: 1-day future logins per user +CREATE OR REPLACE TEMP TABLE future_logins_per_user AS ( + SELECT + -- User's unique identifier + user_pseudo_id, + -- The date for which future logins are being calculated + input_date as event_date, + -- Calculates the maximum count of distinct events for users who made a logins 1 day after `input_date` + MAX(COUNT(DISTINCT CASE DATE_DIFF(event_date, input_date, DAY) = 1 WHEN TRUE THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id) AS login_day_1 + FROM `{{mds_project_id}}.{{mds_dataset}}.event` as E + INNER JOIN `{{mds_project_id}}.{{mds_dataset}}.device` as D + ON E.device_type_id = D.device_type_id + -- Filters events to be within the date range defined by input_date and end_date from dates_interval + WHERE event_date BETWEEN input_date AND end_date + -- Filter event with event name login + AND LOWER(E.event_name) IN ('login') + AND E.ga_session_id IS NOT NULL + AND D.device_os IS NOT NULL + -- Grouping by user pseudo ids + GROUP BY user_pseudo_id +); + +-- All users in the platform +CREATE OR REPLACE TEMP TABLE all_users_possible_logins as ( + SELECT DISTINCT + -- User's unique identifier + Users.user_pseudo_id, + -- The event date for which logins are being considered + Days.event_date as event_date, + -- Placeholder columns for login counts in future days + NULL as login_day_1 + FROM `{{mds_project_id}}.{{mds_dataset}}.event` Users + CROSS JOIN + -- Generates a list of dates for the current date (`input_date`) + (SELECT event_date FROM UNNEST(GENERATE_DATE_ARRAY(input_date, end_date, INTERVAL 1 DAY)) AS event_date) Days + WHERE Days.event_date = input_date + -- Filter event with event name login + -- AND Users.event_name='login' + AND Users.ga_session_id IS NOT NULL +); + + +CREATE OR REPLACE TEMP TABLE DataForTargetTable AS +SELECT DISTINCT + -- Timestamp when the data was processed + CURRENT_TIMESTAMP() AS processed_timestamp, + -- The date for which logins are being considered + A.event_date as feature_date, + -- User's unique identifier + A.user_pseudo_id, + -- The maximum of 0 and the login count for day 1 (if it exists) + LEAST(COALESCE(B.login_day_1, 0), 1) AS login_day_1 +FROM all_users_possible_logins AS A +LEFT JOIN future_logins_per_user AS B +ON B.user_pseudo_id = A.user_pseudo_id +; + +-- Updates or inserts data into the target table +MERGE `{{project_id}}.{{dataset}}.{{insert_table}}` I +USING DataForTargetTable T +ON I.feature_date = T.feature_date + AND I.user_pseudo_id = T.user_pseudo_id +WHEN MATCHED THEN + -- Updates existing records + UPDATE SET + -- Updates the processed timestamp + I.processed_timestamp = T.processed_timestamp, + -- Updates login counts for each day + I.login_day_1 = T.login_day_1 +WHEN NOT MATCHED THEN + -- Inserts new records + INSERT + (processed_timestamp, + feature_date, + user_pseudo_id, + login_day_1) + VALUES + (T.processed_timestamp, + T.feature_date, + T.user_pseudo_id, + T.login_day_1) +; + +SET rows_added = (SELECT COUNT(DISTINCT user_pseudo_id) FROM `{{project_id}}.{{dataset}}.{{insert_table}}`); diff --git a/sql/procedure/lead_score_propensity_training_preparation.sqlx b/sql/procedure/lead_score_propensity_training_preparation.sqlx new file mode 100644 index 00000000..6e0eed69 --- /dev/null +++ b/sql/procedure/lead_score_propensity_training_preparation.sqlx @@ -0,0 +1,1093 @@ +-- Copyright 2023 Google LLC +-- +-- Licensed under the Apache License, Version 2.0 (the "License"); +-- you may not use this file except in compliance with the License. +-- You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, software +-- distributed under the License is distributed on an "AS IS" BASIS, +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- See the License for the specific language governing permissions and +-- limitations under the License. + +DECLARE custom_start_date DATE DEFAULT NULL; +DECLARE custom_end_date DATE DEFAULT NULL; + +-- custom_start_date: The start date of the data to be used for training. +-- custom_end_date: The end date of the data to be used for training. +SET custom_start_date = PARSE_DATE("%Y-%m-%d", {{custom_start_date}}); +SET custom_end_date = PARSE_DATE("%Y-%m-%d", {{custom_end_date}}); + +-- The procedure first checks if the custom_start_date and custom_end_date parameters are valid. +-- If either parameter is not valid, the procedure sets the corresponding date to the maximum or +-- minimum date of the available data. +IF custom_start_date IS NOT NULL AND custom_start_date >= start_date AND custom_start_date <= end_date + AND custom_start_date < custom_end_date THEN + SET start_date = custom_start_date; +END IF; + +IF custom_end_date IS NOT NULL AND custom_end_date <= end_date AND custom_end_date >= start_date + AND custom_end_date > custom_start_date THEN + SET end_date = custom_end_date; +END IF; + +-- This is a temp table consolidating user_dimensions over the dates intervals. +CREATE OR REPLACE TEMP TABLE training_preparation_ud as ( + SELECT DISTINCT + -- The user pseudo id + UD.user_pseudo_id, + -- The user id + MAX(UD.user_id) OVER(user_dimensions_window) AS user_id, + -- The feature date + UD.feature_date, + -- The user lifetime value revenue + MAX(UD.user_ltv_revenue) OVER(user_dimensions_window) AS user_ltv_revenue, + -- The device category + MAX(UD.device_category) OVER(user_dimensions_window) AS device_category, + -- The device brand name + MAX(UD.device_mobile_brand_name) OVER(user_dimensions_window) AS device_mobile_brand_name, + -- The device model name + MAX(UD.device_mobile_model_name) OVER(user_dimensions_window) AS device_mobile_model_name, + -- The device operating system + MAX(UD.device_os) OVER(user_dimensions_window) AS device_os, + -- The device language + MAX(UD.device_language) OVER(user_dimensions_window) AS device_language, + -- The device web browser + MAX(UD.device_web_browser) OVER(user_dimensions_window) AS device_web_browser, + -- The user sub continent + MAX(UD.geo_sub_continent) OVER(user_dimensions_window) AS geo_sub_continent, + -- The user country + MAX(UD.geo_country) OVER(user_dimensions_window) AS geo_country, + -- The user region + MAX(UD.geo_region) OVER(user_dimensions_window) AS geo_region, + -- The user city + MAX(UD.geo_city) OVER(user_dimensions_window) AS geo_city, + -- The user metro + MAX(UD.geo_metro) OVER(user_dimensions_window) AS geo_metro, + -- The user last traffic source medium + MAX(UD.last_traffic_source_medium) OVER(user_dimensions_window) AS last_traffic_source_medium, + -- The user last traffic source name + MAX(UD.last_traffic_source_name) OVER(user_dimensions_window) AS last_traffic_source_name, + -- The user last traffic source source + MAX(UD.last_traffic_source_source) OVER(user_dimensions_window) AS last_traffic_source_source, + -- The user first traffic source medium + MAX(UD.first_traffic_source_medium) OVER(user_dimensions_window) AS first_traffic_source_medium, + -- The user first traffic source name + MAX(UD.first_traffic_source_name) OVER(user_dimensions_window) AS first_traffic_source_name, + -- The user first traffic source source + MAX(UD.first_traffic_source_source) OVER(user_dimensions_window) AS first_traffic_source_source, + -- Whether the user has signed in with user ID + MAX(UD.has_signed_in_with_user_id) OVER(user_dimensions_window) AS has_signed_in_with_user_id, +FROM + `{{feature_store_project_id}}.{{feature_store_dataset}}.user_dimensions` UD +WHERE + -- Filter feature dates according to the defined date interval + UD.feature_date BETWEEN start_date AND end_date +WINDOW + user_dimensions_window AS (PARTITION BY UD.user_pseudo_id, UD.feature_date ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) +); + +-- This is a temp table consolidating user rolling metrics over the dates intervals. +CREATE OR REPLACE TEMP TABLE training_preparation_uwlm as ( + SELECT DISTINCT + -- User pseudo id + UWLM.user_pseudo_id, + -- Feature date + UWLM.feature_date, + -- Calculate the maximum value for each metric over the window + MAX(UWLM.scroll_50_past_1_day) OVER(user_rolling_lead_window) AS scroll_50_past_1_day, + -- Maximum active users in the past 2 days + MAX(UWLM.scroll_50_past_2_day) OVER(user_rolling_lead_window) AS scroll_50_past_2_day, + -- Maximum active users in the past 3 days + MAX(UWLM.scroll_50_past_3_day) OVER(user_rolling_lead_window) AS scroll_50_past_3_day, + -- Maximum active users in the past 4 days + MAX(UWLM.scroll_50_past_4_day) OVER(user_rolling_lead_window) AS scroll_50_past_4_day, + -- Maximum active users in the past 5 days + MAX(UWLM.scroll_50_past_5_day) OVER(user_rolling_lead_window) AS scroll_50_past_5_day, + -- Maximum scroll_90 in the past 1 day + MAX(UWLM.scroll_90_past_1_day) OVER(user_rolling_lead_window) AS scroll_90_past_1_day, + -- Maximum scroll_90 in the past 2 days + MAX(UWLM.scroll_90_past_2_day) OVER(user_rolling_lead_window) AS scroll_90_past_2_day, + -- Maximum scroll_90 in the past 3 days + MAX(UWLM.scroll_90_past_3_day) OVER(user_rolling_lead_window) AS scroll_90_past_3_day, + -- Maximum scroll_90 in the past 4 days + MAX(UWLM.scroll_90_past_4_day) OVER(user_rolling_lead_window) AS scroll_90_past_4_day, + -- Maximum scroll_90 in the past 5 days + MAX(UWLM.scroll_90_past_5_day) OVER(user_rolling_lead_window) AS scroll_90_past_5_day, + -- Maximum view_search_results in the past 1 day + MAX(UWLM.view_search_results_past_1_day) OVER(user_rolling_lead_window) AS view_search_results_past_1_day, + -- Maximum view_search_results in the past 2 days + MAX(UWLM.view_search_results_past_2_day) OVER(user_rolling_lead_window) AS view_search_results_past_2_day, + -- Maximum view_search_results in the past 3 days + MAX(UWLM.view_search_results_past_3_day) OVER(user_rolling_lead_window) AS view_search_results_past_3_day, + -- Maximum view_search_results in the past 4 days + MAX(UWLM.view_search_results_past_4_day) OVER(user_rolling_lead_window) AS view_search_results_past_4_day, + -- Maximum view_search_results in the past 5 days + MAX(UWLM.view_search_results_past_5_day) OVER(user_rolling_lead_window) AS view_search_results_past_5_day, + -- Maximum file_download in the past 1 day + MAX(UWLM.file_download_past_1_day) OVER(user_rolling_lead_window) AS file_download_past_1_day, + -- Maximum file_download in the past 2 days + MAX(UWLM.file_download_past_2_day) OVER(user_rolling_lead_window) AS file_download_past_2_day, + -- Maximum file_download in the past 3 days + MAX(UWLM.file_download_past_3_day) OVER(user_rolling_lead_window) AS file_download_past_3_day, + -- Maximum file_download in the past 4 days + MAX(UWLM.file_download_past_4_day) OVER(user_rolling_lead_window) AS file_download_past_4_day, + -- Maximum file_download in the past 5 days + MAX(UWLM.file_download_past_5_day) OVER(user_rolling_lead_window) AS file_download_past_5_day, + -- Maximum recipe_add_to_list in the past 1 day + MAX(UWLM.recipe_add_to_list_past_1_day) OVER(user_rolling_lead_window) AS recipe_add_to_list_past_1_day, + -- Maximum recipe_add_to_list in the past 2 days + MAX(UWLM.recipe_add_to_list_past_2_day) OVER(user_rolling_lead_window) AS recipe_add_to_list_past_2_day, + -- Maximum recipe_add_to_list in the past 3 days + MAX(UWLM.recipe_add_to_list_past_3_day) OVER(user_rolling_lead_window) AS recipe_add_to_list_past_3_day, + -- Maximum recipe_add_to_list in the past 4 days + MAX(UWLM.recipe_add_to_list_past_4_day) OVER(user_rolling_lead_window) AS recipe_add_to_list_past_4_day, + -- Maximum recipe_add_to_list in the past 5 days + MAX(UWLM.recipe_add_to_list_past_5_day) OVER(user_rolling_lead_window) AS recipe_add_to_list_past_5_day, + -- Maximum recipe_print in the past 1 day + MAX(UWLM.recipe_print_past_1_day) OVER(user_rolling_lead_window) AS recipe_print_past_1_day, + -- Maximum recipe_print in the past 2 days + MAX(UWLM.recipe_print_past_2_day) OVER(user_rolling_lead_window) AS recipe_print_past_2_day, + -- Maximum recipe_print in the past 3 days + MAX(UWLM.recipe_print_past_3_day) OVER(user_rolling_lead_window) AS recipe_print_past_3_day, + -- Maximum recipe_print in the past 4 days + MAX(UWLM.recipe_print_past_4_day) OVER(user_rolling_lead_window) AS recipe_print_past_4_day, + -- Maximum recipe_print in the past 5 days + MAX(UWLM.recipe_print_past_5_day) OVER(user_rolling_lead_window) AS recipe_print_past_5_day, + -- Maximum sign_up_past in the past 1 day + MAX(UWLM.sign_up_past_1_day) OVER(user_rolling_lead_window) AS sign_up_past_1_day, + -- Maximum sign_up_past in the past 2 days + MAX(UWLM.sign_up_past_2_day) OVER(user_rolling_lead_window) AS sign_up_past_2_day, + -- Maximum sign_up_past in the past 3 days + MAX(UWLM.sign_up_past_3_day) OVER(user_rolling_lead_window) AS sign_up_past_3_day, + -- Maximum sign_up_past in the past 4 days + MAX(UWLM.sign_up_past_4_day) OVER(user_rolling_lead_window) AS sign_up_past_4_day, + -- Maximum sign_up_past in the past 5 days + MAX(UWLM.sign_up_past_5_day) OVER(user_rolling_lead_window) AS sign_up_past_5_day, + -- Maximum recipe_favorite in the past 1 day + MAX(UWLM.recipe_favorite_past_1_day) OVER(user_rolling_lead_window) AS recipe_favorite_past_1_day, + -- Maximum recipe_favorite in the past 2 days + MAX(UWLM.recipe_favorite_past_2_day) OVER(user_rolling_lead_window) AS recipe_favorite_past_2_day, + -- Maximum recipe_favorite in the past 3 days + MAX(UWLM.recipe_favorite_past_3_day) OVER(user_rolling_lead_window) AS recipe_favorite_past_3_day, + -- Maximum recipe_favorite in the past 4 days + MAX(UWLM.recipe_favorite_past_4_day) OVER(user_rolling_lead_window) AS recipe_favorite_past_4_day, + -- Maximum recipe_favorite in the past 5 days + MAX(UWLM.recipe_favorite_past_5_day) OVER(user_rolling_lead_window) AS recipe_favorite_past_5_day, + -- Maximum recipe_add_to_menu in the past 1 day + MAX(UWLM.recipe_add_to_menu_past_1_day) OVER(user_rolling_lead_window) AS recipe_add_to_menu_past_1_day, + -- Maximum recipe_add_to_menu in the past 2 days + MAX(UWLM.recipe_add_to_menu_past_2_day) OVER(user_rolling_lead_window) AS recipe_add_to_menu_past_2_day, + -- Maximum recipe_add_to_menu in the past 3 days + MAX(UWLM.recipe_add_to_menu_past_3_day) OVER(user_rolling_lead_window) AS recipe_add_to_menu_past_3_day, + -- Maximum recipe_add_to_menu in the past 4 days + MAX(UWLM.recipe_add_to_menu_past_4_day) OVER(user_rolling_lead_window) AS recipe_add_to_menu_past_4_day, + -- Maximum recipe_add_to_menu in the past 5 days + MAX(UWLM.recipe_add_to_menu_past_5_day) OVER(user_rolling_lead_window) AS recipe_add_to_menu_past_5_day, +FROM + `{{feature_store_project_id}}.{{feature_store_dataset}}.user_rolling_window_lead_metrics` UWLM +WHERE + -- In the future consider `feature_date BETWEEN start_date AND end_date`, to process multiple days. Modify Partition BY + UWLM.feature_date BETWEEN start_date AND end_date +WINDOW + user_rolling_lead_window AS (PARTITION BY UWLM.user_pseudo_id, UWLM.feature_date ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) +); + +-- This is a temp table consolidating user labels over the dates intervals. +CREATE OR REPLACE TEMP TABLE training_preparation_label as ( + SELECT DISTINCT + LABEL.user_pseudo_id, -- The unique identifier for the user. + LABEL.feature_date, -- The date for which the features are extracted. + MAX(LABEL.login_day_1) OVER(lead_score_propensity_label_window) AS login_day_1, -- Whether the user made a login on day 1. +FROM + `{{feature_store_project_id}}.{{feature_store_dataset}}.lead_score_propensity_label` LABEL +WHERE + -- Define the training subset interval + LABEL.feature_date BETWEEN start_date AND end_date +WINDOW + lead_score_propensity_label_window AS (PARTITION BY LABEL.user_pseudo_id, LABEL.feature_date ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) +); + +-- This is a temp table consolidating all features and labels over the dates intervals. +CREATE OR REPLACE TEMP TABLE training_preparation as ( + SELECT DISTINCT + UD.user_pseudo_id, + UD.user_id, + UD.feature_date, + COALESCE(UD.user_ltv_revenue, 0.0) AS user_ltv_revenue, + UD.device_category, + UD.device_mobile_brand_name, + UD.device_mobile_model_name, + UD.device_os, + UD.device_language, + UD.device_web_browser, + UD.geo_sub_continent, + UD.geo_country, + UD.geo_region, + UD.geo_city, + UD.geo_metro, + UD.last_traffic_source_medium, + UD.last_traffic_source_name, + UD.last_traffic_source_source, + UD.first_traffic_source_medium, + UD.first_traffic_source_name, + UD.first_traffic_source_source, + UD.has_signed_in_with_user_id, + UWLM.scroll_50_past_1_day, + UWLM.scroll_50_past_2_day, + UWLM.scroll_50_past_3_day, + UWLM.scroll_50_past_4_day, + UWLM.scroll_50_past_5_day, + UWLM.scroll_90_past_1_day, + UWLM.scroll_90_past_2_day, + UWLM.scroll_90_past_3_day, + UWLM.scroll_90_past_4_day, + UWLM.scroll_90_past_5_day, + UWLM.view_search_results_past_1_day, + UWLM.view_search_results_past_2_day, + UWLM.view_search_results_past_3_day, + UWLM.view_search_results_past_4_day, + UWLM.view_search_results_past_5_day, + UWLM.file_download_past_1_day, + UWLM.file_download_past_2_day, + UWLM.file_download_past_3_day, + UWLM.file_download_past_4_day, + UWLM.file_download_past_5_day, + UWLM.recipe_add_to_list_past_1_day, + UWLM.recipe_add_to_list_past_2_day, + UWLM.recipe_add_to_list_past_3_day, + UWLM.recipe_add_to_list_past_4_day, + UWLM.recipe_add_to_list_past_5_day, + UWLM.recipe_print_past_1_day, + UWLM.recipe_print_past_2_day, + UWLM.recipe_print_past_3_day, + UWLM.recipe_print_past_4_day, + UWLM.recipe_print_past_5_day, + UWLM.sign_up_past_1_day, + UWLM.sign_up_past_2_day, + UWLM.sign_up_past_3_day, + UWLM.sign_up_past_4_day, + UWLM.sign_up_past_5_day, + UWLM.recipe_favorite_past_1_day, + UWLM.recipe_favorite_past_2_day, + UWLM.recipe_favorite_past_3_day, + UWLM.recipe_favorite_past_4_day, + UWLM.recipe_favorite_past_5_day, + UWLM.recipe_add_to_menu_past_1_day, + UWLM.recipe_add_to_menu_past_2_day, + UWLM.recipe_add_to_menu_past_3_day, + UWLM.recipe_add_to_menu_past_4_day, + UWLM.recipe_add_to_menu_past_5_day, + LABEL.login_day_1 +FROM + training_preparation_ud UD +INNER JOIN + training_preparation_uwlm UWLM +ON + UWLM.user_pseudo_id = UD.user_pseudo_id + AND UWLM.feature_date = UD.feature_date +INNER JOIN + training_preparation_label LABEL +ON + LABEL.user_pseudo_id = UD.user_pseudo_id + AND LABEL.feature_date = UD.feature_date +); + +-- This is a temp table split the rows in each different data_split (TRAIN, VALIDATE, TEST) split +CREATE OR REPLACE TEMP TABLE DataForTargetTable AS( + SELECT DISTINCT + CASE + WHEN (ABS(MOD(FARM_FINGERPRINT(user_pseudo_id), 10)) BETWEEN 0 AND train_split_end_number) THEN "TRAIN" + WHEN (ABS(MOD(FARM_FINGERPRINT(user_pseudo_id), 10)) BETWEEN train_split_end_number AND validation_split_end_number) THEN "VALIDATE" + WHEN (ABS(MOD(FARM_FINGERPRINT(user_pseudo_id), 10)) BETWEEN validation_split_end_number AND 9) THEN "TEST" + END as data_split, + feature_date, + user_pseudo_id, + user_id, + user_ltv_revenue, + device_category, + device_mobile_brand_name, + device_mobile_model_name, + device_os, + device_language, + device_web_browser, + geo_sub_continent, + geo_country, + geo_region, + geo_city, + geo_metro, + last_traffic_source_medium, + last_traffic_source_name, + last_traffic_source_source, + first_traffic_source_medium, + first_traffic_source_name, + first_traffic_source_source, + has_signed_in_with_user_id, + scroll_50_past_1_day, + scroll_50_past_2_day, + scroll_50_past_3_day, + scroll_50_past_4_day, + scroll_50_past_5_day, + scroll_90_past_1_day, + scroll_90_past_2_day, + scroll_90_past_3_day, + scroll_90_past_4_day, + scroll_90_past_5_day, + view_search_results_past_1_day, + view_search_results_past_2_day, + view_search_results_past_3_day, + view_search_results_past_4_day, + view_search_results_past_5_day, + file_download_past_1_day, + file_download_past_2_day, + file_download_past_3_day, + file_download_past_4_day, + file_download_past_5_day, + recipe_add_to_list_past_1_day, + recipe_add_to_list_past_2_day, + recipe_add_to_list_past_3_day, + recipe_add_to_list_past_4_day, + recipe_add_to_list_past_5_day, + recipe_print_past_1_day, + recipe_print_past_2_day, + recipe_print_past_3_day, + recipe_print_past_4_day, + recipe_print_past_5_day, + sign_up_past_1_day, + sign_up_past_2_day, + sign_up_past_3_day, + sign_up_past_4_day, + sign_up_past_5_day, + recipe_favorite_past_1_day, + recipe_favorite_past_2_day, + recipe_favorite_past_3_day, + recipe_favorite_past_4_day, + recipe_favorite_past_5_day, + recipe_add_to_menu_past_1_day, + recipe_add_to_menu_past_2_day, + recipe_add_to_menu_past_3_day, + recipe_add_to_menu_past_4_day, + recipe_add_to_menu_past_5_day, + login_day_1 + FROM training_preparation); + +CREATE OR REPLACE TABLE `{{project_id}}.{{dataset}}.lead_score_propensity_training_full_dataset` AS +SELECT DISTINCT * FROM DataForTargetTable +WHERE data_split IS NOT NULL; + + +-- This is a table preparing rows for lead score propensity modelling looking back 5 days and looking ahead 1 day. +CREATE OR REPLACE TABLE `{{project_id}}.{{dataset}}.lead_score_propensity_training_5_1` AS( + SELECT DISTINCT + CURRENT_TIMESTAMP() AS processed_timestamp, + data_split, + feature_date, + user_pseudo_id, + LAST_VALUE(user_id) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS user_id, + LAST_VALUE(user_ltv_revenue) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS user_ltv_revenue, + LAST_VALUE(device_category) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS device_category, + LAST_VALUE(device_mobile_brand_name) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS device_mobile_brand_name, + LAST_VALUE(device_mobile_model_name) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS device_mobile_model_name, + LAST_VALUE(device_os) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS device_os, + LAST_VALUE(device_language) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS device_language, + LAST_VALUE(device_web_browser) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS device_web_browser, + LAST_VALUE(geo_sub_continent) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS geo_sub_continent, + LAST_VALUE(geo_country) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS geo_country, + LAST_VALUE(geo_region) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS geo_region, + LAST_VALUE(geo_city) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS geo_city, + LAST_VALUE(geo_metro) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS geo_metro, + LAST_VALUE(last_traffic_source_medium) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS last_traffic_source_medium, + LAST_VALUE(last_traffic_source_name) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS last_traffic_source_name, + LAST_VALUE(last_traffic_source_source) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS last_traffic_source_source, + LAST_VALUE(first_traffic_source_medium) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS first_traffic_source_medium, + LAST_VALUE(first_traffic_source_name) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS first_traffic_source_name, + LAST_VALUE(first_traffic_source_source) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS first_traffic_source_source, + LAST_VALUE(has_signed_in_with_user_id) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS has_signed_in_with_user_id, + LAST_VALUE(scroll_50_past_1_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS scroll_50_past_1_day, + LAST_VALUE(scroll_50_past_2_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS scroll_50_past_2_day, + LAST_VALUE(scroll_50_past_3_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS scroll_50_past_3_day, + LAST_VALUE(scroll_50_past_4_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS scroll_50_past_4_day, + LAST_VALUE(scroll_50_past_5_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS scroll_50_past_5_day, + LAST_VALUE(scroll_90_past_1_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS scroll_90_past_1_day, + LAST_VALUE(scroll_90_past_2_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS scroll_90_past_2_day, + LAST_VALUE(scroll_90_past_3_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS scroll_90_past_3_day, + LAST_VALUE(scroll_90_past_4_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS scroll_90_past_4_day, + LAST_VALUE(scroll_90_past_5_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS scroll_90_past_5_day, + LAST_VALUE(view_search_results_past_1_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS view_search_results_past_1_day, + LAST_VALUE(view_search_results_past_2_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS view_search_results_past_2_day, + LAST_VALUE(view_search_results_past_3_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS view_search_results_past_3_day, + LAST_VALUE(view_search_results_past_4_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS view_search_results_past_4_day, + LAST_VALUE(view_search_results_past_5_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS view_search_results_past_5_day, + LAST_VALUE(file_download_past_1_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS file_download_past_1_day, + LAST_VALUE(file_download_past_2_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS file_download_past_2_day, + LAST_VALUE(file_download_past_3_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS file_download_past_3_day, + LAST_VALUE(file_download_past_4_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS file_download_past_4_day, + LAST_VALUE(file_download_past_5_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS file_download_past_5_day, + LAST_VALUE(recipe_add_to_list_past_1_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS recipe_add_to_list_past_1_day, + LAST_VALUE(recipe_add_to_list_past_2_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS recipe_add_to_list_past_2_day, + LAST_VALUE(recipe_add_to_list_past_3_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS recipe_add_to_list_past_3_day, + LAST_VALUE(recipe_add_to_list_past_4_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS recipe_add_to_list_past_4_day, + LAST_VALUE(recipe_add_to_list_past_5_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS recipe_add_to_list_past_5_day, + LAST_VALUE(recipe_print_past_1_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS recipe_print_past_1_day, + LAST_VALUE(recipe_print_past_2_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS recipe_print_past_2_day, + LAST_VALUE(recipe_print_past_3_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS recipe_print_past_3_day, + LAST_VALUE(recipe_print_past_4_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS recipe_print_past_4_day, + LAST_VALUE(recipe_print_past_5_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS recipe_print_past_5_day, + LAST_VALUE(sign_up_past_1_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS sign_up_past_1_day, + LAST_VALUE(sign_up_past_2_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS sign_up_past_2_day, + LAST_VALUE(sign_up_past_3_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS sign_up_past_3_day, + LAST_VALUE(sign_up_past_4_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS sign_up_past_4_day, + LAST_VALUE(sign_up_past_5_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS sign_up_past_5_day, + LAST_VALUE(recipe_favorite_past_1_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS recipe_favorite_past_1_day, + LAST_VALUE(recipe_favorite_past_2_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS recipe_favorite_past_2_day, + LAST_VALUE(recipe_favorite_past_3_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS recipe_favorite_past_3_day, + LAST_VALUE(recipe_favorite_past_4_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS recipe_favorite_past_4_day, + LAST_VALUE(recipe_favorite_past_5_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS recipe_favorite_past_5_day, + LAST_VALUE(recipe_add_to_menu_past_1_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS recipe_add_to_menu_past_1_day, + LAST_VALUE(recipe_add_to_menu_past_2_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS recipe_add_to_menu_past_2_day, + LAST_VALUE(recipe_add_to_menu_past_3_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS recipe_add_to_menu_past_3_day, + LAST_VALUE(recipe_add_to_menu_past_4_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS recipe_add_to_menu_past_4_day, + LAST_VALUE(recipe_add_to_menu_past_5_day) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) AS recipe_add_to_menu_past_5_day, + -- Calculate the will login label. + -- Label for a lead score propensity model. It indicates whether a user made a lead score within the next 30 days based on their lead score history. + -- This label is then used to train a model that can predict the likelihood of future logins for other users. + LAST_VALUE(CASE WHEN (login_day_1) = 0 THEN 0 ELSE 1 END) OVER(PARTITION BY user_pseudo_id, feature_date ORDER BY feature_date DESC) as will_login + FROM `{{project_id}}.{{dataset}}.lead_score_propensity_training_full_dataset` +); + + +-- This is a view preparing rows for lead score propensity modelling looking back 5 days and looking ahead 1 days. +CREATE OR REPLACE VIEW `{{project_id}}.{{dataset}}.v_lead_score_propensity_training_5_1` +(processed_timestamp, + data_split, + user_pseudo_id, + user_id, + user_ltv_revenue, + device_category, + device_mobile_brand_name, + device_mobile_model_name, + device_os, + device_language, + device_web_browser, + geo_sub_continent, + geo_country, + geo_region, + geo_city, + geo_metro, + last_traffic_source_medium, + last_traffic_source_name, + last_traffic_source_source, + first_traffic_source_medium, + first_traffic_source_name, + first_traffic_source_source, + has_signed_in_with_user_id, + scroll_50_past_1_day, + scroll_50_past_2_day, + scroll_50_past_3_day, + scroll_50_past_4_day, + scroll_50_past_5_day, + scroll_90_past_1_day, + scroll_90_past_2_day, + scroll_90_past_3_day, + scroll_90_past_4_day, + scroll_90_past_5_day, + view_search_results_past_1_day, + view_search_results_past_2_day, + view_search_results_past_3_day, + view_search_results_past_4_day, + view_search_results_past_5_day, + file_download_past_1_day, + file_download_past_2_day, + file_download_past_3_day, + file_download_past_4_day, + file_download_past_5_day, + recipe_add_to_list_past_1_day, + recipe_add_to_list_past_2_day, + recipe_add_to_list_past_3_day, + recipe_add_to_list_past_4_day, + recipe_add_to_list_past_5_day, + recipe_print_past_1_day, + recipe_print_past_2_day, + recipe_print_past_3_day, + recipe_print_past_4_day, + recipe_print_past_5_day, + sign_up_past_1_day, + sign_up_past_2_day, + sign_up_past_3_day, + sign_up_past_4_day, + sign_up_past_5_day, + recipe_favorite_past_1_day, + recipe_favorite_past_2_day, + recipe_favorite_past_3_day, + recipe_favorite_past_4_day, + recipe_favorite_past_5_day, + recipe_add_to_menu_past_1_day, + recipe_add_to_menu_past_2_day, + recipe_add_to_menu_past_3_day, + recipe_add_to_menu_past_4_day, + recipe_add_to_menu_past_5_day, + will_login) +OPTIONS( + --expiration_timestamp=TIMESTAMP_ADD(CURRENT_TIMESTAMP(), INTERVAL 48 HOUR), + friendly_name="v_lead_score_propensity_training_5_1", + description="View Lead Score Propensity Training dataset using 5 days back to predict 1 day ahead. View expires after 48h and should run daily.", + labels=[("org_unit", "development")] +) AS +SELECT DISTINCT + * EXCEPT(feature_date, row_order_peruser_persplit) +FROM ( +SELECT DISTINCT + processed_timestamp, + user_pseudo_id, + data_split, + feature_date, + -- Now, I want to skip rows per user, per split every 1 day. + ROW_NUMBER() OVER (PARTITION BY user_pseudo_id, data_split, will_login ORDER BY feature_date ASC) AS row_order_peruser_persplit, + user_id, + user_ltv_revenue, + device_category, + device_mobile_brand_name, + device_mobile_model_name, + device_os, + device_language, + device_web_browser, + geo_sub_continent, + geo_country, + geo_region, + geo_city, + geo_metro, + last_traffic_source_medium, + last_traffic_source_name, + last_traffic_source_source, + first_traffic_source_medium, + first_traffic_source_name, + first_traffic_source_source, + has_signed_in_with_user_id, + scroll_50_past_1_day, + scroll_50_past_2_day, + scroll_50_past_3_day, + scroll_50_past_4_day, + scroll_50_past_5_day, + scroll_90_past_1_day, + scroll_90_past_2_day, + scroll_90_past_3_day, + scroll_90_past_4_day, + scroll_90_past_5_day, + view_search_results_past_1_day, + view_search_results_past_2_day, + view_search_results_past_3_day, + view_search_results_past_4_day, + view_search_results_past_5_day, + file_download_past_1_day, + file_download_past_2_day, + file_download_past_3_day, + file_download_past_4_day, + file_download_past_5_day, + recipe_add_to_list_past_1_day, + recipe_add_to_list_past_2_day, + recipe_add_to_list_past_3_day, + recipe_add_to_list_past_4_day, + recipe_add_to_list_past_5_day, + recipe_print_past_1_day, + recipe_print_past_2_day, + recipe_print_past_3_day, + recipe_print_past_4_day, + recipe_print_past_5_day, + sign_up_past_1_day, + sign_up_past_2_day, + sign_up_past_3_day, + sign_up_past_4_day, + sign_up_past_5_day, + recipe_favorite_past_1_day, + recipe_favorite_past_2_day, + recipe_favorite_past_3_day, + recipe_favorite_past_4_day, + recipe_favorite_past_5_day, + recipe_add_to_menu_past_1_day, + recipe_add_to_menu_past_2_day, + recipe_add_to_menu_past_3_day, + recipe_add_to_menu_past_4_day, + recipe_add_to_menu_past_5_day, + will_login +FROM( +SELECT DISTINCT + processed_timestamp, + data_split, + feature_date, + user_pseudo_id, + user_id, + user_ltv_revenue, + device_category, + device_mobile_brand_name, + device_mobile_model_name, + device_os, + device_language, + device_web_browser, + geo_sub_continent, + geo_country, + geo_region, + geo_city, + geo_metro, + last_traffic_source_medium, + last_traffic_source_name, + last_traffic_source_source, + first_traffic_source_medium, + first_traffic_source_name, + first_traffic_source_source, + has_signed_in_with_user_id, + scroll_50_past_1_day, + scroll_50_past_2_day, + scroll_50_past_3_day, + scroll_50_past_4_day, + scroll_50_past_5_day, + scroll_90_past_1_day, + scroll_90_past_2_day, + scroll_90_past_3_day, + scroll_90_past_4_day, + scroll_90_past_5_day, + view_search_results_past_1_day, + view_search_results_past_2_day, + view_search_results_past_3_day, + view_search_results_past_4_day, + view_search_results_past_5_day, + file_download_past_1_day, + file_download_past_2_day, + file_download_past_3_day, + file_download_past_4_day, + file_download_past_5_day, + recipe_add_to_list_past_1_day, + recipe_add_to_list_past_2_day, + recipe_add_to_list_past_3_day, + recipe_add_to_list_past_4_day, + recipe_add_to_list_past_5_day, + recipe_print_past_1_day, + recipe_print_past_2_day, + recipe_print_past_3_day, + recipe_print_past_4_day, + recipe_print_past_5_day, + sign_up_past_1_day, + sign_up_past_2_day, + sign_up_past_3_day, + sign_up_past_4_day, + sign_up_past_5_day, + recipe_favorite_past_1_day, + recipe_favorite_past_2_day, + recipe_favorite_past_3_day, + recipe_favorite_past_4_day, + recipe_favorite_past_5_day, + recipe_add_to_menu_past_1_day, + recipe_add_to_menu_past_2_day, + recipe_add_to_menu_past_3_day, + recipe_add_to_menu_past_4_day, + recipe_add_to_menu_past_5_day, + will_login, + -- Number of rows per user, per day, per split. Only one row per user, per day, per slip. + ROW_NUMBER() OVER (PARTITION BY user_pseudo_id, feature_date, data_split, will_login ORDER BY feature_date DESC) AS row_order_peruser_perday_persplit + FROM `{{project_id}}.{{dataset}}.lead_score_propensity_training_5_1` +) +WHERE + row_order_peruser_perday_persplit = 1 +) +WHERE + --Skipping windows of 5 days, which is the past window size. + MOD(row_order_peruser_persplit-1, 5) = 0; + + +-- This is a view preparing rows for lead score propensity modelling looking back 5 days and looking ahead 1 day. +-- This specifically filter rows which are most recent for each user. +CREATE OR REPLACE VIEW `{{project_id}}.{{dataset}}.v_lead_score_propensity_training_5_1_last_window` +(processed_timestamp, + data_split, + user_pseudo_id, + user_id, + user_ltv_revenue, + device_category, + device_mobile_brand_name, + device_mobile_model_name, + device_os, + device_language, + device_web_browser, + geo_sub_continent, + geo_country, + geo_region, + geo_city, + geo_metro, + last_traffic_source_medium, + last_traffic_source_name, + last_traffic_source_source, + first_traffic_source_medium, + first_traffic_source_name, + first_traffic_source_source, + has_signed_in_with_user_id, + scroll_50_past_1_day, + scroll_50_past_2_day, + scroll_50_past_3_day, + scroll_50_past_4_day, + scroll_50_past_5_day, + scroll_90_past_1_day, + scroll_90_past_2_day, + scroll_90_past_3_day, + scroll_90_past_4_day, + scroll_90_past_5_day, + view_search_results_past_1_day, + view_search_results_past_2_day, + view_search_results_past_3_day, + view_search_results_past_4_day, + view_search_results_past_5_day, + file_download_past_1_day, + file_download_past_2_day, + file_download_past_3_day, + file_download_past_4_day, + file_download_past_5_day, + recipe_add_to_list_past_1_day, + recipe_add_to_list_past_2_day, + recipe_add_to_list_past_3_day, + recipe_add_to_list_past_4_day, + recipe_add_to_list_past_5_day, + recipe_print_past_1_day, + recipe_print_past_2_day, + recipe_print_past_3_day, + recipe_print_past_4_day, + recipe_print_past_5_day, + sign_up_past_1_day, + sign_up_past_2_day, + sign_up_past_3_day, + sign_up_past_4_day, + sign_up_past_5_day, + recipe_favorite_past_1_day, + recipe_favorite_past_2_day, + recipe_favorite_past_3_day, + recipe_favorite_past_4_day, + recipe_favorite_past_5_day, + recipe_add_to_menu_past_1_day, + recipe_add_to_menu_past_2_day, + recipe_add_to_menu_past_3_day, + recipe_add_to_menu_past_4_day, + recipe_add_to_menu_past_5_day, + will_login) +OPTIONS( + --expiration_timestamp=TIMESTAMP_ADD(CURRENT_TIMESTAMP(), INTERVAL 48 HOUR), + friendly_name="v_lead_score_propensity_training_5_1_last_window", + description="View Lead Score Propensity Training dataset using 5 days back to predict 1 day ahead.", + labels=[("org_unit", "development")] +) AS +SELECT DISTINCT + processed_timestamp, + data_split, + user_pseudo_id, + user_id, + user_ltv_revenue, + device_category, + device_mobile_brand_name, + device_mobile_model_name, + device_os, + device_language, + device_web_browser, + geo_sub_continent, + geo_country, + geo_region, + geo_city, + geo_metro, + last_traffic_source_medium, + last_traffic_source_name, + last_traffic_source_source, + first_traffic_source_medium, + first_traffic_source_name, + first_traffic_source_source, + has_signed_in_with_user_id, + scroll_50_past_1_day, + scroll_50_past_2_day, + scroll_50_past_3_day, + scroll_50_past_4_day, + scroll_50_past_5_day, + scroll_90_past_1_day, + scroll_90_past_2_day, + scroll_90_past_3_day, + scroll_90_past_4_day, + scroll_90_past_5_day, + view_search_results_past_1_day, + view_search_results_past_2_day, + view_search_results_past_3_day, + view_search_results_past_4_day, + view_search_results_past_5_day, + file_download_past_1_day, + file_download_past_2_day, + file_download_past_3_day, + file_download_past_4_day, + file_download_past_5_day, + recipe_add_to_list_past_1_day, + recipe_add_to_list_past_2_day, + recipe_add_to_list_past_3_day, + recipe_add_to_list_past_4_day, + recipe_add_to_list_past_5_day, + recipe_print_past_1_day, + recipe_print_past_2_day, + recipe_print_past_3_day, + recipe_print_past_4_day, + recipe_print_past_5_day, + sign_up_past_1_day, + sign_up_past_2_day, + sign_up_past_3_day, + sign_up_past_4_day, + sign_up_past_5_day, + recipe_favorite_past_1_day, + recipe_favorite_past_2_day, + recipe_favorite_past_3_day, + recipe_favorite_past_4_day, + recipe_favorite_past_5_day, + recipe_add_to_menu_past_1_day, + recipe_add_to_menu_past_2_day, + recipe_add_to_menu_past_3_day, + recipe_add_to_menu_past_4_day, + recipe_add_to_menu_past_5_day, + will_login +FROM( +SELECT DISTINCT + processed_timestamp, + data_split, + user_pseudo_id, + user_id, + user_ltv_revenue, + device_category, + device_mobile_brand_name, + device_mobile_model_name, + device_os, + device_language, + device_web_browser, + geo_sub_continent, + geo_country, + geo_region, + geo_city, + geo_metro, + last_traffic_source_medium, + last_traffic_source_name, + last_traffic_source_source, + first_traffic_source_medium, + first_traffic_source_name, + first_traffic_source_source, + has_signed_in_with_user_id, + scroll_50_past_1_day, + scroll_50_past_2_day, + scroll_50_past_3_day, + scroll_50_past_4_day, + scroll_50_past_5_day, + scroll_90_past_1_day, + scroll_90_past_2_day, + scroll_90_past_3_day, + scroll_90_past_4_day, + scroll_90_past_5_day, + view_search_results_past_1_day, + view_search_results_past_2_day, + view_search_results_past_3_day, + view_search_results_past_4_day, + view_search_results_past_5_day, + file_download_past_1_day, + file_download_past_2_day, + file_download_past_3_day, + file_download_past_4_day, + file_download_past_5_day, + recipe_add_to_list_past_1_day, + recipe_add_to_list_past_2_day, + recipe_add_to_list_past_3_day, + recipe_add_to_list_past_4_day, + recipe_add_to_list_past_5_day, + recipe_print_past_1_day, + recipe_print_past_2_day, + recipe_print_past_3_day, + recipe_print_past_4_day, + recipe_print_past_5_day, + sign_up_past_1_day, + sign_up_past_2_day, + sign_up_past_3_day, + sign_up_past_4_day, + sign_up_past_5_day, + recipe_favorite_past_1_day, + recipe_favorite_past_2_day, + recipe_favorite_past_3_day, + recipe_favorite_past_4_day, + recipe_favorite_past_5_day, + recipe_add_to_menu_past_1_day, + recipe_add_to_menu_past_2_day, + recipe_add_to_menu_past_3_day, + recipe_add_to_menu_past_4_day, + recipe_add_to_menu_past_5_day, + will_login, + ROW_NUMBER() OVER (PARTITION BY user_pseudo_id, data_split, will_login ORDER BY feature_date DESC) AS user_row_order + --ROW_NUMBER() OVER (PARTITION BY user_pseudo_id, data_split ORDER BY feature_date DESC) AS user_row_order + FROM `{{project_id}}.{{dataset}}.lead_score_propensity_training_5_1` +) +WHERE + user_row_order = 1; + + +-- This is a view preparing rows for lead score propensity modelling looking back 5 days and looking ahead 1 day. +-- This is to be used in case recently no logins are registered, and you don't have a way to train the classification model. +CREATE OR REPLACE VIEW `{{project_id}}.{{dataset}}.v_lead_score_propensity_training_5_1_rare_logins` +(processed_timestamp, + data_split, + user_pseudo_id, + user_id, + user_ltv_revenue, + device_category, + device_mobile_brand_name, + device_mobile_model_name, + device_os, + device_language, + device_web_browser, + geo_sub_continent, + geo_country, + geo_region, + geo_city, + geo_metro, + last_traffic_source_medium, + last_traffic_source_name, + last_traffic_source_source, + first_traffic_source_medium, + first_traffic_source_name, + first_traffic_source_source, + has_signed_in_with_user_id, + scroll_50_past_1_day, + scroll_50_past_2_day, + scroll_50_past_3_day, + scroll_50_past_4_day, + scroll_50_past_5_day, + scroll_90_past_1_day, + scroll_90_past_2_day, + scroll_90_past_3_day, + scroll_90_past_4_day, + scroll_90_past_5_day, + view_search_results_past_1_day, + view_search_results_past_2_day, + view_search_results_past_3_day, + view_search_results_past_4_day, + view_search_results_past_5_day, + file_download_past_1_day, + file_download_past_2_day, + file_download_past_3_day, + file_download_past_4_day, + file_download_past_5_day, + recipe_add_to_list_past_1_day, + recipe_add_to_list_past_2_day, + recipe_add_to_list_past_3_day, + recipe_add_to_list_past_4_day, + recipe_add_to_list_past_5_day, + recipe_print_past_1_day, + recipe_print_past_2_day, + recipe_print_past_3_day, + recipe_print_past_4_day, + recipe_print_past_5_day, + sign_up_past_1_day, + sign_up_past_2_day, + sign_up_past_3_day, + sign_up_past_4_day, + sign_up_past_5_day, + recipe_favorite_past_1_day, + recipe_favorite_past_2_day, + recipe_favorite_past_3_day, + recipe_favorite_past_4_day, + recipe_favorite_past_5_day, + recipe_add_to_menu_past_1_day, + recipe_add_to_menu_past_2_day, + recipe_add_to_menu_past_3_day, + recipe_add_to_menu_past_4_day, + recipe_add_to_menu_past_5_day, + will_login) +OPTIONS( + --expiration_timestamp=TIMESTAMP_ADD(CURRENT_TIMESTAMP(), INTERVAL 48 HOUR), + friendly_name="v_lead_score_propensity_training_5_1_rare_logins", + description="View Lead Score Propensity Training dataset using 5 days back to predict 1 day ahead.", + labels=[("org_unit", "development")] +) AS +SELECT DISTINCT + processed_timestamp, + data_split, + user_pseudo_id, + user_id, + user_ltv_revenue, + device_category, + device_mobile_brand_name, + device_mobile_model_name, + device_os, + device_language, + device_web_browser, + geo_sub_continent, + geo_country, + geo_region, + geo_city, + geo_metro, + last_traffic_source_medium, + last_traffic_source_name, + last_traffic_source_source, + first_traffic_source_medium, + first_traffic_source_name, + first_traffic_source_source, + has_signed_in_with_user_id, + scroll_50_past_1_day, + scroll_50_past_2_day, + scroll_50_past_3_day, + scroll_50_past_4_day, + scroll_50_past_5_day, + scroll_90_past_1_day, + scroll_90_past_2_day, + scroll_90_past_3_day, + scroll_90_past_4_day, + scroll_90_past_5_day, + view_search_results_past_1_day, + view_search_results_past_2_day, + view_search_results_past_3_day, + view_search_results_past_4_day, + view_search_results_past_5_day, + file_download_past_1_day, + file_download_past_2_day, + file_download_past_3_day, + file_download_past_4_day, + file_download_past_5_day, + recipe_add_to_list_past_1_day, + recipe_add_to_list_past_2_day, + recipe_add_to_list_past_3_day, + recipe_add_to_list_past_4_day, + recipe_add_to_list_past_5_day, + recipe_print_past_1_day, + recipe_print_past_2_day, + recipe_print_past_3_day, + recipe_print_past_4_day, + recipe_print_past_5_day, + sign_up_past_1_day, + sign_up_past_2_day, + sign_up_past_3_day, + sign_up_past_4_day, + sign_up_past_5_day, + recipe_favorite_past_1_day, + recipe_favorite_past_2_day, + recipe_favorite_past_3_day, + recipe_favorite_past_4_day, + recipe_favorite_past_5_day, + recipe_add_to_menu_past_1_day, + recipe_add_to_menu_past_2_day, + recipe_add_to_menu_past_3_day, + recipe_add_to_menu_past_4_day, + recipe_add_to_menu_past_5_day, + will_login + FROM + (SELECT DISTINCT + * + FROM `{{project_id}}.{{dataset}}.v_lead_score_propensity_training_5_1_last_window` + ) + UNION ALL + ( + SELECT DISTINCT + * EXCEPT(user_row_order, feature_date) + FROM( + SELECT DISTINCT + *, + ROW_NUMBER() OVER (PARTITION BY user_pseudo_id, data_split ORDER BY feature_date DESC) AS user_row_order + FROM `{{project_id}}.{{dataset}}.lead_score_propensity_training_5_1` + WHERE will_login = 1 + ) + WHERE + user_row_order = 1 + LIMIT 100 + ) +; \ No newline at end of file diff --git a/sql/procedure/user_rolling_window_lead_metrics.sqlx b/sql/procedure/user_rolling_window_lead_metrics.sqlx new file mode 100644 index 00000000..5e15edce --- /dev/null +++ b/sql/procedure/user_rolling_window_lead_metrics.sqlx @@ -0,0 +1,520 @@ +-- Copyright 2023 Google LLC +-- +-- Licensed under the Apache License, Version 2.0 (the "License"); +-- you may not use this file except in compliance with the License. +-- You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, software +-- distributed under the License is distributed on an "AS IS" BASIS, +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- See the License for the specific language governing permissions and +-- limitations under the License. + +-- Setting procedure to lookback from the day before `input_date` until the day before `end_date` +-- Subtract one day from `input_date` +SET input_date = DATE_SUB(input_date, INTERVAL 1 DAY); +-- Subtract one day from `end_date` +SET end_date = DATE_SUB(end_date, INTERVAL 1 DAY); + +-- Past User metrics: 1-day scroll_50 events per user, 2-5-day scroll_50 events per user +-- Create a temporary table `rolling_scroll_50_past_days` to store the rolling scroll_50 events count for each user +CREATE OR REPLACE TEMP TABLE rolling_scroll_50_past_days AS ( +SELECT + -- User's unique identifier + user_pseudo_id, + -- Calculate the number of scroll_50 made in the past 1 day + MAX(COUNT(DISTINCT CASE WHEN DATE_DIFF(input_date, event_date, DAY) = 1 THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id) AS scroll_50_past_1_day, + -- Calculate the number of scroll_50 made in the past 2 days + MAX(COUNT(DISTINCT CASE WHEN DATE_DIFF(input_date, event_date, DAY) = 2 THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id) AS scroll_50_past_2_day, + -- Calculate the number of scroll_50 made in the past 3 days + MAX(COUNT(DISTINCT CASE WHEN DATE_DIFF(input_date, event_date, DAY) = 3 THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id) AS scroll_50_past_3_day, + -- Calculate the number of scroll_50 made in the past 4 days + MAX(COUNT(DISTINCT CASE WHEN DATE_DIFF(input_date, event_date, DAY) = 4 THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id) AS scroll_50_past_4_day, + -- Calculate the number of scroll_50 made in the past 5 days + MAX(COUNT(DISTINCT CASE WHEN DATE_DIFF(input_date, event_date, DAY) = 5 THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id) AS scroll_50_past_5_day +FROM `{{mds_project_id}}.{{mds_dataset}}.event` as E +-- Filter events within the defined date range +WHERE event_date BETWEEN end_date AND input_date +-- Filter for scroll_50 events +AND event_name='scroll_50' +-- Ensure valid session ID +AND ga_session_id IS NOT NULL +-- Group the results by user pseudo ID +GROUP BY user_pseudo_id +); + +-- Past User metrics: 1-day scroll_90 per user, 2-5-day scroll_90 per user +-- Create a temporary table `rolling_scroll_90_past_days` to store the rolling visit count for each user +CREATE OR REPLACE TEMP TABLE rolling_scroll_90_past_days AS ( +SELECT + -- User's unique identifier + user_pseudo_id, + -- Calculate the number of scroll_90 made in the past 1 day + MAX(COUNT(DISTINCT CASE WHEN DATE_DIFF(input_date, event_date, DAY) = 1 THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id) AS scroll_90_past_1_day, + -- Calculate the number of scroll_90 made in the past 2 days + MAX(COUNT(DISTINCT CASE WHEN DATE_DIFF(input_date, event_date, DAY) = 2 THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id) AS scroll_90_past_2_day, + -- Calculate the number of scroll_90 made in the past 3 days + MAX(COUNT(DISTINCT CASE WHEN DATE_DIFF(input_date, event_date, DAY) = 3 THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id) AS scroll_90_past_3_day, + -- Calculate the number of scroll_90 made in the past 4 days + MAX(COUNT(DISTINCT CASE WHEN DATE_DIFF(input_date, event_date, DAY) = 4 THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id) AS scroll_90_past_4_day, + -- Calculate the number of scroll_90 made in the past 5 days + MAX(COUNT(DISTINCT CASE WHEN DATE_DIFF(input_date, event_date, DAY) = 5 THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id) AS scroll_90_past_5_day +FROM `{{mds_project_id}}.{{mds_dataset}}.event` as E +-- Filter events within the defined date range +WHERE event_date BETWEEN end_date AND input_date +-- Filter for scroll_90 events +AND event_name='scroll_90' +-- Ensure valid session ID +AND ga_session_id IS NOT NULL +-- Group the results by user pseudo ID +GROUP BY user_pseudo_id +); + +-- Past User metrics: 1-day view_search_results per user, 2-5-day view_search_results per user +-- Create a temporary table `rolling_view_search_results_past_days` to store the rolling view item count for each user +CREATE OR REPLACE TEMP TABLE rolling_view_search_results_past_days AS ( +SELECT + -- User's unique identifier + user_pseudo_id, + -- Calculate the number of times the user view_search_results in the past 1 day + MAX(COUNT(DISTINCT CASE WHEN DATE_DIFF(input_date, event_date, DAY) = 1 THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id) AS view_search_results_past_1_day, + -- Calculate the number of times the user view_search_results in the past 2 days + MAX(COUNT(DISTINCT CASE WHEN DATE_DIFF(input_date, event_date, DAY) = 2 THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id) AS view_search_results_past_2_day, + -- Calculate the number of times the user view_search_results in the past 3 days + MAX(COUNT(DISTINCT CASE WHEN DATE_DIFF(input_date, event_date, DAY) = 3 THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id) AS view_search_results_past_3_day, + -- Calculate the number of times the user view_search_results in the past 4 days + MAX(COUNT(DISTINCT CASE WHEN DATE_DIFF(input_date, event_date, DAY) = 4 THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id) AS view_search_results_past_4_day, + -- Calculate the number of times the user view_search_results in the past 5 days + MAX(COUNT(DISTINCT CASE WHEN DATE_DIFF(input_date, event_date, DAY) = 5 THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id) AS view_search_results_past_5_day +FROM `{{mds_project_id}}.{{mds_dataset}}.event` as E +-- Filter events within the defined date range +WHERE event_date BETWEEN end_date AND input_date +-- Filter for view_search_results events +AND event_name='view_search_results' +-- Ensure valid session ID +AND ga_session_id IS NOT NULL +-- Group the results by user pseudo ID +GROUP BY user_pseudo_id +); + +-- Past User metrics: 1-day file_download per user, 2-5-day file_download +-- Create a temporary table `rolling_file_download_past_days` to store the rolling file_download count for each user +CREATE OR REPLACE TEMP TABLE rolling_file_download_past_days AS ( +SELECT + -- User's unique identifier + user_pseudo_id, + -- Calculate the number of times the user file_download in the past 1 day + MAX(COUNT(DISTINCT CASE WHEN DATE_DIFF(input_date, event_date, DAY) = 1 THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id) AS file_download_past_1_day, + -- Calculate the number of times the user file_download in the past 2 days + MAX(COUNT(DISTINCT CASE WHEN DATE_DIFF(input_date, event_date, DAY) = 2 THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id) AS file_download_past_2_day, + -- Calculate the number of times the user file_download in the past 3 days + MAX(COUNT(DISTINCT CASE WHEN DATE_DIFF(input_date, event_date, DAY) = 3 THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id) AS file_download_past_3_day, + -- Calculate the number of times the user file_download in the past 4 days + MAX(COUNT(DISTINCT CASE WHEN DATE_DIFF(input_date, event_date, DAY) = 4 THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id) AS file_download_past_4_day, + -- Calculate the number of times the user file_download in the past 5 days + MAX(COUNT(DISTINCT CASE WHEN DATE_DIFF(input_date, event_date, DAY) = 5 THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id) AS file_download_past_5_day +FROM `{{mds_project_id}}.{{mds_dataset}}.event` as E +-- Filter events within the defined date range +WHERE event_date BETWEEN end_date AND input_date +-- Filter for file_download events +AND event_name='file_download' +-- Ensure valid session ID +AND ga_session_id IS NOT NULL +-- Group the results by user pseudo ID +GROUP BY user_pseudo_id +); + +-- Past User metrics: 1-day recipe_add_to_list per user, 2-5-day recipe_add_to_list per user +-- 2-5-day recipe_add_to_list per user +CREATE OR REPLACE TEMP TABLE rolling_recipe_add_to_list_past_days AS ( +SELECT + -- User pseudo ID, a unique identifier for the user + user_pseudo_id, + -- Number of recipe_add_to_list in the past 1st day, calculated using a window function partitioned by user_pseudo_id + MAX(COUNT(DISTINCT CASE WHEN DATE_DIFF(input_date, event_date, DAY) = 1 THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id) AS recipe_add_to_list_past_1_day, + -- Number of recipe_add_to_list in the past 2nd day, calculated using a window function partitioned by user_pseudo_id + MAX(COUNT(DISTINCT CASE WHEN DATE_DIFF(input_date, event_date, DAY) = 2 THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id) AS recipe_add_to_list_past_2_day, + -- Number of recipe_add_to_list in the past 3rd day, calculated using a window function partitioned by user_pseudo_id + MAX(COUNT(DISTINCT CASE WHEN DATE_DIFF(input_date, event_date, DAY) = 3 THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id) AS recipe_add_to_list_past_3_day, + -- Number of recipe_add_to_list in the past 4th day, calculated using a window function partitioned by user_pseudo_id + MAX(COUNT(DISTINCT CASE WHEN DATE_DIFF(input_date, event_date, DAY) = 4 THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id) AS recipe_add_to_list_past_4_day, + -- Number of recipe_add_to_list in the past 5th day, calculated using a window function partitioned by user_pseudo_id + MAX(COUNT(DISTINCT CASE WHEN DATE_DIFF(input_date, event_date, DAY) = 5 THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id) AS recipe_add_to_list_past_5_day +FROM `{{mds_project_id}}.{{mds_dataset}}.event` as E +-- Filter events within the defined date range +WHERE event_date BETWEEN end_date AND input_date +-- Filter for recipe_add_to_list events +AND event_name='recipe_add_to_list' +-- Ensure valid session ID +AND ga_session_id IS NOT NULL +-- Group the results by user pseudo ID +GROUP BY user_pseudo_id +); + +-- Past User metrics: 1-day recipe_print per user, 2-5-day recipe_print per user +CREATE OR REPLACE TEMP TABLE rolling_recipe_print_past_days AS ( +SELECT + -- User pseudo ID, a unique identifier for the user + user_pseudo_id, + -- Number of recipe_print in the past 1st day, calculated using a window function partitioned by user_pseudo_id + MAX(COUNT(DISTINCT CASE WHEN DATE_DIFF(input_date, event_date, DAY) = 1 THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id) AS recipe_print_past_1_day, + -- Number of recipe_print in the past 2nd day, calculated using a window function partitioned by user_pseudo_id + MAX(COUNT(DISTINCT CASE WHEN DATE_DIFF(input_date, event_date, DAY) = 2 THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id) AS recipe_print_past_2_day, + -- Number of recipe_print in the past 3rd day, calculated using a window function partitioned by user_pseudo_id + MAX(COUNT(DISTINCT CASE WHEN DATE_DIFF(input_date, event_date, DAY) = 3 THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id) AS recipe_print_past_3_day, + -- Number of recipe_print in the past 4th day, calculated using a window function partitioned by user_pseudo_id + MAX(COUNT(DISTINCT CASE WHEN DATE_DIFF(input_date, event_date, DAY) = 4 THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id) AS recipe_print_past_4_day, + -- Number of recipe_print in the past 5th day, calculated using a window function partitioned by user_pseudo_id + MAX(COUNT(DISTINCT CASE WHEN DATE_DIFF(input_date, event_date, DAY) = 5 THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id) AS recipe_print_past_5_day +FROM `{{mds_project_id}}.{{mds_dataset}}.event` as E +-- Filter events within the defined date range +WHERE event_date BETWEEN end_date AND input_date +-- Filter for recipe_print events +AND event_name='recipe_print' +-- Ensure valid session ID +AND ga_session_id IS NOT NULL +-- Group the results by user pseudo ID +GROUP BY user_pseudo_id +); + +-- Past User metrics: 1-day sign_up per user, 2-5-day sign_up per user +-- 2-5-day sign_up per user +CREATE OR REPLACE TEMP TABLE rolling_sign_up_past_days AS ( +SELECT + -- User pseudo ID, a unique identifier for the user + user_pseudo_id, + -- Number of sign_up in the past 1st day, calculated using a window function partitioned by user_pseudo_id + MAX(COUNT(DISTINCT CASE WHEN DATE_DIFF(input_date, event_date, DAY) = 1 THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id) AS sign_up_past_1_day, + -- Number of sign_up in the past 2nd day, calculated using a window function partitioned by user_pseudo_id + MAX(COUNT(DISTINCT CASE WHEN DATE_DIFF(input_date, event_date, DAY) = 2 THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id) AS sign_up_past_2_day, + -- Number of sign_up in the past 3rd day, calculated using a window function partitioned by user_pseudo_id + MAX(COUNT(DISTINCT CASE WHEN DATE_DIFF(input_date, event_date, DAY) = 3 THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id) AS sign_up_past_3_day, + -- Number of sign_up in the past 4th day, calculated using a window function partitioned by user_pseudo_id + MAX(COUNT(DISTINCT CASE WHEN DATE_DIFF(input_date, event_date, DAY) = 4 THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id) AS sign_up_past_4_day, + -- Number of sign_up in the past 5th day, calculated using a window function partitioned by user_pseudo_id + MAX(COUNT(DISTINCT CASE WHEN DATE_DIFF(input_date, event_date, DAY) = 5 THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id) AS sign_up_past_5_day +FROM `{{mds_project_id}}.{{mds_dataset}}.event` as E +-- Filter events within the defined date range +WHERE event_date BETWEEN end_date AND input_date +-- Filter for sign_up events +AND event_name='sign_up' +-- Ensure valid session ID +AND ga_session_id IS NOT NULL +-- Group the results by user pseudo ID +GROUP BY user_pseudo_id +); + + +-- Past User metrics: 1-day recipe_favorite per user, 2-5-day recipe_favorite per user +-- 2-5-day recipe_favorite per user +CREATE OR REPLACE TEMP TABLE rolling_recipe_favorite_past_days AS ( +SELECT + -- User pseudo ID, a unique identifier for the user + user_pseudo_id, + -- Number of recipe_favorite in the past 1st day, calculated using a window function partitioned by user_pseudo_id + MAX(COUNT(DISTINCT CASE WHEN DATE_DIFF(input_date, event_date, DAY) = 1 THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id) AS recipe_favorite_past_1_day, + -- Number of recipe_favorite in the past 2nd day, calculated using a window function partitioned by user_pseudo_id + MAX(COUNT(DISTINCT CASE WHEN DATE_DIFF(input_date, event_date, DAY) = 2 THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id) AS recipe_favorite_past_2_day, + -- Number of recipe_favorite in the past 3rd day, calculated using a window function partitioned by user_pseudo_id + MAX(COUNT(DISTINCT CASE WHEN DATE_DIFF(input_date, event_date, DAY) = 3 THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id) AS recipe_favorite_past_3_day, + -- Number of recipe_favorite in the past 4th day, calculated using a window function partitioned by user_pseudo_id + MAX(COUNT(DISTINCT CASE WHEN DATE_DIFF(input_date, event_date, DAY) = 4 THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id) AS recipe_favorite_past_4_day, + -- Number of recipe_favorite in the past 5th day, calculated using a window function partitioned by user_pseudo_id + MAX(COUNT(DISTINCT CASE WHEN DATE_DIFF(input_date, event_date, DAY) = 5 THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id) AS recipe_favorite_past_5_day +FROM `{{mds_project_id}}.{{mds_dataset}}.event` as E +-- Filter events within the defined date range +WHERE event_date BETWEEN end_date AND input_date +-- Filter for recipe_favorite events +AND event_name='recipe_favorite' +-- Ensure valid session ID +AND ga_session_id IS NOT NULL +-- Group the results by user pseudo ID +GROUP BY user_pseudo_id +); + + +-- Past User metrics: 1-day recipe_add_to_menu per user, 2-5-day recipe_add_to_menu per user +-- 2-5-day recipe_add_to_menu per user +CREATE OR REPLACE TEMP TABLE rolling_recipe_add_to_menu_past_days AS ( +SELECT + -- User pseudo ID, a unique identifier for the user + user_pseudo_id, + -- Number of recipe_add_to_menu in the past 1st day, calculated using a window function partitioned by user_pseudo_id + MAX(COUNT(DISTINCT CASE WHEN DATE_DIFF(input_date, event_date, DAY) = 1 THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id) AS recipe_add_to_menu_past_1_day, + -- Number of recipe_add_to_menu in the past 2nd day, calculated using a window function partitioned by user_pseudo_id + MAX(COUNT(DISTINCT CASE WHEN DATE_DIFF(input_date, event_date, DAY) = 2 THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id) AS recipe_add_to_menu_past_2_day, + -- Number of recipe_add_to_menu in the past 3rd day, calculated using a window function partitioned by user_pseudo_id + MAX(COUNT(DISTINCT CASE WHEN DATE_DIFF(input_date, event_date, DAY) = 3 THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id) AS recipe_add_to_menu_past_3_day, + -- Number of recipe_add_to_menu in the past 4th day, calculated using a window function partitioned by user_pseudo_id + MAX(COUNT(DISTINCT CASE WHEN DATE_DIFF(input_date, event_date, DAY) = 4 THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id) AS recipe_add_to_menu_past_4_day, + -- Number of recipe_add_to_menu in the past 5th day, calculated using a window function partitioned by user_pseudo_id + MAX(COUNT(DISTINCT CASE WHEN DATE_DIFF(input_date, event_date, DAY) = 5 THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id) AS recipe_add_to_menu_past_5_day +FROM `{{mds_project_id}}.{{mds_dataset}}.event` as E +-- Filter events within the defined date range +WHERE event_date BETWEEN end_date AND input_date +-- Filter for recipe_add_to_menu events +AND event_name='recipe_add_to_menu' +-- Ensure valid session ID +AND ga_session_id IS NOT NULL +-- Group the results by user pseudo ID +GROUP BY user_pseudo_id +); + + +-- All users in the platform +CREATE OR REPLACE TEMP TABLE events_users_days as ( + SELECT DISTINCT + -- User pseudo ID + Users.user_pseudo_id, + -- distinct event date + Days.event_date as event_date + FROM `{{mds_project_id}}.{{mds_dataset}}.event` Users + -- 'Days' is an alias for a temporary table containing distinct event dates + CROSS JOIN + (SELECT DISTINCT event_date FROM `{{mds_project_id}}.{{mds_dataset}}.event`) Days + INNER JOIN `{{mds_project_id}}.{{mds_dataset}}.device` as D + ON Users.device_type_id = D.device_type_id + -- Exclude events without a valid session ID + WHERE Users.ga_session_id IS NOT NULL + -- Exclude events without a valid device operating system + AND D.device_os IS NOT NULL + -- Filter events within the defined date range + AND Days.event_date BETWEEN end_date AND input_date) +; + +-- Create a temporary table to store data for the target table +CREATE OR REPLACE TEMP TABLE DataForTargetTable AS +SELECT DISTINCT + -- Current timestamp + CURRENT_TIMESTAMP() AS processed_timestamp, + -- Feature date + input_date AS feature_date, + -- User pseudo ID + EUD.user_pseudo_id, + COALESCE(scroll_50_past_1_day,0) AS scroll_50_past_1_day, + COALESCE(scroll_50_past_2_day,0) AS scroll_50_past_2_day, + COALESCE(scroll_50_past_3_day,0) AS scroll_50_past_3_day, + COALESCE(scroll_50_past_4_day,0) AS scroll_50_past_4_day, + COALESCE(scroll_50_past_5_day,0) AS scroll_50_past_5_day, + COALESCE(scroll_90_past_1_day,0) AS scroll_90_past_1_day, + COALESCE(scroll_90_past_2_day,0) AS scroll_90_past_2_day, + COALESCE(scroll_90_past_3_day,0) AS scroll_90_past_3_day, + COALESCE(scroll_90_past_4_day,0) AS scroll_90_past_4_day, + COALESCE(scroll_90_past_5_day,0) AS scroll_90_past_5_day, + COALESCE(view_search_results_past_1_day,0) AS view_search_results_past_1_day, + COALESCE(view_search_results_past_2_day,0) AS view_search_results_past_2_day, + COALESCE(view_search_results_past_3_day,0) AS view_search_results_past_3_day, + COALESCE(view_search_results_past_4_day,0) AS view_search_results_past_4_day, + COALESCE(view_search_results_past_5_day,0) AS view_search_results_past_5_day, + COALESCE(file_download_past_1_day,0) AS file_download_past_1_day, + COALESCE(file_download_past_2_day,0) AS file_download_past_2_day, + COALESCE(file_download_past_3_day,0) AS file_download_past_3_day, + COALESCE(file_download_past_4_day,0) AS file_download_past_4_day, + COALESCE(file_download_past_5_day,0) AS file_download_past_5_day, + COALESCE(recipe_add_to_list_past_1_day,0) AS recipe_add_to_list_past_1_day, + COALESCE(recipe_add_to_list_past_2_day,0) AS recipe_add_to_list_past_2_day, + COALESCE(recipe_add_to_list_past_3_day,0) AS recipe_add_to_list_past_3_day, + COALESCE(recipe_add_to_list_past_4_day,0) AS recipe_add_to_list_past_4_day, + COALESCE(recipe_add_to_list_past_5_day,0) AS recipe_add_to_list_past_5_day, + COALESCE(recipe_print_past_1_day,0) AS recipe_print_past_1_day, + COALESCE(recipe_print_past_2_day,0) AS recipe_print_past_2_day, + COALESCE(recipe_print_past_3_day,0) AS recipe_print_past_3_day, + COALESCE(recipe_print_past_4_day,0) AS recipe_print_past_4_day, + COALESCE(recipe_print_past_5_day,0) AS recipe_print_past_5_day, + COALESCE(sign_up_past_1_day,0) AS sign_up_past_1_day, + COALESCE(sign_up_past_2_day,0) AS sign_up_past_2_day, + COALESCE(sign_up_past_3_day,0) AS sign_up_past_3_day, + COALESCE(sign_up_past_4_day,0) AS sign_up_past_4_day, + COALESCE(sign_up_past_5_day,0) AS sign_up_past_5_day, + COALESCE(recipe_favorite_past_1_day,0) AS recipe_favorite_past_1_day, + COALESCE(recipe_favorite_past_2_day,0) AS recipe_favorite_past_2_day, + COALESCE(recipe_favorite_past_3_day,0) AS recipe_favorite_past_3_day, + COALESCE(recipe_favorite_past_4_day,0) AS recipe_favorite_past_4_day, + COALESCE(recipe_favorite_past_5_day,0) AS recipe_favorite_past_5_day, + COALESCE(recipe_add_to_menu_past_1_day,0) AS recipe_add_to_menu_past_1_day, + COALESCE(recipe_add_to_menu_past_2_day,0) AS recipe_add_to_menu_past_2_day, + COALESCE(recipe_add_to_menu_past_3_day,0) AS recipe_add_to_menu_past_3_day, + COALESCE(recipe_add_to_menu_past_4_day,0) AS recipe_add_to_menu_past_4_day, + COALESCE(recipe_add_to_menu_past_5_day,0) AS recipe_add_to_menu_past_5_day + FROM events_users_days AS EUD + FULL OUTER JOIN rolling_scroll_50_past_days AS A + ON EUD.user_pseudo_id = A.user_pseudo_id + FULL OUTER JOIN rolling_scroll_90_past_days AS B + ON EUD.user_pseudo_id = B.user_pseudo_id + FULL OUTER JOIN rolling_view_search_results_past_days AS C + ON EUD.user_pseudo_id = C.user_pseudo_id + FULL OUTER JOIN rolling_file_download_past_days AS D + ON EUD.user_pseudo_id = D.user_pseudo_id + FULL OUTER JOIN rolling_recipe_add_to_list_past_days AS E + ON EUD.user_pseudo_id = E.user_pseudo_id + FULL OUTER JOIN rolling_recipe_print_past_days AS F + ON EUD.user_pseudo_id = F.user_pseudo_id + FULL OUTER JOIN rolling_sign_up_past_days AS G + ON EUD.user_pseudo_id = G.user_pseudo_id + FULL OUTER JOIN rolling_recipe_favorite_past_days AS H + ON EUD.user_pseudo_id = H.user_pseudo_id + FULL OUTER JOIN rolling_recipe_add_to_menu_past_days AS I + ON EUD.user_pseudo_id = I.user_pseudo_id + -- Exclude rows without a valid user pseudo ID + WHERE EUD.user_pseudo_id IS NOT NULL + ; + +-- Merge data into the target table +MERGE `{{project_id}}.{{dataset}}.{{insert_table}}` I +USING DataForTargetTable T +ON I.feature_date = T.feature_date + AND I.user_pseudo_id = T.user_pseudo_id +WHEN MATCHED THEN + UPDATE SET + -- Update the processed timestamp and rolling window features + I.processed_timestamp = T.processed_timestamp, + I.scroll_50_past_1_day = T.scroll_50_past_1_day, + I.scroll_50_past_2_day = T.scroll_50_past_2_day, + I.scroll_50_past_3_day = T.scroll_50_past_3_day, + I.scroll_50_past_4_day = T.scroll_50_past_4_day, + I.scroll_50_past_5_day = T.scroll_50_past_5_day, + I.scroll_90_past_1_day = T.scroll_90_past_1_day, + I.scroll_90_past_2_day = T.scroll_90_past_2_day, + I.scroll_90_past_3_day = T.scroll_90_past_3_day, + I.scroll_90_past_4_day = T.scroll_90_past_4_day, + I.scroll_90_past_5_day = T.scroll_90_past_5_day, + I.view_search_results_past_1_day = T.view_search_results_past_1_day, + I.view_search_results_past_2_day = T.view_search_results_past_2_day, + I.view_search_results_past_3_day = T.view_search_results_past_3_day, + I.view_search_results_past_4_day = T.view_search_results_past_4_day, + I.view_search_results_past_5_day = T.view_search_results_past_5_day, + I.file_download_past_1_day = T.file_download_past_1_day, + I.file_download_past_2_day = T.file_download_past_2_day, + I.file_download_past_3_day = T.file_download_past_3_day, + I.file_download_past_4_day = T.file_download_past_4_day, + I.file_download_past_5_day = T.file_download_past_5_day, + I.recipe_add_to_list_past_1_day = T.recipe_add_to_list_past_1_day, + I.recipe_add_to_list_past_2_day = T.recipe_add_to_list_past_2_day, + I.recipe_add_to_list_past_3_day = T.recipe_add_to_list_past_3_day, + I.recipe_add_to_list_past_4_day = T.recipe_add_to_list_past_4_day, + I.recipe_add_to_list_past_5_day = T.recipe_add_to_list_past_5_day, + I.recipe_print_past_1_day = T.recipe_print_past_1_day, + I.recipe_print_past_2_day = T.recipe_print_past_2_day, + I.recipe_print_past_3_day = T.recipe_print_past_3_day, + I.recipe_print_past_4_day = T.recipe_print_past_4_day, + I.recipe_print_past_5_day = T.recipe_print_past_5_day, + I.sign_up_past_1_day = T.sign_up_past_1_day, + I.sign_up_past_2_day = T.sign_up_past_2_day, + I.sign_up_past_3_day = T.sign_up_past_3_day, + I.sign_up_past_4_day = T.sign_up_past_4_day, + I.sign_up_past_5_day = T.sign_up_past_5_day, + I.recipe_favorite_past_1_day = T.recipe_favorite_past_1_day, + I.recipe_favorite_past_2_day = T.recipe_favorite_past_2_day, + I.recipe_favorite_past_3_day = T.recipe_favorite_past_3_day, + I.recipe_favorite_past_4_day = T.recipe_favorite_past_4_day, + I.recipe_favorite_past_5_day = T.recipe_favorite_past_5_day, + I.recipe_add_to_menu_past_1_day = T.recipe_add_to_menu_past_1_day, + I.recipe_add_to_menu_past_2_day = T.recipe_add_to_menu_past_2_day, + I.recipe_add_to_menu_past_3_day = T.recipe_add_to_menu_past_3_day, + I.recipe_add_to_menu_past_4_day = T.recipe_add_to_menu_past_4_day, + I.recipe_add_to_menu_past_5_day = T.recipe_add_to_menu_past_5_day +WHEN NOT MATCHED THEN + INSERT + (processed_timestamp, + feature_date, + user_pseudo_id, + scroll_50_past_1_day, + scroll_50_past_2_day, + scroll_50_past_3_day, + scroll_50_past_4_day, + scroll_50_past_5_day, + scroll_90_past_1_day, + scroll_90_past_2_day, + scroll_90_past_3_day, + scroll_90_past_4_day, + scroll_90_past_5_day, + view_search_results_past_1_day, + view_search_results_past_2_day, + view_search_results_past_3_day, + view_search_results_past_4_day, + view_search_results_past_5_day, + file_download_past_1_day, + file_download_past_2_day, + file_download_past_3_day, + file_download_past_4_day, + file_download_past_5_day, + recipe_add_to_list_past_1_day, + recipe_add_to_list_past_2_day, + recipe_add_to_list_past_3_day, + recipe_add_to_list_past_4_day, + recipe_add_to_list_past_5_day, + recipe_print_past_1_day, + recipe_print_past_2_day, + recipe_print_past_3_day, + recipe_print_past_4_day, + recipe_print_past_5_day, + sign_up_past_1_day, + sign_up_past_2_day, + sign_up_past_3_day, + sign_up_past_4_day, + sign_up_past_5_day, + recipe_favorite_past_1_day, + recipe_favorite_past_2_day, + recipe_favorite_past_3_day, + recipe_favorite_past_4_day, + recipe_favorite_past_5_day, + recipe_add_to_menu_past_1_day, + recipe_add_to_menu_past_2_day, + recipe_add_to_menu_past_3_day, + recipe_add_to_menu_past_4_day, + recipe_add_to_menu_past_5_day) + VALUES + (T.processed_timestamp, + T.feature_date, + T.user_pseudo_id, + T.scroll_50_past_1_day, + T.scroll_50_past_2_day, + T.scroll_50_past_3_day, + T.scroll_50_past_4_day, + T.scroll_50_past_5_day, + T.scroll_90_past_1_day, + T.scroll_90_past_2_day, + T.scroll_90_past_3_day, + T.scroll_90_past_4_day, + T.scroll_90_past_5_day, + T.view_search_results_past_1_day, + T.view_search_results_past_2_day, + T.view_search_results_past_3_day, + T.view_search_results_past_4_day, + T.view_search_results_past_5_day, + T.file_download_past_1_day, + T.file_download_past_2_day, + T.file_download_past_3_day, + T.file_download_past_4_day, + T.file_download_past_5_day, + T.recipe_add_to_list_past_1_day, + T.recipe_add_to_list_past_2_day, + T.recipe_add_to_list_past_3_day, + T.recipe_add_to_list_past_4_day, + T.recipe_add_to_list_past_5_day, + T.recipe_print_past_1_day, + T.recipe_print_past_2_day, + T.recipe_print_past_3_day, + T.recipe_print_past_4_day, + T.recipe_print_past_5_day, + T.sign_up_past_1_day, + T.sign_up_past_2_day, + T.sign_up_past_3_day, + T.sign_up_past_4_day, + T.sign_up_past_5_day, + T.recipe_favorite_past_1_day, + T.recipe_favorite_past_2_day, + T.recipe_favorite_past_3_day, + T.recipe_favorite_past_4_day, + T.recipe_favorite_past_5_day, + T.recipe_add_to_menu_past_1_day, + T.recipe_add_to_menu_past_2_day, + T.recipe_add_to_menu_past_3_day, + T.recipe_add_to_menu_past_4_day, + T.recipe_add_to_menu_past_5_day) +; + +-- Set a variable to track the number of rows added +SET rows_added = (SELECT COUNT(DISTINCT user_pseudo_id) FROM `{{project_id}}.{{dataset}}.{{insert_table}}`); diff --git a/sql/query/create_gemini_model.sqlx b/sql/query/create_gemini_model.sqlx index 84612d8f..4e365c4a 100644 --- a/sql/query/create_gemini_model.sqlx +++ b/sql/query/create_gemini_model.sqlx @@ -18,6 +18,6 @@ -- Your supervised tuning computations also occur in the europe-west4 region, because that's where TPU resources are located. -- Create a {{endpoint_name}} model using a remote connection to {{region}}.{{connection_name}} -CREATE OR REPLACE MODEL `{{project_id}}.{{dataset}}.{{model_name}}` +CREATE MODEL IF NOT EXISTS `{{project_id}}.{{dataset}}.{{model_name}}` REMOTE WITH CONNECTION `{{project_id}}.{{region}}.{{connection_name}}` OPTIONS (ENDPOINT = '{{endpoint_name}}'); \ No newline at end of file diff --git a/sql/query/invoke_backfill_lead_score_propensity_label.sqlx b/sql/query/invoke_backfill_lead_score_propensity_label.sqlx new file mode 100644 index 00000000..84ef7fc6 --- /dev/null +++ b/sql/query/invoke_backfill_lead_score_propensity_label.sqlx @@ -0,0 +1,116 @@ +-- Copyright 2023 Google LLC +-- +-- Licensed under the Apache License, Version 2.0 (the "License"); +-- you may not use this file except in compliance with the License. +-- You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, software +-- distributed under the License is distributed on an "AS IS" BASIS, +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- See the License for the specific language governing permissions and +-- limitations under the License. + +-- Declares a variable to store the maximum date for analysis +DECLARE max_date DATE; +-- Declares a variable to store the minimum date for analysis +DECLARE min_date DATE; +-- Sets the max_date variable to the latest event_date minus a specified number of days ({{interval_max_date}}) from the 'event' table +SET max_date = (SELECT DATE_SUB(MAX(event_date), INTERVAL {{interval_max_date}} DAY) FROM `{{mds_project_id}}.{{mds_dataset}}.event`); +-- Sets the min_date variable to the earliest event_date plus a specified number of days ({{interval_min_date}}) from the 'event' table +SET min_date = (SELECT DATE_ADD(MIN(event_date), INTERVAL {{interval_min_date}} DAY) FROM `{{mds_project_id}}.{{mds_dataset}}.event`); + +-- If min_date > maximum event_date OR max_date < minimum event_date, then set min_date for the max event_date and set max_date for the min event_date +IF min_date >= (SELECT MAX(event_date) FROM `{{mds_project_id}}.{{mds_dataset}}.event`) OR max_date <= (SELECT MIN(event_date) FROM `{{mds_project_id}}.{{mds_dataset}}.event`) OR min_date >= max_date THEN + SET min_date = (SELECT MIN(event_date) FROM `{{mds_project_id}}.{{mds_dataset}}.event`); + SET max_date = (SELECT MAX(event_date) FROM `{{mds_project_id}}.{{mds_dataset}}.event`); +END IF; + +-- This code block acts as a safeguard to ensure that the min_date and max_date used for further analysis are always within the bounds of the actual data available in the table. +-- It prevents situations where calculations might mistakenly consider dates beyond the real data range, which could lead to errors or misleading results. +IF max_date > (SELECT MAX(event_date) FROM `{{mds_project_id}}.{{mds_dataset}}.event`) OR min_date < (SELECT MIN(event_date) FROM `{{mds_project_id}}.{{mds_dataset}}.event`) THEN + SET min_date = (SELECT MIN(event_date) FROM `{{mds_project_id}}.{{mds_dataset}}.event`); + SET max_date = (SELECT MAX(event_date) FROM `{{mds_project_id}}.{{mds_dataset}}.event`); +END IF; + +-- Creates a temporary table called dates_interval to store distinct event dates and their corresponding end dates +CREATE OR REPLACE TEMP TABLE dates_interval as ( + SELECT DISTINCT + -- Selects the distinct event_date and assigns it to the column input_date + event_date as input_date, + -- Calculates the end date by adding a specified number of days ({{interval_end_date}}) to the input_date + DATE_ADD(event_date, INTERVAL {{interval_end_date}} DAY) as end_date + FROM `{{mds_project_id}}.{{mds_dataset}}.event` + -- Filters the events to include only those within the defined date range (between min_date and max_date) + WHERE event_date BETWEEN min_date AND max_date + ORDER BY input_date DESC +); + +-- All users in the platform +-- Creates a temporary table called all_users_possible_logins to store user login data +CREATE OR REPLACE TEMP TABLE all_users_possible_logins as ( + SELECT DISTINCT + -- Selects the user_pseudo_id from the 'event' table and assigns it to the column user_pseudo_id + Users.user_pseudo_id, + -- Selects the event_date from the date array generated using GENERATE_DATE_ARRAY and assigns it to the column feature_date + DI.event_date as feature_date, + -- Creates a series of columns (login_day_1) and initializes them with NULL values + -- These columns will be populated later with login data for specific days + NULL as login_day_1 + FROM `{{mds_project_id}}.{{mds_dataset}}.event` Users + -- Performs a cross join with a subquery that generates a date array using GENERATE_DATE_ARRAY + -- The date array includes dates from min_date to max_date with a 1-day interval + CROSS JOIN (SELECT event_date FROM UNNEST(GENERATE_DATE_ARRAY(min_date, max_date, INTERVAL 1 DAY)) as event_date) as DI + -- Filters the data to include events where event_name is 'login' + WHERE LOWER(Users.event_name) IN ('login') + AND Users.ga_session_id IS NOT NULL + ); + +-- Creates a temporary table called future_logins_per_user to store user login data in the future +-- Future User metrics: 1-7-day future logins per user, 1-day future logins per user +CREATE OR REPLACE TEMP TABLE future_logins_per_user AS ( + SELECT + -- Selects user_pseudo_id from the event table and assigns it to column user_pseudo_id + user_pseudo_id, + -- Selects input_date from the dates_interval table and assigns it to column feature_date + input_date as feature_date, + -- This calculation is performed over a window partitioned by user_pseudo_id and input_date + -- Repeats the above logic for different day offsets (1) to calculate future login counts for different days + MAX(COUNT(DISTINCT CASE DATE_DIFF(event_date, input_date, DAY) = 1 WHEN TRUE THEN ecommerce.transaction_id END)) OVER(PARTITION BY user_pseudo_id, input_date) AS login_day_1 + FROM `{{mds_project_id}}.{{mds_dataset}}.event` as E + INNER JOIN `{{mds_project_id}}.{{mds_dataset}}.device` as D + ON E.device_type_id = D.device_type_id + CROSS JOIN dates_interval as DI + -- Filters events to be within the date range defined by input_date and end_date from dates_interval + WHERE E.event_date BETWEEN DI.input_date AND DI.end_date + AND LOWER(E.event_name) IN ('login') + AND E.ga_session_id IS NOT NULL + AND D.device_os IS NOT NULL + -- Groups the result by user_pseudo_id and feature_date + GROUP BY user_pseudo_id, feature_date +); + +-- Inserts data into the target table +INSERT INTO `{{project_id}}.{{dataset}}.{{insert_table}}` ( + processed_timestamp, + feature_date, + user_pseudo_id, + login_day_1 +) +SELECT DISTINCT + -- Selects the current timestamp and assigns it to the column processed_timestamp + CURRENT_TIMESTAMP() AS processed_timestamp, + -- Selects the feature_date from the all_users_possible_logins table and assigns it to the column feature_date + A.feature_date, + -- Selects the user_pseudo_id from the all_users_possible_logins table and assigns it to the column user_pseudo_id + A.user_pseudo_id, + -- Uses the LEAST function to get the minimum value between the coalesced value of login_day_1 and 1 + -- COALESCE is used to handle null values, replacing them with 0 + -- This pattern is repeated for login_day_1 to populate the respective columns + LEAST(COALESCE(B.login_day_1, 0), 1) AS login_day_1 +FROM all_users_possible_logins AS A +-- Performs a left join with the future_logins_per_user table (aliased as B) using user_pseudo_id and feature_date +LEFT JOIN future_logins_per_user AS B +ON B.user_pseudo_id = A.user_pseudo_id AND B.feature_date = A.feature_date +; \ No newline at end of file diff --git a/sql/query/invoke_backfill_user_rolling_window_lead_metrics.sqlx b/sql/query/invoke_backfill_user_rolling_window_lead_metrics.sqlx new file mode 100644 index 00000000..d98fab78 --- /dev/null +++ b/sql/query/invoke_backfill_user_rolling_window_lead_metrics.sqlx @@ -0,0 +1,431 @@ +-- Copyright 2023 Google LLC +-- +-- Licensed under the Apache License, Version 2.0 (the "License"); +-- you may not use this file except in compliance with the License. +-- You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, software +-- distributed under the License is distributed on an "AS IS" BASIS, +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- See the License for the specific language governing permissions and +-- limitations under the License. + +-- This SQL code defines a series of temporary tables to calculate and store user engagement metrics based on +-- rolling window aggregations. These tables are then used to populate a target table with daily user engagement features. + +DECLARE max_date DATE; +DECLARE min_date DATE; +-- Sets max_date to the latest event_date from the event table, minus an offset specified by the interval_max_date +SET max_date = (SELECT DATE_SUB(MAX(event_date), INTERVAL {{interval_max_date}} DAY) FROM `{{mds_project_id}}.{{mds_dataset}}.event`); +-- Sets min_date to the earliest event_date from the event table, plus an offset specified by the interval_min_date +SET min_date = (SELECT DATE_ADD(MIN(event_date), INTERVAL {{interval_min_date}} DAY) FROM `{{mds_project_id}}.{{mds_dataset}}.event`); + +-- If min_date > maximum event_date OR max_date < minimum event_date, then set min_date for the max event_date and set max_date for the min event_date +IF min_date >= (SELECT MAX(event_date) FROM `{{mds_project_id}}.{{mds_dataset}}.event`) OR max_date <= (SELECT MIN(event_date) FROM `{{mds_project_id}}.{{mds_dataset}}.event`) OR min_date >= max_date THEN + SET min_date = (SELECT MIN(event_date) FROM `{{mds_project_id}}.{{mds_dataset}}.event`); + SET max_date = (SELECT MAX(event_date) FROM `{{mds_project_id}}.{{mds_dataset}}.event`); +END IF; + +-- This code block acts as a safeguard to ensure that the min_date and max_date used for further analysis are always within the bounds of the actual data available in the table. +-- It prevents situations where calculations might mistakenly consider dates beyond the real data range, which could lead to errors or misleading results. +IF max_date > (SELECT MAX(event_date) FROM `{{mds_project_id}}.{{mds_dataset}}.event`) OR min_date < (SELECT MIN(event_date) FROM `{{mds_project_id}}.{{mds_dataset}}.event`) THEN + SET min_date = (SELECT MIN(event_date) FROM `{{mds_project_id}}.{{mds_dataset}}.event`); + SET max_date = (SELECT MAX(event_date) FROM `{{mds_project_id}}.{{mds_dataset}}.event`); +END IF; + +-- This section determines the date range for analysis and creates a temporary table dates_interval with distinct date intervals. +CREATE OR REPLACE TEMP TABLE dates_interval as ( + SELECT DISTINCT + -- Select each distinct event_date as 'input_date', representing the current date in the analysis + event_date as input_date, + -- Calculate the 'end_date' by subtracting a specified interval from the 'input_date' + DATE_SUB(event_date, INTERVAL {{interval_end_date}} DAY) as end_date + FROM `{{mds_project_id}}.{{mds_dataset}}.event` + WHERE event_date BETWEEN min_date AND max_date + ORDER BY input_date DESC +); + +-- Run these windows aggregations every day. For each date in training and inference date ranges. +-- All users metrics: 1–5-day scroll_50 users +CREATE OR REPLACE TEMP TABLE rolling_scroll_50_past_days AS ( + SELECT + user_pseudo_id, + input_date as feature_date, + -- Number of times the user has scroll_50 in the past 1st day + MAX(COUNT(DISTINCT CASE DATE_DIFF(input_date, event_date, DAY) = 1 WHEN TRUE THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id, input_date) AS scroll_50_past_1_day, + -- Number of times the user has scroll_50 in the past 2nd day + MAX(COUNT(DISTINCT CASE DATE_DIFF(input_date, event_date, DAY) = 2 WHEN TRUE THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id, input_date) AS scroll_50_past_2_day, + -- Number of times the user has scroll_50 in the past 3rd day + MAX(COUNT(DISTINCT CASE DATE_DIFF(input_date, event_date, DAY) = 3 WHEN TRUE THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id, input_date) AS scroll_50_past_3_day, + -- Number of times the user has scroll_50 in the past 4th day + MAX(COUNT(DISTINCT CASE DATE_DIFF(input_date, event_date, DAY) = 4 WHEN TRUE THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id, input_date) AS scroll_50_past_4_day, + -- Number of times the user has scroll_50 in the past 5th day + MAX(COUNT(DISTINCT CASE DATE_DIFF(input_date, event_date, DAY) = 5 WHEN TRUE THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id, input_date) AS scroll_50_past_5_day + FROM `{{mds_project_id}}.{{mds_dataset}}.event` as E + CROSS JOIN dates_interval as DI + -- Filter events to be within the defined date range + WHERE E.event_date BETWEEN DI.end_date AND DI.input_date + -- Filter for scroll_50 events + AND event_name='scroll_50' + -- Ensure valid session ID + AND ga_session_id IS NOT NULL + -- Group the results by user pseudo ID and feature date + GROUP BY user_pseudo_id, feature_date +); + +-- Past User metrics: 1-day scroll_90 per user, 2-5-day scroll_90 per user +CREATE OR REPLACE TEMP TABLE rolling_scroll_90_past_days AS ( + SELECT + user_pseudo_id, + input_date as feature_date, + -- Number of scroll_90 made by the user in the past 1st day + MAX(COUNT(DISTINCT CASE DATE_DIFF(input_date, event_date, DAY) = 1 WHEN TRUE THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id, input_date) AS scroll_90_past_1_day, + -- Number of scroll_90 made by the user in the past 2nd day + MAX(COUNT(DISTINCT CASE DATE_DIFF(input_date, event_date, DAY) = 2 WHEN TRUE THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id, input_date) AS scroll_90_past_2_day, + -- Number of scroll_90 made by the user in the past 3th day + MAX(COUNT(DISTINCT CASE DATE_DIFF(input_date, event_date, DAY) = 3 WHEN TRUE THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id, input_date) AS scroll_90_past_3_day, + -- Number of scroll_90 made by the user in the past 4th day + MAX(COUNT(DISTINCT CASE DATE_DIFF(input_date, event_date, DAY) = 4 WHEN TRUE THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id, input_date) AS scroll_90_past_4_day, + -- Number of scroll_90 made by the user in the past 5th day + MAX(COUNT(DISTINCT CASE DATE_DIFF(input_date, event_date, DAY) = 5 WHEN TRUE THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id, input_date) AS scroll_90_past_5_day + FROM `{{mds_project_id}}.{{mds_dataset}}.event` as E + CROSS JOIN dates_interval as DI + -- Filter events to be within the defined date range + WHERE E.event_date BETWEEN DI.end_date AND DI.input_date + -- Consider only events with event name 'scroll_90' + AND event_name='scroll_90' + AND ga_session_id IS NOT NULL + -- Group the results by user pseudo ID and feature date + GROUP BY user_pseudo_id, feature_date +); + +-- Past User metrics: 1-day view_search_results per user, 2-5-day visits per user +CREATE OR REPLACE TEMP TABLE rolling_view_search_results_past_days AS ( + SELECT + user_pseudo_id, + input_date as feature_date, + -- Calculate the number of times the user view_search_results in the past 1st day + MAX(COUNT(DISTINCT CASE DATE_DIFF(input_date, event_date, DAY) = 1 WHEN TRUE THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id, input_date) AS view_search_results_past_1_day, + -- Calculate the number of times the user view_search_results in the past 2nd day + MAX(COUNT(DISTINCT CASE DATE_DIFF(input_date, event_date, DAY) = 2 WHEN TRUE THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id, input_date) AS view_search_results_past_2_day, + -- Calculate the number of times the user view_search_results in the past 3rd day + MAX(COUNT(DISTINCT CASE DATE_DIFF(input_date, event_date, DAY) = 3 WHEN TRUE THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id, input_date) AS view_search_results_past_3_day, + -- Calculate the number of times the user view_search_results in the past 4th day + MAX(COUNT(DISTINCT CASE DATE_DIFF(input_date, event_date, DAY) = 4 WHEN TRUE THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id, input_date) AS view_search_results_past_4_day, + -- Calculate the number of times the user view_search_results in the past 5th day + MAX(COUNT(DISTINCT CASE DATE_DIFF(input_date, event_date, DAY) = 5 WHEN TRUE THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id, input_date) AS view_search_results_past_5_day + FROM `{{mds_project_id}}.{{mds_dataset}}.event` as E + CROSS JOIN dates_interval as DI + -- Filter events to be within the defined date range + WHERE E.event_date BETWEEN DI.end_date AND DI.input_date + -- Filter for view_search_results events + AND event_name='view_search_results' + -- Ensure valid session ID + AND ga_session_id IS NOT NULL + -- Group the results by user pseudo ID and feature date + GROUP BY user_pseudo_id, feature_date +); + +-- Past User metrics: 1-day view_item per user, 2-5-day view_item per user +CREATE OR REPLACE TEMP TABLE rolling_file_download_past_days AS ( + SELECT + user_pseudo_id, + input_date as feature_date, + -- Number of times the user has file_download in the past 1st day + MAX(COUNT(DISTINCT CASE DATE_DIFF(input_date, event_date, DAY) = 1 WHEN TRUE THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id, input_date) AS file_download_past_1_day, + -- Number of times the user has file_download in the past 2nd day + MAX(COUNT(DISTINCT CASE DATE_DIFF(input_date, event_date, DAY) = 2 WHEN TRUE THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id, input_date) AS file_download_past_2_day, + -- Number of times the user has file_download in the past 3rd day + MAX(COUNT(DISTINCT CASE DATE_DIFF(input_date, event_date, DAY) = 3 WHEN TRUE THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id, input_date) AS file_download_past_3_day, + -- Number of times the user has file_download in the past 4th day + MAX(COUNT(DISTINCT CASE DATE_DIFF(input_date, event_date, DAY) = 4 WHEN TRUE THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id, input_date) AS file_download_past_4_day, + -- Number of times the user has file_download in the past 5th day + MAX(COUNT(DISTINCT CASE DATE_DIFF(input_date, event_date, DAY) = 5 WHEN TRUE THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id, input_date) AS file_download_past_5_day + FROM `{{mds_project_id}}.{{mds_dataset}}.event` as E + CROSS JOIN dates_interval as DI + -- Filter events to be within the defined date range + WHERE E.event_date BETWEEN DI.end_date AND DI.input_date + -- Consider only events with event name 'file_download' + AND event_name='file_download' + AND ga_session_id IS NOT NULL + -- Group the results by user pseudo ID and feature date + GROUP BY user_pseudo_id, feature_date +); + +## Past User metrics: 1-day recipe_add_to_list per user, 2-5-day recipe_add_to_list per user +CREATE OR REPLACE TEMP TABLE rolling_recipe_add_to_list_past_days AS ( + SELECT + user_pseudo_id, + input_date as feature_date, + -- Number of times the user has added items to cart in the past 1st day + MAX(COUNT(DISTINCT CASE DATE_DIFF(input_date, event_date, DAY) = 1 WHEN TRUE THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id, input_date) AS recipe_add_to_list_past_1_day, + -- Number of times the user has added items to cart in the past 2nd day + MAX(COUNT(DISTINCT CASE DATE_DIFF(input_date, event_date, DAY) = 2 WHEN TRUE THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id, input_date) AS recipe_add_to_list_past_2_day, + -- Number of times the user has added items to cart in the past 3rd day + MAX(COUNT(DISTINCT CASE DATE_DIFF(input_date, event_date, DAY) = 3 WHEN TRUE THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id, input_date) AS recipe_add_to_list_past_3_day, + -- Number of times the user has added items to cart in the past 4th day + MAX(COUNT(DISTINCT CASE DATE_DIFF(input_date, event_date, DAY) = 4 WHEN TRUE THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id, input_date) AS recipe_add_to_list_past_4_day, + -- Number of times the user has added items to cart in the past 5th day + MAX(COUNT(DISTINCT CASE DATE_DIFF(input_date, event_date, DAY) = 5 WHEN TRUE THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id, input_date) AS recipe_add_to_list_past_5_day + FROM `{{mds_project_id}}.{{mds_dataset}}.event` as E + CROSS JOIN dates_interval as DI + -- Filter events to be within the defined date range + WHERE E.event_date BETWEEN DI.end_date AND DI.input_date + -- Consider only events with event name 'recipe_add_to_list' + AND event_name='recipe_add_to_list' + AND ga_session_id IS NOT NULL + -- Group the results by user pseudo ID and feature date + GROUP BY user_pseudo_id, feature_date +); + +-- Past User metrics: 1-day recipe_print per user, 2-5-day recipe_print per user +CREATE OR REPLACE TEMP TABLE rolling_recipe_print_past_days AS ( + SELECT + user_pseudo_id, + input_date as feature_date, + -- Number of times the user has recipe_print in the past 1st day + MAX(COUNT(DISTINCT CASE DATE_DIFF(input_date, event_date, DAY) = 1 WHEN TRUE THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id, input_date) AS recipe_print_past_1_day, + -- Number of times the user has recipe_print in the past 2nd day + MAX(COUNT(DISTINCT CASE DATE_DIFF(input_date, event_date, DAY) = 2 WHEN TRUE THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id, input_date) AS recipe_print_past_2_day, + -- Number of times the user has recipe_print in the past 3rd day + MAX(COUNT(DISTINCT CASE DATE_DIFF(input_date, event_date, DAY) = 3 WHEN TRUE THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id, input_date) AS recipe_print_past_3_day, + -- Number of times the user has recipe_print in the past 4th day + MAX(COUNT(DISTINCT CASE DATE_DIFF(input_date, event_date, DAY) = 4 WHEN TRUE THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id, input_date) AS recipe_print_past_4_day, + -- Number of times the user has recipe_print in the past 5th day + MAX(COUNT(DISTINCT CASE DATE_DIFF(input_date, event_date, DAY) = 5 WHEN TRUE THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id, input_date) AS recipe_print_past_5_day + FROM `{{mds_project_id}}.{{mds_dataset}}.event` as E + CROSS JOIN dates_interval as DI + -- Filter events to be within the defined date range + WHERE E.event_date BETWEEN DI.end_date AND DI.input_date + -- Consider only events with event name 'recipe_print' + AND event_name='recipe_print' + AND ga_session_id IS NOT NULL + -- Group the results by user pseudo ID and feature date + GROUP BY user_pseudo_id, feature_date +); + +-- Past User metrics: 1-day sign_up per user, 2-5-day sign_up per user +CREATE OR REPLACE TEMP TABLE rolling_sign_up_past_days AS ( + SELECT + user_pseudo_id, + input_date as feature_date, + -- Number of times the user has sign_up in the past 1st day + MAX(COUNT(DISTINCT CASE DATE_DIFF(input_date, event_date, DAY) = 1 WHEN TRUE THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id, input_date) AS sign_up_past_1_day, + -- Number of times the user has sign_up in the past 2nd day + MAX(COUNT(DISTINCT CASE DATE_DIFF(input_date, event_date, DAY) = 2 WHEN TRUE THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id, input_date) AS sign_up_past_2_day, + -- Number of times the user has sign_up in the past 3rd day + MAX(COUNT(DISTINCT CASE DATE_DIFF(input_date, event_date, DAY) = 3 WHEN TRUE THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id, input_date) AS sign_up_past_3_day, + -- Number of times the user has sign_up in the past 4th day + MAX(COUNT(DISTINCT CASE DATE_DIFF(input_date, event_date, DAY) = 4 WHEN TRUE THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id, input_date) AS sign_up_past_4_day, + -- Number of times the user has sign_up in the past 5th day + MAX(COUNT(DISTINCT CASE DATE_DIFF(input_date, event_date, DAY) = 5 WHEN TRUE THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id, input_date) AS sign_up_past_5_day + FROM `{{mds_project_id}}.{{mds_dataset}}.event` as E + CROSS JOIN dates_interval as DI + -- Filter events to be within the defined date range + WHERE E.event_date BETWEEN DI.end_date AND DI.input_date + -- Consider only events with event name 'sign_up' + AND event_name='sign_up' + AND ga_session_id IS NOT NULL + -- Group the results by user pseudo ID and feature date + GROUP BY user_pseudo_id, feature_date +); + +-- Past User metrics: 1-day recipe_favorite per user, 2-5-day recipe_favorite per user +CREATE OR REPLACE TEMP TABLE rolling_recipe_favorite_past_days AS ( + SELECT + user_pseudo_id, + input_date as feature_date, + -- Number of times the user has recipe_favorite in the past 1st day + MAX(COUNT(DISTINCT CASE DATE_DIFF(input_date, event_date, DAY) = 1 WHEN TRUE THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id, input_date) AS recipe_favorite_past_1_day, + -- Number of times the user has recipe_favorite in the past 2nd day + MAX(COUNT(DISTINCT CASE DATE_DIFF(input_date, event_date, DAY) = 2 WHEN TRUE THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id, input_date) AS recipe_favorite_past_2_day, + -- Number of times the user has recipe_favorite in the past 3rd day + MAX(COUNT(DISTINCT CASE DATE_DIFF(input_date, event_date, DAY) = 3 WHEN TRUE THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id, input_date) AS recipe_favorite_past_3_day, + -- Number of times the user has recipe_favorite in the past 4th day + MAX(COUNT(DISTINCT CASE DATE_DIFF(input_date, event_date, DAY) = 4 WHEN TRUE THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id, input_date) AS recipe_favorite_past_4_day, + -- Number of times the user has recipe_favorite in the past 5th day + MAX(COUNT(DISTINCT CASE DATE_DIFF(input_date, event_date, DAY) = 5 WHEN TRUE THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id, input_date) AS recipe_favorite_past_5_day + FROM `{{mds_project_id}}.{{mds_dataset}}.event` as E + CROSS JOIN dates_interval as DI + -- Filter events to be within the defined date range + WHERE E.event_date BETWEEN DI.end_date AND DI.input_date + -- Consider only events with event name 'recipe_favorite' + AND event_name='recipe_favorite' + AND ga_session_id IS NOT NULL + -- Group the results by user pseudo ID and feature date + GROUP BY user_pseudo_id, feature_date +); + +-- Past User metrics: 1-day recipe_add_to_menu per user, 2-5-day recipe_add_to_menu per user +CREATE OR REPLACE TEMP TABLE rolling_recipe_add_to_menu_past_days AS ( + SELECT + user_pseudo_id, + input_date as feature_date, + -- Number of times the user has recipe_add_to_menu in the past 1st day + MAX(COUNT(DISTINCT CASE DATE_DIFF(input_date, event_date, DAY) = 1 WHEN TRUE THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id, input_date) AS recipe_add_to_menu_past_1_day, + -- Number of times the user has recipe_add_to_menu in the past 2nd day + MAX(COUNT(DISTINCT CASE DATE_DIFF(input_date, event_date, DAY) = 2 WHEN TRUE THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id, input_date) AS recipe_add_to_menu_past_2_day, + -- Number of times the user has recipe_add_to_menu in the past 3rd day + MAX(COUNT(DISTINCT CASE DATE_DIFF(input_date, event_date, DAY) = 3 WHEN TRUE THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id, input_date) AS recipe_add_to_menu_past_3_day, + -- Number of times the user has recipe_add_to_menu in the past 4th day + MAX(COUNT(DISTINCT CASE DATE_DIFF(input_date, event_date, DAY) = 4 WHEN TRUE THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id, input_date) AS recipe_add_to_menu_past_4_day, + -- Number of times the user has recipe_add_to_menu in the past 5th day + MAX(COUNT(DISTINCT CASE DATE_DIFF(input_date, event_date, DAY) = 5 WHEN TRUE THEN event_timestamp END)) OVER(PARTITION BY user_pseudo_id, input_date) AS recipe_add_to_menu_past_5_day + FROM `{{mds_project_id}}.{{mds_dataset}}.event` as E + CROSS JOIN dates_interval as DI + -- Filter events to be within the defined date range + WHERE E.event_date BETWEEN DI.end_date AND DI.input_date + -- Consider only events with event name 'recipe_add_to_menu' + AND event_name='recipe_add_to_menu' + AND ga_session_id IS NOT NULL + -- Group the results by user pseudo ID and feature date + GROUP BY user_pseudo_id, feature_date +); + +-- All users in the platform +-- This code creates a temporary table that contains a distinct list of user pseudo IDs +-- and their corresponding feature dates, filtering for events with valid session IDs, +-- device operating systems, and falling within the specified date range. +CREATE OR REPLACE TEMP TABLE events_users as ( + SELECT DISTINCT + Users.user_pseudo_id, + DI.input_date as feature_date + FROM `{{mds_project_id}}.{{mds_dataset}}.event` Users + INNER JOIN `{{mds_project_id}}.{{mds_dataset}}.device` as D + ON Users.device_type_id = D.device_type_id + CROSS JOIN dates_interval as DI + WHERE Users.ga_session_id IS NOT NULL + AND Users.event_date BETWEEN DI.end_date AND DI.input_date + AND D.device_os IS NOT NULL +); + +-- This code block inserts data into a table, combining information from the events_users +-- table and several temporary tables containing rolling window features. The resulting data +-- represents user-level features for each user and date, capturing their past activity within +-- different time windows. +INSERT INTO `{{project_id}}.{{dataset}}.{{insert_table}}` ( + processed_timestamp, + feature_date, + user_pseudo_id, + scroll_50_past_1_day, + scroll_50_past_2_day, + scroll_50_past_3_day, + scroll_50_past_4_day, + scroll_50_past_5_day, + scroll_90_past_1_day, + scroll_90_past_2_day, + scroll_90_past_3_day, + scroll_90_past_4_day, + scroll_90_past_5_day, + view_search_results_past_1_day, + view_search_results_past_2_day, + view_search_results_past_3_day, + view_search_results_past_4_day, + view_search_results_past_5_day, + file_download_past_1_day, + file_download_past_2_day, + file_download_past_3_day, + file_download_past_4_day, + file_download_past_5_day, + recipe_add_to_list_past_1_day, + recipe_add_to_list_past_2_day, + recipe_add_to_list_past_3_day, + recipe_add_to_list_past_4_day, + recipe_add_to_list_past_5_day, + recipe_print_past_1_day, + recipe_print_past_2_day, + recipe_print_past_3_day, + recipe_print_past_4_day, + recipe_print_past_5_day, + sign_up_past_1_day, + sign_up_past_2_day, + sign_up_past_3_day, + sign_up_past_4_day, + sign_up_past_5_day, + recipe_favorite_past_1_day, + recipe_favorite_past_2_day, + recipe_favorite_past_3_day, + recipe_favorite_past_4_day, + recipe_favorite_past_5_day, + recipe_add_to_menu_past_1_day, + recipe_add_to_menu_past_2_day, + recipe_add_to_menu_past_3_day, + recipe_add_to_menu_past_4_day, + recipe_add_to_menu_past_5_day +) + SELECT DISTINCT + -- This selects the current timestamp and assigns it to the column processed_timestamp. + CURRENT_TIMESTAMP() AS processed_timestamp, + EUD.feature_date, + EUD.user_pseudo_id, + COALESCE(scroll_50_past_1_day,0) AS scroll_50_past_1_day, + COALESCE(scroll_50_past_2_day,0) AS scroll_50_past_2_day, + COALESCE(scroll_50_past_3_day,0) AS scroll_50_past_3_day, + COALESCE(scroll_50_past_4_day,0) AS scroll_50_past_4_day, + COALESCE(scroll_50_past_5_day,0) AS scroll_50_past_5_day, + COALESCE(scroll_90_past_1_day,0) AS scroll_90_past_1_day, + COALESCE(scroll_90_past_2_day,0) AS scroll_90_past_2_day, + COALESCE(scroll_90_past_3_day,0) AS scroll_90_past_3_day, + COALESCE(scroll_90_past_4_day,0) AS scroll_90_past_4_day, + COALESCE(scroll_90_past_5_day,0) AS scroll_90_past_5_day, + COALESCE(view_search_results_past_1_day,0) AS view_search_results_past_1_day, + COALESCE(view_search_results_past_2_day,0) AS view_search_results_past_2_day, + COALESCE(view_search_results_past_3_day,0) AS view_search_results_past_3_day, + COALESCE(view_search_results_past_4_day,0) AS view_search_results_past_4_day, + COALESCE(view_search_results_past_5_day,0) AS view_search_results_past_5_day, + COALESCE(file_download_past_1_day,0) AS file_download_past_1_day, + COALESCE(file_download_past_2_day,0) AS file_download_past_2_day, + COALESCE(file_download_past_3_day,0) AS file_download_past_3_day, + COALESCE(file_download_past_4_day,0) AS file_download_past_4_day, + COALESCE(file_download_past_5_day,0) AS file_download_past_5_day, + COALESCE(recipe_add_to_list_past_1_day,0) AS recipe_add_to_list_past_1_day, + COALESCE(recipe_add_to_list_past_2_day,0) AS recipe_add_to_list_past_2_day, + COALESCE(recipe_add_to_list_past_3_day,0) AS recipe_add_to_list_past_3_day, + COALESCE(recipe_add_to_list_past_4_day,0) AS recipe_add_to_list_past_4_day, + COALESCE(recipe_add_to_list_past_5_day,0) AS recipe_add_to_list_past_5_day, + COALESCE(recipe_print_past_1_day,0) AS recipe_print_past_1_day, + COALESCE(recipe_print_past_2_day,0) AS recipe_print_past_2_day, + COALESCE(recipe_print_past_3_day,0) AS recipe_print_past_3_day, + COALESCE(recipe_print_past_4_day,0) AS recipe_print_past_4_day, + COALESCE(recipe_print_past_5_day,0) AS recipe_print_past_5_day, + COALESCE(sign_up_past_1_day,0) AS sign_up_past_1_day, + COALESCE(sign_up_past_2_day,0) AS sign_up_past_2_day, + COALESCE(sign_up_past_3_day,0) AS sign_up_past_3_day, + COALESCE(sign_up_past_4_day,0) AS sign_up_past_4_day, + COALESCE(sign_up_past_5_day,0) AS sign_up_past_5_day, + COALESCE(recipe_favorite_past_1_day,0) AS recipe_favorite_past_1_day, + COALESCE(recipe_favorite_past_2_day,0) AS recipe_favorite_past_2_day, + COALESCE(recipe_favorite_past_3_day,0) AS recipe_favorite_past_3_day, + COALESCE(recipe_favorite_past_4_day,0) AS recipe_favorite_past_4_day, + COALESCE(recipe_favorite_past_5_day,0) AS recipe_favorite_past_5_day, + COALESCE(recipe_add_to_menu_past_1_day,0) AS recipe_add_to_menu_past_1_day, + COALESCE(recipe_add_to_menu_past_2_day,0) AS recipe_add_to_menu_past_2_day, + COALESCE(recipe_add_to_menu_past_3_day,0) AS recipe_add_to_menu_past_3_day, + COALESCE(recipe_add_to_menu_past_4_day,0) AS recipe_add_to_menu_past_4_day, + COALESCE(recipe_add_to_menu_past_5_day,0) AS recipe_add_to_menu_past_5_day + FROM events_users AS EUD + -- This performs a full outer join, which combines all rows from both tables, + -- including those that don't have matching values. + FULL OUTER JOIN rolling_scroll_50_past_days AS A + ON EUD.user_pseudo_id = A.user_pseudo_id + FULL OUTER JOIN rolling_scroll_90_past_days AS B + ON EUD.user_pseudo_id = B.user_pseudo_id + FULL OUTER JOIN rolling_view_search_results_past_days AS C + ON EUD.user_pseudo_id = C.user_pseudo_id + FULL OUTER JOIN rolling_file_download_past_days AS D + ON EUD.user_pseudo_id = D.user_pseudo_id + FULL OUTER JOIN rolling_recipe_add_to_list_past_days AS E + ON EUD.user_pseudo_id = E.user_pseudo_id + FULL OUTER JOIN rolling_recipe_print_past_days AS F + ON EUD.user_pseudo_id = F.user_pseudo_id + FULL OUTER JOIN rolling_sign_up_past_days AS G + ON EUD.user_pseudo_id = G.user_pseudo_id + FULL OUTER JOIN rolling_recipe_favorite_past_days AS H + ON EUD.user_pseudo_id = H.user_pseudo_id + FULL OUTER JOIN rolling_recipe_add_to_menu_past_days AS I + ON EUD.user_pseudo_id = I.user_pseudo_id + -- This filters the results to include only rows where the user_pseudo_id is not null. + WHERE EUD.user_pseudo_id IS NOT NULL + ; \ No newline at end of file diff --git a/sql/query/invoke_lead_score_propensity_inference_preparation.sqlx b/sql/query/invoke_lead_score_propensity_inference_preparation.sqlx new file mode 100644 index 00000000..54e937d7 --- /dev/null +++ b/sql/query/invoke_lead_score_propensity_inference_preparation.sqlx @@ -0,0 +1,23 @@ +-- Copyright 2023 Google LLC +-- +-- Licensed under the Apache License, Version 2.0 (the "License"); +-- you may not use this file except in compliance with the License. +-- You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, software +-- distributed under the License is distributed on an "AS IS" BASIS, +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- See the License for the specific language governing permissions and +-- limitations under the License. + +-- This script determines the current date and then passes it as an argument to a +-- stored procedure in your BigQuery project. This pattern is commonly used when +-- you want a stored procedure to perform operations or calculations that are +-- relevant to the current date, such as data processing, analysis, or reporting tasks. + +DECLARE inference_date DATE DEFAULT NULL; +SET inference_date = CURRENT_DATE(); + +CALL `{{project_id}}.{{dataset}}.{{stored_procedure}}`(inference_date); diff --git a/sql/query/invoke_lead_score_propensity_label.sqlx b/sql/query/invoke_lead_score_propensity_label.sqlx new file mode 100644 index 00000000..f4288278 --- /dev/null +++ b/sql/query/invoke_lead_score_propensity_label.sqlx @@ -0,0 +1,39 @@ +-- Copyright 2023 Google LLC +-- +-- Licensed under the Apache License, Version 2.0 (the "License"); +-- you may not use this file except in compliance with the License. +-- You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, software +-- distributed under the License is distributed on an "AS IS" BASIS, +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- See the License for the specific language governing permissions and +-- limitations under the License. + +-- This script sets up a date range, calls a stored procedure with this range and a variable to +-- store a result, and then returns the result of the stored procedure. This pattern is common +-- for orchestrating data processing tasks within BigQuery using stored procedures. + +DECLARE input_date DATE; +DECLARE end_date DATE; +DECLARE users_added INT64 DEFAULT NULL; + +SET end_date= CURRENT_DATE(); +SET input_date= (SELECT DATE_SUB(end_date, INTERVAL {{interval_input_date}} DAY)); + +-- This code block ensures that the end_date used in subsequent operations is not later than one day after the latest available data in +-- the specified events table. This prevents potential attempts to process data for a date range that extends beyond the actual data availability. +IF (SELECT DATE_SUB(end_date, INTERVAL 1 DAY)) > (SELECT MAX(event_date) FROM `{{mds_project_id}}.{{mds_dataset}}.event`) THEN + SET end_date = (SELECT DATE_ADD(MAX(event_date), INTERVAL 1 DAY) FROM `{{mds_project_id}}.{{mds_dataset}}.event`); +END IF; + +-- This code block ensures that the input_date used in subsequent operations is not before the earliest available data in the +-- specified events table. This prevents potential errors or unexpected behavior that might occur when trying to process data +-- for a date range that precedes the actual data availability. +IF input_date < (SELECT MIN(event_date) FROM `{{mds_project_id}}.{{mds_dataset}}.event`) THEN + SET input_date = (SELECT DATE_ADD(MIN(event_date), INTERVAL 1 DAY) FROM `{{mds_project_id}}.{{mds_dataset}}.event`); +END IF; + +CALL `{{project_id}}.{{dataset}}.{{stored_procedure}}`(input_date, end_date, users_added); \ No newline at end of file diff --git a/sql/query/invoke_lead_score_propensity_training_preparation.sqlx b/sql/query/invoke_lead_score_propensity_training_preparation.sqlx new file mode 100644 index 00000000..cf93a2d8 --- /dev/null +++ b/sql/query/invoke_lead_score_propensity_training_preparation.sqlx @@ -0,0 +1,73 @@ +-- Copyright 2023 Google LLC +-- +-- Licensed under the Apache License, Version 2.0 (the "License"); +-- you may not use this file except in compliance with the License. +-- You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, software +-- distributed under the License is distributed on an "AS IS" BASIS, +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- See the License for the specific language governing permissions and +-- limitations under the License. + +-- This script intelligently determines the optimal date range for training a purchase +-- propensity model by considering user-defined parameters and the availability of purchase +-- events within the dataset. It ensures that the training data includes purchase events if +-- they exist within the specified bounds. + +-- Intended start and end dates for training data +-- Initializing Training Dates +DECLARE train_start_date DATE DEFAULT NULL; +DECLARE train_end_date DATE DEFAULT NULL; + +-- Control data splitting for training and validation (likely used in a subsequent process). +DECLARE train_split_end_number INT64 DEFAULT NULL; +DECLARE validation_split_end_number INT64 DEFAULT NULL; + +-- Will store the count of distinct users who made a login within a given period. +DECLARE logged_users INT64 DEFAULT NULL; + +-- Used to store the maximum and minimum event dates from the source data. +DECLARE max_date DATE; +DECLARE min_date DATE; + +-- Determining Maximum and Minimum Dates +SET max_date = (SELECT DATE_SUB(MAX(event_date), INTERVAL {{interval_max_date}} DAY) FROM `{{mds_project_id}}.{{mds_dataset}}.event`); +SET min_date = (SELECT DATE_ADD(MIN(event_date), INTERVAL {{interval_min_date}} DAY) FROM `{{mds_project_id}}.{{mds_dataset}}.event`); + +-- If min_date > maximum event_date OR max_date < minimum event_date, then set min_date for the min event_date and set max_date for the max event_date +IF min_date >= (SELECT MAX(event_date) FROM `{{mds_project_id}}.{{mds_dataset}}.event`) OR max_date <= (SELECT MIN(event_date) FROM `{{mds_project_id}}.{{mds_dataset}}.event`) OR min_date >= max_date THEN + SET min_date = (SELECT MIN(event_date) FROM `{{mds_project_id}}.{{mds_dataset}}.event`); + SET max_date = (SELECT MAX(event_date) FROM `{{mds_project_id}}.{{mds_dataset}}.event`); +END IF; + +-- Setting Split Numbers +-- Sets the train_split_end_number to a user-defined value. This value likely determines the proportion of data used for training. +SET train_split_end_number = {{train_split_end_number}}; -- If you want 60% for training use number 5. If you want 80% use number 7. +-- Sets the validation_split_end_number to a user-defined value, controlling the proportion of data used for validation. +SET validation_split_end_number = {{validation_split_end_number}}; + +-- This crucial step counts distinct users who have an event named 'login' within the initially set training date range. +-- IF there are no logged_users in the time interval selected, then set "train_start_date" and "train_end_date" as "max_date" and "min_date". +SET logged_users = (SELECT COUNT(DISTINCT user_pseudo_id) + FROM `{{mds_project_id}}.{{mds_dataset}}.event` + WHERE event_name = 'login' AND + event_date BETWEEN min_date AND max_date + ); + +-- Setting Training Dates +-- If there are logged_users in the training set, then keep the calculated dates, or else set +-- the start and end dates to a fixed interval preventing `train_start_date` and `train_end_date` from being NULL. +IF logged_users > 0 THEN + SET train_start_date = min_date; + SET train_end_date = max_date; +ELSE + SET train_start_date = DATE_SUB(CURRENT_DATE(), INTERVAL 3 YEAR); + SET train_end_date = DATE_SUB(CURRENT_DATE(), INTERVAL 5 DAY); +END IF; + +-- Finally, the script calls a stored procedure, passing the adjusted training dates and split numbers as arguments. This stored procedure +-- handles the actual data preparation for the lead score propensity model. +CALL `{{project_id}}.{{dataset}}.{{stored_procedure}}`(train_start_date, train_end_date, train_split_end_number, validation_split_end_number); diff --git a/sql/query/invoke_user_rolling_window_lead_metrics.sqlx b/sql/query/invoke_user_rolling_window_lead_metrics.sqlx new file mode 100644 index 00000000..e469a2d7 --- /dev/null +++ b/sql/query/invoke_user_rolling_window_lead_metrics.sqlx @@ -0,0 +1,28 @@ +-- Copyright 2023 Google LLC +-- +-- Licensed under the Apache License, Version 2.0 (the "License"); +-- you may not use this file except in compliance with the License. +-- You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, software +-- distributed under the License is distributed on an "AS IS" BASIS, +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- See the License for the specific language governing permissions and +-- limitations under the License. + +-- This script sets up a date range, calls a stored procedure with this range and a variable to +-- store a result, and then returns the result of the stored procedure. This pattern is common +-- for orchestrating data processing tasks within BigQuery using stored procedures. + +DECLARE input_date DATE; +DECLARE end_date DATE; +DECLARE users_added INT64 DEFAULT NULL; + +SET input_date= CURRENT_DATE(); +SET end_date= (SELECT DATE_SUB(input_date, INTERVAL {{interval_end_date}} DAY)); + +CALL `{{project_id}}.{{dataset}}.{{stored_procedure}}`(input_date, end_date, users_added); + +SELECT users_added; \ No newline at end of file diff --git a/sql/schema/table/lead_score_propensity_inference_preparation.json b/sql/schema/table/lead_score_propensity_inference_preparation.json new file mode 100644 index 00000000..5fc9e6ec --- /dev/null +++ b/sql/schema/table/lead_score_propensity_inference_preparation.json @@ -0,0 +1,337 @@ +[ + { + "name": "user_pseudo_id", + "type": "STRING", + "description": "The user pseudo identifier" + }, + { + "name": "user_id", + "type": "STRING", + "description": "The user identifier when the user is logged in" + }, + { + "name": "feature_date", + "type": "DATE", + "description": "Date that serves as the basis for the calculation of the features" + }, + { + "name": "user_ltv_revenue", + "type": "FLOAT", + "description": "The current customer lifetime value revenue of the user" + }, + { + "name": "device_category", + "type": "STRING", + "description": "The device category the user last accessed" + }, + { + "name": "device_mobile_brand_name", + "type": "STRING", + "description": "The device mobile brand name the user last accessed" + }, + { + "name": "device_mobile_model_name", + "type": "STRING", + "description": "The device mobile model name the user last accessed" + }, + { + "name": "device_os", + "type": "STRING", + "description": "The device operating system the user last accessed" + }, + { + "name": "device_language", + "type": "STRING", + "description": "The device language the user last accessed" + }, + { + "name": "device_web_browser", + "type": "STRING", + "description": "The device web browser the user last accessed" + }, + { + "name": "geo_sub_continent", + "type": "STRING", + "description": "The geographic subcontinent the user last accessed from" + }, + { + "name": "geo_country", + "type": "STRING", + "description": "The geographic country the user last accessed from" + }, + { + "name": "geo_region", + "type": "STRING", + "description": "The geographic region the user last accessed from" + }, + { + "name": "geo_city", + "type": "STRING", + "description": "The geographic city the user last accessed from" + }, + { + "name": "geo_metro", + "type": "STRING", + "description": "The geographic metropolitan area the user last accessed from" + }, + { + "name": "last_traffic_source_medium", + "type": "STRING", + "description": "The last traffic source medium the user has been acquired" + }, + { + "name": "last_traffic_source_name", + "type": "STRING", + "description": "The last traffic source name the user has been acquired" + }, + { + "name": "last_traffic_source_source", + "type": "STRING", + "description": "The last traffic source source the user has been acquired" + }, + { + "name": "first_traffic_source_medium", + "type": "STRING", + "description": "The first traffic source medium the user has been acquired" + }, + { + "name": "first_traffic_source_name", + "type": "STRING", + "description": "The first traffic source name the user has been acquired" + }, + { + "name": "first_traffic_source_source", + "type": "STRING", + "description": "The first traffic source source the user has been acquired" + }, + { + "name": "has_signed_in_with_user_id", + "type": "BOOLEAN", + "description": "A boolean indicating whether the user has signed in with an user id" + }, + { + "name": "scroll_50_past_1_day", + "type": "INTEGER", + "description": "The number of times the user has been active in the past 1 day" + }, + { + "name": "scroll_50_past_2_day", + "type": "INTEGER", + "description": "The number of times the user has been active in the past 2nd day" + }, + { + "name": "scroll_50_past_3_day", + "type": "INTEGER", + "description": "The number of times the user has been active in the past 3rd day" + }, + { + "name": "scroll_50_past_4_day", + "type": "INTEGER", + "description": "The number of times the user has been active in the past 4th day" + }, + { + "name": "scroll_50_past_5_day", + "type": "INTEGER", + "description": "The number of times the user has been active in the past 5th day" + }, + { + "name": "scroll_90_past_1_day", + "type": "INTEGER", + "description": "The number of times the user has scrolled 90p pages in the past day" + }, + { + "name": "scroll_90_past_2_day", + "type": "INTEGER", + "description": "The number of times the user has scrolled 90p pages in the past 2nd day" + }, + { + "name": "scroll_90_past_3_day", + "type": "INTEGER", + "description": "The number of times the user has scrolled 90p pages in the past 3rd day" + }, + { + "name": "scroll_90_past_4_day", + "type": "INTEGER", + "description": "The number of times the user has scrolled 90p pages in the past 4th day" + }, + { + "name": "scroll_90_past_5_day", + "type": "INTEGER", + "description": "The number of times the user has scrolled 90p pages in the past 5th day" + }, + { + "name": "view_search_results_past_1_day", + "type": "INTEGER", + "description": "The number of times the user has visited in the past day" + }, + { + "name": "view_search_results_past_2_day", + "type": "INTEGER", + "description": "The number of times the user has visited in the past 2nd day" + }, + { + "name": "view_search_results_past_3_day", + "type": "INTEGER", + "description": "The number of times the user has visited in the past 3rd day" + }, + { + "name": "view_search_results_past_4_day", + "type": "INTEGER", + "description": "The number of times the user has visited in the past 4th day" + }, + { + "name": "view_search_results_past_5_day", + "type": "INTEGER", + "description": "The number of times the user has visited in the past 5th day" + }, + { + "name": "file_download_past_1_day", + "type": "INTEGER", + "description": "The number of times the user has visited in the past day" + }, + { + "name": "file_download_past_2_day", + "type": "INTEGER", + "description": "The number of times the user has visited in the past 2nd day" + }, + { + "name": "file_download_past_3_day", + "type": "INTEGER", + "description": "The number of times the user has visited in the past 3rd day" + }, + { + "name": "file_download_past_4_day", + "type": "INTEGER", + "description": "The number of times the user has visited in the past 4th day" + }, + { + "name": "file_download_past_5_day", + "type": "INTEGER", + "description": "The number of times the user has visited in the past 5th day" + }, + { + "name": "recipe_add_to_list_past_1_day", + "type": "INTEGER", + "description": "The number of times the user has viewed items in the past day" + }, + { + "name": "recipe_add_to_list_past_2_day", + "type": "INTEGER", + "description": "The number of times the user has viewed items in the past 2nd day" + }, + { + "name": "recipe_add_to_list_past_3_day", + "type": "INTEGER", + "description": "The number of times the user has viewed items in the past 3rd day" + }, + { + "name": "recipe_add_to_list_past_4_day", + "type": "INTEGER", + "description": "The number of times the user has viewed items in the past 4th day" + }, + { + "name": "recipe_add_to_list_past_5_day", + "type": "INTEGER", + "description": "The number of times the user has viewed items in the past 5th day" + }, + { + "name": "recipe_print_past_1_day", + "type": "INTEGER", + "description": "The number of times the user has added items to cart in the past day" + }, + { + "name": "recipe_print_past_2_day", + "type": "INTEGER", + "description": "The number of times the user has added items to cart in the past 2nd day" + }, + { + "name": "recipe_print_past_3_day", + "type": "INTEGER", + "description": "The number of times the user has added items to cart in the past 3rd day" + }, + { + "name": "recipe_print_past_4_day", + "type": "INTEGER", + "description": "The number of times the user has added items to cart in the past 4th day" + }, + { + "name": "recipe_print_past_5_day", + "type": "INTEGER", + "description": "The number of times the user has added items to cart in the past 5th day" + }, + { + "name": "sign_up_past_1_day", + "type": "INTEGER", + "description": "The number of times the user has checked out in the past day" + }, + { + "name": "sign_up_past_2_day", + "type": "INTEGER", + "description": "The number of times the user has checked out in the past 2nd day" + }, + { + "name": "sign_up_past_3_day", + "type": "INTEGER", + "description": "The number of times the user has checked out in the past 3rd day" + }, + { + "name": "sign_up_past_4_day", + "type": "INTEGER", + "description": "The number of times the user has checked out in the past 4th day" + }, + { + "name": "sign_up_past_5_day", + "type": "INTEGER", + "description": "The number of times the user has checked out in the past 5th day" + }, + { + "name": "recipe_favorite_past_1_day", + "type": "INTEGER", + "description": "The number of times the user has checked out in the past day" + }, + { + "name": "recipe_favorite_past_2_day", + "type": "INTEGER", + "description": "The number of times the user has checked out in the past 2nd day" + }, + { + "name": "recipe_favorite_past_3_day", + "type": "INTEGER", + "description": "The number of times the user has checked out in the past 3rd day" + }, + { + "name": "recipe_favorite_past_4_day", + "type": "INTEGER", + "description": "The number of times the user has checked out in the past 4th day" + }, + { + "name": "recipe_favorite_past_5_day", + "type": "INTEGER", + "description": "The number of times the user has checked out in the past 5th day" + }, + { + "name": "recipe_add_to_menu_past_1_day", + "type": "INTEGER", + "description": "The number of times the user has checked out in the past day" + }, + { + "name": "recipe_add_to_menu_past_2_day", + "type": "INTEGER", + "description": "The number of times the user has checked out in the past 2nd day" + }, + { + "name": "recipe_add_to_menu_past_3_day", + "type": "INTEGER", + "description": "The number of times the user has checked out in the past 3rd day" + }, + { + "name": "recipe_add_to_menu_past_4_day", + "type": "INTEGER", + "description": "The number of times the user has checked out in the past 4th day" + }, + { + "name": "recipe_add_to_menu_past_5_day", + "type": "INTEGER", + "description": "The number of times the user has checked out in the past 5th day" + } +] \ No newline at end of file diff --git a/sql/schema/table/lead_score_propensity_label.json b/sql/schema/table/lead_score_propensity_label.json new file mode 100644 index 00000000..8b63bc6f --- /dev/null +++ b/sql/schema/table/lead_score_propensity_label.json @@ -0,0 +1,22 @@ +[ + { + "name": "processed_timestamp", + "type": "TIMESTAMP", + "description": "Timestamp of when the data was processed" + }, + { + "name": "feature_date", + "type": "DATE", + "description": "The date serving as basis for the features calculation" + }, + { + "name": "user_pseudo_id", + "type": "STRING", + "description": "The user pseudo identifier" + }, + { + "name": "login_day_1", + "type": "INTEGER", + "description": "Predicted number of logins by the user in the next 1st day from the feature date" + } +] \ No newline at end of file diff --git a/sql/schema/table/lead_score_propensity_training_preparation.json b/sql/schema/table/lead_score_propensity_training_preparation.json new file mode 100644 index 00000000..f5647417 --- /dev/null +++ b/sql/schema/table/lead_score_propensity_training_preparation.json @@ -0,0 +1,352 @@ +[ + { + "name": "processed_timestamp", + "type": "TIMESTAMP", + "description": "Timestamp of when the data was processed" + }, + { + "name": "data_split", + "type": "STRING", + "description": "The indication of whether the row should be used for TRAINING, VALIDATION or TESTING" + }, + { + "name": "feature_date", + "type": "DATE", + "description": "The date serving as basis for the features calculation" + }, + { + "name": "user_pseudo_id", + "type": "STRING", + "description": "The user pseudo identifier" + }, + { + "name": "user_id", + "type": "STRING", + "description": "The user identifier of when the user has logged in" + }, + { + "name": "user_ltv_revenue", + "type": "FLOAT", + "description": "The current user lifetime value" + }, + { + "name": "device_category", + "type": "STRING", + "description": "The device category of the user last used to access" + }, + { + "name": "device_mobile_brand_name", + "type": "STRING", + "description": "The device mobile brand name last used by the user" + }, + { + "name": "device_mobile_model_name", + "type": "STRING", + "description": "The device mobile model name last used by the user" + }, + { + "name": "device_os", + "type": "STRING", + "description": "The device operating system last used by the user" + }, + { + "name": "device_language", + "type": "STRING", + "description": "The device language last used by the user" + }, + { + "name": "device_web_browser", + "type": "STRING", + "description": "The device web browser last used by the user" + }, + { + "name": "geo_sub_continent", + "type": "STRING", + "description": "The geographic subcontinent from the user last access" + }, + { + "name": "geo_country", + "type": "STRING", + "description": "The geographic country from the user last access" + }, + { + "name": "geo_region", + "type": "STRING", + "description": "The geographic region from the user last access" + }, + { + "name": "geo_city", + "type": "STRING", + "description": "The geographic city from the user last access" + }, + { + "name": "geo_metro", + "type": "STRING", + "description": "The geographic metropolitan area from the user user last access" + }, + { + "name": "last_traffic_source_medium", + "type": "STRING", + "description": "The last traffic source medium from where the user was acquired" + }, + { + "name": "last_traffic_source_name", + "type": "STRING", + "description": "The last traffic source name from where the user was acquired" + }, + { + "name": "last_traffic_source_source", + "type": "STRING", + "description": "The last traffic source soure from where the user was acquired" + }, + { + "name": "first_traffic_source_medium", + "type": "STRING", + "description": "The first traffic source medium from where the user was acquired" + }, + { + "name": "first_traffic_source_name", + "type": "STRING", + "description": "The first traffic source name from where the user was acquired" + }, + { + "name": "first_traffic_source_source", + "type": "STRING", + "description": "The first traffic source source from where the user was acquired" + }, + { + "name": "has_signed_in_with_user_id", + "type": "BOOLEAN", + "description": "A boolean indicating whether the user has signed in with the user id" + }, + { + "name": "scroll_50_past_1_day", + "type": "INTEGER", + "description": "The number of times the user has been active in the past 1 day" + }, + { + "name": "scroll_50_past_2_day", + "type": "INTEGER", + "description": "The number of times the user has been active in the past 2nd day" + }, + { + "name": "scroll_50_past_3_day", + "type": "INTEGER", + "description": "The number of times the user has been active in the past 3rd day" + }, + { + "name": "scroll_50_past_4_day", + "type": "INTEGER", + "description": "The number of times the user has been active in the past 4th day" + }, + { + "name": "scroll_50_past_5_day", + "type": "INTEGER", + "description": "The number of times the user has been active in the past 5th day" + }, + { + "name": "scroll_90_past_1_day", + "type": "INTEGER", + "description": "The number of times the user has scrolled 90p pages in the past day" + }, + { + "name": "scroll_90_past_2_day", + "type": "INTEGER", + "description": "The number of times the user has scrolled 90p pages in the past 2nd day" + }, + { + "name": "scroll_90_past_3_day", + "type": "INTEGER", + "description": "The number of times the user has scrolled 90p pages in the past 3rd day" + }, + { + "name": "scroll_90_past_4_day", + "type": "INTEGER", + "description": "The number of times the user has scrolled 90p pages in the past 4th day" + }, + { + "name": "scroll_90_past_5_day", + "type": "INTEGER", + "description": "The number of times the user has scrolled 90p pages in the past 5th day" + }, + { + "name": "view_search_results_past_1_day", + "type": "INTEGER", + "description": "The number of times the user has visited in the past day" + }, + { + "name": "view_search_results_past_2_day", + "type": "INTEGER", + "description": "The number of times the user has visited in the past 2nd day" + }, + { + "name": "view_search_results_past_3_day", + "type": "INTEGER", + "description": "The number of times the user has visited in the past 3rd day" + }, + { + "name": "view_search_results_past_4_day", + "type": "INTEGER", + "description": "The number of times the user has visited in the past 4th day" + }, + { + "name": "view_search_results_past_5_day", + "type": "INTEGER", + "description": "The number of times the user has visited in the past 5th day" + }, + { + "name": "file_download_past_1_day", + "type": "INTEGER", + "description": "The number of times the user has visited in the past day" + }, + { + "name": "file_download_past_2_day", + "type": "INTEGER", + "description": "The number of times the user has visited in the past 2nd day" + }, + { + "name": "file_download_past_3_day", + "type": "INTEGER", + "description": "The number of times the user has visited in the past 3rd day" + }, + { + "name": "file_download_past_4_day", + "type": "INTEGER", + "description": "The number of times the user has visited in the past 4th day" + }, + { + "name": "file_download_past_5_day", + "type": "INTEGER", + "description": "The number of times the user has visited in the past 5th day" + }, + { + "name": "recipe_add_to_list_past_1_day", + "type": "INTEGER", + "description": "The number of times the user has viewed items in the past day" + }, + { + "name": "recipe_add_to_list_past_2_day", + "type": "INTEGER", + "description": "The number of times the user has viewed items in the past 2nd day" + }, + { + "name": "recipe_add_to_list_past_3_day", + "type": "INTEGER", + "description": "The number of times the user has viewed items in the past 3rd day" + }, + { + "name": "recipe_add_to_list_past_4_day", + "type": "INTEGER", + "description": "The number of times the user has viewed items in the past 4th day" + }, + { + "name": "recipe_add_to_list_past_5_day", + "type": "INTEGER", + "description": "The number of times the user has viewed items in the past 5th day" + }, + { + "name": "recipe_print_past_1_day", + "type": "INTEGER", + "description": "The number of times the user has added items to cart in the past day" + }, + { + "name": "recipe_print_past_2_day", + "type": "INTEGER", + "description": "The number of times the user has added items to cart in the past 2nd day" + }, + { + "name": "recipe_print_past_3_day", + "type": "INTEGER", + "description": "The number of times the user has added items to cart in the past 3rd day" + }, + { + "name": "recipe_print_past_4_day", + "type": "INTEGER", + "description": "The number of times the user has added items to cart in the past 4th day" + }, + { + "name": "recipe_print_past_5_day", + "type": "INTEGER", + "description": "The number of times the user has added items to cart in the past 5th day" + }, + { + "name": "sign_up_past_1_day", + "type": "INTEGER", + "description": "The number of times the user has checked out in the past day" + }, + { + "name": "sign_up_past_2_day", + "type": "INTEGER", + "description": "The number of times the user has checked out in the past 2nd day" + }, + { + "name": "sign_up_past_3_day", + "type": "INTEGER", + "description": "The number of times the user has checked out in the past 3rd day" + }, + { + "name": "sign_up_past_4_day", + "type": "INTEGER", + "description": "The number of times the user has checked out in the past 4th day" + }, + { + "name": "sign_up_past_5_day", + "type": "INTEGER", + "description": "The number of times the user has checked out in the past 5th day" + }, + { + "name": "recipe_favorite_past_1_day", + "type": "INTEGER", + "description": "The number of times the user has checked out in the past day" + }, + { + "name": "recipe_favorite_past_2_day", + "type": "INTEGER", + "description": "The number of times the user has checked out in the past 2nd day" + }, + { + "name": "recipe_favorite_past_3_day", + "type": "INTEGER", + "description": "The number of times the user has checked out in the past 3rd day" + }, + { + "name": "recipe_favorite_past_4_day", + "type": "INTEGER", + "description": "The number of times the user has checked out in the past 4th day" + }, + { + "name": "recipe_favorite_past_5_day", + "type": "INTEGER", + "description": "The number of times the user has checked out in the past 5th day" + }, + { + "name": "recipe_add_to_menu_past_1_day", + "type": "INTEGER", + "description": "The number of times the user has checked out in the past day" + }, + { + "name": "recipe_add_to_menu_past_2_day", + "type": "INTEGER", + "description": "The number of times the user has checked out in the past 2nd day" + }, + { + "name": "recipe_add_to_menu_past_3_day", + "type": "INTEGER", + "description": "The number of times the user has checked out in the past 3rd day" + }, + { + "name": "recipe_add_to_menu_past_4_day", + "type": "INTEGER", + "description": "The number of times the user has checked out in the past 4th day" + }, + { + "name": "recipe_add_to_menu_past_5_day", + "type": "INTEGER", + "description": "The number of times the user has checked out in the past 5th day" + }, + { + "name": "will_login", + "type": "INTEGER", + "description": "A boolean indicating whether the user will login in the next period" + } +] \ No newline at end of file diff --git a/sql/schema/table/purchase_propensity_inference_preparation.json b/sql/schema/table/purchase_propensity_inference_preparation.json index 0fe328b2..2f8b1256 100644 --- a/sql/schema/table/purchase_propensity_inference_preparation.json +++ b/sql/schema/table/purchase_propensity_inference_preparation.json @@ -109,161 +109,6 @@ "type": "BOOLEAN", "description": "A boolean indicating whether the user has signed in with an user id" }, - { - "name": "engagement_rate", - "type": "FLOAT", - "description": "The percentage of sessions that were engaged sessions. Engagement rate = engaged sessions / total sessions Engagement rate is the inverse of bounce rate" - }, - { - "name": "engaged_sessions_per_user", - "type": "INTEGER", - "description": "The number of engaged sessions per user" - }, - { - "name": "session_conversion_rate", - "type": "FLOAT", - "description": "The session conversion rate is calculated by dividing the number of sessions with a conversion event by the total number of sessions" - }, - { - "name": "bounces", - "type": "INTEGER", - "description": "The number of not engaged sessions" - }, - { - "name": "bounce_rate_per_user", - "type": "FLOAT", - "description": "The percentage of sessions that were not engaged sessions per user. Bounce rate = not engaged sessions / total sessions Bounce rate is the inverse of engagement rate" - }, - { - "name": "sessions_per_user", - "type": "INTEGER", - "description": "The number of sessions per user" - }, - { - "name": "avg_views_per_session", - "type": "FLOAT", - "description": "The average number of views per sessions" - }, - { - "name": "sum_engagement_time_seconds", - "type": "FLOAT", - "description": "The sum of time that your website was in focus in a user's browser or an app was in the foreground of a user's device in seconds per user" - }, - { - "name": "avg_engagement_time_seconds", - "type": "FLOAT", - "description": "The average time that your website was in focus in a user's browser or an app was in the foreground of a user's device. Average engagement time = total user engagement durations / number of active users" - }, - { - "name": "new_visits", - "type": "INTEGER", - "description": "The number of times your users opened your website for the first time" - }, - { - "name": "returning_visits", - "type": "INTEGER", - "description": "The number of users who have initiated at least one previous session, regardless of whether or not the previous sessions were engaged sessions" - }, - { - "name": "add_to_carts", - "type": "INTEGER", - "description": "The number of times users added items to their shopping carts" - }, - { - "name": "cart_to_view_rate", - "type": "FLOAT", - "description": "The number of times users added items to their shopping carts divided by the the number of mobile app screens or web pages your users saw. Repeated views of a single screen or page are counted" - }, - { - "name": "checkouts", - "type": "INTEGER", - "description": "The number of times users started the checkout process" - }, - { - "name": "ecommerce_purchases", - "type": "INTEGER", - "description": "The number of purchases on your website or app" - }, - { - "name": "ecommerce_quantity", - "type": "INTEGER", - "description": "The number of units for an ecommerce event" - }, - { - "name": "ecommerce_revenue", - "type": "FLOAT", - "description": "The sum of revenue from purchases made on your website or app, minus any refunds given. Purchase revenue = purchases + in-app purchases + subscriptions - refund" - }, - { - "name": "item_revenue", - "type": "FLOAT", - "description": "The total revenue from items only minus refunds, excluding tax and shipping" - }, - { - "name": "item_quantity", - "type": "INTEGER", - "description": "The number of units for a single item included in ecommerce events" - }, - { - "name": "item_view_events", - "type": "INTEGER", - "description": "The number of times an item was viewed" - }, - { - "name": "items_clicked_in_promotion", - "type": "INTEGER", - "description": "The number of items that the customer clicked in a promotion" - }, - { - "name": "items_clicked_in_list", - "type": "INTEGER", - "description": "The number of items that the customer clicked in a list of items" - }, - { - "name": "items_checked_out", - "type": "INTEGER", - "description": "The number of times the user has checked out" - }, - { - "name": "items_added_to_cart", - "type": "INTEGER", - "description": "The number of times the user has added items to cart" - }, - { - "name": "item_list_view_events", - "type": "INTEGER", - "description": "The number of times the user has viewed items in list" - }, - { - "name": "purchase_revenue", - "type": "FLOAT", - "description": "The total revenue from purchases, in-app purchases, subscriptions, and ad revenue. Total revenue = purchases + in-app purchases + subscriptions + ad revenue - refunds" - }, - { - "name": "purchase_to_view_rate", - "type": "FLOAT", - "description": "The number of purchases on your website or app divided by the number of mobile app screens or web pages your users saw" - }, - { - "name": "transactions_per_purchaser", - "type": "FLOAT", - "description": "The average number of purchases per buyer for the selected time frame" - }, - { - "name": "user_conversion_rate", - "type": "FLOAT", - "description": "The number of users who performed a conversion action divided by the total number of users" - }, - { - "name": "how_many_purchased_before", - "type": "INTEGER", - "description": "The number of times the user have purchased before" - }, - { - "name": "has_abandoned_cart", - "type": "BOOLEAN", - "description": "a boolean indicating whether the user has abandoned a cart in the past" - }, { "name": "active_users_past_1_day", "type": "INTEGER", diff --git a/sql/schema/table/purchase_propensity_training_preparation.json b/sql/schema/table/purchase_propensity_training_preparation.json index e5d284d5..f984f42e 100644 --- a/sql/schema/table/purchase_propensity_training_preparation.json +++ b/sql/schema/table/purchase_propensity_training_preparation.json @@ -119,161 +119,6 @@ "type": "BOOLEAN", "description": "A boolean indicating whether the user has signed in with the user id" }, - { - "name": "engagement_rate", - "type": "FLOAT", - "description": "The percentage of sessions that were engaged sessions. Engagement rate = engaged sessions / total sessions Engagement rate is the inverse of bounce rate" - }, - { - "name": "engaged_sessions_per_user", - "type": "INTEGER", - "description": "The number of engaged sessions per user" - }, - { - "name": "session_conversion_rate", - "type": "FLOAT", - "description": "The session conversion rate is calculated by dividing the number of sessions with a conversion event by the total number of sessions" - }, - { - "name": "bounces", - "type": "INTEGER", - "description": "The number of not engaged sessions" - }, - { - "name": "bounce_rate_per_user", - "type": "FLOAT", - "description": "The percentage of sessions that were not engaged sessions per user. Bounce rate = not engaged sessions / total sessions Bounce rate is the inverse of engagement rate" - }, - { - "name": "sessions_per_user", - "type": "INTEGER", - "description": "The number of sessions per user" - }, - { - "name": "avg_views_per_session", - "type": "FLOAT", - "description": "The average number of views per sessions" - }, - { - "name": "sum_engagement_time_seconds", - "type": "FLOAT", - "description": "The sum of time that your website was in focus in a user's browser or an app was in the foreground of a user's device in seconds per user" - }, - { - "name": "avg_engagement_time_seconds", - "type": "FLOAT", - "description": "The average time that your website was in focus in a user's browser or an app was in the foreground of a user's device. Average engagement time = total user engagement durations / number of active users" - }, - { - "name": "new_visits", - "type": "INTEGER", - "description": "The number of times your users opened your website for the first time" - }, - { - "name": "returning_visits", - "type": "INTEGER", - "description": "The number of users who have initiated at least one previous session, regardless of whether or not the previous sessions were engaged sessions" - }, - { - "name": "add_to_carts", - "type": "INTEGER", - "description": "The number of times users added items to their shopping carts" - }, - { - "name": "cart_to_view_rate", - "type": "FLOAT", - "description": "The number of times users added items to their shopping carts divided by the the number of mobile app screens or web pages your users saw. Repeated views of a single screen or page are counted" - }, - { - "name": "checkouts", - "type": "INTEGER", - "description": "The number of times users started the checkout process" - }, - { - "name": "ecommerce_purchases", - "type": "INTEGER", - "description": "The number of purchases on your website or app" - }, - { - "name": "ecommerce_quantity", - "type": "INTEGER", - "description": "The number of units for an ecommerce event" - }, - { - "name": "ecommerce_revenue", - "type": "FLOAT", - "description": "The sum of revenue from purchases made on your website or app, minus any refunds given. Purchase revenue = purchases + in-app purchases + subscriptions - refund" - }, - { - "name": "item_revenue", - "type": "FLOAT", - "description": "The total revenue from items only minus refunds, excluding tax and shipping" - }, - { - "name": "item_quantity", - "type": "INTEGER", - "description": "The number of units for a single item included in ecommerce events" - }, - { - "name": "item_view_events", - "type": "INTEGER", - "description": "The number of times an item was viewed" - }, - { - "name": "items_clicked_in_promotion", - "type": "INTEGER", - "description": "The number of items that the customer clicked in a promotion" - }, - { - "name": "items_clicked_in_list", - "type": "INTEGER", - "description": "The number of items that the customer clicked in a list of items" - }, - { - "name": "items_checked_out", - "type": "INTEGER", - "description": "The number of times the user has checked out" - }, - { - "name": "items_added_to_cart", - "type": "INTEGER", - "description": "The number of times the user has added items to cart" - }, - { - "name": "item_list_view_events", - "type": "INTEGER", - "description": "The number of times the user has viewed items in list" - }, - { - "name": "purchase_revenue", - "type": "FLOAT", - "description": "The total revenue from purchases, in-app purchases, subscriptions, and ad revenue. Total revenue = purchases + in-app purchases + subscriptions + ad revenue - refunds" - }, - { - "name": "purchase_to_view_rate", - "type": "FLOAT", - "description": "The number of purchases on your website or app divided by the number of mobile app screens or web pages your users saw" - }, - { - "name": "transactions_per_purchaser", - "type": "FLOAT", - "description": "The average number of purchases per buyer for the selected time frame" - }, - { - "name": "user_conversion_rate", - "type": "FLOAT", - "description": "The number of users who performed a conversion action divided by the total number of users" - }, - { - "name": "how_many_purchased_before", - "type": "INTEGER", - "description": "The number of times the user have purchased before" - }, - { - "name": "has_abandoned_cart", - "type": "BOOLEAN", - "description": "a boolean indicating whether the user has abandoned a cart in the past" - }, { "name": "active_users_past_1_day", "type": "INTEGER", @@ -544,131 +389,6 @@ "type": "INTEGER", "description": "The number of times the user has checked out in the past 15 to 30 days" }, - { - "name": "purchasers_users", - "type": "INTEGER", - "description": "The number of distinct users who have purchases in the past" - }, - { - "name": "average_daily_purchasers", - "type": "FLOAT", - "description": "The average number of purchasers across all the days in the selected time frame" - }, - { - "name": "active_users", - "type": "INTEGER", - "description": "The number of distinct users who visited your website or application. An active user is any user who has an engaged session or when Analytics collects: the first_visit event or engagement_time_msec parameter from a website the first_open event or engagement_time_msec parameter from an Android app the first_open or user_engagement event from an iOS app" - }, - { - "name": "DAU", - "type": "FLOAT", - "description": "The number of users who engaged for the calendar day" - }, - { - "name": "MAU", - "type": "FLOAT", - "description": "The number of users who engaged in the last 30 days" - }, - { - "name": "WAU", - "type": "FLOAT", - "description": "The number of users who engaged in the last week" - }, - { - "name": "dau_per_mau", - "type": "FLOAT", - "description": "Daily Active Users (DAU) / Monthly Active Users (MAU) shows the percentage of users who engaged for the calendar day out of the users who engaged in the last 30 days. A higher ratio suggests good engagement and user retention" - }, - { - "name": "dau_per_wau", - "type": "FLOAT", - "description": "Daily Active Users (DAU) / Weekly Active Users (WAU) shows the percentage of users who engaged in the last 24 hours out of the users who engaged in the last 7 days. A higher ratio suggests good engagement and user retention" - }, - { - "name": "wau_per_mau", - "type": "FLOAT", - "description": "Weekly Active Users (DAU) / Monthly Active Users (MAU) shows the percentage of users who engaged in the last 7 days out of the users who engaged in the last 30 days. A higher ratio suggests good engagement and user retention" - }, - { - "name": "users_engagement_duration_seconds", - "type": "FLOAT", - "description": "The length of time that your app screen was in the foreground or your web page was in focus in seconds" - }, - { - "name": "average_engagement_time", - "type": "FLOAT", - "description": "The average time that your website was in focus in a user's browser or an app was in the foreground of a user's device. Average engagement time = total user engagement durations / number of active users" - }, - { - "name": "average_engagement_time_per_session", - "type": "FLOAT", - "description": "The average engagement time per session" - }, - { - "name": "average_sessions_per_user", - "type": "FLOAT", - "description": "The average number of sessions per user" - }, - { - "name": "ARPPU", - "type": "FLOAT", - "description": "Average revenue per paying user (ARPPU) is the total purchase revenue per active user who made a purchase" - }, - { - "name": "ARPU", - "type": "FLOAT", - "description": "Average revenue per active user (ARPU) is the total revenue generated on average from each active user, whether they made a purchase or not. ARPU = (Total ad revenue + purchase revenue + in-app purchase revenue + subscriptions) / Active users" - }, - { - "name": "average_daily_revenue", - "type": "FLOAT", - "description": "Average daily revenue The average total revenue for a day over the selected time frame" - }, - { - "name": "max_daily_revenue", - "type": "FLOAT", - "description": "The maximum total revenue for a day over the selected time frame" - }, - { - "name": "min_daily_revenue", - "type": "FLOAT", - "description": "The minimum total revenue for a day over the selected time frame" - }, - { - "name": "new_users", - "type": "INTEGER", - "description": "The number of new unique user IDs that logged the first_open or first_visit event. The metric allows you to measure the number of users who interacted with your site or launched your app for the first time" - }, - { - "name": "returning_users", - "type": "INTEGER", - "description": "The number of users who have initiated at least one previous session, regardless of whether or not the previous sessions were engaged sessions" - }, - { - "name": "first_time_purchasers", - "type": "INTEGER", - "description": "The number of users who made their first purchase in the selected time frame." - }, - { - "name": "first_time_purchaser_conversion", - "type": "FLOAT", - "description": "The percentage of active users who made their first purchase. This metric is returned as a fraction; for example, 0.092 means 9.2% of active users were first-time purchasers" - }, - { - "name": "first_time_purchasers_per_new_user", - "type": "FLOAT", - "description": "The average number of first-time purchasers per new user" - }, - { - "name": "avg_user_conversion_rate", - "type": "FLOAT", - "description": "The average number of converting user per total users" - }, - { - "name": "avg_session_conversion_rate", - "type": "FLOAT", - "description": "The average number of converting session per total sessions" - }, { "name": "will_purchase", "type": "INTEGER", diff --git a/sql/schema/table/user_rolling_window_lead_metrics.json b/sql/schema/table/user_rolling_window_lead_metrics.json new file mode 100644 index 00000000..e22d0ceb --- /dev/null +++ b/sql/schema/table/user_rolling_window_lead_metrics.json @@ -0,0 +1,242 @@ +[ + { + "name": "processed_timestamp", + "type": "TIMESTAMP", + "description": "Timestamp of when the data was processed" + }, + { + "name": "feature_date", + "type": "DATE", + "description": "The date serving as basis for the features calculation" + }, + { + "name": "user_pseudo_id", + "type": "STRING", + "description": "The user pseudo identifier" + }, + { + "name": "scroll_50_past_1_day", + "type": "INTEGER", + "description": "The number of times the user has been active in the past 1 day" + }, + { + "name": "scroll_50_past_2_day", + "type": "INTEGER", + "description": "The number of times the user has been active in the past 2nd day" + }, + { + "name": "scroll_50_past_3_day", + "type": "INTEGER", + "description": "The number of times the user has been active in the past 3rd day" + }, + { + "name": "scroll_50_past_4_day", + "type": "INTEGER", + "description": "The number of times the user has been active in the past 4th day" + }, + { + "name": "scroll_50_past_5_day", + "type": "INTEGER", + "description": "The number of times the user has been active in the past 5th day" + }, + { + "name": "scroll_90_past_1_day", + "type": "INTEGER", + "description": "The number of times the user has purchased in the past day" + }, + { + "name": "scroll_90_past_2_day", + "type": "INTEGER", + "description": "The number of times the user has purchased in the past 2nd day" + }, + { + "name": "scroll_90_past_3_day", + "type": "INTEGER", + "description": "The number of times the user has purchased in the past 3rd day" + }, + { + "name": "scroll_90_past_4_day", + "type": "INTEGER", + "description": "The number of times the user has purchased in the past 4th day" + }, + { + "name": "scroll_90_past_5_day", + "type": "INTEGER", + "description": "The number of times the user has purchased in the past 5th day" + }, + { + "name": "view_search_results_past_1_day", + "type": "INTEGER", + "description": "The number of times the user has visited in the past day" + }, + { + "name": "view_search_results_past_2_day", + "type": "INTEGER", + "description": "The number of times the user has visited in the past 2nd day" + }, + { + "name": "view_search_results_past_3_day", + "type": "INTEGER", + "description": "The number of times the user has visited in the past 3rd day" + }, + { + "name": "view_search_results_past_4_day", + "type": "INTEGER", + "description": "The number of times the user has visited in the past 4th day" + }, + { + "name": "view_search_results_past_5_day", + "type": "INTEGER", + "description": "The number of times the user has visited in the past 5th day" + }, + { + "name": "file_download_past_1_day", + "type": "INTEGER", + "description": "The number of times the user has visited in the past day" + }, + { + "name": "file_download_past_2_day", + "type": "INTEGER", + "description": "The number of times the user has visited in the past 2nd day" + }, + { + "name": "file_download_past_3_day", + "type": "INTEGER", + "description": "The number of times the user has visited in the past 3rd day" + }, + { + "name": "file_download_past_4_day", + "type": "INTEGER", + "description": "The number of times the user has visited in the past 4th day" + }, + { + "name": "file_download_past_5_day", + "type": "INTEGER", + "description": "The number of times the user has visited in the past 5th day" + }, + { + "name": "recipe_add_to_list_past_1_day", + "type": "INTEGER", + "description": "The number of times the user has viewed items in the past day" + }, + { + "name": "recipe_add_to_list_past_2_day", + "type": "INTEGER", + "description": "The number of times the user has viewed items in the past 2nd day" + }, + { + "name": "recipe_add_to_list_past_3_day", + "type": "INTEGER", + "description": "The number of times the user has viewed items in the past 3rd day" + }, + { + "name": "recipe_add_to_list_past_4_day", + "type": "INTEGER", + "description": "The number of times the user has viewed items in the past 4th day" + }, + { + "name": "recipe_add_to_list_past_5_day", + "type": "INTEGER", + "description": "The number of times the user has viewed items in the past 5th day" + }, + { + "name": "recipe_print_past_1_day", + "type": "INTEGER", + "description": "The number of times the user has added items to cart in the past day" + }, + { + "name": "recipe_print_past_2_day", + "type": "INTEGER", + "description": "The number of times the user has added items to cart in the past 2nd day" + }, + { + "name": "recipe_print_past_3_day", + "type": "INTEGER", + "description": "The number of times the user has added items to cart in the past 3rd day" + }, + { + "name": "recipe_print_past_4_day", + "type": "INTEGER", + "description": "The number of times the user has added items to cart in the past 4th day" + }, + { + "name": "recipe_print_past_5_day", + "type": "INTEGER", + "description": "The number of times the user has added items to cart in the past 5th day" + }, + { + "name": "sign_up_past_1_day", + "type": "INTEGER", + "description": "The number of times the user has checked out in the past day" + }, + { + "name": "sign_up_past_2_day", + "type": "INTEGER", + "description": "The number of times the user has checked out in the past 2nd day" + }, + { + "name": "sign_up_past_3_day", + "type": "INTEGER", + "description": "The number of times the user has checked out in the past 3rd day" + }, + { + "name": "sign_up_past_4_day", + "type": "INTEGER", + "description": "The number of times the user has checked out in the past 4th day" + }, + { + "name": "sign_up_past_5_day", + "type": "INTEGER", + "description": "The number of times the user has checked out in the past 5th day" + }, + { + "name": "recipe_favorite_past_1_day", + "type": "INTEGER", + "description": "The number of times the user has checked out in the past day" + }, + { + "name": "recipe_favorite_past_2_day", + "type": "INTEGER", + "description": "The number of times the user has checked out in the past 2nd day" + }, + { + "name": "recipe_favorite_past_3_day", + "type": "INTEGER", + "description": "The number of times the user has checked out in the past 3rd day" + }, + { + "name": "recipe_favorite_past_4_day", + "type": "INTEGER", + "description": "The number of times the user has checked out in the past 4th day" + }, + { + "name": "recipe_favorite_past_5_day", + "type": "INTEGER", + "description": "The number of times the user has checked out in the past 5th day" + }, + { + "name": "recipe_add_to_menu_past_1_day", + "type": "INTEGER", + "description": "The number of times the user has checked out in the past day" + }, + { + "name": "recipe_add_to_menu_past_2_day", + "type": "INTEGER", + "description": "The number of times the user has checked out in the past 2nd day" + }, + { + "name": "recipe_add_to_menu_past_3_day", + "type": "INTEGER", + "description": "The number of times the user has checked out in the past 3rd day" + }, + { + "name": "recipe_add_to_menu_past_4_day", + "type": "INTEGER", + "description": "The number of times the user has checked out in the past 4th day" + }, + { + "name": "recipe_add_to_menu_past_5_day", + "type": "INTEGER", + "description": "The number of times the user has checked out in the past 5th day" + } +] \ No newline at end of file diff --git a/templates/activation_query/lead_score_propensity_query_template.sqlx b/templates/activation_query/lead_score_propensity_query_template.sqlx new file mode 100644 index 00000000..5ad0b874 --- /dev/null +++ b/templates/activation_query/lead_score_propensity_query_template.sqlx @@ -0,0 +1,14 @@ +SELECT + a.prediction AS user_prop_l_s_p_prediction, + NTILE(10) OVER (ORDER BY a.prediction_prob DESC) AS user_prop_l_s_p_decile, + b.user_pseudo_id AS client_id, + b.user_id AS user_id, + b.ga_session_id AS event_param_session_id, + '100' AS event_param_engagement_time_msec, + CASE WHEN EXTRACT(MICROSECOND FROM b.event_timestamp) = 1 THEN b.event_timestamp ELSE TIMESTAMP_SUB(b.event_timestamp, INTERVAL 1 MICROSECOND) END AS inference_date +FROM + `${mds_project_id}.marketing_ga4_v1_${mds_dataset_suffix}.latest_event_per_user_last_72_hours` b, + `{{source_table}}` a +WHERE + COALESCE(a.user_id, "") = COALESCE(b.user_id, "") + AND a.user_pseudo_id = b.user_pseudo_id diff --git a/templates/activation_query/lead_score_propensity_vbb_query_template.sqlx b/templates/activation_query/lead_score_propensity_vbb_query_template.sqlx new file mode 100644 index 00000000..9be0e0a9 --- /dev/null +++ b/templates/activation_query/lead_score_propensity_vbb_query_template.sqlx @@ -0,0 +1,35 @@ +WITH user_prediction_decile AS ( + SELECT + a.prediction AS l_s_p_prediction, + NTILE(10) OVER (ORDER BY a.prediction_prob DESC) AS l_s_p_decile, + b.user_pseudo_id AS client_id, + b.user_id AS user_id, + b.ga_session_id AS session_id, + CASE + WHEN EXTRACT(MICROSECOND FROM b.event_timestamp) = 1 THEN b.event_timestamp + ELSE TIMESTAMP_SUB(b.event_timestamp, INTERVAL 1 MICROSECOND) + END AS inference_date + FROM + `${mds_project_id}.marketing_ga4_v1_${mds_dataset_suffix}.latest_event_per_user_last_24_hours` b, + `{{source_table}}` a + WHERE + COALESCE(a.user_id, "") = COALESCE(b.user_id, "") + AND a.user_pseudo_id = b.user_pseudo_id) +SELECT + a.l_s_p_prediction AS user_prop_l_s_p_prediction, + a.l_s_p_decile AS user_prop_l_s_p_decile, + b.value AS event_param_value, + 'USD' AS event_param_currency, + a.client_id, + a.user_id, + a.session_id AS event_param_session_id, + a.inference_date +FROM + user_prediction_decile AS a +LEFT JOIN + `${activation_project_id}.${dataset}.vbb_activation_configuration` AS b +ON + a.l_s_p_decile = b.decile +WHERE + b.activation_type = 'lead-score-propensity' +AND b.value > 0 \ No newline at end of file diff --git a/templates/activation_type_configuration_template.tpl b/templates/activation_type_configuration_template.tpl index d73206c7..913b70a2 100644 --- a/templates/activation_type_configuration_template.tpl +++ b/templates/activation_type_configuration_template.tpl @@ -46,5 +46,9 @@ "churn-propensity-15-7": { "activation_event_name": "maj_churn_propensity_15_7", "source_query_template": "${churn_propensity_query_template_gcs_path}" + }, + "lead-score-propensity-30-15": { + "activation_event_name": "maj_lead_score_propensity_30_15", + "source_query_template": "${lead_score_propensity_query_template_gcs_path}" } } diff --git a/templates/activation_user_import/lead_score_propensity_csv_export.sqlx b/templates/activation_user_import/lead_score_propensity_csv_export.sqlx new file mode 100644 index 00000000..376cea56 --- /dev/null +++ b/templates/activation_user_import/lead_score_propensity_csv_export.sqlx @@ -0,0 +1,27 @@ +DECLARE + select_query STRING; +SET + select_query = FORMAT(""" + CREATE TEMPORARY TABLE tmp_selection AS + SELECT + user_pseudo_id AS client_id, + '${ga4_stream_id}' AS stream_id, + prediction AS l_s_p_prediction, + NTILE(10) OVER (ORDER BY prediction_prob DESC) AS l_s_p_decile + FROM `%s` + """, prediction_table_name); +EXECUTE IMMEDIATE + select_query; +EXPORT DATA + OPTIONS ( uri = 'gs://${export_bucket}/csv-export/lead_score_propensity-*.csv', + format = 'CSV', + OVERWRITE = TRUE, + header = TRUE, + field_delimiter = ',' ) AS ( + SELECT + client_id, + stream_id, + l_s_p_prediction, + l_s_p_decile + FROM + tmp_selection ); diff --git a/templates/vbb_activation_configuration.jsonl b/templates/vbb_activation_configuration.jsonl index a0c64ad1..57b200e0 100644 --- a/templates/vbb_activation_configuration.jsonl +++ b/templates/vbb_activation_configuration.jsonl @@ -1,2 +1,3 @@ {"activation_type":"purchase-propensity","value_norm":150,"decile_multiplier":[{"decile":1,"multiplier":5.5},{"decile":2,"multiplier":3},{"decile":3,"multiplier":2},{"decile":4,"multiplier":1},{"decile":5,"multiplier":0},{"decile":6,"multiplier":0},{"decile":7,"multiplier":0},{"decile":8,"multiplier":0},{"decile":9,"multiplier":0},{"decile":10,"multiplier":0}]} -{"activation_type":"cltv","value_norm":500,"decile_multiplier":[{"decile":1,"multiplier":5.5},{"decile":2,"multiplier":3},{"decile":3,"multiplier":2},{"decile":4,"multiplier":1},{"decile":5,"multiplier":0},{"decile":6,"multiplier":0},{"decile":7,"multiplier":0},{"decile":8,"multiplier":0},{"decile":9,"multiplier":0},{"decile":10,"multiplier":0}]} \ No newline at end of file +{"activation_type":"cltv","value_norm":500,"decile_multiplier":[{"decile":1,"multiplier":5.5},{"decile":2,"multiplier":3},{"decile":3,"multiplier":2},{"decile":4,"multiplier":1},{"decile":5,"multiplier":0},{"decile":6,"multiplier":0},{"decile":7,"multiplier":0},{"decile":8,"multiplier":0},{"decile":9,"multiplier":0},{"decile":10,"multiplier":0}]} +{"activation_type":"lead-score-propensity","value_norm":150,"decile_multiplier":[{"decile":1,"multiplier":5.5},{"decile":2,"multiplier":3},{"decile":3,"multiplier":2},{"decile":4,"multiplier":1},{"decile":5,"multiplier":0},{"decile":6,"multiplier":0},{"decile":7,"multiplier":0},{"decile":8,"multiplier":0},{"decile":9,"multiplier":0},{"decile":10,"multiplier":0}]} \ No newline at end of file