Skip to content

Commit bf19d4d

Browse files
chmstimoteoCarlos Timoteo
and
Carlos Timoteo
authored
Chmstimoteo patch2 (#47)
* fixing typos and making more robust logic to avoid duplicates * adding default tables schemas for tabular workflow compilation * tabular workflow dataset preparations * preventing high memory consumption with order by * removing duplicates and expiration time from views * substiting INSERTs for UPSERTs * fixing typo * fixing typos --------- Co-authored-by: Carlos Timoteo <[email protected]>
1 parent e2d16dc commit bf19d4d

25 files changed

+4709
-2596
lines changed

config/config.yaml.tftpl

+13-11
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,7 @@ vertex_ai:
130130
query_parameters:
131131
- { name: "input_date", type: "DATE", value: None } # If value is not defined then assume current_date()
132132
#INT64
133-
timeout: 1800.0
133+
timeout: 3600.0
134134
pipeline_parameters_substitutions: # Substitutions are applied to the parameters before compilation
135135
customer_lifetime_value_label_procedure_name: "${project_id}.feature_store.invoke_customer_lifetime_value_label"
136136
purchase_propensity_label_procedure_name: "${project_id}.feature_store.invoke_purchase_propensity_label"
@@ -186,6 +186,7 @@ vertex_ai:
186186
# data_source_bigquery_table_path: "bq://${project_id}.purchase_propensity.v_purchase_propensity_training_15_7"
187187
# data_source_bigquery_table_path: "bq://${project_id}.purchase_propensity.v_purchase_propensity_training_15_15"
188188
data_source_bigquery_table_path: "bq://${project_id}.purchase_propensity.v_purchase_propensity_training_30_15"
189+
data_source_bigquery_table_schema: "sql/schema/table/purchase_propensity_training_preparation.json"
189190
dataflow_service_account: "df-worker@${project_id}.iam.gserviceaccount.com"
190191
timestamp_split_key: null
191192
stratified_split_key: null
@@ -329,6 +330,7 @@ vertex_ai:
329330
data_source_csv_filenames: null
330331
optimization_objective: minimize-mae # minimize-mae | minimize-rmse | minimize-rmsle
331332
data_source_bigquery_table_path: "bq://${project_id}.customer_lifetime_value.v_customer_lifetime_value_training_180_30"
333+
data_source_bigquery_table_schema: "sql/schema/table/customer_lifetime_value_training_preparation.json"
332334
dataflow_service_account: "df-worker@${project_id}.iam.gserviceaccount.com"
333335
timestamp_split_key: null
334336
stratified_split_key: null
@@ -392,7 +394,7 @@ bigquery:
392394
is_case_insensitive: TRUE
393395
description: "Feature Store dataset for Marketing behavioural modeling"
394396
friendly_name: "Feature Store"
395-
max_time_travel_hours: 48
397+
max_time_travel_hours: 168
396398
default_partition_expiration_days: 365
397399
default_table_expiration_days: 365
398400
purchase_propensity:
@@ -403,7 +405,7 @@ bigquery:
403405
is_case_insensitive: TRUE
404406
description: "Purchase Propensity Use Case dataset for Marketing behavioural modeling"
405407
friendly_name: "Purchase Propensity Dataset"
406-
max_time_travel_hours: 48
408+
max_time_travel_hours: 168
407409
default_partition_expiration_days: 365
408410
default_table_expiration_days: 365
409411
customer_lifetime_value:
@@ -414,7 +416,7 @@ bigquery:
414416
is_case_insensitive: TRUE
415417
description: "Customer Lifetime Value Use Case dataset for Marketing behavioural modeling"
416418
friendly_name: "Customer Lifetime Value Dataset"
417-
max_time_travel_hours: 48
419+
max_time_travel_hours: 168
418420
default_partition_expiration_days: 365
419421
default_table_expiration_days: 365
420422
audience_segmentation:
@@ -425,7 +427,7 @@ bigquery:
425427
is_case_insensitive: TRUE
426428
description: "Audience Segmentation Use Case dataset for Marketing behavioural modeling"
427429
friendly_name: "Audience Segmentation Dataset"
428-
max_time_travel_hours: 48
430+
max_time_travel_hours: 168
429431
default_partition_expiration_days: 365
430432
default_table_expiration_days: 365
431433
table:
@@ -749,7 +751,7 @@ bigquery:
749751
feature_store_dataset: "feature_store"
750752
mds_project_id: "${project_id}"
751753
mds_dataset: "${mds_dataset}"
752-
expiration_duration_hours: 48
754+
expiration_duration_hours: 168
753755
samples_per_split: 100000
754756
customer_lifetime_value_label:
755757
project_id: "${project_id}"
@@ -767,7 +769,7 @@ bigquery:
767769
feature_store_dataset: "feature_store"
768770
mds_project_id: "${project_id}"
769771
mds_dataset: "${mds_dataset}"
770-
expiration_duration_hours: 48
772+
expiration_duration_hours: 168
771773
purchase_propensity_label:
772774
project_id: "${project_id}"
773775
dataset: "feature_store"
@@ -784,7 +786,7 @@ bigquery:
784786
feature_store_dataset: "feature_store"
785787
mds_project_id: "${project_id}"
786788
mds_dataset: "${mds_dataset}"
787-
expiration_duration_hours: 48
789+
expiration_duration_hours: 168
788790
user_dimensions:
789791
project_id: "${project_id}"
790792
dataset: "feature_store"
@@ -862,23 +864,23 @@ bigquery:
862864
feature_store_project_id: "${project_id}"
863865
feature_store_dataset: "feature_store"
864866
insert_table: "purchase_propensity_inference_preparation"
865-
expiration_duration_hours: 48
867+
expiration_duration_hours: 168
866868
customer_lifetime_value_inference_preparation:
867869
project_id: "${project_id}"
868870
dataset: "customer_lifetime_value"
869871
name: "customer_lifetime_value_inference_preparation"
870872
feature_store_project_id: "${project_id}"
871873
feature_store_dataset: "feature_store"
872874
insert_table: "customer_lifetime_value_inference_preparation"
873-
expiration_duration_hours: 48
875+
expiration_duration_hours: 168
874876
audience_segmentation_inference_preparation:
875877
project_id: "${project_id}"
876878
dataset: "audience_segmentation"
877879
name: "audience_segmentation_inference_preparation"
878880
feature_store_project_id: "${project_id}"
879881
feature_store_dataset: "feature_store"
880882
insert_table: "audience_segmentation_inference_preparation"
881-
expiration_duration_hours: 48
883+
expiration_duration_hours: 168
882884
mds_dataset: "${mds_dataset}"
883885

884886

infrastructure/terraform/modules/feature-store/bigquery-datasets.tf

+20
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,15 @@ resource "google_bigquery_dataset" "feature_store" {
1919
description = local.config_bigquery.dataset.feature_store.description
2020
location = local.config_bigquery.dataset.feature_store.location
2121
max_time_travel_hours = local.config_bigquery.dataset.feature_store.max_time_travel_hours
22+
delete_contents_on_destroy = false
2223

2324
labels = {
2425
version = "pilot"
2526
}
27+
28+
lifecycle {
29+
ignore_changes = all
30+
}
2631
}
2732

2833
resource "google_bigquery_dataset" "purchase_propensity" {
@@ -32,10 +37,15 @@ resource "google_bigquery_dataset" "purchase_propensity" {
3237
description = local.config_bigquery.dataset.purchase_propensity.description
3338
location = local.config_bigquery.dataset.purchase_propensity.location
3439
max_time_travel_hours = local.config_bigquery.dataset.purchase_propensity.max_time_travel_hours
40+
delete_contents_on_destroy = false
3541

3642
labels = {
3743
version = "pilot"
3844
}
45+
46+
lifecycle {
47+
ignore_changes = all
48+
}
3949
}
4050

4151
resource "google_bigquery_dataset" "customer_lifetime_value" {
@@ -45,10 +55,15 @@ resource "google_bigquery_dataset" "customer_lifetime_value" {
4555
description = local.config_bigquery.dataset.customer_lifetime_value.description
4656
location = local.config_bigquery.dataset.customer_lifetime_value.location
4757
max_time_travel_hours = local.config_bigquery.dataset.customer_lifetime_value.max_time_travel_hours
58+
delete_contents_on_destroy = false
4859

4960
labels = {
5061
version = "pilot"
5162
}
63+
64+
lifecycle {
65+
ignore_changes = all
66+
}
5267
}
5368

5469
resource "google_bigquery_dataset" "audience_segmentation" {
@@ -58,8 +73,13 @@ resource "google_bigquery_dataset" "audience_segmentation" {
5873
description = local.config_bigquery.dataset.audience_segmentation.description
5974
location = local.config_bigquery.dataset.audience_segmentation.location
6075
max_time_travel_hours = local.config_bigquery.dataset.audience_segmentation.max_time_travel_hours
76+
delete_contents_on_destroy = false
6177

6278
labels = {
6379
version = "pilot"
6480
}
81+
82+
lifecycle {
83+
ignore_changes = all
84+
}
6585
}

infrastructure/terraform/modules/feature-store/bigquery-tables.tf

+15-15
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ resource "google_bigquery_table" "audience_segmentation_inference_preparation" {
1717
dataset_id = google_bigquery_dataset.audience_segmentation.dataset_id
1818
table_id = local.config_bigquery.table.audience_segmentation_inference_preparation.table_name
1919
description = local.config_bigquery.table.audience_segmentation_inference_preparation.table_description
20-
deletion_protection = false
20+
deletion_protection = true
2121
labels = {
2222
version = "pilot"
2323
}
@@ -30,7 +30,7 @@ resource "google_bigquery_table" "customer_lifetime_value_inference_preparation"
3030
dataset_id = google_bigquery_dataset.customer_lifetime_value.dataset_id
3131
table_id = local.config_bigquery.table.customer_lifetime_value_inference_preparation.table_name
3232
description = local.config_bigquery.table.customer_lifetime_value_inference_preparation.table_description
33-
deletion_protection = false
33+
deletion_protection = true
3434
labels = {
3535
version = "pilot"
3636
}
@@ -42,7 +42,7 @@ resource "google_bigquery_table" "customer_lifetime_value_label" {
4242
dataset_id = google_bigquery_dataset.feature_store.dataset_id
4343
table_id = local.config_bigquery.table.customer_lifetime_value_label.table_name
4444
description = local.config_bigquery.table.customer_lifetime_value_label.table_description
45-
deletion_protection = false
45+
deletion_protection = true
4646
labels = {
4747
version = "pilot"
4848
}
@@ -54,7 +54,7 @@ resource "google_bigquery_table" "purchase_propensity_inference_preparation" {
5454
dataset_id = google_bigquery_dataset.purchase_propensity.dataset_id
5555
table_id = local.config_bigquery.table.purchase_propensity_inference_preparation.table_name
5656
description = local.config_bigquery.table.purchase_propensity_inference_preparation.table_description
57-
deletion_protection = false
57+
deletion_protection = true
5858
labels = {
5959
version = "pilot"
6060
}
@@ -66,7 +66,7 @@ resource "google_bigquery_table" "purchase_propensity_label" {
6666
dataset_id = google_bigquery_dataset.feature_store.dataset_id
6767
table_id = local.config_bigquery.table.purchase_propensity_label.table_name
6868
description = local.config_bigquery.table.purchase_propensity_label.table_description
69-
deletion_protection = false
69+
deletion_protection = true
7070
labels = {
7171
version = "pilot"
7272
}
@@ -78,7 +78,7 @@ resource "google_bigquery_table" "user_dimensions" {
7878
dataset_id = google_bigquery_dataset.feature_store.dataset_id
7979
table_id = local.config_bigquery.table.user_dimensions.table_name
8080
description = local.config_bigquery.table.user_dimensions.table_description
81-
deletion_protection = false
81+
deletion_protection = true
8282
labels = {
8383
version = "pilot"
8484
}
@@ -90,7 +90,7 @@ resource "google_bigquery_table" "user_lifetime_dimensions" {
9090
dataset_id = google_bigquery_dataset.feature_store.dataset_id
9191
table_id = local.config_bigquery.table.user_lifetime_dimensions.table_name
9292
description = local.config_bigquery.table.user_lifetime_dimensions.table_description
93-
deletion_protection = false
93+
deletion_protection = true
9494
labels = {
9595
version = "pilot"
9696
}
@@ -103,7 +103,7 @@ resource "google_bigquery_table" "user_lookback_metrics" {
103103
dataset_id = google_bigquery_dataset.feature_store.dataset_id
104104
table_id = local.config_bigquery.table.user_lookback_metrics.table_name
105105
description = local.config_bigquery.table.user_lookback_metrics.table_description
106-
deletion_protection = false
106+
deletion_protection = true
107107
labels = {
108108
version = "pilot"
109109
}
@@ -116,7 +116,7 @@ resource "google_bigquery_table" "user_rolling_window_lifetime_metrics" {
116116
dataset_id = google_bigquery_dataset.feature_store.dataset_id
117117
table_id = local.config_bigquery.table.user_rolling_window_lifetime_metrics.table_name
118118
description = local.config_bigquery.table.user_rolling_window_lifetime_metrics.table_description
119-
deletion_protection = false
119+
deletion_protection = true
120120
labels = {
121121
version = "pilot"
122122
}
@@ -128,7 +128,7 @@ resource "google_bigquery_table" "user_rolling_window_metrics" {
128128
dataset_id = google_bigquery_dataset.feature_store.dataset_id
129129
table_id = local.config_bigquery.table.user_rolling_window_metrics.table_name
130130
description = local.config_bigquery.table.user_rolling_window_metrics.table_description
131-
deletion_protection = false
131+
deletion_protection = true
132132
labels = {
133133
version = "pilot"
134134
}
@@ -140,7 +140,7 @@ resource "google_bigquery_table" "user_scoped_lifetime_metrics" {
140140
dataset_id = google_bigquery_dataset.feature_store.dataset_id
141141
table_id = local.config_bigquery.table.user_scoped_lifetime_metrics.table_name
142142
description = local.config_bigquery.table.user_scoped_lifetime_metrics.table_description
143-
deletion_protection = false
143+
deletion_protection = true
144144
labels = {
145145
version = "pilot"
146146
}
@@ -152,7 +152,7 @@ resource "google_bigquery_table" "user_scoped_metrics" {
152152
dataset_id = google_bigquery_dataset.feature_store.dataset_id
153153
table_id = local.config_bigquery.table.user_scoped_metrics.table_name
154154
description = local.config_bigquery.table.user_scoped_metrics.table_description
155-
deletion_protection = false
155+
deletion_protection = true
156156
labels = {
157157
version = "pilot"
158158
}
@@ -164,7 +164,7 @@ resource "google_bigquery_table" "user_scoped_segmentation_metrics" {
164164
dataset_id = google_bigquery_dataset.feature_store.dataset_id
165165
table_id = local.config_bigquery.table.user_scoped_segmentation_metrics.table_name
166166
description = local.config_bigquery.table.user_scoped_segmentation_metrics.table_description
167-
deletion_protection = false
167+
deletion_protection = true
168168
labels = {
169169
version = "pilot"
170170
}
@@ -176,7 +176,7 @@ resource "google_bigquery_table" "user_segmentation_dimensions" {
176176
dataset_id = google_bigquery_dataset.feature_store.dataset_id
177177
table_id = local.config_bigquery.table.user_segmentation_dimensions.table_name
178178
description = local.config_bigquery.table.user_segmentation_dimensions.table_description
179-
deletion_protection = false
179+
deletion_protection = true
180180
labels = {
181181
version = "pilot"
182182
}
@@ -188,7 +188,7 @@ resource "google_bigquery_table" "user_session_event_aggregated_metrics" {
188188
dataset_id = google_bigquery_dataset.feature_store.dataset_id
189189
table_id = local.config_bigquery.table.user_session_event_aggregated_metrics.table_name
190190
description = local.config_bigquery.table.user_session_event_aggregated_metrics.table_description
191-
deletion_protection = false
191+
deletion_protection = true
192192
labels = {
193193
version = "pilot"
194194
}

python/pipelines/pipeline_ops.py

+27-16
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,27 @@ def run_pipeline_from_func(
146146
return pl
147147

148148

149+
def _extract_schema_from_bigquery(
150+
table_name: str,
151+
table_schema: str,
152+
) -> list:
153+
from google.cloud import bigquery
154+
from google.api_core import exceptions
155+
try:
156+
client = bigquery.Client()
157+
table = client.get_table(table_name)
158+
schema = [schema.name for schema in table.schema]
159+
except exceptions.NotFound as e:
160+
logging.warn(f'Pipeline compiled without columns transformation. \
161+
Make sure the `data_source_bigquery_table_path` table or view exists! \
162+
Loading default values from schema file {schema_name}.')
163+
import json
164+
with open(schema_name) as f:
165+
d = json.load(f)
166+
schema = [feature['name'] for feature in d]
167+
return schema
168+
169+
149170
def compile_automl_tabular_pipeline(
150171
template_path: str,
151172
parameters_path: str,
@@ -227,18 +248,10 @@ def compile_automl_tabular_pipeline(
227248
pipeline_parameters['transformations'] = pipeline_parameters['transformations'].format(
228249
timestamp=datetime.now().strftime("%Y%m%d%H%M%S"))
229250

230-
from google.cloud import bigquery
231-
from google.api_core import exceptions
232-
233-
try:
234-
client = bigquery.Client()
235-
table = client.get_table(
236-
pipeline_parameters['data_source_bigquery_table_path'].split('/')[-1])
237-
schema = [schema.name for schema in table.schema]
238-
except exceptions.NotFound as e:
239-
logging.warn(f'Pipeline compiled without columns transformation. \
240-
Make sure the `data_source_bigquery_table_path` table or view exists in your config.yaml!')
241-
schema = []
251+
schema = _extract_schema_from_bigquery(
252+
table_name=pipeline_parameters['data_source_bigquery_table_path'].split('/')[-1],
253+
table_schema=pipeline_parameters['data_source_bigquery_table_schema']
254+
)
242255

243256
for column_to_remove in exclude_features + [
244257
pipeline_parameters['target_column'],
@@ -249,10 +262,7 @@ def compile_automl_tabular_pipeline(
249262
if column_to_remove in schema:
250263
schema.remove(column_to_remove)
251264

252-
logging.info(f'features:{schema}' )
253-
# need to remove later
254-
# if "default" in schema:
255-
# schema.remove("default")
265+
logging.info(f'features:{schema}')
256266

257267
write_auto_transformations(pipeline_parameters['transformations'], schema)
258268
if pipeline_parameters['predefined_split_key']:
@@ -262,6 +272,7 @@ def compile_automl_tabular_pipeline(
262272

263273
# write_to_gcs(pipeline_parameters['transform_config_path'], json.dumps(transformations))
264274

275+
pipeline_parameters.pop('data_source_bigquery_table_schema', None)
265276
(
266277
tp,
267278
parameter_values,

0 commit comments

Comments
 (0)