diff --git a/data/transform/models/staging/snowplow/snowplow_bad_parsed.sql b/data/transform/models/staging/snowplow/snowplow_bad_parsed.sql index 92dcc663..05ecace4 100644 --- a/data/transform/models/staging/snowplow/snowplow_bad_parsed.sql +++ b/data/transform/models/staging/snowplow/snowplow_bad_parsed.sql @@ -1,9 +1,3 @@ -{{ - config( - materialized='table' - ) -}} - WITH reparse_1 AS ( -- Incident: new schema not registered to SnowcatCloud versions 2.14.0 and -- 2.15.0 (partial) diff --git a/data/transform/models/staging/snowplow/stg_snowplow__events.sql b/data/transform/models/staging/snowplow/stg_snowplow__events.sql index 72bc8e3d..ab5672ae 100644 --- a/data/transform/models/staging/snowplow/stg_snowplow__events.sql +++ b/data/transform/models/staging/snowplow/stg_snowplow__events.sql @@ -1,13 +1,39 @@ {{ config( - materialized='table' + materialized='incremental' ) }} - WITH blended_source AS ( - SELECT * - FROM {{ ref('stg_snowplow__events_union_all') }} + SELECT + *, + FALSE AS snowplow_bad_parsed + {% if env_var("MELTANO_ENVIRONMENT") == "cicd" %} + + FROM raw.snowplow.events + WHERE derived_tstamp::TIMESTAMP >= DATEADD('day', -3, CURRENT_DATE) + {% else %} + + FROM {{ source('snowplow', 'events') }} + + {% if is_incremental() %} + + WHERE UPLOADED_AT >= (SELECT max(UPLOADED_AT) FROM {{ this }} WHERE snowplow_bad_parsed = FALSE) + + {% endif %} + {% endif %} + + UNION ALL + + SELECT + *, + TRUE AS snowplow_bad_parsed + FROM {{ ref('snowplow_bad_parsed') }} + {% if is_incremental() %} + + WHERE event_id NOT IN (SELECT DISTINCT event_id FROM {{ this }} WHERE snowplow_bad_parsed = TRUE) + + {% endif %} ), @@ -18,7 +44,7 @@ source AS ( ROW_NUMBER() OVER ( PARTITION BY event_id - ORDER BY event_created_at::TIMESTAMP DESC + ORDER BY derived_tstamp::TIMESTAMP DESC ) AS row_num FROM blended_source @@ -29,8 +55,158 @@ clean_new_source AS ( SELECT * FROM source WHERE row_num = 1 + {% if is_incremental() %} + + AND event_id NOT IN ( + SELECT DISTINCT event_id FROM {{ this }} + ) + {% endif %} + +), + +renamed AS ( + + SELECT -- noqa: L034 + -- only meltano events. For the first ~6 months no app_id was + -- sent from Meltano. So nulls are from meltano. + COALESCE(app_id, 'meltano') AS app_id, + platform, + etl_tstamp::TIMESTAMP AS etl_enriched_at, + collector_tstamp::TIMESTAMP AS collector_received_at, + dvce_created_tstamp::TIMESTAMP AS device_created_at, + dvce_sent_tstamp::TIMESTAMP AS device_sent_at, + refr_dvce_tstamp::TIMESTAMP AS refr_device_processed_at, + derived_tstamp::TIMESTAMP AS event_created_at, + derived_tstamp::DATE AS event_created_date, + true_tstamp::TIMESTAMP AS true_sent_at, + uploaded_at, + event, + event_id, + txn_id, + name_tracker, + v_tracker, + v_collector, + v_etl, + user_id, + user_ipaddress, + user_fingerprint, + domain_userid, + domain_sessionidx, + network_userid, + geo_country, + geo_region, + geo_city, + geo_zipcode, + geo_latitude, + geo_longitude, + geo_region_name, + ip_isp, + ip_organization, + ip_domain, + ip_netspeed, + page_url, + page_title, + page_referrer, + page_urlscheme, + page_urlhost, + page_urlport, + page_urlpath, + page_urlquery, + page_urlfragment, + refr_urlscheme, + refr_urlhost, + refr_urlport, + refr_urlpath, + refr_urlquery, + refr_urlfragment, + refr_medium, + refr_source, + refr_term, + mkt_medium, + mkt_source, + mkt_term, + mkt_content, + mkt_campaign, + contexts, + se_category, + se_action, + se_label, + se_property, + se_value, + unstruct_event, + tr_orderid, + tr_affiliation, + tr_total, + tr_tax, + tr_shipping, + tr_city, + tr_state, + tr_country, + ti_orderid, + ti_sku, + ti_name, + ti_category, + ti_price, + ti_quantity, + pp_xoffset_min, + pp_xoffset_max, + pp_yoffset_min, + pp_yoffset_max, + useragent, + br_name, + br_family, + br_version, + br_type, + br_renderengine, + br_lang, + br_features_pdf, + br_features_flash, + br_features_java, + br_features_director, + br_features_quicktime, + br_features_realplayer, + br_features_windowsmedia, + br_features_gears, + br_features_silverlight, + br_cookies, + br_colordepth, + br_viewwidth, + br_viewheight, + os_name, + os_family, + os_manufacturer, + os_timezone, + dvce_type, + dvce_ismobile, + dvce_screenwidth, + dvce_screenheight, + doc_charset, + doc_width, + doc_height, + tr_currency, + tr_total_base, + tr_tax_base, + tr_shipping_base, + ti_currency, + ti_price_base, + base_currency, + geo_timezone, + mkt_clickid, + mkt_network, + etl_tags, + refr_domain_userid, + derived_contexts::VARIANT AS derived_contexts, + domain_sessionid, + event_vendor, + event_name, + event_format, + event_version, + event_fingerprint, + MD5(user_ipaddress) AS ip_address_hash, + snowplow_bad_parsed + FROM clean_new_source ) SELECT * -FROM clean_new_source +FROM renamed diff --git a/data/transform/models/staging/snowplow/stg_snowplow__events_union_all.sql b/data/transform/models/staging/snowplow/stg_snowplow__events_union_all.sql deleted file mode 100644 index 9ed2b61b..00000000 --- a/data/transform/models/staging/snowplow/stg_snowplow__events_union_all.sql +++ /dev/null @@ -1,186 +0,0 @@ -{{ - config( - materialized='incremental' - ) -}} - -WITH blended_source AS ( - - SELECT - *, - FALSE AS snowplow_bad_parsed - {% if env_var("MELTANO_ENVIRONMENT") == "cicd" %} - - FROM raw.snowplow.events - WHERE derived_tstamp::TIMESTAMP >= DATEADD('day', -3, CURRENT_DATE) - {% else %} - - FROM {{ source('snowplow', 'events') }} - - {% if is_incremental() %} - - WHERE UPLOADED_AT >= (SELECT max(UPLOADED_AT) FROM {{ this }} WHERE snowplow_bad_parsed = FALSE) - - {% endif %} - {% endif %} - - UNION ALL - - SELECT - *, - TRUE AS snowplow_bad_parsed - FROM {{ ref('snowplow_bad_parsed') }} - {% if is_incremental() %} - - WHERE event_id NOT IN (SELECT DISTINCT event_id FROM {{ this }} WHERE snowplow_bad_parsed = TRUE) - - {% endif %} - -), - -renamed AS ( - - SELECT -- noqa: L034 - -- only meltano events. For the first ~6 months no app_id was - -- sent from Meltano. So nulls are from meltano. - COALESCE(app_id, 'meltano') AS app_id, - platform, - etl_tstamp::TIMESTAMP AS etl_enriched_at, - collector_tstamp::TIMESTAMP AS collector_received_at, - dvce_created_tstamp::TIMESTAMP AS device_created_at, - dvce_sent_tstamp::TIMESTAMP AS device_sent_at, - refr_dvce_tstamp::TIMESTAMP AS refr_device_processed_at, - derived_tstamp::TIMESTAMP AS event_created_at, - derived_tstamp::DATE AS event_created_date, - true_tstamp::TIMESTAMP AS true_sent_at, - uploaded_at, - event, - event_id, - txn_id, - name_tracker, - v_tracker, - v_collector, - v_etl, - user_id, - user_ipaddress, - user_fingerprint, - domain_userid, - domain_sessionidx, - network_userid, - geo_country, - geo_region, - geo_city, - geo_zipcode, - geo_latitude, - geo_longitude, - geo_region_name, - ip_isp, - ip_organization, - ip_domain, - ip_netspeed, - page_url, - page_title, - page_referrer, - page_urlscheme, - page_urlhost, - page_urlport, - page_urlpath, - page_urlquery, - page_urlfragment, - refr_urlscheme, - refr_urlhost, - refr_urlport, - refr_urlpath, - refr_urlquery, - refr_urlfragment, - refr_medium, - refr_source, - refr_term, - mkt_medium, - mkt_source, - mkt_term, - mkt_content, - mkt_campaign, - contexts, - se_category, - se_action, - se_label, - se_property, - se_value, - unstruct_event, - tr_orderid, - tr_affiliation, - tr_total, - tr_tax, - tr_shipping, - tr_city, - tr_state, - tr_country, - ti_orderid, - ti_sku, - ti_name, - ti_category, - ti_price, - ti_quantity, - pp_xoffset_min, - pp_xoffset_max, - pp_yoffset_min, - pp_yoffset_max, - useragent, - br_name, - br_family, - br_version, - br_type, - br_renderengine, - br_lang, - br_features_pdf, - br_features_flash, - br_features_java, - br_features_director, - br_features_quicktime, - br_features_realplayer, - br_features_windowsmedia, - br_features_gears, - br_features_silverlight, - br_cookies, - br_colordepth, - br_viewwidth, - br_viewheight, - os_name, - os_family, - os_manufacturer, - os_timezone, - dvce_type, - dvce_ismobile, - dvce_screenwidth, - dvce_screenheight, - doc_charset, - doc_width, - doc_height, - tr_currency, - tr_total_base, - tr_tax_base, - tr_shipping_base, - ti_currency, - ti_price_base, - base_currency, - geo_timezone, - mkt_clickid, - mkt_network, - etl_tags, - refr_domain_userid, - derived_contexts::VARIANT AS derived_contexts, - domain_sessionid, - event_vendor, - event_name, - event_format, - event_version, - event_fingerprint, - MD5(user_ipaddress) AS ip_address_hash, - snowplow_bad_parsed - FROM blended_source - -) - -SELECT * -FROM renamed