diff --git a/CHANGELOG b/CHANGELOG index e55a186..3b49c01 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,11 +1,19 @@ snowplow-media-player 0.7.0 (2023-xx-xx) --------------------------------------- ## Summary -This release adds a more robust unique media identifier. This fixes an issue where duplicate `media_id` values could occur in the media stats table as a result of incorrect tracking implementation (e.g. sharing the same media label across different media types). This release also fixes the incremental materialization of the media_ad_views table by adding a unique primary key. +This version adds new features powered by a complete refactor of the core processing of the package by moving it out to the new `base` macro functionality provided in `snowplow_utils`. This enables users to now specify custom fields for sessionization and user identification, to add custom entities/SDEs fields to the base events table for redshift/postgres, and to add passthrough fields to the derived tables so you can now more easily add your own fields to our tables. + +In addition this release adds a more robust unique media identifier. This fixes an issue where duplicate `media_id` values could occur in the media stats table as a result of incorrect tracking implementation (e.g. sharing the same media label across different media types). This release also fixes the incremental materialization of the media_ad_views table by adding a unique primary key. + +## Features +- Migrate base models to the new `base` macros for flexibility and consistency +- Add ability to pass fields through to derived media base and ad views tables +- Add new field `domain_sessionid_array` to derived tables (where applicable) ## Fixes - Add unique media identifier (close #59) - Add missing primary key to media_ad_views +- Fix field names in custom session stats model yaml (close #63) ## Under the hood diff --git a/README.md b/README.md index 51ed692..7dbacd2 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ # dbt-snowplow-media-player -A fully incremental model that transforms media player event data into derived tables for easier querying generated by the Snowplow [JavaScript tracker][javascript-tracker] in combination with media tracking specific plugins such as the [Media Tracking plugin][media-tracking] or the [YouTube Tracking plugin][youtube-tracking]. +A fully incremental model that transforms media player event data into derived tables for easier querying generated by the Snowplow [JavaScript tracker][javascript-tracker] in combination with media tracking specific plugins such as the [Media Tracking plugin][media-tracking] or the [YouTube Tracking plugin][youtube-tracking]. The package also supports media events generated by the Snowplow [iOS and Android trackers][mobile-media-tracker-docs]. Please refer to the [doc site][snowplow-media-player-docs] for a full breakdown of the package. @@ -122,3 +122,5 @@ limitations under the License. [snowplow-media-player-docs-dbt]: https://snowplow.github.io/dbt-snowplow-media-player/#!/overview/snowplow_media_player [snowplow-media-player-docs]: https://docs.snowplow.io/docs/modeling-your-data/modeling-your-data-with-dbt/dbt-models/dbt-media-player-data-model/ + +[mobile-media-tracker-docs]: https://docs.snowplow.io/docs/collecting-data/collecting-from-own-applications/mobile-trackers/tracking-events/media-tracking/ diff --git a/dbt_project.yml b/dbt_project.yml index ed6183e..e655a2b 100644 --- a/dbt_project.yml +++ b/dbt_project.yml @@ -72,6 +72,9 @@ vars: snowplow__enable_web_events: true snowplow__enable_mobile_events: false snowplow__enable_ad_quartile_event: false + # add extra custom fields: + snowplow__base_passthroughs: [] + snowplow__ad_views_passthroughs: [] # Variables - Warehouse Specific snowplow__media_player_event_context: 'com_snowplowanalytics_snowplow_media_player_event_1' diff --git a/docs/markdown/snowplow_media_player_common_cols.md b/docs/markdown/snowplow_media_player_common_cols.md index 2c993c7..ccd64b2 100644 --- a/docs/markdown/snowplow_media_player_common_cols.md +++ b/docs/markdown/snowplow_media_player_common_cols.md @@ -19,7 +19,15 @@ A UUID for each page view e.g. `c6ef3124-b53a-4b13-a233-0088f79dcbcb`. {% enddocs %} {% docs col_session_identifier %} -A visit / session UUID e.g. `c6ef3124-b53a-4b13-a233-0088f79dcbcb`. +The session identifier as defined in your project variables. Default to the media_session_id, or to page_view_id if the media session entity is not enabled. +{% enddocs %} + +{% docs col_domain_sessionid_array %} +All domain_sessionids seen for a play_id. +{% enddocs %} + +{% docs col_user_identifier %} +The user identifier as defined in your project variables. Default to domain_userid. {% enddocs %} {% docs col_domain_userid %} @@ -1005,31 +1013,31 @@ Datetime of the last event. {% enddocs %} {% docs col_views_unique %} -Number of users that viewed the ad (identified by their domain_userid). +Number of users that viewed the ad (identified by their user_identifier). {% enddocs %} {% docs col_clicked_unique %} -Number of users that clicked on the ad (identified by their domain_userid). +Number of users that clicked on the ad (identified by their user_identifier). {% enddocs %} {% docs col_skipped_unique %} -Number of users that skipped the ad (identified by their domain_userid). +Number of users that skipped the ad (identified by their user_identifier). {% enddocs %} {% docs col_percent_reached_25_unique %} -Number of users that watched 25% of the ad (identified by their domain_userid). +Number of users that watched 25% of the ad (identified by their user_identifier). {% enddocs %} {% docs col_percent_reached_50_unique %} -Number of users that watched 50% of the ad (identified by their domain_userid). +Number of users that watched 50% of the ad (identified by their user_identifier). {% enddocs %} {% docs col_percent_reached_75_unique %} -Number of users that watched 75% of the ad (identified by their domain_userid). +Number of users that watched 75% of the ad (identified by their user_identifier). {% enddocs %} {% docs col_percent_reached_100_unique %} -Number of users that watched 100% of the ad (identified by their domain_userid). +Number of users that watched 100% of the ad (identified by their user_identifier). {% enddocs %} {% docs col_media_session_id %} diff --git a/docs/markdown/snowplow_media_player_model_docs.md b/docs/markdown/snowplow_media_player_model_docs.md index 4ddfd66..5f84eda 100644 --- a/docs/markdown/snowplow_media_player_model_docs.md +++ b/docs/markdown/snowplow_media_player_model_docs.md @@ -1,7 +1,3 @@ -{% docs table_interactions_this_run %} -This staging table shows all media player events within the current incremental run and calculates play_time. It could be used in custom models for more in-depth time based calculations. -{% enddocs %} - {% docs table_base_this_run %} This staging table aggregates media player interactions within the current run to a pageview level that is considered a base level for media plays. {% enddocs %} @@ -15,11 +11,11 @@ This view removes impressions from the derived snowplow_media_player_base table {% enddocs %} {% docs table_session_stats %} -This table aggregates the pageview level interactions to show session level media stats. +This table aggregates the base level plays to show session level media stats. {% enddocs %} {% docs table_user_stats %} -This table aggregates the pageview level interactions to show user level media stats. +This table aggregates the session level stats to show user level media stats. {% enddocs %} {% docs table_media_stats %} @@ -37,5 +33,5 @@ This derived table aggregates information about ad views. Each ad view (a user v {% enddocs %} {% docs table_media_ads %} -This derived table aggregates information about ads. Each row represents one ad played within a certain media on a certain platform. Stats about the number of ad clicks, progress reached and more are calculated as total values but also as counts of unique users (identified using `domain_userid`). +This derived table aggregates information about ads. Each row represents one ad played within a certain media on a certain platform. Stats about the number of ad clicks, progress reached and more are calculated as total values but also as counts of unique users (identified using `user_identifier`). {% enddocs %} diff --git a/docs/markdown/snowplow_media_player_overview.md b/docs/markdown/snowplow_media_player_overview.md index e6fd058..6363960 100644 --- a/docs/markdown/snowplow_media_player_overview.md +++ b/docs/markdown/snowplow_media_player_overview.md @@ -4,31 +4,23 @@ # Snowplow Media Player Package -Welcome to the documentation site for the Snowplow Media Player dbt package. The package is built as an extension of the [dbt-snowplow-web package][dbt-snowplow-web] that transforms raw media player event data into derived tables for easier querying generated by the Snowplow [JavaScript tracker][javascript-tracker] in combination with media tracking specific plugins such as the [Media Tracking plugin][media-tracking] or the [YouTube Tracking plugin][youtube-tracking]. +Welcome to the documentation site for the Snowplow Media Player dbt package. The package transforms raw media player event data into derived tables for easier querying generated by the Snowplow [JavaScript tracker][javascript-tracker] in combination with media tracking specific plugins such as the [Media Tracking plugin][media-tracking] or the [YouTube Tracking plugin][youtube-tracking]. The package also supports media events generated by the Snowplow [iOS and Android trackers][mobile-media-tracker-docs]. -**For more information, including the dependency on the Snowplow Web package as well as a QuickStart guide, operation and configuration, and implementing your own custom modules on top of this please visit the [Snowplow Docs](https://docs.snowplow.io/docs/modeling-your-data/modeling-your-data-with-dbt/) for more detailed information.** +**For more information, including a QuickStart guide, operation and configuration, and implementing your own custom modules on top of this please visit the [Snowplow Docs](https://docs.snowplow.io/docs/modeling-your-data/modeling-your-data-with-dbt/) for more detailed information.** *Note this model design doc site is linked to latest release of the package. If you are not using the latest release, [generate and serve](https://docs.getdbt.com/reference/commands/cmd-docs#dbt-docs-serve) the doc site locally for accurate documentation.* ## Overview -This package consists of a series of dbt models with the goal to produce the following main aggregated models from the raw media player events and relevant contexts: +This package consists of a series of incremental dbt models with the goal to produce the following main aggregated models from the raw media player events and relevant contexts: - - `snowplow_media_player_base`: This derived table summarizes the key media player events and metrics of each media element on a media_identifier and pageview level which is considered as a base aggregation level for media interactions. - - - `snowplow_media_player_plays_by_pageview`: This view removes impressions from the '_base' table to summarize media plays on a page_view by media_identifier level. - - - `snowplow_media_player_media_stats`: This derived table aggregates the '_base' table to individual media_identifier level, calculating the main KPIs and overall video/audio metrics. - -The package is built on top of the [dbt-snowplow-web package][dbt-snowplow-web] taking that as a basis to carry out the incremental update. It is designed to be run together with the web model in a similar manner to how a custom module would run: - -The `_interactions_this_run` table takes the `snowplow_web_base_events_this_run` table generated by the web package as an input then adds the various contexts to enrich the base table with the additional media related fields. It could be used for custom models for more in-depth event level derived tables and further analysis. - -The `_base_this_run` table then aggregates the `_interactions_this_run` table to media_identifier and pageview level and serves as a basis for the incrementalized derived table `_media_base`. - -The main `_media_stats` derived table will also be updated incrementally based on the `_media_base` derived table, however not through the snowplow_incremental materialization, but using the native dbt incremental materialization on a pageview basis after a set time window passed. This is to prevent complex and expensive queries due to metrics which need to take the whole page_view events into calculation. This way the metrics will only be calculated once per pageview / media, after no new events are expected. - -The additional `_pivot_base` table is there to calculate the percent_progress boundaries and weights that are used to calculate the total play_time and other related media fields. +| Model | Description | +|------------------------------------------|------------------------------------------------------------------------------------------------------------------| +| snowplow_media_player_base | A table summarizing media player events by media and pageview including impressions. | +| snowplow_media_player_plays_by_pageview | A view summarizing media plays by media on a pageview level. | +| snowplow_media_player_media_stats | An aggregated table of media metrics on a media_identifier level. | +| snowplow_media_player_media_ad_views | A view summarizing each ad viewed within a media playback (only for v2 schemas, see above). | +| snowplow_media_player_media_ads | An aggregated table of ad metrics for each ad played within each media content (only for v2 schemas, see above). | ## Installation @@ -46,7 +38,7 @@ If you find a bug, please report an issue on GitHub. The snowplow-media-player package is Copyright 2022 Snowplow Analytics Ltd. -Licensed under the [Apache License, Version 2.0][license] (the "License"); +Licensed under the [Snowplow Personal and Academic License][license] (the "License"); you may not use this software except in compliance with the License. Unless required by applicable law or agreed to in writing, software @@ -55,8 +47,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -[license]: http://www.apache.org/licenses/LICENSE-2.0 -[license-image]: http://img.shields.io/badge/license-Apache--2-blue.svg?style=flat +[license]: https://docs.snowplow.io/personal-and-academic-license-1.0/ +[license-image]: http://img.shields.io/badge/license-Snowplow--Personal--and--Academic--1-blue.svg?style=flat [tracker-classificiation]: https://docs.snowplow.io/docs/collecting-data/collecting-from-own-applications/tracker-maintenance-classification/ [early-release]: https://img.shields.io/static/v1?style=flat&label=Snowplow&message=Early%20Release&color=014477&labelColor=9ba0aa&logo= @@ -88,7 +80,7 @@ limitations under the License. [dbt-snowplow-web]: https://snowplow.github.io/dbt-snowplow-web/#!/overview/snowplow_web -[flutter-tracker]: https://docs.snowplow.io/docs/collecting-data/collecting-from-own-applications/flutter-tracker/ +[mobile-media-tracker-docs]: https://docs.snowplow.io/docs/collecting-data/collecting-from-own-applications/mobile-trackers/tracking-events/media-tracking/ {% endraw %} diff --git a/integration_tests/dbt_project.yml b/integration_tests/dbt_project.yml index 8f8dfae..4883688 100644 --- a/integration_tests/dbt_project.yml +++ b/integration_tests/dbt_project.yml @@ -37,6 +37,12 @@ models: +enabled: '{{ target.type in ["redshift", "postgres"] | as_bool() }}' snowflake: +enabled: '{{ target.type == "snowflake" | as_bool() }}' + snowplow_media_player: + +persist_docs: + relation: true + columns: true + custom: + +enabled: true vars: snowplow__enable_media_ad: true @@ -63,17 +69,8 @@ vars: snowplow__enable_media_ad_break: true snowplow__enable_ad_quartile_event: true snowplow__enable_mobile_events: true - - # Variables - Warehouse Specific - snowplow__page_view_context: 'snowplow_web_page_view_context' - snowplow__context_mobile_session: | - {% if target.type in ['postgres', 'redshift'] -%} - com_snowplowanalytics_snowplow_client_session_1 - {%- elif target.type in ['bigquery'] -%} - contexts_com_snowplowanalytics_snowplow_client_session_1_0_2 - {%- else -%} - contexts_com_snowplowanalytics_snowplow_client_session_1 - {%- endif %} + snowplow__base_passthroughs: ['v_collector', {'sql': 'v_tracker || app_id', 'alias': 'tracker_app_id'}] + snowplow__ad_views_passthroughs: ['v_collector', {'sql': 'v_tracker || app_id', 'alias': 'tracker_app_id'}] seeds: quote_columns: false diff --git a/macros/identifiers.sql b/macros/identifiers.sql index 6ba13ee..ebc7638 100644 --- a/macros/identifiers.sql +++ b/macros/identifiers.sql @@ -6,29 +6,129 @@ You may obtain a copy of the Snowplow Personal and Academic License Version 1.0 #} {% macro session_identifiers() %} + {{ return(adapter.dispatch('session_identifiers', 'snowplow_media_player')()) }} +{% endmacro %} + - {% if target.type == 'snowflake' -%} - {% set mobile_field = 'sessionId' %} - {%- else -%} - {% set mobile_field = 'session_id' %} - {%- endif %} +{% macro default__session_identifiers() %} {% if var('snowplow__session_identifiers') %} {{ return(var('snowplow__session_identifiers')) }} {% else %} - {% if var('snowplow__enable_web_events') and var('snowplow__enable_mobile_events') %} + {% if var('snowplow__enable_media_session') %} + {{ return([ + {'schema': 'contexts_com_snowplowanalytics_snowplow_media_session_1', 'field': 'media_session_id', 'prefix': 'media_session_'}, + {'schema': 'contexts_com_snowplowanalytics_mobile_screen_1', 'field': 'id', 'prefix': 'mobile_screen_'}, + {'schema': 'contexts_com_snowplowanalytics_snowplow_web_page_1', 'field': 'id', 'prefix': 'web_page_'} + ]) }} + + {% elif var('snowplow__enable_web_events') and var('snowplow__enable_mobile_events') %} + {{ return([ + {'schema': 'contexts_com_snowplowanalytics_mobile_screen_1', 'field': 'id', 'prefix': 'mobile_screen_'}, + {'schema': 'contexts_com_snowplowanalytics_snowplow_web_page_1', 'field': 'id', 'prefix': 'web_page_'} + ]) }} + + {% elif var('snowplow__enable_mobile_events') %} + {{ return([{'schema': 'contexts_com_snowplowanalytics_mobile_screen_1', 'field': 'id', 'prefix': 'mobile_screen_'}]) }} + + {% else %} + {{ return([{'schema': 'contexts_com_snowplowanalytics_snowplow_web_page_1', 'field': 'id', 'prefix': 'web_page_'}] )}} + + {% endif %} + {% endif %} + +{% endmacro %} + + +{% macro snowflake__session_identifiers() %} + + {% if var('snowplow__session_identifiers') %} + {{ return(var('snowplow__session_identifiers')) }} + + {% else %} + + {% if var('snowplow__enable_media_session') %} + {{ return([ + {'schema': 'contexts_com_snowplowanalytics_snowplow_media_session_1', 'field': 'mediaSessionId', 'prefix': 'media_session_'}, + {'schema': 'contexts_com_snowplowanalytics_mobile_screen_1', 'field': 'id', 'prefix': 'mobile_screen_'}, + {'schema': 'contexts_com_snowplowanalytics_snowplow_web_page_1', 'field': 'id', 'prefix': 'web_page_'} + ]) }} + + {% elif var('snowplow__enable_web_events') and var('snowplow__enable_mobile_events') %} + {{ return([ + {'schema': 'contexts_com_snowplowanalytics_mobile_screen_1', 'field': 'id', 'prefix': 'mobile_screen_'}, + {'schema': 'contexts_com_snowplowanalytics_snowplow_web_page_1', 'field': 'id', 'prefix': 'web_page_'} + ]) }} + + {% elif var('snowplow__enable_mobile_events') %} + {{ return([{'schema': 'contexts_com_snowplowanalytics_mobile_screen_1', 'field': 'id', 'prefix': 'mobile_screen_'}]) }} + + {% else %} + {{ return([{'schema': 'contexts_com_snowplowanalytics_snowplow_web_page_1', 'field': 'id', 'prefix': 'web_page_'}] )}} + + {% endif %} + {% endif %} + +{% endmacro %} + +{% macro bigquery__session_identifiers() %} + + {% if var('snowplow__session_identifiers') %} + {{ return(var('snowplow__session_identifiers')) }} + + {% else %} + + {% if var('snowplow__enable_media_session') %} + {{ return([ + {'schema': 'contexts_com_snowplowanalytics_snowplow_media_session_1_*', 'field': 'media_session_id', 'prefix': 'media_session_'}, + {'schema': 'contexts_com_snowplowanalytics_mobile_screen_1_*', 'field': 'id', 'prefix': 'mobile_screen_'}, + {'schema': 'contexts_com_snowplowanalytics_snowplow_web_page_1_*', 'field': 'id', 'prefix': 'web_page_'} + ]) }} + + {% elif var('snowplow__enable_web_events') and var('snowplow__enable_mobile_events') %} + {{ return([ + {'schema': 'contexts_com_snowplowanalytics_mobile_screen_1_*', 'field': 'id', 'prefix': 'mobile_screen_'}, + {'schema': 'contexts_com_snowplowanalytics_snowplow_web_page_1_*', 'field': 'id', 'prefix': 'web_page_'} + ]) }} + + {% elif var('snowplow__enable_mobile_events') %} + {{ return([{'schema': 'contexts_com_snowplowanalytics_mobile_screen_1_*', 'field': 'id', 'prefix': 'mobile_screen_'}]) }} + + {% else %} + {{ return([{'schema': 'contexts_com_snowplowanalytics_snowplow_web_page_1_*', 'field': 'id', 'prefix': 'web_page_'}] )}} + + {% endif %} + {% endif %} + +{% endmacro %} + +{% macro postgres__session_identifiers() %} + + {% if var('snowplow__session_identifiers') %} + {{ return(var('snowplow__session_identifiers')) }} + + {% else %} + + {% if var('snowplow__enable_media_session') %} + {{ return([ + {'schema': 'com_snowplowanalytics_snowplow_media_session_1', 'field': 'media_session_id', 'prefix': 'media_session_'}, + {'schema': 'com_snowplowanalytics_mobile_screen_1', 'field': 'id', 'prefix': 'mobile_screen_'}, + {'schema': 'com_snowplowanalytics_snowplow_web_page_1', 'field': 'id', 'prefix': 'web_page_'} + ]) }} + + {% elif var('snowplow__enable_web_events') and var('snowplow__enable_mobile_events') %} {{ return([ - {'schema': var('snowplow__context_mobile_session'), 'field': mobile_field, 'prefix': 'session_'}, - {'schema': 'atomic', 'field': 'domain_sessionid', 'prefix': 'session_'} + {'schema': 'com_snowplowanalytics_mobile_screen_1', 'field': 'id', 'prefix': 'mobile_screen_'}, + {'schema': 'com_snowplowanalytics_snowplow_web_page_1', 'field': 'id', 'prefix': 'web_page_'} ]) }} {% elif var('snowplow__enable_mobile_events') %} - {{ return([{'schema': var('snowplow__context_mobile_session'), 'field': mobile_field, 'prefix': 'session_'}]) }} + {{ return([{'schema': 'com_snowplowanalytics_mobile_screen_1', 'field': 'id', 'prefix': 'mobile_screen_'}]) }} {% else %} - {{ return([{'schema': 'atomic', 'field': 'domain_sessionid', 'prefix': 'session_'}] )}} + {{ return([{'schema': 'com_snowplowanalytics_snowplow_web_page_1', 'field': 'id', 'prefix': 'web_page_'}] )}} {% endif %} {% endif %} @@ -37,12 +137,84 @@ You may obtain a copy of the Snowplow Personal and Academic License Version 1.0 {% macro user_identifiers() %} + {{ return(adapter.dispatch('user_identifiers', 'snowplow_unified')()) }} +{% endmacro %} + + +{% macro default__user_identifiers() %} + + {% if var('snowplow__user_identifiers') %} + {{ return(var('snowplow__user_identifiers')) }} + + {% else %} + + {% if var('snowplow__enable_web_events') and var('snowplow__enable_mobile_events') %} + {{ return([ + {'schema': 'contexts_com_snowplowanalytics_snowplow_client_session_1', 'field': 'user_id', 'prefix': 'user_'}, + {'schema': 'atomic', 'field': 'domain_userid', 'prefix': 'user_'} + ]) }} + + {% elif var('snowplow__enable_mobile_events') %} + {{ return([{'schema': 'contexts_com_snowplowanalytics_snowplow_client_session_1', 'field': 'user_id', 'prefix': 'user_'}]) }} + + {% else %} + {{ return([{'schema': 'atomic', 'field': 'domain_userid', 'prefix': 'user_'}] )}} + + {% endif %} + {% endif %} + +{% endmacro %} + + +{% macro snowflake__user_identifiers() %} + + {% if var('snowplow__user_identifiers') %} + {{ return(var('snowplow__user_identifiers')) }} + + {% else %} + + {% if var('snowplow__enable_web_events') and var('snowplow__enable_mobile_events') %} + {{ return([ + {'schema': 'contexts_com_snowplowanalytics_snowplow_client_session_1', 'field': 'userId', 'prefix': 'user_'}, + {'schema': 'atomic', 'field': 'domain_userid', 'prefix': 'user_'} + ]) }} + + {% elif var('snowplow__enable_mobile_events') %} + {{ return([{'schema': 'contexts_com_snowplowanalytics_snowplow_client_session_1', 'field': 'userId', 'prefix': 'user_'}]) }} + + {% else %} + {{ return([{'schema': 'atomic', 'field': 'domain_userid', 'prefix': 'user_'}] )}} + + {% endif %} + {% endif %} + +{% endmacro %} + +{% macro bigquery__user_identifiers() %} + + {% if var('snowplow__user_identifiers') %} + {{ return(var('snowplow__user_identifiers')) }} + + {% else %} + + {% if var('snowplow__enable_web_events') and var('snowplow__enable_mobile_events') %} + {{ return([ + {'schema': 'contexts_com_snowplowanalytics_snowplow_client_session_1_*', 'field': 'user_id', 'prefix': 'user_'}, + {'schema': 'atomic', 'field': 'domain_userid', 'prefix': 'user_'} + ]) }} + + {% elif var('snowplow__enable_mobile_events') %} + {{ return([{'schema': 'contexts_com_snowplowanalytics_snowplow_client_session_1_*', 'field': 'user_id', 'prefix': 'user_'}]) }} + + {% else %} + {{ return([{'schema': 'atomic', 'field': 'domain_userid', 'prefix': 'user_'}] )}} + + {% endif %} + {% endif %} + +{% endmacro %} - {% if target.type == 'snowflake' -%} - {% set mobile_field = 'userId' %} - {%- else -%} - {% set mobile_field = 'user_id' %} - {%- endif %} +{% macro postgres__user_identifiers() %} {% if var('snowplow__user_identifiers') %} {{ return(var('snowplow__user_identifiers')) }} @@ -51,12 +223,12 @@ You may obtain a copy of the Snowplow Personal and Academic License Version 1.0 {% if var('snowplow__enable_web_events') and var('snowplow__enable_mobile_events') %} {{ return([ - {'schema': var('snowplow__context_mobile_session'), 'field': mobile_field, 'prefix': 'user_'}, + {'schema': 'com_snowplowanalytics_snowplow_client_session_1', 'field': 'user_id', 'prefix': 'user_'}, {'schema': 'atomic', 'field': 'domain_userid', 'prefix': 'user_'} ]) }} {% elif var('snowplow__enable_mobile_events') %} - {{ return([{'schema': var('snowplow__context_mobile_session'), 'field': mobile_field, 'prefix': 'user_'}]) }} + {{ return([{'schema': 'com_snowplowanalytics_snowplow_client_session_1', 'field': 'user_id', 'prefix': 'user_'}]) }} {% else %} {{ return([{'schema': 'atomic', 'field': 'domain_userid', 'prefix': 'user_'}] )}} diff --git a/models/base/scratch/bigquery/snowplow_media_player_base_events_this_run.sql b/models/base/scratch/bigquery/snowplow_media_player_base_events_this_run.sql index 6612f22..8f95dea 100644 --- a/models/base/scratch/bigquery/snowplow_media_player_base_events_this_run.sql +++ b/models/base/scratch/bigquery/snowplow_media_player_base_events_this_run.sql @@ -54,6 +54,10 @@ prep as ( web={ 'field': 'id', 'col_prefix': 'contexts_com_snowplowanalytics_snowplow_web_page_1' }, mobile={'field': 'id', 'col_prefix': 'contexts_com_snowplowanalytics_mobile_screen_1' } ) }} as page_view_id, + {{ web_or_mobile_field( + web='a.domain_sessionid', + mobile={ 'field': 'session_id', 'col_prefix': 'contexts_com_snowplowanalytics_snowplow_client_session_1' } + ) }} as original_session_identifier, -- unpacking the media player event {{ media_player_field( diff --git a/models/base/scratch/databricks/snowplow_media_player_base_events_this_run.sql b/models/base/scratch/databricks/snowplow_media_player_base_events_this_run.sql index c64ba81..2230beb 100644 --- a/models/base/scratch/databricks/snowplow_media_player_base_events_this_run.sql +++ b/models/base/scratch/databricks/snowplow_media_player_base_events_this_run.sql @@ -55,6 +55,10 @@ prep as ( web={ 'field': 'id', 'col_prefix': 'contexts_com_snowplowanalytics_snowplow_web_page_1', 'dtype': 'string' }, mobile={ 'field': 'id', 'col_prefix': 'contexts_com_snowplowanalytics_mobile_screen_1', 'dtype': 'string' } ) }} as page_view_id, + {{ web_or_mobile_field( + web='a.domain_sessionid', + mobile={ 'field': 'session_id', 'col_prefix': 'contexts_com_snowplowanalytics_snowplow_client_session_1', 'dtype': 'string' } + ) }} as original_session_identifier, -- unpacking the media player event {{ media_player_field( diff --git a/models/base/scratch/default/snowplow_media_player_base_events_this_run.sql b/models/base/scratch/default/snowplow_media_player_base_events_this_run.sql index c2255a5..071629c 100644 --- a/models/base/scratch/default/snowplow_media_player_base_events_this_run.sql +++ b/models/base/scratch/default/snowplow_media_player_base_events_this_run.sql @@ -98,6 +98,7 @@ prep as ( ev.*, {{ web_or_mobile_field(web='ev.web_page__id', mobile='ev.mobile_screen__id') }} as page_view_id, + {{ web_or_mobile_field(web='ev.domain_sessionid', mobile='ev.mobile_session__session_id') }} as original_session_identifier, -- unpacking the media player event {{ media_player_field(v1='ev.media_player_event__label', v2='ev.media_player_v2__label') }} as media_label, diff --git a/models/base/scratch/snowflake/snowplow_media_player_base_events_this_run.sql b/models/base/scratch/snowflake/snowplow_media_player_base_events_this_run.sql index 8da633d..0988247 100644 --- a/models/base/scratch/snowflake/snowplow_media_player_base_events_this_run.sql +++ b/models/base/scratch/snowflake/snowplow_media_player_base_events_this_run.sql @@ -55,6 +55,10 @@ prep as ( web={ 'field': 'id', 'col_prefix': 'contexts_com_snowplowanalytics_snowplow_web_page_1', 'dtype': 'varchar' }, mobile={ 'field': 'id', 'col_prefix': 'contexts_com_snowplowanalytics_mobile_screen_1', 'dtype': 'varchar' } ) }} as page_view_id, + {{ web_or_mobile_field( + web='a.domain_sessionid', + mobile={ 'field': 'session_id', 'col_prefix': 'contexts_com_snowplowanalytics_snowplow_client_session_1', 'dtype': 'varchar' } + ) }} as original_session_identifier, -- unpacking the media player event {{ media_player_field( diff --git a/models/custom/snowplow_media_player_custom.yml b/models/custom/snowplow_media_player_custom.yml index b19d899..75dbaeb 100644 --- a/models/custom/snowplow_media_player_custom.yml +++ b/models/custom/snowplow_media_player_custom.yml @@ -5,15 +5,15 @@ models: +tags: "snowplow_media_player_incremental" description: '{{ doc("table_session_stats") }}' columns: - - name: session_identifier + - name: domain_sessionid description: '{{ doc("col_session_identifier") }}' tags: - primary-key tests: - unique - not_null - - name: domain_userid - description: '{{ doc("col_domain_userid") }}' + - name: user_identifier + description: '{{ doc("col_user_identifier") }}' - name: impressions description: '{{ doc("col_impressions") }}' - name: videos_played @@ -48,8 +48,8 @@ models: +tags: "snowplow_media_player_incremental" description: '{{ doc("table_user_stats") }}' columns: - - name: domain_userid - description: '{{ doc("col_domain_userid") }}' + - name: user_identifier + description: '{{ doc("col_user_identifier") }}' tags: - primary-key tests: diff --git a/models/custom/snowplow_media_player_session_stats.sql b/models/custom/snowplow_media_player_session_stats.sql index 6cab4ad..0eb63b9 100644 --- a/models/custom/snowplow_media_player_session_stats.sql +++ b/models/custom/snowplow_media_player_session_stats.sql @@ -14,7 +14,7 @@ You may obtain a copy of the Snowplow Personal and Academic License Version 1.0 "field": "start_tstamp", "data_type": "timestamp" }, databricks_val='start_tstamp_date'), - cluster_by=snowplow_utils.get_value_by_target_type(bigquery_val=["domain_userid"]), + cluster_by=snowplow_utils.get_value_by_target_type(bigquery_val=["user_identifier"]), sql_header=snowplow_utils.set_query_tag(var('snowplow__query_tag', 'snowplow_dbt')) ) }} @@ -22,8 +22,9 @@ You may obtain a copy of the Snowplow Personal and Academic License Version 1.0 with prep as ( select - session_identifier, - domain_userid, + -- get the first domain_sessionid in the array + cast(({{ snowplow_utils.get_split_to_array('domain_sessionid_array', 'b') }})[1] as {{ type_string() }}) as domain_sessionid, + user_identifier, count(*) as impressions, count(distinct case when media_type = 'video' and is_played then media_identifier end) as videos_played, count(distinct case when media_type = 'audio' and is_played then media_identifier end) as audio_played, @@ -40,7 +41,7 @@ with prep as ( coalesce(avg(case when is_played then coalesce(play_time_secs / nullif(duration_secs, 0), 0) end),0) as avg_percent_played, sum(case when is_complete_play then 1 else 0 end) as complete_plays - from {{ ref("snowplow_media_player_base") }} + from {{ ref("snowplow_media_player_base") }} as b group by 1,2 diff --git a/models/custom/snowplow_media_player_user_stats.sql b/models/custom/snowplow_media_player_user_stats.sql index eb94cec..5463cb6 100644 --- a/models/custom/snowplow_media_player_user_stats.sql +++ b/models/custom/snowplow_media_player_user_stats.sql @@ -9,12 +9,12 @@ You may obtain a copy of the Snowplow Personal and Academic License Version 1.0 config( materialized = 'table', sort = 'first_play', - dist = 'domain_userid', + dist = 'user_identifier', partition_by = snowplow_utils.get_value_by_target_type(bigquery_val={ "field": "first_play", "data_type": "timestamp" }, databricks_val='first_play_date'), - cluster_by=snowplow_utils.get_value_by_target_type(bigquery_val=["domain_userid"]), + cluster_by=snowplow_utils.get_value_by_target_type(bigquery_val=["user_identifier"]), sql_header=snowplow_utils.set_query_tag(var('snowplow__query_tag', 'snowplow_dbt')) ) }} @@ -22,7 +22,7 @@ You may obtain a copy of the Snowplow Personal and Academic License Version 1.0 with prep as ( select - domain_userid, + user_identifier, min(case when (video_plays + audio_plays) > 0 then start_tstamp end) as first_play, max(case when (video_plays + audio_plays) > 0 then start_tstamp end) as last_play, sum(video_plays) as video_plays, diff --git a/models/media_ad_views/scratch/base_scratch.yml b/models/media_ad_views/scratch/base_scratch.yml index 64679ad..0742397 100644 --- a/models/media_ad_views/scratch/base_scratch.yml +++ b/models/media_ad_views/scratch/base_scratch.yml @@ -27,6 +27,8 @@ models: description: '{{ doc("col_user_id")}}' - name: session_identifier description: '{{ doc("col_session_identifier")}}' + - name: domain_sessionid_array + description: '{{ doc("col_domain_sessionid_array") }}' - name: play_id description: '{{ doc("col_play_id")}}' - name: ad_break_id diff --git a/models/media_ad_views/scratch/snowplow_media_player_media_ad_views_this_run.sql b/models/media_ad_views/scratch/snowplow_media_player_media_ad_views_this_run.sql index c656ff3..52d4acb 100644 --- a/models/media_ad_views/scratch/snowplow_media_player_media_ad_views_this_run.sql +++ b/models/media_ad_views/scratch/snowplow_media_player_media_ad_views_this_run.sql @@ -33,48 +33,94 @@ events_this_run as ( , prep as ( select - {{ dbt_utils.generate_surrogate_key(['ev.platform', 'ev.media_identifier', 'ev.ad_id']) }} as media_ad_id, - - ev.platform, - ev.media_identifier, - max(ev.media_label) as media_label, - ev.user_identifier, - ev.session_identifier, - ev.user_id, - ev.play_id, - - {{ media_ad_break_field('ev.ad_break_id') }} as ad_break_id, - {{ media_ad_break_field('max(ev.ad_break_name)' ) }} as ad_break_name, - {{ media_ad_break_field('max(ev.ad_break_type)' ) }} as ad_break_type, - - {{ media_ad_field('ev.ad_id') }} as ad_id, - {{ media_ad_field('max(ev.ad_name)') }} as name, - {{ media_ad_field('max(ev.ad_creative_id)') }} as creative_id, - {{ media_ad_field('max(ev.ad_duration_secs)') }} as duration_secs, - {{ media_ad_field('avg(ev.ad_pod_position)') }} as pod_position, - {{ media_ad_field('sum(case when ev.ad_skippable then 1 else 0 end) > 0') }} as skippable, - - max(case when ev.event_type = 'adclick' then 1 else 0 end) > 0 as clicked, - max(case when ev.event_type = 'adskip' then 1 else 0 end) > 0 as skipped, - {{ media_ad_quartile_event_field("max(case when ev.event_type = 'adcomplete' or (ev.event_type = 'adquartile' and ev.ad_percent_progress >= 25) then 1 else 0 end) > 0") }} as percent_reached_25, - {{ media_ad_quartile_event_field("max(case when ev.event_type = 'adcomplete' or (ev.event_type = 'adquartile' and ev.ad_percent_progress >= 50) then 1 else 0 end) > 0") }} as percent_reached_50, - {{ media_ad_quartile_event_field("max(case when ev.event_type = 'adcomplete' or (ev.event_type = 'adquartile' and ev.ad_percent_progress >= 75) then 1 else 0 end) > 0") }} as percent_reached_75, - max(case when ev.event_type = 'adcomplete' then 1 else 0 end) > 0 as percent_reached_100, - - min(ev.start_tstamp) as viewed_at, - max(ev.start_tstamp) as last_event + {{ dbt_utils.generate_surrogate_key(['ev.platform', 'ev.media_identifier', 'ev.ad_id']) }} as media_ad_id + + ,ev.platform + ,ev.media_identifier + ,ev.user_identifier + ,ev.session_identifier + ,ev.user_id + ,ev.play_id + ,{{ media_ad_break_field('ev.ad_break_id') }} as ad_break_id + ,{{ media_ad_field('ev.ad_id') }} as ad_id + + {%- if var('snowplow__ad_views_passthroughs', []) -%} + {%- set passthrough_names = [] -%} + {%- for identifier in var('snowplow__ad_views_passthroughs', []) %} + {# Check if it is a simple column or a sql+alias #} + {%- if identifier is mapping -%} + ,{{identifier['sql']}} as {{identifier['alias']}} + {%- do passthrough_names.append(identifier['alias']) -%} + {%- else -%} + ,ev.{{identifier}} + {%- do passthrough_names.append(identifier) -%} + {%- endif -%} + {% endfor -%} + {%- endif %} + + ,max(ev.media_label) as media_label + ,{{ media_ad_break_field('max(ev.ad_break_name)' ) }} as ad_break_name + ,{{ media_ad_break_field('max(ev.ad_break_type)' ) }} as ad_break_type + + ,{{ media_ad_field('max(ev.ad_name)') }} as name + ,{{ media_ad_field('max(ev.ad_creative_id)') }} as creative_id + ,{{ media_ad_field('max(ev.ad_duration_secs)') }} as duration_secs + ,{{ media_ad_field('avg(ev.ad_pod_position)') }} as pod_position + ,{{ media_ad_field('sum(case when ev.ad_skippable then 1 else 0 end) > 0') }} as skippable + + ,max(case when ev.event_type = 'adclick' then 1 else 0 end) > 0 as clicked + ,max(case when ev.event_type = 'adskip' then 1 else 0 end) > 0 as skipped + ,{{ media_ad_quartile_event_field("max(case when ev.event_type = 'adcomplete' or (ev.event_type = 'adquartile' and ev.ad_percent_progress >= 25) then 1 else 0 end) > 0") }} as percent_reached_25 + ,{{ media_ad_quartile_event_field("max(case when ev.event_type = 'adcomplete' or (ev.event_type = 'adquartile' and ev.ad_percent_progress >= 50) then 1 else 0 end) > 0") }} as percent_reached_50 + ,{{ media_ad_quartile_event_field("max(case when ev.event_type = 'adcomplete' or (ev.event_type = 'adquartile' and ev.ad_percent_progress >= 75) then 1 else 0 end) > 0") }} as percent_reached_75 + ,max(case when ev.event_type = 'adcomplete' then 1 else 0 end) > 0 as percent_reached_100 + + ,min(ev.start_tstamp) as viewed_at + ,max(ev.start_tstamp) as last_event + ,{{ snowplow_utils.get_string_agg('original_session_identifier', 'ev', is_distinct=True) }} as domain_sessionid_array from events_this_run as ev - group by 1, 2, 3, 5, 6, 7, 8, 9, 12 + {{ dbt_utils.group_by(n=9+(var('snowplow__ad_views_passthroughs', [])|length)) }} ) select {{ dbt_utils.generate_surrogate_key(['p.play_id', 'p.ad_break_id', 'p.media_ad_id']) }} as media_ad_view_id - , p.* + ,p.media_ad_id + ,p.platform + ,p.media_identifier + ,p.media_label + ,p.user_identifier + ,p.session_identifier + ,p.domain_sessionid_array + ,p.user_id + ,p.play_id + ,p.ad_break_id + ,p.ad_break_name + ,p.ad_break_type + ,p.ad_id + ,p.name + ,p.creative_id + ,p.duration_secs + ,p.pod_position + ,p.skippable + ,p.clicked + ,p.skipped + ,p.percent_reached_25 + ,p.percent_reached_50 + ,p.percent_reached_75 + ,p.percent_reached_100 + ,p.viewed_at + ,p.last_event {% if target.type in ['databricks', 'spark'] -%} , date(p.viewed_at) as viewed_at_date {%- endif %} + -- passthrough fields + {%- if var('snowplow__ad_views_passthroughs', []) -%} + {%- for col in passthrough_names %} + , p.{{col}} + {%- endfor -%} + {%- endif %} from prep as p diff --git a/models/media_base/scratch/base_scratch.yml b/models/media_base/scratch/base_scratch.yml index 688c00a..eb336b6 100644 --- a/models/media_base/scratch/base_scratch.yml +++ b/models/media_base/scratch/base_scratch.yml @@ -21,8 +21,12 @@ models: description: '{{ doc("col_media_label") }}' - name: session_identifier description: '{{ doc("col_session_identifier") }}' - - name: domain_userid + - name: domain_sessionid_array + description: '{{ doc("col_domain_sessionid_array") }}' + - name: user_identifier description: '{{ doc("col_domain_userid") }}' + - name: user_id + description: '{{ doc("col_user_id")}}' - name: page_referrer description: '{{ doc("col_page_referrer") }}' - name: page_url diff --git a/models/media_base/scratch/snowplow_media_player_base_this_run.sql b/models/media_base/scratch/snowplow_media_player_base_this_run.sql index 5864516..fd81175 100644 --- a/models/media_base/scratch/snowplow_media_player_base_this_run.sql +++ b/models/media_base/scratch/snowplow_media_player_base_this_run.sql @@ -33,54 +33,76 @@ events_this_run as ( , prep as ( select - i.play_id, - i.page_view_id, - i.media_identifier, - i.player_id, - i.media_label, - i.session_identifier, - i.user_identifier, - i.user_id, - i.platform, - max(i.duration_secs) as duration_secs, - i.media_type, - i.media_player_type, - i.page_referrer, - i.page_url, - max(i.source_url) as source_url, - i.geo_region_name, - i.br_name, - i.dvce_type, - i.os_name, - i.os_timezone, - min(start_tstamp) as start_tstamp, - max(start_tstamp) as end_tstamp, - sum(case when i.event_type = 'play' then 1 else 0 end) as plays, - sum(case when i.event_type in ('seek', 'seeked', 'seekend') then 1 else 0 end) as seeks, - sum(i.play_time_secs) as play_time_secs, - sum(i.play_time_muted_secs) as play_time_muted_secs, - coalesce( + i.play_id + ,i.page_view_id + ,i.media_identifier + ,i.player_id + ,i.media_label + ,i.session_identifier + ,i.user_identifier + ,i.user_id + ,i.platform + ,i.media_type + ,i.media_player_type + ,i.page_referrer + ,i.page_url + ,i.geo_region_name + ,i.br_name + ,i.dvce_type + ,i.os_name + ,i.os_timezone + + {%- if var('snowplow__base_passthroughs', []) -%} + {%- set passthrough_names = [] -%} + {%- for identifier in var('snowplow__base_passthroughs', []) %} + {# Check if it is a simple column or a sql+alias #} + {%- if identifier is mapping -%} + ,{{identifier['sql']}} as {{identifier['alias']}} + {%- do passthrough_names.append(identifier['alias']) -%} + {%- else -%} + ,i.{{identifier}} + {%- do passthrough_names.append(identifier) -%} + {%- endif -%} + {% endfor -%} + {%- endif %} + + ,max(i.source_url) as source_url + ,max(i.duration_secs) as duration_secs + ,min(start_tstamp) as start_tstamp + ,max(start_tstamp) as end_tstamp + ,sum(case when i.event_type = 'play' then 1 else 0 end) as plays + ,sum(case when i.event_type in ('seek', 'seeked', 'seekend') then 1 else 0 end) as seeks + ,sum(i.play_time_secs) as play_time_secs + ,sum(i.play_time_muted_secs) as play_time_muted_secs + ,coalesce( sum(i.playback_rate * i.play_time_secs) / nullif(sum(i.play_time_secs), 0), max(i.playback_rate) - ) as avg_playback_rate, - min(case when i.event_type in ('seek', 'seeked', 'seekstart', 'seekend') then start_tstamp end) as first_seek_time, - max(i.percent_progress) as max_percent_progress + ) as avg_playback_rate + ,min(case when i.event_type in ('seek', 'seeked', 'seekstart', 'seekend') then start_tstamp end) as first_seek_time + ,max(i.percent_progress) as max_percent_progress + ,{{ snowplow_utils.get_string_agg('original_session_identifier', 'i', is_distinct=True) }} as domain_sessionid_array + from events_this_run as i - group by 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14, 16, 17, 18, 19, 20 + {{ dbt_utils.group_by(n=18+(var('snowplow__base_passthroughs', [])|length)) }} ) , dedupe as ( select - *, - row_number() - over (partition by play_id order by start_tstamp) as duplicate_count + * + {% if target.type == 'postgres' %} + ,row_number() over (partition by play_id order by start_tstamp) as duplicate_count + {% endif %} from prep + {% if target.type not in ['postgres'] %} + qualify row_number() over (partition by play_id order by start_tstamp) = 1 + {% endif %} + ) , media_sessions as ( @@ -161,6 +183,7 @@ select d.player_id, d.media_label, d.session_identifier, + d.domain_sessionid_array, d.user_identifier, d.user_id, d.page_referrer, @@ -233,6 +256,13 @@ select , date(d.start_tstamp) as start_tstamp_date {%- endif %} + -- passthrough fields + {%- if var('snowplow__base_passthroughs', []) -%} + {%- for col in passthrough_names %} + , d.{{col}} + {%- endfor -%} + {%- endif %} + from dedupe as d left join retention_rate as r @@ -247,4 +277,6 @@ left join media_sessions as s left join percent_progress_by_play_id as p on p.play_id = d.play_id -where d.duplicate_count = 1 +{% if target.type == 'postgres' %} + where d.duplicate_count = 1 +{% endif %}