From 23da1f424398696e25656dc6f89066b9a5d13794 Mon Sep 17 00:00:00 2001 From: Ryan Hill Date: Tue, 5 Mar 2024 23:55:08 +0000 Subject: [PATCH] add precision + exclude_columns option to equality test (#765) * add exclude columns to equality test * add precision option to equality test * CI fix? * CI fix 2.0 * Update CHANGELOG.md * Check for subset of columns (Close #785) * cast type * cast type across warehouses * swap to copiler error, account for ignore columns * Update CL * allow for different cased names * fix CL * linting * Rename to exclude_columns * Fix typo * Add package-lock.yaml to .gitignore * Update comments --------- Co-authored-by: bruno Co-authored-by: Joel Labes Co-authored-by: gwen windflower --- CHANGELOG.md | 10 +- README.md | 18 ++- integration_tests/.gitignore | 2 +- .../schema_tests/data_test_equality_a.csv | 4 + .../schema_tests/data_test_equality_b.csv | 4 + .../data_test_equality_floats_a.csv | 11 ++ .../data_test_equality_floats_b.csv | 11 ++ .../data_test_equality_floats_columns_a.csv | 11 ++ .../data_test_equality_floats_columns_b.csv | 11 ++ integration_tests/dbt_project.yml | 18 ++- .../generic_tests/equality_less_columns.sql | 9 ++ .../models/generic_tests/schema.yml | 58 +++++++++ integration_tests/models/sql/test_union.sql | 4 +- macros/generic_tests/equality.sql | 113 ++++++++++++++++-- 14 files changed, 266 insertions(+), 18 deletions(-) create mode 100644 integration_tests/data/schema_tests/data_test_equality_a.csv create mode 100644 integration_tests/data/schema_tests/data_test_equality_b.csv create mode 100644 integration_tests/data/schema_tests/data_test_equality_floats_a.csv create mode 100644 integration_tests/data/schema_tests/data_test_equality_floats_b.csv create mode 100644 integration_tests/data/schema_tests/data_test_equality_floats_columns_a.csv create mode 100644 integration_tests/data/schema_tests/data_test_equality_floats_columns_b.csv create mode 100644 integration_tests/models/generic_tests/equality_less_columns.sql diff --git a/CHANGELOG.md b/CHANGELOG.md index a72abb7d..132e8a1e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,9 +9,13 @@ ---> # Unreleased +## New features +- The `equality` test now accepts an additional argument, `precision` to aide in comparing floating point numbers ([#757](https://github.com/dbt-labs/dbt-utils/issues/757), [#765](https://github.com/dbt-labs/dbt-utils/pull/765)) +- Add option to ignore columns in equality test ([#734](https://github.com/dbt-labs/dbt-utils/issues/734), [#737](https://github.com/dbt-labs/dbt-utils/pull/737)) ## Fixes - deduplicate macro for Databricks now uses the QUALIFY clause, which fixes NULL columns issues from the default natural join logic - deduplicate macro for Redshift now uses the QUALIFY clause, which fixes NULL columns issues from the default natural join logic +- Equality test will now raise an error when the second model has less columns than the first ([#785](https://github.com/dbt-labs/dbt-utils/issues/785)) - get_tables_by_pattern_sql will now: - return redshift external tables ([#752](https://github.com/dbt-labs/dbt-utils/issues/752) - work with valid redshift database names that contain dashes @@ -19,8 +23,10 @@ - created a new dispatch redshift__get_tables_by_pattern which unions the result of the default macro and querying svv_external_tables for the same conditions (schema name, pattern, exclude pattern). ## Contributors: -[@graciegoheen](https://github.com/graciegoheen) -[@yauhen-sobaleu](https://github.com/yauhen-sobaleu) +- [@graciegoheen](https://github.com/graciegoheen) +- [@yauhen-sobaleu](https://github.com/yauhen-sobaleu) +- [@rlh1994](https://github.com/rlh1994) +- [@brunocostalopes](https://github.com/brunocostalopes) [@brendan-cook-87](https://github.com/brendan-cook-87) # dbt utils v1.1.1 diff --git a/README.md b/README.md index 827626ba..55c926c3 100644 --- a/README.md +++ b/README.md @@ -114,7 +114,7 @@ This test supports the `group_by_columns` parameter; see [Grouping in tests](#gr ### equality ([source](macros/generic_tests/equality.sql)) -Asserts the equality of two relations. Optionally specify a subset of columns to compare. +Asserts the equality of two relations. Optionally specify a subset of columns to compare or exclude, and a precision to compare numeric columns on. **Usage:** @@ -122,13 +122,29 @@ Asserts the equality of two relations. Optionally specify a subset of columns to version: 2 models: + # compare the entire table - name: model_name + tests: + - dbt_utils.equality: + compare_model: ref('other_table_name') + + # only compare some of the columns + - name: model_name_compare_columns tests: - dbt_utils.equality: compare_model: ref('other_table_name') compare_columns: - first_column - second_column + precision: 4 + + # compare all columns except the ones on the ignore list + - name: model_name_exclude_columns + tests: + - dbt_utils.equality: + compare_model: ref('other_table_name') + exclude_columns: + - third_column ``` ### expression_is_true ([source](macros/generic_tests/expression_is_true.sql)) diff --git a/integration_tests/.gitignore b/integration_tests/.gitignore index 638c8c44..b4500d79 100644 --- a/integration_tests/.gitignore +++ b/integration_tests/.gitignore @@ -1,6 +1,6 @@ - target/ dbt_modules/ logs/ .env/ profiles.yml +package-lock.yml diff --git a/integration_tests/data/schema_tests/data_test_equality_a.csv b/integration_tests/data/schema_tests/data_test_equality_a.csv new file mode 100644 index 00000000..35fa4ee0 --- /dev/null +++ b/integration_tests/data/schema_tests/data_test_equality_a.csv @@ -0,0 +1,4 @@ +col_a,col_b,col_c +1,1,3 +1,2,1 +2,3,3 diff --git a/integration_tests/data/schema_tests/data_test_equality_b.csv b/integration_tests/data/schema_tests/data_test_equality_b.csv new file mode 100644 index 00000000..c9fda320 --- /dev/null +++ b/integration_tests/data/schema_tests/data_test_equality_b.csv @@ -0,0 +1,4 @@ +col_a,col_b,col_c +1,1,2 +1,2,2 +2,3,2 diff --git a/integration_tests/data/schema_tests/data_test_equality_floats_a.csv b/integration_tests/data/schema_tests/data_test_equality_floats_a.csv new file mode 100644 index 00000000..85241961 --- /dev/null +++ b/integration_tests/data/schema_tests/data_test_equality_floats_a.csv @@ -0,0 +1,11 @@ +id,float_number +05ac09c4-f947-45a8-8c14-88f430f8b294,62.3888186 +cfae9054-940b-42a1-84d4-052daae6194f,81.2511656 +6029501d-c274-49f2-a69d-4c75a3d9931d,23.3959675 +c653e520-df81-4a5f-b44b-bb1b4c1b7846,72.2100841 +59caed0d-53d6-473c-a88c-3726c7693f05,68.6029434 +b441f6a0-ce7f-4ad9-b96b-b41d73a94ae7,72.7861425 +26491840-bfd4-4496-9ca9-ad9220a2de47,35.3662223 +b4f233ce-a494-4bb6-9cf2-73bb6854e58a,89.1524680 +11c979b7-2661-4375-8143-7c9b54b90627,19.5755431 +a8057f73-312e-48e6-b344-f4a510a2c4a8,22.9237047 diff --git a/integration_tests/data/schema_tests/data_test_equality_floats_b.csv b/integration_tests/data/schema_tests/data_test_equality_floats_b.csv new file mode 100644 index 00000000..0306a9aa --- /dev/null +++ b/integration_tests/data/schema_tests/data_test_equality_floats_b.csv @@ -0,0 +1,11 @@ +id,float_number +05ac09c4-f947-45a8-8c14-88f430f8b294,62.3888187 +cfae9054-940b-42a1-84d4-052daae6194f,81.2511657 +6029501d-c274-49f2-a69d-4c75a3d9931d,23.3959676 +c653e520-df81-4a5f-b44b-bb1b4c1b7846,72.2100842 +59caed0d-53d6-473c-a88c-3726c7693f05,68.6029435 +b441f6a0-ce7f-4ad9-b96b-b41d73a94ae7,72.7861426 +26491840-bfd4-4496-9ca9-ad9220a2de47,35.3662224 +b4f233ce-a494-4bb6-9cf2-73bb6854e58a,89.1524681 +11c979b7-2661-4375-8143-7c9b54b90627,19.5755432 +a8057f73-312e-48e6-b344-f4a510a2c4a8,22.9237048 diff --git a/integration_tests/data/schema_tests/data_test_equality_floats_columns_a.csv b/integration_tests/data/schema_tests/data_test_equality_floats_columns_a.csv new file mode 100644 index 00000000..77beeae9 --- /dev/null +++ b/integration_tests/data/schema_tests/data_test_equality_floats_columns_a.csv @@ -0,0 +1,11 @@ +id,float_number,to_ignore +05ac09c4-f947-45a8-8c14-88f430f8b294,62.3888186,a +cfae9054-940b-42a1-84d4-052daae6194f,81.2511656,a +6029501d-c274-49f2-a69d-4c75a3d9931d,23.3959675,a +c653e520-df81-4a5f-b44b-bb1b4c1b7846,72.2100841,a +59caed0d-53d6-473c-a88c-3726c7693f05,68.6029434,a +b441f6a0-ce7f-4ad9-b96b-b41d73a94ae7,72.7861425,a +26491840-bfd4-4496-9ca9-ad9220a2de47,35.3662223,a +b4f233ce-a494-4bb6-9cf2-73bb6854e58a,89.1524680,a +11c979b7-2661-4375-8143-7c9b54b90627,19.5755431,a +a8057f73-312e-48e6-b344-f4a510a2c4a8,22.9237047,a diff --git a/integration_tests/data/schema_tests/data_test_equality_floats_columns_b.csv b/integration_tests/data/schema_tests/data_test_equality_floats_columns_b.csv new file mode 100644 index 00000000..e26305de --- /dev/null +++ b/integration_tests/data/schema_tests/data_test_equality_floats_columns_b.csv @@ -0,0 +1,11 @@ +id,float_number,to_ignore +05ac09c4-f947-45a8-8c14-88f430f8b294,62.3888186,b +cfae9054-940b-42a1-84d4-052daae6194f,81.2511656,b +6029501d-c274-49f2-a69d-4c75a3d9931d,23.3959675,b +c653e520-df81-4a5f-b44b-bb1b4c1b7846,72.2100841,b +59caed0d-53d6-473c-a88c-3726c7693f05,68.6029434,b +b441f6a0-ce7f-4ad9-b96b-b41d73a94ae7,72.7861425,b +26491840-bfd4-4496-9ca9-ad9220a2de47,35.3662223,b +b4f233ce-a494-4bb6-9cf2-73bb6854e58a,89.1524680,b +11c979b7-2661-4375-8143-7c9b54b90627,19.5755431,b +a8057f73-312e-48e6-b344-f4a510a2c4a8,22.9237047,b diff --git a/integration_tests/dbt_project.yml b/integration_tests/dbt_project.yml index a9531e78..830643ec 100644 --- a/integration_tests/dbt_project.yml +++ b/integration_tests/dbt_project.yml @@ -32,7 +32,7 @@ seeds: sql: data_events_20180103: +schema: events - + data_get_column_values_dropped: # this.incorporate() to hardcode the node's type as otherwise dbt doesn't know it yet +post-hook: "{% do adapter.drop_relation(this.incorporate(type='table')) %}" @@ -53,3 +53,19 @@ seeds: data_test_sequential_timestamps: +column_types: my_timestamp: timestamp + + data_test_equality_floats_a: + +column_types: + float_number: float + + data_test_equality_floats_columns_a: + +column_types: + float_number: float + + data_test_equality_floats_b: + +column_types: + float_number: float + + data_test_equality_floats_columns_b: + +column_types: + float_number: float diff --git a/integration_tests/models/generic_tests/equality_less_columns.sql b/integration_tests/models/generic_tests/equality_less_columns.sql new file mode 100644 index 00000000..415bf949 --- /dev/null +++ b/integration_tests/models/generic_tests/equality_less_columns.sql @@ -0,0 +1,9 @@ +with data as ( + + select * from {{ ref('data_test_equality_b') }} + +) + +select + col_a, col_b +from data diff --git a/integration_tests/models/generic_tests/schema.yml b/integration_tests/models/generic_tests/schema.yml index fa0e7441..022b0a59 100644 --- a/integration_tests/models/generic_tests/schema.yml +++ b/integration_tests/models/generic_tests/schema.yml @@ -142,6 +142,57 @@ seeds: - dbt_utils.not_null_proportion: at_least: 0.9 + - name: data_test_equality_a + tests: + - dbt_utils.equality: + compare_model: ref('data_test_equality_a') + - dbt_utils.equality: + compare_model: ref('data_test_equality_b') + error_if: "<1" #sneaky way to ensure that the test is returning failing rows + warn_if: "<0" + - dbt_utils.equality: + compare_model: ref('data_test_equality_b') + compare_columns: + - col_a + - col_b + - dbt_utils.equality: + compare_model: ref('data_test_equality_b') + exclude_columns: + - col_c + + - name: data_test_equality_floats_a + tests: + # test precision only + - dbt_utils.equality: + compare_model: ref('data_test_equality_floats_b') + precision: 4 + - dbt_utils.equality: + compare_model: ref('data_test_equality_floats_b') + precision: 8 + error_if: "<1" #sneaky way to ensure that the test is returning failing rows + warn_if: "<0" + + - name: data_test_equality_floats_columns_a + tests: + # Positive assertion tests + - dbt_utils.equality: + compare_model: ref('data_test_equality_floats_columns_b') + compare_columns: + - id + - float_number + precision: 4 + - dbt_utils.equality: + compare_model: ref('data_test_equality_floats_columns_b') + exclude_columns: + - to_ignore + precision: 4 + # all columns should fail even with rounding + - dbt_utils.equality: + compare_model: ref('data_test_equality_floats_columns_b') + precision: 4 + error_if: "<1" #sneaky way to ensure that the test is returning failing rows + warn_if: "<0" + models: - name: recency_time_included tests: @@ -199,3 +250,10 @@ models: - dbt_utils.fewer_rows_than: compare_model: ref('data_test_fewer_rows_than_table_2') group_by_columns: ['col_a'] + + - name: equality_less_columns + tests: + - dbt_utils.equality: + compare_model: ref('data_test_equality_a') + exclude_columns: + - col_c diff --git a/integration_tests/models/sql/test_union.sql b/integration_tests/models/sql/test_union.sql index 69836833..8d675ede 100644 --- a/integration_tests/models/sql/test_union.sql +++ b/integration_tests/models/sql/test_union.sql @@ -2,7 +2,7 @@ select id, name, - favorite_color + favorite_color, + favorite_number from {{ ref('test_union_base') }} - diff --git a/macros/generic_tests/equality.sql b/macros/generic_tests/equality.sql index ffc6a2b8..d7d7197c 100644 --- a/macros/generic_tests/equality.sql +++ b/macros/generic_tests/equality.sql @@ -1,8 +1,12 @@ -{% test equality(model, compare_model, compare_columns=None) %} - {{ return(adapter.dispatch('test_equality', 'dbt_utils')(model, compare_model, compare_columns)) }} +{% test equality(model, compare_model, compare_columns=None, exclude_columns=None, precision = None) %} + {{ return(adapter.dispatch('test_equality', 'dbt_utils')(model, compare_model, compare_columns, exclude_columns, precision)) }} {% endtest %} -{% macro default__test_equality(model, compare_model, compare_columns=None) %} +{% macro default__test_equality(model, compare_model, compare_columns=None, exclude_columns=None, precision = None) %} + +{%- if compare_columns and exclude_columns -%} + {{ exceptions.raise_compiler_error("Both a compare and an ignore list were provided to the `equality` macro. Only one is allowed") }} +{%- endif -%} {% set set_diff %} count(*) + coalesce(abs( @@ -19,20 +23,107 @@ {{ return('') }} {% endif %} + + -- setup {%- do dbt_utils._is_relation(model, 'test_equality') -%} -{#- -If the compare_cols arg is provided, we can run this test without querying the -information schema — this allows the model to be an ephemeral model --#} - +{# Ensure there are no extra columns in the compare_model vs model #} {%- if not compare_columns -%} {%- do dbt_utils._is_ephemeral(model, 'test_equality') -%} - {%- set compare_columns = adapter.get_columns_in_relation(model) | map(attribute='quoted') -%} -{%- endif -%} + {%- do dbt_utils._is_ephemeral(compare_model, 'test_equality') -%} + + {%- set model_columns = adapter.get_columns_in_relation(model) -%} + {%- set compare_model_columns = adapter.get_columns_in_relation(compare_model) -%} + + + {%- if exclude_columns -%} + {#-- Lower case ignore columns for easier comparison --#} + {%- set exclude_columns = exclude_columns | map("lower") | list %} + + {# Filter out the excluded columns #} + {%- set include_columns = [] %} + {%- set include_model_columns = [] %} + {%- for column in model_columns -%} + {%- if column.name | lower not in exclude_columns -%} + {% do include_columns.append(column) %} + {%- endif %} + {%- endfor %} + {%- for column in compare_model_columns -%} + {%- if column.name | lower not in exclude_columns -%} + {% do include_model_columns.append(column) %} + {%- endif %} + {%- endfor %} + + {%- set compare_columns_set = set(include_columns | map(attribute='quoted') | map("lower")) %} + {%- set compare_model_columns_set = set(include_model_columns | map(attribute='quoted') | map("lower")) %} + {%- else -%} + {%- set compare_columns_set = set(model_columns | map(attribute='quoted') | map("lower")) %} + {%- set compare_model_columns_set = set(compare_model_columns | map(attribute='quoted') | map("lower")) %} + {%- endif -%} + + {% if compare_columns_set != compare_model_columns_set %} + {{ exceptions.raise_compiler_error(compare_model ~" has less columns than " ~ model ~ ", please ensure they have the same columns or use the `compare_columns` or `exclude_columns` arguments to subset them.") }} + {% endif %} -{% set compare_cols_csv = compare_columns | join(', ') %} + +{% endif %} + +{%- if not precision -%} + {%- if not compare_columns -%} + {# + You cannot get the columns in an ephemeral model (due to not existing in the information schema), + so if the user does not provide an explicit list of columns we must error in the case it is ephemeral + #} + {%- do dbt_utils._is_ephemeral(model, 'test_equality') -%} + {%- set compare_columns = adapter.get_columns_in_relation(model)-%} + + {%- if exclude_columns -%} + {#-- Lower case ignore columns for easier comparison --#} + {%- set exclude_columns = exclude_columns | map("lower") | list %} + + {# Filter out the excluded columns #} + {%- set include_columns = [] %} + {%- for column in compare_columns -%} + {%- if column.name | lower not in exclude_columns -%} + {% do include_columns.append(column) %} + {%- endif %} + {%- endfor %} + + {%- set compare_columns = include_columns | map(attribute='quoted') %} + {%- else -%} {# Compare columns provided #} + {%- set compare_columns = compare_columns | map(attribute='quoted') %} + {%- endif -%} + {%- endif -%} + + {% set compare_cols_csv = compare_columns | join(', ') %} + +{% else %} {# Precision required #} + {#- + If rounding is required, we need to get the types, so it cannot be ephemeral even if they provide column names + -#} + {%- do dbt_utils._is_ephemeral(model, 'test_equality') -%} + {%- set columns = adapter.get_columns_in_relation(model) -%} + + {% set columns_list = [] %} + {%- for col in columns -%} + {%- if ( + (col.name|lower in compare_columns|map('lower') or not compare_columns) and + (col.name|lower not in exclude_columns|map('lower') or not exclude_columns) + ) -%} + {# Databricks double type is not picked up by any number type checks in dbt #} + {%- if col.is_float() or col.is_numeric() or col.data_type == 'double' -%} + {# Cast is required due to postgres not having round for a double precision number #} + {%- do columns_list.append('round(cast(' ~ col.quoted ~ ' as ' ~ dbt.type_numeric() ~ '),' ~ precision ~ ') as ' ~ col.quoted) -%} + {%- else -%} {# Non-numeric type #} + {%- do columns_list.append(col.quoted) -%} + {%- endif -%} + {% endif %} + {%- endfor -%} + + {% set compare_cols_csv = columns_list | join(', ') %} + +{% endif %} with a as (