From 260fe030fbe296d94be68a5465188100cdd69bee Mon Sep 17 00:00:00 2001
From: popcorny <celu@infuseai.io>
Date: Mon, 5 Aug 2024 17:37:09 +0800
Subject: [PATCH] Add macros for athena

Signed-off-by: popcorny <celu@infuseai.io>
---
 macros/README.md        |  8 +++++
 macros/recce_athena.sql | 73 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 81 insertions(+)
 create mode 100644 macros/README.md
 create mode 100644 macros/recce_athena.sql
diff --git a/macros/README.md b/macros/README.md
new file mode 100644
index 00000000..0a480721
--- /dev/null
+++ b/macros/README.md
@@ -0,0 +1,8 @@
+## Adapter Macros
+
+Recce use dbt package `audit-hepler` and `dbt_profiler` for value diff and profile diff. However, it encounters some SQL compatibility issues in some warehouse. The folder is to provide adapter-specific macro by mean of the dbt [macro dispatch](https://docs.getdbt.com/reference/dbt-jinja-functions/dispatch) mechanism.
+
+## How to use
+
+1. Copy `recce_<adapter>.sql` to your dbt project `macros/` folder.
+2. Rerun the `dbt` command to add the macros to your `target/manifest.json`
diff --git a/macros/recce_athena.sql b/macros/recce_athena.sql
new file mode 100644
index 00000000..bee15ffa
--- /dev/null
+++ b/macros/recce_athena.sql
@@ -0,0 +1,73 @@
+{% macro athena__compare_column_values(a_query, b_query, primary_key, column_to_compare, emojis, a_relation_name, b_relation_name) -%}
+with a_query as (
+    {{ a_query }}
+),
+
+b_query as (
+    {{ b_query }}
+),
+
+joined as (
+    select
+        coalesce(a_query.{{ primary_key }}, b_query.{{ primary_key }}) as {{ primary_key }},
+        a_query.{{ column_to_compare }} as a_query_value,
+        b_query.{{ column_to_compare }} as b_query_value,
+        case
+            when a_query.{{ column_to_compare }} = b_query.{{ column_to_compare }} then '{% if emojis %}✅: {% endif %}perfect match'
+            when a_query.{{ column_to_compare }} is null and b_query.{{ column_to_compare }} is null then '{% if emojis %}✅: {% endif %}both are null'
+            when a_query.{{ primary_key }} is null then '{% if emojis %}🤷: {% endif %}missing from {{ a_relation_name }}'
+            when b_query.{{ primary_key }} is null then '{% if emojis %}🤷: {% endif %}missing from {{ b_relation_name }}'
+            when a_query.{{ column_to_compare }} is null then '{% if emojis %}🤷: {% endif %}value is null in {{ a_relation_name }} only'
+            when b_query.{{ column_to_compare }} is null then '{% if emojis %}🤷: {% endif %}value is null in {{ b_relation_name }} only'
+            when a_query.{{ column_to_compare }} != b_query.{{ column_to_compare }} then '{% if emojis %}❌: {% endif %}‍values do not match'
+            else 'unknown' -- this should never happen
+        end as match_status,
+        case
+            when a_query.{{ column_to_compare }} = b_query.{{ column_to_compare }} then 0
+            when a_query.{{ column_to_compare }} is null and b_query.{{ column_to_compare }} is null then 1
+            when a_query.{{ primary_key }} is null then 2
+            when b_query.{{ primary_key }} is null then 3
+            when a_query.{{ column_to_compare }} is null then 4
+            when b_query.{{ column_to_compare }} is null then 5
+            when a_query.{{ column_to_compare }} != b_query.{{ column_to_compare }} then 6
+            else 7 -- this should never happen
+        end as match_order
+
+    from a_query
+
+    full outer join b_query on a_query.{{ primary_key }} = b_query.{{ primary_key }}
+),
+
+aggregated as (
+    select
+        '{{ column_to_compare }}' as column_name,
+        match_status,
+        match_order,
+        count(*) as count_records
+    from joined
+
+    group by '{{ column_to_compare }}', match_status, match_order
+)
+
+select
+    column_name,
+    match_status,
+    count_records,
+    round(100.0 * count_records / sum(count_records) over (), 2) as percent_of_total
+
+from aggregated
+
+order by match_order
+
+{% endmacro %}
+
+
+{%- macro athena__measure_median(column_name, data_type, cte_name) -%}
+
+{%- if dbt_profiler.is_numeric_dtype(data_type) and not dbt_profiler.is_struct_dtype(data_type) -%}
+    approx_percentile( {{ adapter.quote(column_name) }}, 0.5)
+{%- else -%}
+    cast(null as {{ dbt.type_numeric() }})
+{%- endif -%}
+
+{%- endmacro -%}
\ No newline at end of file