From d568da92b46234f13d2895cf72407d1b6af065bd Mon Sep 17 00:00:00 2001 From: popcorny Date: Mon, 5 Aug 2024 15:47:51 +0800 Subject: [PATCH] Fix the histogram diff for athena Signed-off-by: popcorny --- recce/tasks/histogram.py | 4 ++-- tests/tasks/test_histogram.py | 37 +++++++++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+), 2 deletions(-) create mode 100644 tests/tasks/test_histogram.py diff --git a/recce/tasks/histogram.py b/recce/tasks/histogram.py index 19872ab7..19973a7a 100644 --- a/recce/tasks/histogram.py +++ b/recce/tasks/histogram.py @@ -49,7 +49,7 @@ def generate_histogram_sql_integer(node, column, min_value, max_value, num_bins= WITH value_ranges AS ( SELECT {min_value} as min_value, - {max_value} as max_value, + {max_value} as max_value ), bin_parameters AS ( SELECT @@ -85,7 +85,7 @@ def generate_histogram_sql_numeric(node, column, min_value, max_value, num_bins= WITH value_ranges AS ( SELECT {min_value} as min_value, - {max_value} as max_value, + {max_value} as max_value ), bin_parameters AS ( SELECT diff --git a/tests/tasks/test_histogram.py b/tests/tasks/test_histogram.py new file mode 100644 index 00000000..b88fd326 --- /dev/null +++ b/tests/tasks/test_histogram.py @@ -0,0 +1,37 @@ +from recce.tasks.histogram import HistogramDiffTask + + +def test_histogram(dbt_test_helper): + csv_data = """ + customer_id,name,age + 1,Alice,30 + 2,Bob,25 + 3,Charlie,35 + 4,Dolly,50 + """ + + dbt_test_helper.create_model("customers", csv_data, csv_data) + + params = { + "model": "customers", + "column_name": "age", + "column_type": "int" + } + + task = HistogramDiffTask(params) + run_result = task.execute() + + # { + # 'base': {'counts': [1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], 'total': 4}, + # 'current': {'counts': [1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], 'total': 4}, + # 'min': 25, 'max': 50, + # 'bin_edges': [25, 26, ..., 51], + # 'labels': ['25-26', ..., '51-52'] + # } + assert run_result['current']['counts'][0] == 1 + assert run_result['current']['counts'][-1] == 1 + assert run_result['current']['total'] == 4 + assert run_result['min'] == 25 + assert run_result['max'] == 50 + assert run_result['bin_edges'][0] == 25 + assert run_result['bin_edges'][-1] == 51