-
Notifications
You must be signed in to change notification settings - Fork 162
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
1064 profile array elements #1397
Changes from all commits
1d9e6cc
cdea58a
cfdfd04
95f6f9d
09ef49f
453883b
ad0bc26
2f55b55
53ebbeb
d6143f7
6e03a11
92fcb7d
d373e66
a6bdfdd
7dd32cc
677e81d
641bb45
15fb40c
635c49a
d2366f9
4b3831f
b219e7b
bc7b06e
1ba5ee1
a1830b3
c04d98a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -137,7 +137,9 @@ def _get_df_top_bottom_n(expressions, limit=20, value_order="desc"): | |
return sql | ||
|
||
|
||
def _col_or_expr_frequencies_raw_data_sql(cols_or_exprs, table_name): | ||
def _col_or_expr_frequencies_raw_data_sql( | ||
cols_or_exprs, array_cols, table_name, cast_arrays_as_str | ||
): | ||
cols_or_exprs = ensure_is_list(cols_or_exprs) | ||
column_expressions = expressions_to_sql(cols_or_exprs) | ||
sqls = [] | ||
|
@@ -146,8 +148,12 @@ def _col_or_expr_frequencies_raw_data_sql(cols_or_exprs, table_name): | |
|
||
# If the supplied column string is a list of columns to be concatenated, | ||
# add a quick clause to filter out any instances whereby either column contains | ||
# a null value. | ||
# a null value. Also raise error of usr tries to supply array columns | ||
|
||
if isinstance(raw_expr, list): | ||
if any([expr in array_cols for expr in raw_expr]): | ||
raise ValueError("Arrays cannot be concatenated during profiling") | ||
|
||
null_exprs = [f"{c} is null" for c in raw_expr] | ||
null_exprs = " OR ".join(null_exprs) | ||
|
||
|
@@ -159,21 +165,52 @@ def _col_or_expr_frequencies_raw_data_sql(cols_or_exprs, table_name): | |
end | ||
""" | ||
|
||
sql = f""" | ||
select * from | ||
(select | ||
count(*) as value_count, | ||
'{gn}' as group_name, | ||
cast({col_or_expr} as varchar) as value, | ||
(select count({col_or_expr}) from {table_name}) as total_non_null_rows, | ||
(select count(*) from {table_name}) as total_rows_inc_nulls, | ||
(select count(distinct {col_or_expr}) from {table_name}) | ||
as distinct_value_count | ||
from {table_name} | ||
where {col_or_expr} is not null | ||
group by {col_or_expr} | ||
order by count(*) desc) column_stats | ||
""" | ||
if not cast_arrays_as_str and raw_expr in array_cols: | ||
|
||
sql = f""" | ||
select * from | ||
(select value, | ||
count (*) as value_count, | ||
'{gn}' as group_name, | ||
|
||
(select count(value) from | ||
(select unnest ({col_or_expr} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there a potential issue with unnest here that the function name changes depending on the dialect e.g. in spark it's explode. If so, might consider adding something to each backend similar to how we deal with random samples (where the dialect varies between linkers). Apologies if you're already aware There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for the heads up... yes this was on my radar and was more complicated to deal with as the 'translator' didn't translate the desired function, so Tom is going to work on applying this to the other backends. But I will ask him about the solution used with random samples as it's good to know. |
||
) AS value from {table_name})) as total_non_null_rows, | ||
|
||
(select count(*) from | ||
(select unnest ({col_or_expr} | ||
) AS value from {table_name})) as total_rows_inc_nulls, | ||
|
||
(select count(distinct value) from | ||
(select unnest ({col_or_expr} | ||
) AS value from {table_name})) as distinct_value_count | ||
|
||
from | ||
(select cast(unnest({col_or_expr}) as varchar) as value, | ||
|
||
from {table_name}) | ||
group by value | ||
order by count(*) desc) column_stats | ||
|
||
""" | ||
|
||
else: | ||
sql = f""" | ||
select * from | ||
(select | ||
cast({col_or_expr} as varchar) as value, | ||
count(*) as value_count, | ||
'{gn}' as group_name, | ||
(select count({col_or_expr}) from {table_name}) as total_non_null_rows, | ||
(select count(*) from {table_name}) as total_rows_inc_nulls, | ||
(select count(distinct {col_or_expr}) from {table_name}) | ||
as distinct_value_count | ||
from {table_name} | ||
where {col_or_expr} is not null | ||
group by {col_or_expr} | ||
order by count(*) desc) column_stats | ||
""" | ||
|
||
sqls.append(sql) | ||
|
||
return " union all ".join(sqls) | ||
|
@@ -190,18 +227,23 @@ def _add_100_percentile_to_df_percentiles(percentile_rows): | |
return percentile_rows | ||
|
||
|
||
def profile_columns(linker, column_expressions, top_n=10, bottom_n=10): | ||
df_concat = linker._initialise_df_concat() | ||
def profile_columns( | ||
linker, column_expressions, top_n=10, bottom_n=10, cast_arrays_as_str=False | ||
): | ||
|
||
df_concat = linker._initialise_df_concat(materialise=True) | ||
|
||
input_dataframes = [] | ||
if df_concat: | ||
input_dataframes.append(df_concat) | ||
|
||
array_cols = df_concat.get_array_cols() | ||
|
||
column_expressions_raw = ensure_is_list(column_expressions) | ||
column_expressions = expressions_to_sql(column_expressions_raw) | ||
|
||
sql = _col_or_expr_frequencies_raw_data_sql( | ||
column_expressions_raw, "__splink__df_concat" | ||
column_expressions_raw, array_cols, df_concat.physical_name, cast_arrays_as_str | ||
) | ||
|
||
linker._enqueue_sql(sql, "__splink__df_all_column_value_frequencies") | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah, this is
False
by default too, so users are going to get spammed by the warning.This won't be so much of an issue once this is implemented across the board, but it might be a little obnoxious if we don't apply it across the board for a while.