-
Notifications
You must be signed in to change notification settings - Fork 609
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat(sql): fuse distinct
with other select nodes when possible
#9923
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -53,6 +53,7 @@ | |
predicates: VarTuple[ops.Value[dt.Boolean]] = () | ||
qualified: VarTuple[ops.Value[dt.Boolean]] = () | ||
sort_keys: VarTuple[ops.SortKey] = () | ||
distinct: bool = False | ||
|
||
def is_star_selection(self): | ||
return tuple(self.values.items()) == tuple(self.parent.fields.items()) | ||
|
@@ -128,6 +129,12 @@ | |
return Select(_.parent, selections=_.values, sort_keys=_.keys) | ||
|
||
|
||
@replace(p.Distinct) | ||
def distinct_to_select(_, **kwargs): | ||
"""Convert a Distinct node to a Select node.""" | ||
return Select(_.parent, selections=_.values, distinct=True) | ||
|
||
|
||
@replace(p.DropColumns) | ||
def drop_columns_to_select(_, **kwargs): | ||
"""Convert a DropColumns node to a Select node.""" | ||
|
@@ -244,6 +251,48 @@ | |
if _.parent.find_below(blocking, filter=ops.Value): | ||
return _ | ||
|
||
if _.parent.distinct: | ||
# The inner query is distinct. | ||
# | ||
# If the outer query is distinct, it's only safe to merge if it's a simple subselection: | ||
# - Fusing in the presence of non-deterministic calls in the select would lead to | ||
# incorrect results | ||
# - Fusing in the presence of expensive calls in the select would lead to potential | ||
# performance pitfalls | ||
if _.distinct and not all( | ||
isinstance(v, ops.Field) for v in _.selections.values() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Does this cover the alias-in-outer-project case? SELECT a, b AS c
FROM (
SELECT DISTINCT
a, b
FROM t
) would become SELECT DISTINCT
a, b AS c
FROM t If not, fine to either handle later or perhaps never if it doesn't come up. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This branch is for both being distinct, so:
In this case, yes, the aliases are properly handled. The branch on line 270 handles the |
||
): | ||
return _ | ||
|
||
# If the outer query isn't distinct, it's only safe to merge if the outer is a SELECT *: | ||
# - If new columns are added, they might be non-distinct, changing the distinctness | ||
# - If previous columns are removed, that would also change the distinctness | ||
if not _.distinct and not _.is_star_selection(): | ||
return _ | ||
|
||
distinct = True | ||
elif _.distinct: | ||
# The outer query is distinct and the inner isn't. It's only safe to merge if either | ||
# - The inner query isn't ordered | ||
# - The outer query is a SELECT * | ||
# | ||
# Otherwise we run the risk that the outer query drops columns needed for the ordering of | ||
# the inner query - many backends don't allow select distinc queries to order by columns | ||
# that aren't present in their selection, like | ||
# | ||
# SELECT DISTINCT a, b FROM t ORDER BY c --- some backends will explode at this | ||
# | ||
# An alternate solution would be to drop the inner ORDER BY clause, since the backend will | ||
# ignore it anyway since it's a subquery. That feels potentially risky though, better | ||
# to generate the SQL as written. | ||
if _.parent.sort_keys and not _.is_star_selection(): | ||
jcrist marked this conversation as resolved.
Show resolved
Hide resolved
|
||
return _ | ||
jcrist marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
distinct = True | ||
else: | ||
# Neither query is distinct, safe to merge | ||
distinct = False | ||
|
||
subs = {ops.Field(_.parent, k): v for k, v in _.parent.values.items()} | ||
selections = {k: v.replace(subs, filter=ops.Value) for k, v in _.selections.items()} | ||
|
||
|
@@ -266,6 +315,7 @@ | |
predicates=unique_predicates, | ||
qualified=unique_qualified, | ||
sort_keys=unique_sort_keys, | ||
distinct=distinct, | ||
) | ||
return result if complexity(result) <= complexity(_) else _ | ||
|
||
|
@@ -289,6 +339,7 @@ | |
node: ops.Node, | ||
params: Mapping[ops.ScalarParameter, Any], | ||
rewrites: Sequence[Pattern] = (), | ||
post_rewrites: Sequence[Pattern] = (), | ||
fuse_selects: bool = True, | ||
) -> tuple[ops.Node, list[ops.Node]]: | ||
"""Lower the ibis expression graph to a SQL-like relational algebra. | ||
|
@@ -300,7 +351,9 @@ | |
params | ||
A mapping of scalar parameters to their values. | ||
rewrites | ||
Supplementary rewrites to apply to the expression graph. | ||
Supplementary rewrites to apply before SQL-specific transforms. | ||
post_rewrites | ||
Supplementary rewrites to apply after SQL-specific transforms. | ||
fuse_selects | ||
Whether to merge subsequent Select nodes into one where possible. | ||
|
||
|
@@ -322,6 +375,7 @@ | |
| project_to_select | ||
| filter_to_select | ||
| sort_to_select | ||
| distinct_to_select | ||
| fill_null_to_select | ||
| drop_null_to_select | ||
| drop_columns_to_select | ||
|
@@ -335,6 +389,9 @@ | |
else: | ||
simplified = sqlized | ||
|
||
if post_rewrites: | ||
simplified = simplified.replace(reduce(operator.or_, post_rewrites)) | ||
|
||
# extract common table expressions while wrapping them in a CTE node | ||
ctes = extract_ctes(simplified) | ||
|
||
|
@@ -351,6 +408,46 @@ | |
# supplemental rewrites selectively used on a per-backend basis | ||
|
||
|
||
@replace(Select) | ||
def split_select_distinct_with_order_by(_): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There are dragons here. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Missed opportunity to call this |
||
"""Split a `SELECT DISTINCT ... ORDER BY` query when needed. | ||
|
||
Some databases (postgres, pyspark, ...) have issues with two types of | ||
ordered select distinct statements: | ||
|
||
``` | ||
--- ORDER BY with an expression instead of a name in the select list | ||
SELECT DISTINCT a, b FROM t ORDER BY a + 1 | ||
|
||
--- ORDER BY using a qualified column name, rather than the alias in the select list | ||
SELECT DISTINCT a, b as x FROM t ORDER BY b --- or t.b | ||
``` | ||
|
||
We solve both these cases by splitting everything except the `ORDER BY` | ||
into a subquery. | ||
|
||
``` | ||
SELECT DISTINCT a, b FROM t WHERE a > 10 ORDER BY a + 1 | ||
--- is rewritten as -> | ||
SELECT * FROM (SELECT DISTINCT a, b FROM t WHERE a > 10) ORDER BY a + 1 | ||
``` | ||
""" | ||
# risingwave and pyspark also don't allow qualified names as sort keys, like | ||
# SELECT DISTINCT t.a FROM t ORDER BY t.a | ||
# To avoid having specific rewrite rules for these backends to use only | ||
# local names, we always split SELECT DISTINCT from ORDER BY here. Otherwise we | ||
# could also avoid splitting if all sort keys appear in the select list. | ||
if _.distinct and _.sort_keys: | ||
inner = _.copy(sort_keys=()) | ||
subs = {v: ops.Field(inner, k) for k, v in inner.values.items()} | ||
sort_keys = tuple(s.replace(subs, filter=ops.Value) for s in _.sort_keys) | ||
selections = { | ||
k: v.replace(subs, filter=ops.Value) for k, v in _.selections.items() | ||
} | ||
return Select(inner, selections=selections, sort_keys=sort_keys) | ||
return _ | ||
|
||
|
||
@replace(p.WindowFunction(func=p.NTile(y), order_by=())) | ||
def add_order_by_to_empty_ranking_window_functions(_, **kwargs): | ||
"""Add an ORDER BY clause to rank window functions that don't have one.""" | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,3 @@ | ||
SELECT DISTINCT | ||
* | ||
FROM ( | ||
SELECT | ||
"t0"."string_col" | ||
FROM "functional_alltypes" AS "t0" | ||
) AS "t1" | ||
"t0"."string_col" | ||
FROM "functional_alltypes" AS "t0" |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,8 +1,4 @@ | ||
SELECT DISTINCT | ||
* | ||
FROM ( | ||
SELECT | ||
"t0"."string_col", | ||
"t0"."int_col" | ||
FROM "functional_alltypes" AS "t0" | ||
) AS "t1" | ||
"t0"."string_col", | ||
"t0"."int_col" | ||
FROM "functional_alltypes" AS "t0" |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
SELECT DISTINCT | ||
"t0"."a", | ||
"t0"."b", | ||
"t0"."c" | ||
FROM "test" AS "t0" | ||
WHERE | ||
"t0"."c" > 10 AND "t0"."a" > 10 | ||
ORDER BY | ||
"t0"."a" ASC |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
SELECT DISTINCT | ||
"t0"."a", | ||
"t0"."b", | ||
"t0"."c" | ||
FROM "test" AS "t0" | ||
WHERE | ||
"t0"."c" > 10 AND "t0"."a" > 10 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There are also dragons here.