Skip to content

Commit

Permalink
Optimize DISTINCT, ORDER BY clause when Aggregation without Group By.
Browse files Browse the repository at this point in the history
For query which has Aggregation but without Group by clause, the
DISTINCT/DISTINCT ON/ORDER BY clause could be removed as there would
be one row returned at most.
And there is no necessary to do unique or sort.
This can simply the plan, and process less Aggref nodes during planner.

select distinct on(count(b), count(c)) count(a), sum(b) from
t_distinct_sort order by count(c);
                           QUERY PLAN
--------------------------------------------------------------------
 Unique
   Output: (count(a)), (sum(b)), (count(c)), (count(b))
   Group Key: (count(c)), (count(b))
   ->  Sort
         Output: (count(a)), (sum(b)), (count(c)), (count(b))
         Sort Key: (count(t_distinct_sort.c)),
(count(t_distinct_sort.b))
         ->  Finalize Aggregate
               Output: count(a), sum(b), count(c), count(b)
               ->  Gather Motion 3:1  (slice1; segments: 3)
                     Output: (PARTIAL count(a)), (PARTIAL sum(b)),
(PARTIAL count(c)), (PARTIAL count(b))
                     ->  Partial Aggregate
                           Output: PARTIAL count(a), PARTIAL sum(b),
PARTIAL count(c), PARTIAL count(b)
                           ->  Seq Scan on public.t_distinct_sort
                                 Output: a, b, c

After this commit:

select distinct on(count(b), count(c)) count(a), sum(b) from
t_distinct_sort order by count(c);
                      QUERY PLAN
--------------------------------------------------------
 Finalize Aggregate
   Output: count(a), sum(b)
   ->  Gather Motion 3:1  (slice1; segments: 3)
         Output: (PARTIAL count(a)), (PARTIAL sum(b))
         ->  Partial Aggregate
               Output: PARTIAL count(a), PARTIAL sum(b)
               ->  Seq Scan on public.t_distinct_sort
                     Output: a, b, c
 Optimizer: Postgres query optimizer

Authored-by: Zhang Mingli [email protected]
  • Loading branch information
avamingli committed Oct 24, 2024
1 parent 5633fbb commit 4a01111
Show file tree
Hide file tree
Showing 4 changed files with 338 additions and 0 deletions.
59 changes: 59 additions & 0 deletions src/backend/optimizer/plan/planner.c
Original file line number Diff line number Diff line change
Expand Up @@ -1338,6 +1338,65 @@ subquery_planner(PlannerGlobal *glob, Query *parse,
*/
if (hasResultRTEs)
remove_useless_result_rtes(root);

/*
* DISTINCT optimization.
* Remove DISTINCT clause if possibile, ex:
* select DISTINCT count(a) from t; to
* select count(a) from t;
* There is one row returned at most, DISTINCT is pointless then.
* The same with ORDER BY clause;
*/
if (parse->hasAggs &&
parse->groupClause == NIL &&
!contain_mutable_functions((Node *) parse))
{
List *useless_tlist = NIL;
List *tles;
List *sortops;
List *eqops;
ListCell *lc;

if (parse->distinctClause != NIL)
{
get_sortgroupclauses_tles(parse->distinctClause, parse->targetList,
&tles, &sortops, &eqops);
foreach(lc, tles)
{
TargetEntry *tle = lfirst(lc);
if (tle->resjunk)
useless_tlist = lappend(useless_tlist, tle);
}
parse->distinctClause = NIL;
if (parse->hasDistinctOn)
parse->hasDistinctOn = false;
}

if (parse->sortClause != NIL)
{

get_sortgroupclauses_tles(parse->sortClause, parse->targetList,
&tles, &sortops, &eqops);
foreach(lc, tles)
{
TargetEntry *tle = lfirst(lc);
/*
* For SELECT DISTINCT, ORDER BY expressions must appear in select list,
* Some tles may be already in the list.
*/
if (tle->resjunk)
useless_tlist = list_append_unique(useless_tlist, tle);
}
parse->sortClause = NIL;
}

/*
* There is no groupClause, sortClause and distinctClause now .
* The junk TargetEntrys with ressortgroupref index are safe to be removed.
*/
if (useless_tlist != NIL)
parse->targetList = list_difference(parse->targetList, useless_tlist);
}

/*
* Do the main planning.
Expand Down
123 changes: 123 additions & 0 deletions src/test/regress/expected/select_distinct.out
Original file line number Diff line number Diff line change
Expand Up @@ -534,3 +534,126 @@ DROP TABLE capitals;
DROP TABLE cities;
set gp_statistics_pullup_from_child_partition to off;
-- gpdb end: test inherit/partition table distinct when gp_statistics_pullup_from_child_partition is on
create table t_distinct_sort(a int, b int, c int);
insert into t_distinct_sort select i, i+1, i+2 from generate_series(1, 10)i;
insert into t_distinct_sort select i, i+1, i+2 from generate_series(1, 10)i;
insert into t_distinct_sort select i, i+1, i+2 from generate_series(1, 10)i;
analyze t_distinct_sort;
explain(verbose, costs off)
select distinct count(a), sum(b) from t_distinct_sort order by sum(b), count(a);
QUERY PLAN
---------------------------------------------------------------------------------------------------------------------------
Finalize Aggregate
Output: count(a), sum(b)
-> Gather Motion 3:1 (slice1; segments: 3)
Output: (PARTIAL count(a)), (PARTIAL sum(b))
-> Partial Aggregate
Output: PARTIAL count(a), PARTIAL sum(b)
-> Seq Scan on public.t_distinct_sort
Output: a, b, c
Settings: enable_hashagg = 'on', enable_sort = 'on', gp_statistics_pullup_from_child_partition = 'off', optimizer = 'off'
Optimizer: Postgres query optimizer
(10 rows)

select distinct count(a), sum(b) from t_distinct_sort order by sum(b), count(a);
count | sum
-------+-----
30 | 195
(1 row)

explain(verbose, costs off)
select distinct on(count(b), count(c)) count(a), sum(b) from t_distinct_sort order by count(c);
QUERY PLAN
---------------------------------------------------------------------------------------------------------------------------
Finalize Aggregate
Output: count(a), sum(b)
-> Gather Motion 3:1 (slice1; segments: 3)
Output: (PARTIAL count(a)), (PARTIAL sum(b))
-> Partial Aggregate
Output: PARTIAL count(a), PARTIAL sum(b)
-> Seq Scan on public.t_distinct_sort
Output: a, b, c
Settings: enable_hashagg = 'on', enable_sort = 'on', gp_statistics_pullup_from_child_partition = 'off', optimizer = 'off'
Optimizer: Postgres query optimizer
(10 rows)

select distinct on(count(b), count(c)) count(a), sum(b) from t_distinct_sort order by count(c);
count | sum
-------+-----
30 | 195
(1 row)

explain(verbose, costs off)
select count(a), sum(b) from t_distinct_sort order by sum(a), count(c);
QUERY PLAN
---------------------------------------------------------------------------------------------------------------------------
Finalize Aggregate
Output: count(a), sum(b)
-> Gather Motion 3:1 (slice1; segments: 3)
Output: (PARTIAL count(a)), (PARTIAL sum(b))
-> Partial Aggregate
Output: PARTIAL count(a), PARTIAL sum(b)
-> Seq Scan on public.t_distinct_sort
Output: a, b, c
Settings: enable_hashagg = 'on', enable_sort = 'on', gp_statistics_pullup_from_child_partition = 'off', optimizer = 'off'
Optimizer: Postgres query optimizer
(10 rows)

select count(a), sum(b) from t_distinct_sort order by sum(a), count(c);
count | sum
-------+-----
30 | 195
(1 row)

explain(verbose, costs off)
select distinct count(a), sum(b) from t_distinct_sort ;
QUERY PLAN
---------------------------------------------------------------------------------------------------------------------------
Finalize Aggregate
Output: count(a), sum(b)
-> Gather Motion 3:1 (slice1; segments: 3)
Output: (PARTIAL count(a)), (PARTIAL sum(b))
-> Partial Aggregate
Output: PARTIAL count(a), PARTIAL sum(b)
-> Seq Scan on public.t_distinct_sort
Output: a, b, c
Settings: enable_hashagg = 'on', enable_sort = 'on', gp_statistics_pullup_from_child_partition = 'off', optimizer = 'off'
Optimizer: Postgres query optimizer
(10 rows)

select distinct count(a), sum(b) from t_distinct_sort ;
count | sum
-------+-----
30 | 195
(1 row)

-- should keep distinct clause
explain(verbose, costs off)
select distinct on(count(random())) count(a), sum(b) from t_distinct_sort;
QUERY PLAN
---------------------------------------------------------------------------------------------------------------------------
Unique
Output: (count(a)), (sum(b)), (count(random()))
Group Key: (count(random()))
-> Sort
Output: (count(a)), (sum(b)), (count(random()))
Sort Key: (count(random()))
-> Finalize Aggregate
Output: count(a), sum(b), count(random())
-> Gather Motion 3:1 (slice1; segments: 3)
Output: (PARTIAL count(a)), (PARTIAL sum(b)), (PARTIAL count(random()))
-> Partial Aggregate
Output: PARTIAL count(a), PARTIAL sum(b), PARTIAL count(random())
-> Seq Scan on public.t_distinct_sort
Output: a, b, c
Settings: enable_hashagg = 'on', enable_sort = 'on', gp_statistics_pullup_from_child_partition = 'off', optimizer = 'off'
Optimizer: Postgres query optimizer
(16 rows)

select distinct on(count(random())) count(a), sum(b) from t_distinct_sort;
count | sum
-------+-----
30 | 195
(1 row)

drop table t_distinct_sort;
131 changes: 131 additions & 0 deletions src/test/regress/expected/select_distinct_optimizer.out
Original file line number Diff line number Diff line change
Expand Up @@ -537,3 +537,134 @@ DROP TABLE capitals;
DROP TABLE cities;
set gp_statistics_pullup_from_child_partition to off;
-- gpdb end: test inherit/partition table distinct when gp_statistics_pullup_from_child_partition is on
create table t_distinct_sort(a int, b int, c int);
insert into t_distinct_sort select i, i+1, i+2 from generate_series(1, 10)i;
insert into t_distinct_sort select i, i+1, i+2 from generate_series(1, 10)i;
insert into t_distinct_sort select i, i+1, i+2 from generate_series(1, 10)i;
analyze t_distinct_sort;
explain(verbose, costs off)
select distinct count(a), sum(b) from t_distinct_sort order by sum(b), count(a);
QUERY PLAN
--------------------------------------------------------------------------------------------------------------------------
Sort
Output: (count(a)), (sum(b))
Sort Key: (sum(t_distinct_sort.b)), (count(t_distinct_sort.a))
-> Finalize Aggregate
Output: count(a), sum(b)
-> Gather Motion 3:1 (slice1; segments: 3)
Output: (PARTIAL count(a)), (PARTIAL sum(b))
-> Partial Aggregate
Output: PARTIAL count(a), PARTIAL sum(b)
-> Seq Scan on public.t_distinct_sort
Output: a, b
Settings: enable_hashagg = 'on', enable_sort = 'on', gp_statistics_pullup_from_child_partition = 'off', optimizer = 'on'
Optimizer: Pivotal Optimizer (GPORCA)
(13 rows)

select distinct count(a), sum(b) from t_distinct_sort order by sum(b), count(a);
count | sum
-------+-----
30 | 195
(1 row)

explain(verbose, costs off)
select distinct on(count(b), count(c)) count(a), sum(b) from t_distinct_sort order by count(c);
QUERY PLAN
--------------------------------------------------------------------------------------------------------------------------
Finalize Aggregate
Output: count(a), sum(b)
-> Gather Motion 3:1 (slice1; segments: 3)
Output: (PARTIAL count(a)), (PARTIAL sum(b))
-> Partial Aggregate
Output: PARTIAL count(a), PARTIAL sum(b)
-> Seq Scan on public.t_distinct_sort
Output: a, b, c
Settings: enable_hashagg = 'on', enable_sort = 'on', gp_statistics_pullup_from_child_partition = 'off', optimizer = 'on'
Optimizer: Postgres query optimizer
(10 rows)

select distinct on(count(b), count(c)) count(a), sum(b) from t_distinct_sort order by count(c);
count | sum
-------+-----
30 | 195
(1 row)

explain(verbose, costs off)
select count(a), sum(b) from t_distinct_sort order by sum(a), count(c);
QUERY PLAN
--------------------------------------------------------------------------------------------------------------------------
Result
Output: (count(a)), (sum(b))
-> Sort
Output: (count(a)), (sum(b)), (sum(a)), (count(c))
Sort Key: (sum(t_distinct_sort.a)), (count(t_distinct_sort.c))
-> Finalize Aggregate
Output: count(a), sum(b), sum(a), count(c)
-> Gather Motion 3:1 (slice1; segments: 3)
Output: (PARTIAL count(a)), (PARTIAL sum(b)), (PARTIAL sum(a)), (PARTIAL count(c))
-> Partial Aggregate
Output: PARTIAL count(a), PARTIAL sum(b), PARTIAL sum(a), PARTIAL count(c)
-> Seq Scan on public.t_distinct_sort
Output: a, b, c
Settings: enable_hashagg = 'on', enable_sort = 'on', gp_statistics_pullup_from_child_partition = 'off', optimizer = 'on'
Optimizer: Pivotal Optimizer (GPORCA)
(15 rows)

select count(a), sum(b) from t_distinct_sort order by sum(a), count(c);
count | sum
-------+-----
30 | 195
(1 row)

explain(verbose, costs off)
select distinct count(a), sum(b) from t_distinct_sort ;
QUERY PLAN
--------------------------------------------------------------------------------------------------------------------------
Finalize Aggregate
Output: count(a), sum(b)
-> Gather Motion 3:1 (slice1; segments: 3)
Output: (PARTIAL count(a)), (PARTIAL sum(b))
-> Partial Aggregate
Output: PARTIAL count(a), PARTIAL sum(b)
-> Seq Scan on public.t_distinct_sort
Output: a, b
Settings: enable_hashagg = 'on', enable_sort = 'on', gp_statistics_pullup_from_child_partition = 'off', optimizer = 'on'
Optimizer: Pivotal Optimizer (GPORCA)
(10 rows)

select distinct count(a), sum(b) from t_distinct_sort ;
count | sum
-------+-----
30 | 195
(1 row)

-- should keep distinct clause
explain(verbose, costs off)
select distinct on(count(random())) count(a), sum(b) from t_distinct_sort;
QUERY PLAN
--------------------------------------------------------------------------------------------------------------------------
Unique
Output: (count(a)), (sum(b)), (count(random()))
Group Key: (count(random()))
-> Sort
Output: (count(a)), (sum(b)), (count(random()))
Sort Key: (count(random()))
-> Finalize Aggregate
Output: count(a), sum(b), count(random())
-> Gather Motion 3:1 (slice1; segments: 3)
Output: (PARTIAL count(a)), (PARTIAL sum(b)), (PARTIAL count(random()))
-> Partial Aggregate
Output: PARTIAL count(a), PARTIAL sum(b), PARTIAL count(random())
-> Seq Scan on public.t_distinct_sort
Output: a, b, c
Settings: enable_hashagg = 'on', enable_sort = 'on', gp_statistics_pullup_from_child_partition = 'off', optimizer = 'on'
Optimizer: Postgres query optimizer
(16 rows)

select distinct on(count(random())) count(a), sum(b) from t_distinct_sort;
count | sum
-------+-----
30 | 195
(1 row)

drop table t_distinct_sort;
25 changes: 25 additions & 0 deletions src/test/regress/sql/select_distinct.sql
Original file line number Diff line number Diff line change
Expand Up @@ -200,3 +200,28 @@ DROP TABLE capitals;
DROP TABLE cities;
set gp_statistics_pullup_from_child_partition to off;
-- gpdb end: test inherit/partition table distinct when gp_statistics_pullup_from_child_partition is on

create table t_distinct_sort(a int, b int, c int);
insert into t_distinct_sort select i, i+1, i+2 from generate_series(1, 10)i;
insert into t_distinct_sort select i, i+1, i+2 from generate_series(1, 10)i;
insert into t_distinct_sort select i, i+1, i+2 from generate_series(1, 10)i;
analyze t_distinct_sort;

explain(verbose, costs off)
select distinct count(a), sum(b) from t_distinct_sort order by sum(b), count(a);
select distinct count(a), sum(b) from t_distinct_sort order by sum(b), count(a);
explain(verbose, costs off)
select distinct on(count(b), count(c)) count(a), sum(b) from t_distinct_sort order by count(c);
select distinct on(count(b), count(c)) count(a), sum(b) from t_distinct_sort order by count(c);
explain(verbose, costs off)
select count(a), sum(b) from t_distinct_sort order by sum(a), count(c);
select count(a), sum(b) from t_distinct_sort order by sum(a), count(c);
explain(verbose, costs off)
select distinct count(a), sum(b) from t_distinct_sort ;
select distinct count(a), sum(b) from t_distinct_sort ;

-- should keep distinct clause
explain(verbose, costs off)
select distinct on(count(random())) count(a), sum(b) from t_distinct_sort;
select distinct on(count(random())) count(a), sum(b) from t_distinct_sort;
drop table t_distinct_sort;

0 comments on commit 4a01111

Please sign in to comment.