Optimize DISTINCT, ORDER BY clause when Aggregation without Group By.

For query which has Aggregation but without Group by clause, the DISTINCT/DISTINCT ON/ORDER BY clause could be removed as there would be one row returned at most. And there is no necessary to do unique or sort. This can simply the plan, and process less Aggref nodes during planner. select distinct on(count(b), count(c)) count(a), sum(b) from t_distinct_sort order by count(c); QUERY PLAN -------------------------------------------------------------------- Unique Output: (count(a)), (sum(b)), (count(c)), (count(b)) Group Key: (count(c)), (count(b)) -> Sort Output: (count(a)), (sum(b)), (count(c)), (count(b)) Sort Key: (count(t_distinct_sort.c)), (count(t_distinct_sort.b)) -> Finalize Aggregate Output: count(a), sum(b), count(c), count(b) -> Gather Motion 3:1 (slice1; segments: 3) Output: (PARTIAL count(a)), (PARTIAL sum(b)), (PARTIAL count(c)), (PARTIAL count(b)) -> Partial Aggregate Output: PARTIAL count(a), PARTIAL sum(b), PARTIAL count(c), PARTIAL count(b) -> Seq Scan on public.t_distinct_sort Output: a, b, c After this commit: select distinct on(count(b), count(c)) count(a), sum(b) from t_distinct_sort order by count(c); QUERY PLAN -------------------------------------------------------- Finalize Aggregate Output: count(a), sum(b) -> Gather Motion 3:1 (slice1; segments: 3) Output: (PARTIAL count(a)), (PARTIAL sum(b)) -> Partial Aggregate Output: PARTIAL count(a), PARTIAL sum(b) -> Seq Scan on public.t_distinct_sort Output: a, b, c Optimizer: Postgres query optimizer Authored-by: Zhang Mingli [email protected]
apache · Oct 24, 2024 · 4a01111 · 4a01111
1 parent 5633fbb
commit 4a01111
Show file tree

Hide file tree

Showing 4 changed files with 338 additions and 0 deletions.
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
@@ -1338,6 +1338,65 @@ subquery_planner(PlannerGlobal *glob, Query *parse,
 	 */
 	if (hasResultRTEs)
 		remove_useless_result_rtes(root);
+
+	/*
+	 * DISTINCT optimization.
+	 * Remove DISTINCT clause if possibile, ex:
+	 * select DISTINCT count(a) from t; to
+	 * select count(a) from t;
+	 * There is one row returned at most, DISTINCT is pointless then.
+	 * The same with ORDER BY clause;
+	 */
+	if (parse->hasAggs &&
+		parse->groupClause == NIL &&
+		!contain_mutable_functions((Node *) parse))
+	{
+		List	   *useless_tlist = NIL;
+		List	   *tles;
+		List	   *sortops;
+		List	   *eqops;
+		ListCell   *lc;
+
+		if (parse->distinctClause != NIL)
+		{
+			get_sortgroupclauses_tles(parse->distinctClause, parse->targetList,
+									  &tles, &sortops, &eqops);
+			foreach(lc, tles)
+			{
+				TargetEntry *tle = lfirst(lc);
+				if (tle->resjunk)
+					useless_tlist = lappend(useless_tlist, tle);
+			}
+			parse->distinctClause = NIL;
+			if (parse->hasDistinctOn)
+				parse->hasDistinctOn = false;
+		}
+
+		if (parse->sortClause != NIL)
+		{
+
+			get_sortgroupclauses_tles(parse->sortClause, parse->targetList,
+									  &tles, &sortops, &eqops);
+			foreach(lc, tles)
+			{
+				TargetEntry *tle = lfirst(lc);
+				/*
+				 * For SELECT DISTINCT, ORDER BY expressions must appear in select list,
+				 * Some tles may be already in the list.
+				 */ 
+				if (tle->resjunk)
+					useless_tlist = list_append_unique(useless_tlist, tle);
+			}
+			parse->sortClause = NIL;
+		}
+
+		/*
+		 * There is no groupClause, sortClause and distinctClause now .
+		 * The junk TargetEntrys with ressortgroupref index are safe to be removed.
+		 */
+		if (useless_tlist != NIL)
+			parse->targetList = list_difference(parse->targetList, useless_tlist);
+	}
 
 	/*
 	 * Do the main planning.

diff --git a/src/test/regress/expected/select_distinct.out b/src/test/regress/expected/select_distinct.out
@@ -534,3 +534,126 @@ DROP TABLE capitals;
 DROP TABLE cities;
 set gp_statistics_pullup_from_child_partition to off;
 -- gpdb end: test inherit/partition table distinct when gp_statistics_pullup_from_child_partition is on
+create table t_distinct_sort(a int, b int, c int);
+insert into t_distinct_sort select i, i+1, i+2 from generate_series(1, 10)i;
+insert into t_distinct_sort select i, i+1, i+2  from generate_series(1, 10)i;
+insert into t_distinct_sort select i, i+1, i+2  from generate_series(1, 10)i;
+analyze t_distinct_sort;
+explain(verbose, costs off)
+select distinct count(a), sum(b) from t_distinct_sort order by sum(b), count(a);
+                                                        QUERY PLAN                                                         
+---------------------------------------------------------------------------------------------------------------------------
+ Finalize Aggregate
+   Output: count(a), sum(b)
+   ->  Gather Motion 3:1  (slice1; segments: 3)
+         Output: (PARTIAL count(a)), (PARTIAL sum(b))
+         ->  Partial Aggregate
+               Output: PARTIAL count(a), PARTIAL sum(b)
+               ->  Seq Scan on public.t_distinct_sort
+                     Output: a, b, c
+ Settings: enable_hashagg = 'on', enable_sort = 'on', gp_statistics_pullup_from_child_partition = 'off', optimizer = 'off'
+ Optimizer: Postgres query optimizer
+(10 rows)
+
+select distinct count(a), sum(b) from t_distinct_sort order by sum(b), count(a);
+ count | sum 
+-------+-----
+    30 | 195
+(1 row)
+
+explain(verbose, costs off)
+select distinct on(count(b), count(c)) count(a), sum(b) from t_distinct_sort order by count(c);
+                                                        QUERY PLAN                                                         
+---------------------------------------------------------------------------------------------------------------------------
+ Finalize Aggregate
+   Output: count(a), sum(b)
+   ->  Gather Motion 3:1  (slice1; segments: 3)
+         Output: (PARTIAL count(a)), (PARTIAL sum(b))
+         ->  Partial Aggregate
+               Output: PARTIAL count(a), PARTIAL sum(b)
+               ->  Seq Scan on public.t_distinct_sort
+                     Output: a, b, c
+ Settings: enable_hashagg = 'on', enable_sort = 'on', gp_statistics_pullup_from_child_partition = 'off', optimizer = 'off'
+ Optimizer: Postgres query optimizer
+(10 rows)
+
+select distinct on(count(b), count(c)) count(a), sum(b) from t_distinct_sort order by count(c);
+ count | sum 
+-------+-----
+    30 | 195
+(1 row)
+
+explain(verbose, costs off)
+select count(a), sum(b) from t_distinct_sort order by sum(a), count(c);
+                                                        QUERY PLAN                                                         
+---------------------------------------------------------------------------------------------------------------------------
+ Finalize Aggregate
+   Output: count(a), sum(b)
+   ->  Gather Motion 3:1  (slice1; segments: 3)
+         Output: (PARTIAL count(a)), (PARTIAL sum(b))
+         ->  Partial Aggregate
+               Output: PARTIAL count(a), PARTIAL sum(b)
+               ->  Seq Scan on public.t_distinct_sort
+                     Output: a, b, c
+ Settings: enable_hashagg = 'on', enable_sort = 'on', gp_statistics_pullup_from_child_partition = 'off', optimizer = 'off'
+ Optimizer: Postgres query optimizer
+(10 rows)
+
+select count(a), sum(b) from t_distinct_sort order by sum(a), count(c);
+ count | sum 
+-------+-----
+    30 | 195
+(1 row)
+
+explain(verbose, costs off)
+select distinct count(a), sum(b) from t_distinct_sort ;
+                                                        QUERY PLAN                                                         
+---------------------------------------------------------------------------------------------------------------------------
+ Finalize Aggregate
+   Output: count(a), sum(b)
+   ->  Gather Motion 3:1  (slice1; segments: 3)
+         Output: (PARTIAL count(a)), (PARTIAL sum(b))
+         ->  Partial Aggregate
+               Output: PARTIAL count(a), PARTIAL sum(b)
+               ->  Seq Scan on public.t_distinct_sort
+                     Output: a, b, c
+ Settings: enable_hashagg = 'on', enable_sort = 'on', gp_statistics_pullup_from_child_partition = 'off', optimizer = 'off'
+ Optimizer: Postgres query optimizer
+(10 rows)
+
+select distinct count(a), sum(b) from t_distinct_sort ;
+ count | sum 
+-------+-----
+    30 | 195
+(1 row)
+
+-- should keep distinct clause
+explain(verbose, costs off) 
+select distinct on(count(random())) count(a), sum(b) from t_distinct_sort;
+                                                        QUERY PLAN                                                         
+---------------------------------------------------------------------------------------------------------------------------
+ Unique
+   Output: (count(a)), (sum(b)), (count(random()))
+   Group Key: (count(random()))
+   ->  Sort
+         Output: (count(a)), (sum(b)), (count(random()))
+         Sort Key: (count(random()))
+         ->  Finalize Aggregate
+               Output: count(a), sum(b), count(random())
+               ->  Gather Motion 3:1  (slice1; segments: 3)
+                     Output: (PARTIAL count(a)), (PARTIAL sum(b)), (PARTIAL count(random()))
+                     ->  Partial Aggregate
+                           Output: PARTIAL count(a), PARTIAL sum(b), PARTIAL count(random())
+                           ->  Seq Scan on public.t_distinct_sort
+                                 Output: a, b, c
+ Settings: enable_hashagg = 'on', enable_sort = 'on', gp_statistics_pullup_from_child_partition = 'off', optimizer = 'off'
+ Optimizer: Postgres query optimizer
+(16 rows)
+
+select distinct on(count(random())) count(a), sum(b) from t_distinct_sort;
+ count | sum 
+-------+-----
+    30 | 195
+(1 row)
+
+drop table t_distinct_sort;
diff --git a/src/test/regress/expected/select_distinct_optimizer.out b/src/test/regress/expected/select_distinct_optimizer.out
@@ -537,3 +537,134 @@ DROP TABLE capitals;
 DROP TABLE cities;
 set gp_statistics_pullup_from_child_partition to off;
 -- gpdb end: test inherit/partition table distinct when gp_statistics_pullup_from_child_partition is on
+create table t_distinct_sort(a int, b int, c int);
+insert into t_distinct_sort select i, i+1, i+2 from generate_series(1, 10)i;
+insert into t_distinct_sort select i, i+1, i+2  from generate_series(1, 10)i;
+insert into t_distinct_sort select i, i+1, i+2  from generate_series(1, 10)i;
+analyze t_distinct_sort;
+explain(verbose, costs off)
+select distinct count(a), sum(b) from t_distinct_sort order by sum(b), count(a);
+                                                        QUERY PLAN                                                        
+--------------------------------------------------------------------------------------------------------------------------
+ Sort
+   Output: (count(a)), (sum(b))
+   Sort Key: (sum(t_distinct_sort.b)), (count(t_distinct_sort.a))
+   ->  Finalize Aggregate
+         Output: count(a), sum(b)
+         ->  Gather Motion 3:1  (slice1; segments: 3)
+               Output: (PARTIAL count(a)), (PARTIAL sum(b))
+               ->  Partial Aggregate
+                     Output: PARTIAL count(a), PARTIAL sum(b)
+                     ->  Seq Scan on public.t_distinct_sort
+                           Output: a, b
+ Settings: enable_hashagg = 'on', enable_sort = 'on', gp_statistics_pullup_from_child_partition = 'off', optimizer = 'on'
+ Optimizer: Pivotal Optimizer (GPORCA)
+(13 rows)
+
+select distinct count(a), sum(b) from t_distinct_sort order by sum(b), count(a);
+ count | sum 
+-------+-----
+    30 | 195
+(1 row)
+
+explain(verbose, costs off)
+select distinct on(count(b), count(c)) count(a), sum(b) from t_distinct_sort order by count(c);
+                                                        QUERY PLAN                                                        
+--------------------------------------------------------------------------------------------------------------------------
+ Finalize Aggregate
+   Output: count(a), sum(b)
+   ->  Gather Motion 3:1  (slice1; segments: 3)
+         Output: (PARTIAL count(a)), (PARTIAL sum(b))
+         ->  Partial Aggregate
+               Output: PARTIAL count(a), PARTIAL sum(b)
+               ->  Seq Scan on public.t_distinct_sort
+                     Output: a, b, c
+ Settings: enable_hashagg = 'on', enable_sort = 'on', gp_statistics_pullup_from_child_partition = 'off', optimizer = 'on'
+ Optimizer: Postgres query optimizer
+(10 rows)
+
+select distinct on(count(b), count(c)) count(a), sum(b) from t_distinct_sort order by count(c);
+ count | sum 
+-------+-----
+    30 | 195
+(1 row)
+
+explain(verbose, costs off)
+select count(a), sum(b) from t_distinct_sort order by sum(a), count(c);
+                                                        QUERY PLAN                                                        
+--------------------------------------------------------------------------------------------------------------------------
+ Result
+   Output: (count(a)), (sum(b))
+   ->  Sort
+         Output: (count(a)), (sum(b)), (sum(a)), (count(c))
+         Sort Key: (sum(t_distinct_sort.a)), (count(t_distinct_sort.c))
+         ->  Finalize Aggregate
+               Output: count(a), sum(b), sum(a), count(c)
+               ->  Gather Motion 3:1  (slice1; segments: 3)
+                     Output: (PARTIAL count(a)), (PARTIAL sum(b)), (PARTIAL sum(a)), (PARTIAL count(c))
+                     ->  Partial Aggregate
+                           Output: PARTIAL count(a), PARTIAL sum(b), PARTIAL sum(a), PARTIAL count(c)
+                           ->  Seq Scan on public.t_distinct_sort
+                                 Output: a, b, c
+ Settings: enable_hashagg = 'on', enable_sort = 'on', gp_statistics_pullup_from_child_partition = 'off', optimizer = 'on'
+ Optimizer: Pivotal Optimizer (GPORCA)
+(15 rows)
+
+select count(a), sum(b) from t_distinct_sort order by sum(a), count(c);
+ count | sum 
+-------+-----
+    30 | 195
+(1 row)
+
+explain(verbose, costs off)
+select distinct count(a), sum(b) from t_distinct_sort ;
+                                                        QUERY PLAN                                                        
+--------------------------------------------------------------------------------------------------------------------------
+ Finalize Aggregate
+   Output: count(a), sum(b)
+   ->  Gather Motion 3:1  (slice1; segments: 3)
+         Output: (PARTIAL count(a)), (PARTIAL sum(b))
+         ->  Partial Aggregate
+               Output: PARTIAL count(a), PARTIAL sum(b)
+               ->  Seq Scan on public.t_distinct_sort
+                     Output: a, b
+ Settings: enable_hashagg = 'on', enable_sort = 'on', gp_statistics_pullup_from_child_partition = 'off', optimizer = 'on'
+ Optimizer: Pivotal Optimizer (GPORCA)
+(10 rows)
+
+select distinct count(a), sum(b) from t_distinct_sort ;
+ count | sum 
+-------+-----
+    30 | 195
+(1 row)
+
+-- should keep distinct clause
+explain(verbose, costs off) 
+select distinct on(count(random())) count(a), sum(b) from t_distinct_sort;
+                                                        QUERY PLAN                                                        
+--------------------------------------------------------------------------------------------------------------------------
+ Unique
+   Output: (count(a)), (sum(b)), (count(random()))
+   Group Key: (count(random()))
+   ->  Sort
+         Output: (count(a)), (sum(b)), (count(random()))
+         Sort Key: (count(random()))
+         ->  Finalize Aggregate
+               Output: count(a), sum(b), count(random())
+               ->  Gather Motion 3:1  (slice1; segments: 3)
+                     Output: (PARTIAL count(a)), (PARTIAL sum(b)), (PARTIAL count(random()))
+                     ->  Partial Aggregate
+                           Output: PARTIAL count(a), PARTIAL sum(b), PARTIAL count(random())
+                           ->  Seq Scan on public.t_distinct_sort
+                                 Output: a, b, c
+ Settings: enable_hashagg = 'on', enable_sort = 'on', gp_statistics_pullup_from_child_partition = 'off', optimizer = 'on'
+ Optimizer: Postgres query optimizer
+(16 rows)
+
+select distinct on(count(random())) count(a), sum(b) from t_distinct_sort;
+ count | sum 
+-------+-----
+    30 | 195
+(1 row)
+
+drop table t_distinct_sort;
diff --git a/src/test/regress/sql/select_distinct.sql b/src/test/regress/sql/select_distinct.sql
@@ -200,3 +200,28 @@ DROP TABLE capitals;
 DROP TABLE cities;
 set gp_statistics_pullup_from_child_partition to off;
 -- gpdb end: test inherit/partition table distinct when gp_statistics_pullup_from_child_partition is on
+
+create table t_distinct_sort(a int, b int, c int);
+insert into t_distinct_sort select i, i+1, i+2 from generate_series(1, 10)i;
+insert into t_distinct_sort select i, i+1, i+2  from generate_series(1, 10)i;
+insert into t_distinct_sort select i, i+1, i+2  from generate_series(1, 10)i;
+analyze t_distinct_sort;
+
+explain(verbose, costs off)
+select distinct count(a), sum(b) from t_distinct_sort order by sum(b), count(a);
+select distinct count(a), sum(b) from t_distinct_sort order by sum(b), count(a);
+explain(verbose, costs off)
+select distinct on(count(b), count(c)) count(a), sum(b) from t_distinct_sort order by count(c);
+select distinct on(count(b), count(c)) count(a), sum(b) from t_distinct_sort order by count(c);
+explain(verbose, costs off)
+select count(a), sum(b) from t_distinct_sort order by sum(a), count(c);
+select count(a), sum(b) from t_distinct_sort order by sum(a), count(c);
+explain(verbose, costs off)
+select distinct count(a), sum(b) from t_distinct_sort ;
+select distinct count(a), sum(b) from t_distinct_sort ;
+
+-- should keep distinct clause
+explain(verbose, costs off) 
+select distinct on(count(random())) count(a), sum(b) from t_distinct_sort;
+select distinct on(count(random())) count(a), sum(b) from t_distinct_sort;
+drop table t_distinct_sort;