From 5fe368d65344de4bfd96dcf8dd7472aeb2525e8b Mon Sep 17 00:00:00 2001
From: Alexander Kondakov <a.kondakov@arenadata.io>
Date: Mon, 24 Jul 2023 18:04:20 +0300
Subject: [PATCH] ADBDEV-5527 Change the portal strategy for modifying CTE case

In vanilla PG and Greenplum queries containing modifying CTE are executed
with portal strategy PORTAL_ONE_MOD_WITH, which involves storing the result
of the query in portal's tuple store, which the final result is then fetched
from. This strategy is fair enough and helps us avoid executing
modifying operation several times, for example, when one need to fetch
the query result partially. However, in Greenplum case executing this
strategy at QE nodes is redundant, because clients communicate with the
system throught the QD node. Therefore, this commit proposes another
type of strategy for execution at QE nodes.

Moreover, when a query contains modifying CTE, EXPLAIN ANALYZE reports most of
plan nodes as "never executed", even though the corresponding nodes have been
fully executed and have returned the tuples. This problem arise because
of PORTAL_ONE_MOD_WITH execution strategy, according to which the extra
pq message with tuple description is sent to QD, and the statistics
can't be retrieved properly in cdbexplain_recvExecStats function.

(cherry picked from commit c9b7b025d303e5fcba406a806d8264d34c429b23)
---
 src/backend/tcop/pquery.c                     |  10 +-
 src/test/regress/expected/explain_analyze.out | 101 ++++++++++++++---
 .../expected/explain_analyze_optimizer.out    | 105 +++++++++++++++---
 src/test/regress/sql/explain_analyze.sql      |  26 +++++
 4 files changed, 209 insertions(+), 33 deletions(-)

diff --git a/src/backend/tcop/pquery.c b/src/backend/tcop/pquery.c
index dbeff8c054e2..e44cf0b959bf 100644
--- a/src/backend/tcop/pquery.c
+++ b/src/backend/tcop/pquery.c
@@ -357,6 +357,9 @@ ChoosePortalStrategy(List *stmts)
 	/* Note For CreateTableAs, we still use PORTAL_MULTI_QUERY (not like PG)
 	 * since QE needs to use DestRemote to deliver completionTag to QD and
 	 * use DestIntoRel to insert tuples into the table(s).
+	 *
+	 * For modifying CTE we use PORTAL_MULTI_QUERY at QE nodes, because
+	 * we don't need saving the result inside portal's tuple store.
 	 */
 	if (list_length(stmts) == 1)
 	{
@@ -396,10 +399,11 @@ ChoosePortalStrategy(List *stmts)
 					pstmt->copyIntoClause == NULL &&
 					pstmt->refreshClause == NULL)
 				{
-					if (pstmt->hasModifyingCTE)
-						return PORTAL_ONE_MOD_WITH;
-					else
+					if (!pstmt->hasModifyingCTE)
 						return PORTAL_ONE_SELECT;
+					if (Gp_role != GP_ROLE_EXECUTE)
+						return PORTAL_ONE_MOD_WITH;
+					return PORTAL_MULTI_QUERY;
 				}
 				if (pstmt->commandType == CMD_UTILITY)
 				{
diff --git a/src/test/regress/expected/explain_analyze.out b/src/test/regress/expected/explain_analyze.out
index fd70e3162c52..50ca8626522f 100644
--- a/src/test/regress/expected/explain_analyze.out
+++ b/src/test/regress/expected/explain_analyze.out
@@ -13,6 +13,8 @@
 -- s/Execution Time: [0-9.]+ ms/Execution Time: #.### ms/
 -- m/Executor memory: \d+\w? bytes/
 -- s/Executor memory: \d+\w? bytes/Executor memory: ### bytes/
+-- m/\(slice\d+\)    Executor memory: (\d+)\w bytes avg x \d+ workers, \d+\w bytes max \(seg\d+\)\./
+-- s/Executor memory: (\d+)\w bytes avg x \d+ workers, \d+\w bytes max \(seg\d+\)\./Executor memory: ####K bytes avg x #### workers, ####K bytes max (seg#)./
 -- m/Memory used:\s+\d+\w?B/
 -- s/Memory used:\s+\d+\w?B/Memory used:  ###B/
 -- m/\d+\w? bytes max \(seg\d+\)/
@@ -32,7 +34,7 @@ EXPLAIN (ANALYZE, TIMING OFF, COSTS OFF, SUMMARY OFF) SELECT a FROM empty_table;
 ------------------------------------------------------------------
  Gather Motion 3:1  (slice1; segments: 3) (actual rows=0 loops=1)
    ->  Seq Scan on empty_table (actual rows=0 loops=1)
- Optimizer: Postgres query optimizer
+ Optimizer: Postgres-based planner
 (3 rows)
 
 -- explain_processing_on
@@ -48,13 +50,14 @@ EXPLAIN (ANALYZE, TIMING OFF, COSTS OFF, SUMMARY OFF) SELECT t1.a FROM empty_tab
          ->  Seq Scan on empty_table t1 (actual rows=0 loops=1)
          ->  Hash (never executed)
                ->  Seq Scan on empty_table t2 (never executed)
- Optimizer: Postgres query optimizer
+ Optimizer: Postgres-based planner
 (7 rows)
 
 -- explain_processing_on
 -- If all QEs hit errors when executing sort, we might not receive stat data for sort.
 -- rethrow error before print explain info.
 create extension if not exists gp_inject_fault;
+NOTICE:  extension "gp_inject_fault" already exists, skipping
 create table sort_error_test1(tc1 int, tc2 int);
 NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'tc1' as the Greenplum Database data distribution key for this table.
 HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
@@ -72,7 +75,7 @@ select gp_inject_fault('explain_analyze_sort_error', 'error', dbid)
 (3 rows)
 
 EXPLAIN analyze insert into sort_error_test2 select * from sort_error_test1 order by 1;
-ERROR:  fault triggered, fault name:'explain_analyze_sort_error' fault type:'error'  (seg1 127.0.1.1:7003 pid=103595)
+ERROR:  fault triggered, fault name:'explain_analyze_sort_error' fault type:'error'  (seg1 172.17.0.2:7003 pid=36295)
 select count(*) from sort_error_test2;
  count 
 -------
@@ -116,12 +119,12 @@ explain (analyze, timing off, costs off) select a.i from slice_test a
                        ->  Broadcast Motion 3:3  (slice2; segments: 3) (actual rows=1 loops=1)
                              ->  Seq Scan on slice_test2 b (actual rows=1 loops=1)
  Optimizer: Postgres-based planner
- Planning Time: 0.662 ms
-   (slice0)    Executor memory: 47K bytes.
-   (slice1)    Executor memory: 43K bytes avg x 3 workers, 43K bytes max (seg0).  Work_mem: 17K bytes max.
+ Planning Time: 0.434 ms
+   (slice0)    Executor memory: 45K bytes.
+   (slice1)    Executor memory: 42K bytes avg x 3 workers, 42K bytes max (seg0).  Work_mem: 17K bytes max.
    (slice2)    Executor memory: 37K bytes avg x 3 workers, 37K bytes max (seg0).
- Memory used:  128000kB
- Execution Time: 4.953 ms
+ Memory used:  256000kB
+ Execution Time: 9.264 ms
 (17 rows)
 
 -- checking this didn't break previous behavior
@@ -140,13 +143,83 @@ explain (analyze, timing off, costs off)
      ->  Result (actual rows=1 loops=2)
            One-Time Filter: ((x.x)::integer = 0)
  Optimizer: Postgres-based planner
- Planning Time: 0.245 ms
-   (slice0)    Executor memory: 19K bytes.
-   (slice1)    Executor memory: 36K bytes.  Work_mem: 17K bytes max.
- Memory used:  128000kB
- Execution Time: 0.082 ms
-(14 rows)
+ Planning Time: 0.233 ms
+   (slice0)    Executor memory: 18K bytes.
+   (slice1)    Executor memory: 35K bytes.  Work_mem: 17K bytes max.
+ Memory used:  256000kB
+ Execution Time: 0.090 ms
+(13 rows)
 
 -- explain_processing_on
 drop table slice_test;
 drop table slice_test2;
+-- The statistics for modifying CTEs used to be reported as "never executed",
+-- when all plan nodes were executed and some stat information was expected.
+-- Test QD recieving the stats from all slices and showing it in explain output.
+--start_ignore
+DROP TABLE IF EXISTS with_dml;
+NOTICE:  table "with_dml" does not exist, skipping
+--end_ignore
+CREATE TABLE with_dml (i int, j int) DISTRIBUTED BY (i);
+EXPLAIN (ANALYZE, TIMING OFF, COSTS OFF)
+WITH cte AS (
+    INSERT INTO with_dml SELECT i, i * 100 FROM generate_series(1,5) i
+    RETURNING i
+) SELECT * FROM cte;
+                                                QUERY PLAN                                                 
+-----------------------------------------------------------------------------------------------------------
+ Gather Motion 3:1  (slice1; segments: 3) (actual rows=5 loops=1)
+   ->  Shared Scan (share slice:id 1:0) (actual rows=3 loops=1)
+         ->  Insert on with_dml (actual rows=3 loops=1)
+               ->  Redistribute Motion 1:3  (slice2; segments: 1) (actual rows=3 loops=1)
+                     Hash Key: i.i
+                     ->  Function Scan on generate_series i (actual rows=5 loops=1)
+ Optimizer: Postgres-based planner
+ Planning Time: 0.163 ms
+   (slice0)    Executor memory: 36K bytes.
+   (slice1)    Executor memory: 36K bytes avg x 3 workers, 36K bytes max (seg0).  Work_mem: 17K bytes max.
+   (slice2)    Executor memory: 40K bytes (seg2).  Work_mem: 17K bytes max.
+ Memory used:  256000kB
+ Execution Time: 7.189 ms
+(13 rows)
+
+EXPLAIN (ANALYZE, TIMING OFF, COSTS OFF)
+WITH cte AS (
+    UPDATE with_dml SET j = j + 1
+    RETURNING i
+) SELECT * FROM cte;
+                                                QUERY PLAN                                                 
+-----------------------------------------------------------------------------------------------------------
+ Gather Motion 3:1  (slice1; segments: 3) (actual rows=5 loops=1)
+   ->  Shared Scan (share slice:id 1:0) (actual rows=3 loops=1)
+         ->  Update on with_dml (actual rows=3 loops=1)
+               ->  Seq Scan on with_dml (actual rows=3 loops=1)
+ Optimizer: Postgres-based planner
+ Planning Time: 0.288 ms
+   (slice0)    Executor memory: 38K bytes.
+   (slice1)    Executor memory: 38K bytes avg x 3 workers, 38K bytes max (seg0).  Work_mem: 17K bytes max.
+ Memory used:  256000kB
+ Execution Time: 1.290 ms
+(10 rows)
+
+EXPLAIN (ANALYZE, TIMING OFF, COSTS OFF)
+WITH cte AS (
+    DELETE FROM with_dml WHERE i > 0
+    RETURNING i
+) SELECT * FROM cte;
+                                                QUERY PLAN                                                 
+-----------------------------------------------------------------------------------------------------------
+ Gather Motion 3:1  (slice1; segments: 3) (actual rows=5 loops=1)
+   ->  Shared Scan (share slice:id 1:0) (actual rows=3 loops=1)
+         ->  Delete on with_dml (actual rows=3 loops=1)
+               ->  Seq Scan on with_dml (actual rows=3 loops=1)
+                     Filter: (i > 0)
+ Optimizer: Postgres-based planner
+ Planning Time: 0.232 ms
+   (slice0)    Executor memory: 38K bytes.
+   (slice1)    Executor memory: 37K bytes avg x 3 workers, 37K bytes max (seg0).  Work_mem: 17K bytes max.
+ Memory used:  256000kB
+ Execution Time: 0.947 ms
+(11 rows)
+
+DROP TABLE with_dml;
diff --git a/src/test/regress/expected/explain_analyze_optimizer.out b/src/test/regress/expected/explain_analyze_optimizer.out
index c809518660ad..723861deeb4a 100644
--- a/src/test/regress/expected/explain_analyze_optimizer.out
+++ b/src/test/regress/expected/explain_analyze_optimizer.out
@@ -13,6 +13,8 @@
 -- s/Execution Time: [0-9.]+ ms/Execution Time: #.### ms/
 -- m/Executor memory: \d+\w? bytes/
 -- s/Executor memory: \d+\w? bytes/Executor memory: ### bytes/
+-- m/\(slice\d+\)    Executor memory: (\d+)\w bytes avg x \d+ workers, \d+\w bytes max \(seg\d+\)\./
+-- s/Executor memory: (\d+)\w bytes avg x \d+ workers, \d+\w bytes max \(seg\d+\)\./Executor memory: ####K bytes avg x #### workers, ####K bytes max (seg#)./
 -- m/Memory used:\s+\d+\w?B/
 -- s/Memory used:\s+\d+\w?B/Memory used:  ###B/
 -- m/\d+\w? bytes max \(seg\d+\)/
@@ -32,7 +34,7 @@ EXPLAIN (ANALYZE, TIMING OFF, COSTS OFF, SUMMARY OFF) SELECT a FROM empty_table;
 ------------------------------------------------------------------
  Gather Motion 3:1  (slice1; segments: 3) (actual rows=0 loops=1)
    ->  Seq Scan on empty_table (actual rows=0 loops=1)
- Optimizer: Pivotal Optimizer (GPORCA)
+ Optimizer: GPORCA
 (3 rows)
 
 -- explain_processing_on
@@ -40,22 +42,23 @@ EXPLAIN (ANALYZE, TIMING OFF, COSTS OFF, SUMMARY OFF) SELECT a FROM empty_table;
 -- be reported
 -- explain_processing_off
 EXPLAIN (ANALYZE, TIMING OFF, COSTS OFF, SUMMARY OFF) SELECT t1.a FROM empty_table t1 join empty_table t2 on t1.a = t2.a;
-                                   QUERY PLAN                                    
----------------------------------------------------------------------------------
+                              QUERY PLAN                              
+----------------------------------------------------------------------
  Gather Motion 3:1  (slice1; segments: 3) (actual rows=0 loops=1)
    ->  Hash Join (actual rows=0 loops=1)
          Hash Cond: (t1.a = t2.a)
          ->  Seq Scan on empty_table t1 (never executed)
          ->  Hash (actual rows=0 loops=1)
-               Buckets: 524288  Batches: 1  Memory Usage: 4096kB
+               Buckets: 1048576  Batches: 1  Memory Usage: 8192kB
                ->  Seq Scan on empty_table t2 (actual rows=0 loops=1)
- Optimizer: Pivotal Optimizer (GPORCA)
+ Optimizer: GPORCA
 (8 rows)
 
 -- explain_processing_on
 -- If all QEs hit errors when executing sort, we might not receive stat data for sort.
 -- rethrow error before print explain info.
 create extension if not exists gp_inject_fault;
+NOTICE:  extension "gp_inject_fault" already exists, skipping
 create table sort_error_test1(tc1 int, tc2 int);
 NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'tc1' as the Greenplum Database data distribution key for this table.
 HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
@@ -73,7 +76,7 @@ select gp_inject_fault('explain_analyze_sort_error', 'error', dbid)
 (3 rows)
 
 EXPLAIN analyze insert into sort_error_test2 select * from sort_error_test1 order by 1;
-ERROR:  fault triggered, fault name:'explain_analyze_sort_error' fault type:'error'  (seg0 127.0.1.1:7002 pid=103209)
+ERROR:  fault triggered, fault name:'explain_analyze_sort_error' fault type:'error'  (seg0 172.17.0.2:7002 pid=37168)
 select count(*) from sort_error_test2;
  count 
 -------
@@ -119,12 +122,12 @@ explain (analyze, timing off, costs off) select a.i from slice_test a
                  ->  Gather Motion 3:1  (slice2; segments: 3) (actual rows=1 loops=1)
                        ->  Seq Scan on slice_test2 b (actual rows=1 loops=1)
  Optimizer: GPORCA
- Planning Time: 6.096 ms
-   (slice0)    Executor memory: 49K bytes.  Work_mem: 17K bytes max.
+ Planning Time: 4.463 ms
+   (slice0)    Executor memory: 48K bytes.  Work_mem: 17K bytes max.
    (slice1)    Executor memory: 36K bytes avg x 3 workers, 36K bytes max (seg0).
    (slice2)    Executor memory: 37K bytes avg x 3 workers, 37K bytes max (seg0).
- Memory used:  128000kB
- Execution Time: 5.096 ms
+ Memory used:  256000kB
+ Execution Time: 6.883 ms
 (19 rows)
 
 -- checking this didn't break previous behavior
@@ -143,13 +146,83 @@ explain (analyze, timing off, costs off)
      ->  Result (actual rows=1 loops=2)
            One-Time Filter: ((x.x)::integer = 0)
  Optimizer: Postgres-based planner
- Planning Time: 2.920 ms
-   (slice0)    Executor memory: 19K bytes.
-   (slice1)    Executor memory: 36K bytes.  Work_mem: 17K bytes max.
- Memory used:  128000kB
- Execution Time: 0.085 ms
-(14 rows)
+ Planning Time: 3.210 ms
+   (slice0)    Executor memory: 18K bytes.
+   (slice1)    Executor memory: 35K bytes.  Work_mem: 17K bytes max.
+ Memory used:  256000kB
+ Execution Time: 0.074 ms
+(13 rows)
 
 -- explain_processing_on
 drop table slice_test;
 drop table slice_test2;
+-- The statistics for modifying CTEs used to be reported as "never executed",
+-- when all plan nodes were executed and some stat information was expected.
+-- Test QD recieving the stats from all slices and showing it in explain output.
+--start_ignore
+DROP TABLE IF EXISTS with_dml;
+NOTICE:  table "with_dml" does not exist, skipping
+--end_ignore
+CREATE TABLE with_dml (i int, j int) DISTRIBUTED BY (i);
+EXPLAIN (ANALYZE, TIMING OFF, COSTS OFF)
+WITH cte AS (
+    INSERT INTO with_dml SELECT i, i * 100 FROM generate_series(1,5) i
+    RETURNING i
+) SELECT * FROM cte;
+                                                QUERY PLAN                                                 
+-----------------------------------------------------------------------------------------------------------
+ Gather Motion 3:1  (slice1; segments: 3) (actual rows=5 loops=1)
+   ->  Shared Scan (share slice:id 1:0) (actual rows=3 loops=1)
+         ->  Insert on with_dml (actual rows=3 loops=1)
+               ->  Redistribute Motion 1:3  (slice2; segments: 1) (actual rows=3 loops=1)
+                     Hash Key: i.i
+                     ->  Function Scan on generate_series i (actual rows=5 loops=1)
+ Optimizer: Postgres-based planner
+ Planning Time: 2.969 ms
+   (slice0)    Executor memory: 36K bytes.
+   (slice1)    Executor memory: 36K bytes avg x 3 workers, 36K bytes max (seg0).  Work_mem: 17K bytes max.
+   (slice2)    Executor memory: 40K bytes (seg2).  Work_mem: 17K bytes max.
+ Memory used:  256000kB
+ Execution Time: 7.598 ms
+(13 rows)
+
+EXPLAIN (ANALYZE, TIMING OFF, COSTS OFF)
+WITH cte AS (
+    UPDATE with_dml SET j = j + 1
+    RETURNING i
+) SELECT * FROM cte;
+                                                QUERY PLAN                                                 
+-----------------------------------------------------------------------------------------------------------
+ Gather Motion 3:1  (slice1; segments: 3) (actual rows=5 loops=1)
+   ->  Shared Scan (share slice:id 1:0) (actual rows=3 loops=1)
+         ->  Update on with_dml (actual rows=3 loops=1)
+               ->  Seq Scan on with_dml (actual rows=3 loops=1)
+ Optimizer: Postgres-based planner
+ Planning Time: 2.896 ms
+   (slice0)    Executor memory: 38K bytes.
+   (slice1)    Executor memory: 38K bytes avg x 3 workers, 38K bytes max (seg0).  Work_mem: 17K bytes max.
+ Memory used:  256000kB
+ Execution Time: 1.348 ms
+(10 rows)
+
+EXPLAIN (ANALYZE, TIMING OFF, COSTS OFF)
+WITH cte AS (
+    DELETE FROM with_dml WHERE i > 0
+    RETURNING i
+) SELECT * FROM cte;
+                                                QUERY PLAN                                                 
+-----------------------------------------------------------------------------------------------------------
+ Gather Motion 3:1  (slice1; segments: 3) (actual rows=5 loops=1)
+   ->  Shared Scan (share slice:id 1:0) (actual rows=3 loops=1)
+         ->  Delete on with_dml (actual rows=3 loops=1)
+               ->  Seq Scan on with_dml (actual rows=3 loops=1)
+                     Filter: (i > 0)
+ Optimizer: Postgres-based planner
+ Planning Time: 2.691 ms
+   (slice0)    Executor memory: 38K bytes.
+   (slice1)    Executor memory: 37K bytes avg x 3 workers, 37K bytes max (seg0).  Work_mem: 17K bytes max.
+ Memory used:  256000kB
+ Execution Time: 1.059 ms
+(11 rows)
+
+DROP TABLE with_dml;
diff --git a/src/test/regress/sql/explain_analyze.sql b/src/test/regress/sql/explain_analyze.sql
index 23557b12e031..db93eb815e52 100644
--- a/src/test/regress/sql/explain_analyze.sql
+++ b/src/test/regress/sql/explain_analyze.sql
@@ -13,6 +13,8 @@
 -- s/Execution Time: [0-9.]+ ms/Execution Time: #.### ms/
 -- m/Executor memory: \d+\w? bytes/
 -- s/Executor memory: \d+\w? bytes/Executor memory: ### bytes/
+-- m/\(slice\d+\)    Executor memory: (\d+)\w bytes avg x \d+ workers, \d+\w bytes max \(seg\d+\)\./
+-- s/Executor memory: (\d+)\w bytes avg x \d+ workers, \d+\w bytes max \(seg\d+\)\./Executor memory: ####K bytes avg x #### workers, ####K bytes max (seg#)./
 -- m/Memory used:\s+\d+\w?B/
 -- s/Memory used:\s+\d+\w?B/Memory used:  ###B/
 -- m/\d+\w? bytes max \(seg\d+\)/
@@ -76,3 +78,27 @@ explain (analyze, timing off, costs off)
 
 drop table slice_test;
 drop table slice_test2;
+
+-- The statistics for modifying CTEs used to be reported as "never executed",
+-- when all plan nodes were executed and some stat information was expected.
+-- Test QD recieving the stats from all slices and showing it in explain output.
+--start_ignore
+DROP TABLE IF EXISTS with_dml;
+--end_ignore
+CREATE TABLE with_dml (i int, j int) DISTRIBUTED BY (i);
+EXPLAIN (ANALYZE, TIMING OFF, COSTS OFF)
+WITH cte AS (
+    INSERT INTO with_dml SELECT i, i * 100 FROM generate_series(1,5) i
+    RETURNING i
+) SELECT * FROM cte;
+EXPLAIN (ANALYZE, TIMING OFF, COSTS OFF)
+WITH cte AS (
+    UPDATE with_dml SET j = j + 1
+    RETURNING i
+) SELECT * FROM cte;
+EXPLAIN (ANALYZE, TIMING OFF, COSTS OFF)
+WITH cte AS (
+    DELETE FROM with_dml WHERE i > 0
+    RETURNING i
+) SELECT * FROM cte;
+DROP TABLE with_dml;