diff --git a/src/backend/tcop/pquery.c b/src/backend/tcop/pquery.c
index dbeff8c054e2..e44cf0b959bf 100644
--- a/src/backend/tcop/pquery.c
+++ b/src/backend/tcop/pquery.c
@@ -357,6 +357,9 @@ ChoosePortalStrategy(List *stmts)
 	/* Note For CreateTableAs, we still use PORTAL_MULTI_QUERY (not like PG)
 	 * since QE needs to use DestRemote to deliver completionTag to QD and
 	 * use DestIntoRel to insert tuples into the table(s).
+	 *
+	 * For modifying CTE we use PORTAL_MULTI_QUERY at QE nodes, because
+	 * we don't need saving the result inside portal's tuple store.
 	 */
 	if (list_length(stmts) == 1)
 	{
@@ -396,10 +399,11 @@ ChoosePortalStrategy(List *stmts)
 					pstmt->copyIntoClause == NULL &&
 					pstmt->refreshClause == NULL)
 				{
-					if (pstmt->hasModifyingCTE)
-						return PORTAL_ONE_MOD_WITH;
-					else
+					if (!pstmt->hasModifyingCTE)
 						return PORTAL_ONE_SELECT;
+					if (Gp_role != GP_ROLE_EXECUTE)
+						return PORTAL_ONE_MOD_WITH;
+					return PORTAL_MULTI_QUERY;
 				}
 				if (pstmt->commandType == CMD_UTILITY)
 				{
diff --git a/src/test/regress/expected/explain_analyze.out b/src/test/regress/expected/explain_analyze.out
index fd70e3162c52..50ca8626522f 100644
--- a/src/test/regress/expected/explain_analyze.out
+++ b/src/test/regress/expected/explain_analyze.out
@@ -13,6 +13,8 @@
 -- s/Execution Time: [0-9.]+ ms/Execution Time: #.### ms/
 -- m/Executor memory: \d+\w? bytes/
 -- s/Executor memory: \d+\w? bytes/Executor memory: ### bytes/
+-- m/\(slice\d+\)    Executor memory: (\d+)\w bytes avg x \d+ workers, \d+\w bytes max \(seg\d+\)\./
+-- s/Executor memory: (\d+)\w bytes avg x \d+ workers, \d+\w bytes max \(seg\d+\)\./Executor memory: ####K bytes avg x #### workers, ####K bytes max (seg#)./
 -- m/Memory used:\s+\d+\w?B/
 -- s/Memory used:\s+\d+\w?B/Memory used:  ###B/
 -- m/\d+\w? bytes max \(seg\d+\)/
@@ -32,7 +34,7 @@ EXPLAIN (ANALYZE, TIMING OFF, COSTS OFF, SUMMARY OFF) SELECT a FROM empty_table;
 ------------------------------------------------------------------
  Gather Motion 3:1  (slice1; segments: 3) (actual rows=0 loops=1)
    ->  Seq Scan on empty_table (actual rows=0 loops=1)
- Optimizer: Postgres query optimizer
+ Optimizer: Postgres-based planner
 (3 rows)
 
 -- explain_processing_on
@@ -48,13 +50,14 @@ EXPLAIN (ANALYZE, TIMING OFF, COSTS OFF, SUMMARY OFF) SELECT t1.a FROM empty_tab
          ->  Seq Scan on empty_table t1 (actual rows=0 loops=1)
          ->  Hash (never executed)
                ->  Seq Scan on empty_table t2 (never executed)
- Optimizer: Postgres query optimizer
+ Optimizer: Postgres-based planner
 (7 rows)
 
 -- explain_processing_on
 -- If all QEs hit errors when executing sort, we might not receive stat data for sort.
 -- rethrow error before print explain info.
 create extension if not exists gp_inject_fault;
+NOTICE:  extension "gp_inject_fault" already exists, skipping
 create table sort_error_test1(tc1 int, tc2 int);
 NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'tc1' as the Greenplum Database data distribution key for this table.
 HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
@@ -72,7 +75,7 @@ select gp_inject_fault('explain_analyze_sort_error', 'error', dbid)
 (3 rows)
 
 EXPLAIN analyze insert into sort_error_test2 select * from sort_error_test1 order by 1;
-ERROR:  fault triggered, fault name:'explain_analyze_sort_error' fault type:'error'  (seg1 127.0.1.1:7003 pid=103595)
+ERROR:  fault triggered, fault name:'explain_analyze_sort_error' fault type:'error'  (seg1 172.17.0.2:7003 pid=36295)
 select count(*) from sort_error_test2;
  count 
 -------
@@ -116,12 +119,12 @@ explain (analyze, timing off, costs off) select a.i from slice_test a
                        ->  Broadcast Motion 3:3  (slice2; segments: 3) (actual rows=1 loops=1)
                              ->  Seq Scan on slice_test2 b (actual rows=1 loops=1)
  Optimizer: Postgres-based planner
- Planning Time: 0.662 ms
-   (slice0)    Executor memory: 47K bytes.
-   (slice1)    Executor memory: 43K bytes avg x 3 workers, 43K bytes max (seg0).  Work_mem: 17K bytes max.
+ Planning Time: 0.434 ms
+   (slice0)    Executor memory: 45K bytes.
+   (slice1)    Executor memory: 42K bytes avg x 3 workers, 42K bytes max (seg0).  Work_mem: 17K bytes max.
    (slice2)    Executor memory: 37K bytes avg x 3 workers, 37K bytes max (seg0).
- Memory used:  128000kB
- Execution Time: 4.953 ms
+ Memory used:  256000kB
+ Execution Time: 9.264 ms
 (17 rows)
 
 -- checking this didn't break previous behavior
@@ -140,13 +143,83 @@ explain (analyze, timing off, costs off)
      ->  Result (actual rows=1 loops=2)
            One-Time Filter: ((x.x)::integer = 0)
  Optimizer: Postgres-based planner
- Planning Time: 0.245 ms
-   (slice0)    Executor memory: 19K bytes.
-   (slice1)    Executor memory: 36K bytes.  Work_mem: 17K bytes max.
- Memory used:  128000kB
- Execution Time: 0.082 ms
-(14 rows)
+ Planning Time: 0.233 ms
+   (slice0)    Executor memory: 18K bytes.
+   (slice1)    Executor memory: 35K bytes.  Work_mem: 17K bytes max.
+ Memory used:  256000kB
+ Execution Time: 0.090 ms
+(13 rows)
 
 -- explain_processing_on
 drop table slice_test;
 drop table slice_test2;
+-- The statistics for modifying CTEs used to be reported as "never executed",
+-- when all plan nodes were executed and some stat information was expected.
+-- Test QD recieving the stats from all slices and showing it in explain output.
+--start_ignore
+DROP TABLE IF EXISTS with_dml;
+NOTICE:  table "with_dml" does not exist, skipping
+--end_ignore
+CREATE TABLE with_dml (i int, j int) DISTRIBUTED BY (i);
+EXPLAIN (ANALYZE, TIMING OFF, COSTS OFF)
+WITH cte AS (
+    INSERT INTO with_dml SELECT i, i * 100 FROM generate_series(1,5) i
+    RETURNING i
+) SELECT * FROM cte;
+                                                QUERY PLAN                                                 
+-----------------------------------------------------------------------------------------------------------
+ Gather Motion 3:1  (slice1; segments: 3) (actual rows=5 loops=1)
+   ->  Shared Scan (share slice:id 1:0) (actual rows=3 loops=1)
+         ->  Insert on with_dml (actual rows=3 loops=1)
+               ->  Redistribute Motion 1:3  (slice2; segments: 1) (actual rows=3 loops=1)
+                     Hash Key: i.i
+                     ->  Function Scan on generate_series i (actual rows=5 loops=1)
+ Optimizer: Postgres-based planner
+ Planning Time: 0.163 ms
+   (slice0)    Executor memory: 36K bytes.
+   (slice1)    Executor memory: 36K bytes avg x 3 workers, 36K bytes max (seg0).  Work_mem: 17K bytes max.
+   (slice2)    Executor memory: 40K bytes (seg2).  Work_mem: 17K bytes max.
+ Memory used:  256000kB
+ Execution Time: 7.189 ms
+(13 rows)
+
+EXPLAIN (ANALYZE, TIMING OFF, COSTS OFF)
+WITH cte AS (
+    UPDATE with_dml SET j = j + 1
+    RETURNING i
+) SELECT * FROM cte;
+                                                QUERY PLAN                                                 
+-----------------------------------------------------------------------------------------------------------
+ Gather Motion 3:1  (slice1; segments: 3) (actual rows=5 loops=1)
+   ->  Shared Scan (share slice:id 1:0) (actual rows=3 loops=1)
+         ->  Update on with_dml (actual rows=3 loops=1)
+               ->  Seq Scan on with_dml (actual rows=3 loops=1)
+ Optimizer: Postgres-based planner
+ Planning Time: 0.288 ms
+   (slice0)    Executor memory: 38K bytes.
+   (slice1)    Executor memory: 38K bytes avg x 3 workers, 38K bytes max (seg0).  Work_mem: 17K bytes max.
+ Memory used:  256000kB
+ Execution Time: 1.290 ms
+(10 rows)
+
+EXPLAIN (ANALYZE, TIMING OFF, COSTS OFF)
+WITH cte AS (
+    DELETE FROM with_dml WHERE i > 0
+    RETURNING i
+) SELECT * FROM cte;
+                                                QUERY PLAN                                                 
+-----------------------------------------------------------------------------------------------------------
+ Gather Motion 3:1  (slice1; segments: 3) (actual rows=5 loops=1)
+   ->  Shared Scan (share slice:id 1:0) (actual rows=3 loops=1)
+         ->  Delete on with_dml (actual rows=3 loops=1)
+               ->  Seq Scan on with_dml (actual rows=3 loops=1)
+                     Filter: (i > 0)
+ Optimizer: Postgres-based planner
+ Planning Time: 0.232 ms
+   (slice0)    Executor memory: 38K bytes.
+   (slice1)    Executor memory: 37K bytes avg x 3 workers, 37K bytes max (seg0).  Work_mem: 17K bytes max.
+ Memory used:  256000kB
+ Execution Time: 0.947 ms
+(11 rows)
+
+DROP TABLE with_dml;
diff --git a/src/test/regress/expected/explain_analyze_optimizer.out b/src/test/regress/expected/explain_analyze_optimizer.out
index c809518660ad..723861deeb4a 100644
--- a/src/test/regress/expected/explain_analyze_optimizer.out
+++ b/src/test/regress/expected/explain_analyze_optimizer.out
@@ -13,6 +13,8 @@
 -- s/Execution Time: [0-9.]+ ms/Execution Time: #.### ms/
 -- m/Executor memory: \d+\w? bytes/
 -- s/Executor memory: \d+\w? bytes/Executor memory: ### bytes/
+-- m/\(slice\d+\)    Executor memory: (\d+)\w bytes avg x \d+ workers, \d+\w bytes max \(seg\d+\)\./
+-- s/Executor memory: (\d+)\w bytes avg x \d+ workers, \d+\w bytes max \(seg\d+\)\./Executor memory: ####K bytes avg x #### workers, ####K bytes max (seg#)./
 -- m/Memory used:\s+\d+\w?B/
 -- s/Memory used:\s+\d+\w?B/Memory used:  ###B/
 -- m/\d+\w? bytes max \(seg\d+\)/
@@ -32,7 +34,7 @@ EXPLAIN (ANALYZE, TIMING OFF, COSTS OFF, SUMMARY OFF) SELECT a FROM empty_table;
 ------------------------------------------------------------------
  Gather Motion 3:1  (slice1; segments: 3) (actual rows=0 loops=1)
    ->  Seq Scan on empty_table (actual rows=0 loops=1)
- Optimizer: Pivotal Optimizer (GPORCA)
+ Optimizer: GPORCA
 (3 rows)
 
 -- explain_processing_on
@@ -40,22 +42,23 @@ EXPLAIN (ANALYZE, TIMING OFF, COSTS OFF, SUMMARY OFF) SELECT a FROM empty_table;
 -- be reported
 -- explain_processing_off
 EXPLAIN (ANALYZE, TIMING OFF, COSTS OFF, SUMMARY OFF) SELECT t1.a FROM empty_table t1 join empty_table t2 on t1.a = t2.a;
-                                   QUERY PLAN                                    
----------------------------------------------------------------------------------
+                              QUERY PLAN                              
+----------------------------------------------------------------------
  Gather Motion 3:1  (slice1; segments: 3) (actual rows=0 loops=1)
    ->  Hash Join (actual rows=0 loops=1)
          Hash Cond: (t1.a = t2.a)
          ->  Seq Scan on empty_table t1 (never executed)
          ->  Hash (actual rows=0 loops=1)
-               Buckets: 524288  Batches: 1  Memory Usage: 4096kB
+               Buckets: 1048576  Batches: 1  Memory Usage: 8192kB
                ->  Seq Scan on empty_table t2 (actual rows=0 loops=1)
- Optimizer: Pivotal Optimizer (GPORCA)
+ Optimizer: GPORCA
 (8 rows)
 
 -- explain_processing_on
 -- If all QEs hit errors when executing sort, we might not receive stat data for sort.
 -- rethrow error before print explain info.
 create extension if not exists gp_inject_fault;
+NOTICE:  extension "gp_inject_fault" already exists, skipping
 create table sort_error_test1(tc1 int, tc2 int);
 NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'tc1' as the Greenplum Database data distribution key for this table.
 HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
@@ -73,7 +76,7 @@ select gp_inject_fault('explain_analyze_sort_error', 'error', dbid)
 (3 rows)
 
 EXPLAIN analyze insert into sort_error_test2 select * from sort_error_test1 order by 1;
-ERROR:  fault triggered, fault name:'explain_analyze_sort_error' fault type:'error'  (seg0 127.0.1.1:7002 pid=103209)
+ERROR:  fault triggered, fault name:'explain_analyze_sort_error' fault type:'error'  (seg0 172.17.0.2:7002 pid=37168)
 select count(*) from sort_error_test2;
  count 
 -------
@@ -119,12 +122,12 @@ explain (analyze, timing off, costs off) select a.i from slice_test a
                  ->  Gather Motion 3:1  (slice2; segments: 3) (actual rows=1 loops=1)
                        ->  Seq Scan on slice_test2 b (actual rows=1 loops=1)
  Optimizer: GPORCA
- Planning Time: 6.096 ms
-   (slice0)    Executor memory: 49K bytes.  Work_mem: 17K bytes max.
+ Planning Time: 4.463 ms
+   (slice0)    Executor memory: 48K bytes.  Work_mem: 17K bytes max.
    (slice1)    Executor memory: 36K bytes avg x 3 workers, 36K bytes max (seg0).
    (slice2)    Executor memory: 37K bytes avg x 3 workers, 37K bytes max (seg0).
- Memory used:  128000kB
- Execution Time: 5.096 ms
+ Memory used:  256000kB
+ Execution Time: 6.883 ms
 (19 rows)
 
 -- checking this didn't break previous behavior
@@ -143,13 +146,83 @@ explain (analyze, timing off, costs off)
      ->  Result (actual rows=1 loops=2)
            One-Time Filter: ((x.x)::integer = 0)
  Optimizer: Postgres-based planner
- Planning Time: 2.920 ms
-   (slice0)    Executor memory: 19K bytes.
-   (slice1)    Executor memory: 36K bytes.  Work_mem: 17K bytes max.
- Memory used:  128000kB
- Execution Time: 0.085 ms
-(14 rows)
+ Planning Time: 3.210 ms
+   (slice0)    Executor memory: 18K bytes.
+   (slice1)    Executor memory: 35K bytes.  Work_mem: 17K bytes max.
+ Memory used:  256000kB
+ Execution Time: 0.074 ms
+(13 rows)
 
 -- explain_processing_on
 drop table slice_test;
 drop table slice_test2;
+-- The statistics for modifying CTEs used to be reported as "never executed",
+-- when all plan nodes were executed and some stat information was expected.
+-- Test QD recieving the stats from all slices and showing it in explain output.
+--start_ignore
+DROP TABLE IF EXISTS with_dml;
+NOTICE:  table "with_dml" does not exist, skipping
+--end_ignore
+CREATE TABLE with_dml (i int, j int) DISTRIBUTED BY (i);
+EXPLAIN (ANALYZE, TIMING OFF, COSTS OFF)
+WITH cte AS (
+    INSERT INTO with_dml SELECT i, i * 100 FROM generate_series(1,5) i
+    RETURNING i
+) SELECT * FROM cte;
+                                                QUERY PLAN                                                 
+-----------------------------------------------------------------------------------------------------------
+ Gather Motion 3:1  (slice1; segments: 3) (actual rows=5 loops=1)
+   ->  Shared Scan (share slice:id 1:0) (actual rows=3 loops=1)
+         ->  Insert on with_dml (actual rows=3 loops=1)
+               ->  Redistribute Motion 1:3  (slice2; segments: 1) (actual rows=3 loops=1)
+                     Hash Key: i.i
+                     ->  Function Scan on generate_series i (actual rows=5 loops=1)
+ Optimizer: Postgres-based planner
+ Planning Time: 2.969 ms
+   (slice0)    Executor memory: 36K bytes.
+   (slice1)    Executor memory: 36K bytes avg x 3 workers, 36K bytes max (seg0).  Work_mem: 17K bytes max.
+   (slice2)    Executor memory: 40K bytes (seg2).  Work_mem: 17K bytes max.
+ Memory used:  256000kB
+ Execution Time: 7.598 ms
+(13 rows)
+
+EXPLAIN (ANALYZE, TIMING OFF, COSTS OFF)
+WITH cte AS (
+    UPDATE with_dml SET j = j + 1
+    RETURNING i
+) SELECT * FROM cte;
+                                                QUERY PLAN                                                 
+-----------------------------------------------------------------------------------------------------------
+ Gather Motion 3:1  (slice1; segments: 3) (actual rows=5 loops=1)
+   ->  Shared Scan (share slice:id 1:0) (actual rows=3 loops=1)
+         ->  Update on with_dml (actual rows=3 loops=1)
+               ->  Seq Scan on with_dml (actual rows=3 loops=1)
+ Optimizer: Postgres-based planner
+ Planning Time: 2.896 ms
+   (slice0)    Executor memory: 38K bytes.
+   (slice1)    Executor memory: 38K bytes avg x 3 workers, 38K bytes max (seg0).  Work_mem: 17K bytes max.
+ Memory used:  256000kB
+ Execution Time: 1.348 ms
+(10 rows)
+
+EXPLAIN (ANALYZE, TIMING OFF, COSTS OFF)
+WITH cte AS (
+    DELETE FROM with_dml WHERE i > 0
+    RETURNING i
+) SELECT * FROM cte;
+                                                QUERY PLAN                                                 
+-----------------------------------------------------------------------------------------------------------
+ Gather Motion 3:1  (slice1; segments: 3) (actual rows=5 loops=1)
+   ->  Shared Scan (share slice:id 1:0) (actual rows=3 loops=1)
+         ->  Delete on with_dml (actual rows=3 loops=1)
+               ->  Seq Scan on with_dml (actual rows=3 loops=1)
+                     Filter: (i > 0)
+ Optimizer: Postgres-based planner
+ Planning Time: 2.691 ms
+   (slice0)    Executor memory: 38K bytes.
+   (slice1)    Executor memory: 37K bytes avg x 3 workers, 37K bytes max (seg0).  Work_mem: 17K bytes max.
+ Memory used:  256000kB
+ Execution Time: 1.059 ms
+(11 rows)
+
+DROP TABLE with_dml;
diff --git a/src/test/regress/sql/explain_analyze.sql b/src/test/regress/sql/explain_analyze.sql
index 23557b12e031..db93eb815e52 100644
--- a/src/test/regress/sql/explain_analyze.sql
+++ b/src/test/regress/sql/explain_analyze.sql
@@ -13,6 +13,8 @@
 -- s/Execution Time: [0-9.]+ ms/Execution Time: #.### ms/
 -- m/Executor memory: \d+\w? bytes/
 -- s/Executor memory: \d+\w? bytes/Executor memory: ### bytes/
+-- m/\(slice\d+\)    Executor memory: (\d+)\w bytes avg x \d+ workers, \d+\w bytes max \(seg\d+\)\./
+-- s/Executor memory: (\d+)\w bytes avg x \d+ workers, \d+\w bytes max \(seg\d+\)\./Executor memory: ####K bytes avg x #### workers, ####K bytes max (seg#)./
 -- m/Memory used:\s+\d+\w?B/
 -- s/Memory used:\s+\d+\w?B/Memory used:  ###B/
 -- m/\d+\w? bytes max \(seg\d+\)/
@@ -76,3 +78,27 @@ explain (analyze, timing off, costs off)
 
 drop table slice_test;
 drop table slice_test2;
+
+-- The statistics for modifying CTEs used to be reported as "never executed",
+-- when all plan nodes were executed and some stat information was expected.
+-- Test QD recieving the stats from all slices and showing it in explain output.
+--start_ignore
+DROP TABLE IF EXISTS with_dml;
+--end_ignore
+CREATE TABLE with_dml (i int, j int) DISTRIBUTED BY (i);
+EXPLAIN (ANALYZE, TIMING OFF, COSTS OFF)
+WITH cte AS (
+    INSERT INTO with_dml SELECT i, i * 100 FROM generate_series(1,5) i
+    RETURNING i
+) SELECT * FROM cte;
+EXPLAIN (ANALYZE, TIMING OFF, COSTS OFF)
+WITH cte AS (
+    UPDATE with_dml SET j = j + 1
+    RETURNING i
+) SELECT * FROM cte;
+EXPLAIN (ANALYZE, TIMING OFF, COSTS OFF)
+WITH cte AS (
+    DELETE FROM with_dml WHERE i > 0
+    RETURNING i
+) SELECT * FROM cte;
+DROP TABLE with_dml;