diff --git a/src/backend/distributed/planner/query_pushdown_planning.c b/src/backend/distributed/planner/query_pushdown_planning.c index 2eda4e42a76..9cc3eb5c3e5 100644 --- a/src/backend/distributed/planner/query_pushdown_planning.c +++ b/src/backend/distributed/planner/query_pushdown_planning.c @@ -2049,6 +2049,16 @@ CreateSubqueryTargetListAndAdjustVars(List *columnList) */ column->varno = 1; column->varattno = resNo; + + /* + * 1 subquery means there is one range table entry so with Postgres 16+ we need + * to ensure that column's varnullingrels - the set of join rels that can null + * the var - is empty. Otherwise, when given the query, the Postgres planner + * may attempt to access a non-existent range table and segfault, as in #7787. + */ +#if PG_VERSION_NUM >= PG_VERSION_16 + column->varnullingrels = NULL; +#endif } return subqueryTargetEntryList; diff --git a/src/test/regress/expected/multi_outer_join_columns.out b/src/test/regress/expected/multi_outer_join_columns.out new file mode 100644 index 00000000000..79527f7c65f --- /dev/null +++ b/src/test/regress/expected/multi_outer_join_columns.out @@ -0,0 +1,426 @@ +--- Test for verifying that column references (var nodes) in targets that cannot be pushed down +--- do not cause issues for the postgres planner, in particular postgres versions 16+, where the +--- varnullingrels field of a VAR node may contain relids of join relations that can make the var +--- NULL; in a rewritten distributed query without a join such relids do not have a meaning. +-- This test has an alternative goldfile because of the following feature in Postgres 16: +-- https://github.com/postgres/postgres/commit/1349d2790bf48a4de072931c722f39337e72055e +-- +SHOW server_version \gset +SELECT substring(:'server_version', '\d+')::int >= 16 AS server_version_ge_16; + server_version_ge_16 +--------------------------------------------------------------------- + t +(1 row) + +CREATE SCHEMA outer_join_columns_testing; +SET search_path to 'outer_join_columns_testing'; +SET citus.next_shard_id TO 30070000; +SET citus.shard_replication_factor TO 1; +SET citus.enable_local_execution TO ON; +CREATE TABLE t1 (id INT PRIMARY KEY); +INSERT INTO t1 VALUES (1), (2); +CREATE TABLE t2 (id INT, account_id INT, a2 INT, PRIMARY KEY(id, account_id)); +INSERT INTO t2 VALUES (3, 1, 10), (4, 2, 20), (5, 1, NULL); +SELECT create_distributed_table('t1', 'id'); +NOTICE: Copying data from local table... +NOTICE: copying the data has completed +DETAIL: The local data in the table is no longer visible, but is still on disk. +HINT: To remove the local data, run: SELECT truncate_local_data_after_distributing_table($$outer_join_columns_testing.t1$$) + create_distributed_table +--------------------------------------------------------------------- + +(1 row) + +SELECT create_distributed_table('t2', 'account_id'); +NOTICE: Copying data from local table... +NOTICE: copying the data has completed +DETAIL: The local data in the table is no longer visible, but is still on disk. +HINT: To remove the local data, run: SELECT truncate_local_data_after_distributing_table($$outer_join_columns_testing.t2$$) + create_distributed_table +--------------------------------------------------------------------- + +(1 row) + +-- Test the issue seen in #7705; a target expression with +-- a window function that cannot be pushed down because the +-- partion by is not on the distribution column also includes +-- a column from the inner side of a left outer join, which +-- produces a non-empty varnullingrels set in PG 16 (and higher) +SELECT t1.id, MAX(t2.a2) OVER (PARTITION BY t2.id) +FROM t1 LEFT OUTER JOIN t2 ON t1.id = t2.account_id; + id | max +--------------------------------------------------------------------- + 1 | 10 + 2 | 20 + 1 | +(3 rows) + +EXPLAIN (VERBOSE, COSTS OFF, TIMING OFF) +SELECT t1.id, MAX(t2.a2) OVER (PARTITION BY t2.id) +FROM t1 LEFT OUTER JOIN t2 ON t1.id = t2.account_id; + QUERY PLAN +--------------------------------------------------------------------- + WindowAgg + Output: remote_scan.id, max(remote_scan.max) OVER (?), remote_scan.worker_column_3 + -> Sort + Output: remote_scan.worker_column_3, remote_scan.id, remote_scan.max + Sort Key: remote_scan.worker_column_3 + -> Custom Scan (Citus Adaptive) + Output: remote_scan.worker_column_3, remote_scan.id, remote_scan.max + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Query: SELECT worker_column_1 AS id, worker_column_2 AS max, worker_column_3 FROM (SELECT t1.id AS worker_column_1, t2.a2 AS worker_column_2, t2.id AS worker_column_3 FROM (outer_join_columns_testing.t1_30070000 t1 LEFT JOIN outer_join_columns_testing.t2_30070004 t2 ON ((t1.id OPERATOR(pg_catalog.=) t2.account_id)))) worker_subquery + Node: host=localhost port=xxxxx dbname=regression + -> Hash Right Join + Output: t1.id, t2.a2, t2.id + Inner Unique: true + Hash Cond: (t2.account_id = t1.id) + -> Seq Scan on outer_join_columns_testing.t2_30070004 t2 + Output: t2.id, t2.account_id, t2.a2 + -> Hash + Output: t1.id + -> Seq Scan on outer_join_columns_testing.t1_30070000 t1 + Output: t1.id +(22 rows) + +SELECT t1.id, MAX(t2.a2) OVER (PARTITION BY t2.id) +FROM t2 RIGHT OUTER JOIN t1 ON t1.id = t2.account_id; + id | max +--------------------------------------------------------------------- + 1 | 10 + 2 | 20 + 1 | +(3 rows) + +EXPLAIN (VERBOSE, COSTS OFF, TIMING OFF) +SELECT t1.id, MAX(t2.a2) OVER (PARTITION BY t2.id) +FROM t2 RIGHT OUTER JOIN t1 ON t1.id = t2.account_id; + QUERY PLAN +--------------------------------------------------------------------- + WindowAgg + Output: remote_scan.id, max(remote_scan.max) OVER (?), remote_scan.worker_column_3 + -> Sort + Output: remote_scan.worker_column_3, remote_scan.id, remote_scan.max + Sort Key: remote_scan.worker_column_3 + -> Custom Scan (Citus Adaptive) + Output: remote_scan.worker_column_3, remote_scan.id, remote_scan.max + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Query: SELECT worker_column_1 AS id, worker_column_2 AS max, worker_column_3 FROM (SELECT t1.id AS worker_column_1, t2.a2 AS worker_column_2, t2.id AS worker_column_3 FROM (outer_join_columns_testing.t2_30070004 t2 RIGHT JOIN outer_join_columns_testing.t1_30070000 t1 ON ((t1.id OPERATOR(pg_catalog.=) t2.account_id)))) worker_subquery + Node: host=localhost port=xxxxx dbname=regression + -> Hash Right Join + Output: t1.id, t2.a2, t2.id + Inner Unique: true + Hash Cond: (t2.account_id = t1.id) + -> Seq Scan on outer_join_columns_testing.t2_30070004 t2 + Output: t2.id, t2.account_id, t2.a2 + -> Hash + Output: t1.id + -> Seq Scan on outer_join_columns_testing.t1_30070000 t1 + Output: t1.id +(22 rows) + +SELECT DISTINCT t1.id, MAX(t2.a2) OVER (PARTITION BY t2.id) +FROM t1 LEFT OUTER JOIN t2 ON t1.id = t2.account_id; + id | max +--------------------------------------------------------------------- + 1 | + 1 | 10 + 2 | 20 +(3 rows) + +EXPLAIN (VERBOSE, COSTS OFF, TIMING OFF) +SELECT DISTINCT t1.id, MAX(t2.a2) OVER (PARTITION BY t2.id) +FROM t1 LEFT OUTER JOIN t2 ON t1.id = t2.account_id; + QUERY PLAN +--------------------------------------------------------------------- + HashAggregate + Output: remote_scan.id, (max(remote_scan.max) OVER (?)), remote_scan.worker_column_3 + Group Key: remote_scan.id, max(remote_scan.max) OVER (?) + -> WindowAgg + Output: remote_scan.id, max(remote_scan.max) OVER (?), remote_scan.worker_column_3 + -> Sort + Output: remote_scan.worker_column_3, remote_scan.id, remote_scan.max + Sort Key: remote_scan.worker_column_3 + -> Custom Scan (Citus Adaptive) + Output: remote_scan.worker_column_3, remote_scan.id, remote_scan.max + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Query: SELECT worker_column_1 AS id, worker_column_2 AS max, worker_column_3 FROM (SELECT t1.id AS worker_column_1, t2.a2 AS worker_column_2, t2.id AS worker_column_3 FROM (outer_join_columns_testing.t1_30070000 t1 LEFT JOIN outer_join_columns_testing.t2_30070004 t2 ON ((t1.id OPERATOR(pg_catalog.=) t2.account_id)))) worker_subquery + Node: host=localhost port=xxxxx dbname=regression + -> Hash Right Join + Output: t1.id, t2.a2, t2.id + Inner Unique: true + Hash Cond: (t2.account_id = t1.id) + -> Seq Scan on outer_join_columns_testing.t2_30070004 t2 + Output: t2.id, t2.account_id, t2.a2 + -> Hash + Output: t1.id + -> Seq Scan on outer_join_columns_testing.t1_30070000 t1 + Output: t1.id +(25 rows) + +CREATE SEQUENCE test_seq START 101; +CREATE OR REPLACE FUNCTION TEST_F(int) returns INT language sql stable as $$ select $1 + 42; $$ ; +-- Issue #7705 also occurs if a target expression includes a column +-- of a distributed table that is on the inner side of a left outer +-- join and a call to nextval(), because nextval() cannot be pushed +-- down, and must be run on the coordinator +SELECT t1.id, TEST_F(t2.a2 + nextval('test_seq') :: int) +FROM t1 LEFT OUTER JOIN t2 ON t1.id = t2.account_id +ORDER BY t1.id; + id | test_f +--------------------------------------------------------------------- + 1 | 153 + 1 | + 2 | 165 +(3 rows) + +EXPLAIN (VERBOSE, COSTS OFF, TIMING OFF) +SELECT t1.id, TEST_F(t2.a2 + nextval('test_seq') :: int) +FROM t1 LEFT OUTER JOIN t2 ON t1.id = t2.account_id +ORDER BY t1.id; + QUERY PLAN +--------------------------------------------------------------------- + Result + Output: remote_scan.id, ((remote_scan.test_f + (nextval('test_seq'::regclass))::integer) + 42) + -> Sort + Output: remote_scan.id, remote_scan.test_f + Sort Key: remote_scan.id + -> Custom Scan (Citus Adaptive) + Output: remote_scan.id, remote_scan.test_f + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Query: SELECT worker_column_1 AS id, worker_column_2 AS test_f FROM (SELECT t1.id AS worker_column_1, t2.a2 AS worker_column_2 FROM (outer_join_columns_testing.t1_30070000 t1 LEFT JOIN outer_join_columns_testing.t2_30070004 t2 ON ((t1.id OPERATOR(pg_catalog.=) t2.account_id)))) worker_subquery + Node: host=localhost port=xxxxx dbname=regression + -> Hash Right Join + Output: t1.id, t2.a2 + Inner Unique: true + Hash Cond: (t2.account_id = t1.id) + -> Seq Scan on outer_join_columns_testing.t2_30070004 t2 + Output: t2.id, t2.account_id, t2.a2 + -> Hash + Output: t1.id + -> Seq Scan on outer_join_columns_testing.t1_30070000 t1 + Output: t1.id +(22 rows) + +SELECT t1.id, CASE nextval('test_seq') % 2 = 0 WHEN true THEN t2.a2 ELSE 1 END +FROM t1 LEFT OUTER JOIN t2 ON t1.id = t2.account_id +ORDER BY t1.id; + id | case +--------------------------------------------------------------------- + 1 | 10 + 1 | 1 + 2 | 20 +(3 rows) + +EXPLAIN (VERBOSE, COSTS OFF, TIMING OFF) +SELECT t1.id, CASE nextval('test_seq') %2 = 0 WHEN true THEN t2.a2 ELSE 1 END +FROM t1 LEFT OUTER JOIN t2 ON t1.id = t2.account_id +ORDER BY t1.id; + QUERY PLAN +--------------------------------------------------------------------- + Result + Output: remote_scan.id, CASE ((nextval('test_seq'::regclass) % '2'::bigint) = 0) WHEN CASE_TEST_EXPR THEN remote_scan."case" ELSE 1 END + -> Sort + Output: remote_scan.id, remote_scan."case" + Sort Key: remote_scan.id + -> Custom Scan (Citus Adaptive) + Output: remote_scan.id, remote_scan."case" + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Query: SELECT worker_column_1 AS id, worker_column_2 AS "case" FROM (SELECT t1.id AS worker_column_1, t2.a2 AS worker_column_2 FROM (outer_join_columns_testing.t1_30070000 t1 LEFT JOIN outer_join_columns_testing.t2_30070004 t2 ON ((t1.id OPERATOR(pg_catalog.=) t2.account_id)))) worker_subquery + Node: host=localhost port=xxxxx dbname=regression + -> Hash Right Join + Output: t1.id, t2.a2 + Inner Unique: true + Hash Cond: (t2.account_id = t1.id) + -> Seq Scan on outer_join_columns_testing.t2_30070004 t2 + Output: t2.id, t2.account_id, t2.a2 + -> Hash + Output: t1.id + -> Seq Scan on outer_join_columns_testing.t1_30070000 t1 + Output: t1.id +(22 rows) + +-- Issue #7787: count distinct of a column from the inner side of a +-- left outer join will have a non-empty varnullingrels in the query +-- tree returned by Postgres 16+, so ensure this is not reflected in +-- the worker subquery constructed by Citus; it has just one relation, +-- for the pushed down subquery. +SELECT COUNT(DISTINCT a2) +FROM t1 LEFT OUTER JOIN t2 ON t1.id = t2.account_id; + count +--------------------------------------------------------------------- + 2 +(1 row) + +EXPLAIN (VERBOSE, COSTS OFF, TIMING OFF) +SELECT COUNT(DISTINCT a2) +FROM t1 LEFT OUTER JOIN t2 ON t1.id = t2.account_id; + QUERY PLAN +--------------------------------------------------------------------- + Aggregate + Output: count(DISTINCT remote_scan.count) + -> Sort + Output: remote_scan.count + Sort Key: remote_scan.count + -> Custom Scan (Citus Adaptive) + Output: remote_scan.count + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Query: SELECT worker_column_1 AS count FROM (SELECT t2.a2 AS worker_column_1 FROM (outer_join_columns_testing.t1_30070000 t1 LEFT JOIN outer_join_columns_testing.t2_30070004 t2 ON ((t1.id OPERATOR(pg_catalog.=) t2.account_id)))) worker_subquery GROUP BY worker_column_1 + Node: host=localhost port=xxxxx dbname=regression + -> HashAggregate + Output: t2.a2 + Group Key: t2.a2 + -> Hash Right Join + Output: t2.a2 + Inner Unique: true + Hash Cond: (t2.account_id = t1.id) + -> Seq Scan on outer_join_columns_testing.t2_30070004 t2 + Output: t2.id, t2.account_id, t2.a2 + -> Hash + Output: t1.id + -> Seq Scan on outer_join_columns_testing.t1_30070000 t1 + Output: t1.id +(25 rows) + +-- Issue #7787 also occurs with a HAVING clause +SELECT 1 +FROM t1 LEFT OUTER JOIN t2 ON t1.id = t2.account_id +HAVING COUNT(DISTINCT a2) > 1; + ?column? +--------------------------------------------------------------------- + 1 +(1 row) + +EXPLAIN (VERBOSE, COSTS OFF, TIMING OFF) +SELECT 1 +FROM t1 LEFT OUTER JOIN t2 ON t1.id = t2.account_id +HAVING COUNT(DISTINCT a2) > 1; + QUERY PLAN +--------------------------------------------------------------------- + Aggregate + Output: remote_scan."?column?" + Filter: (count(DISTINCT remote_scan.worker_column_2) > 1) + -> Sort + Output: remote_scan."?column?", remote_scan.worker_column_2 + Sort Key: remote_scan.worker_column_2 + -> Custom Scan (Citus Adaptive) + Output: remote_scan."?column?", remote_scan.worker_column_2 + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Query: SELECT 1, worker_column_1 AS worker_column_2 FROM (SELECT t2.a2 AS worker_column_1 FROM (outer_join_columns_testing.t1_30070000 t1 LEFT JOIN outer_join_columns_testing.t2_30070004 t2 ON ((t1.id OPERATOR(pg_catalog.=) t2.account_id)))) worker_subquery GROUP BY worker_column_1 + Node: host=localhost port=xxxxx dbname=regression + -> HashAggregate + Output: 1, t2.a2 + Group Key: t2.a2 + -> Hash Right Join + Output: t2.a2 + Inner Unique: true + Hash Cond: (t2.account_id = t1.id) + -> Seq Scan on outer_join_columns_testing.t2_30070004 t2 + Output: t2.id, t2.account_id, t2.a2 + -> Hash + Output: t1.id + -> Seq Scan on outer_join_columns_testing.t1_30070000 t1 + Output: t1.id +(26 rows) + +-- Check right outer join +SELECT COUNT(DISTINCT a2) +FROM t2 RIGHT OUTER JOIN t1 ON t2.account_id = t1.id; + count +--------------------------------------------------------------------- + 2 +(1 row) + +EXPLAIN (VERBOSE, COSTS OFF, TIMING OFF) +SELECT COUNT(DISTINCT a2) +FROM t2 RIGHT OUTER JOIN t1 ON t2.account_id = t1.id; + QUERY PLAN +--------------------------------------------------------------------- + Aggregate + Output: count(DISTINCT remote_scan.count) + -> Sort + Output: remote_scan.count + Sort Key: remote_scan.count + -> Custom Scan (Citus Adaptive) + Output: remote_scan.count + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Query: SELECT worker_column_1 AS count FROM (SELECT t2.a2 AS worker_column_1 FROM (outer_join_columns_testing.t2_30070004 t2 RIGHT JOIN outer_join_columns_testing.t1_30070000 t1 ON ((t2.account_id OPERATOR(pg_catalog.=) t1.id)))) worker_subquery GROUP BY worker_column_1 + Node: host=localhost port=xxxxx dbname=regression + -> HashAggregate + Output: t2.a2 + Group Key: t2.a2 + -> Hash Right Join + Output: t2.a2 + Inner Unique: true + Hash Cond: (t2.account_id = t1.id) + -> Seq Scan on outer_join_columns_testing.t2_30070004 t2 + Output: t2.id, t2.account_id, t2.a2 + -> Hash + Output: t1.id + -> Seq Scan on outer_join_columns_testing.t1_30070000 t1 + Output: t1.id +(25 rows) + +-- Check both count distinct and having clause +SELECT COUNT(DISTINCT a2) +FROM t1 LEFT OUTER JOIN t2 ON t1.id = t2.account_id +HAVING COUNT(DISTINCT t2.id) > 1; + count +--------------------------------------------------------------------- + 2 +(1 row) + +EXPLAIN (VERBOSE, COSTS OFF, TIMING OFF) +SELECT COUNT(DISTINCT a2) +FROM t1 LEFT OUTER JOIN t2 ON t1.id = t2.account_id +HAVING COUNT(DISTINCT t2.id) > 1; + QUERY PLAN +--------------------------------------------------------------------- + Aggregate + Output: count(DISTINCT remote_scan.count) + Filter: (count(DISTINCT remote_scan.worker_column_2) > 1) + -> Sort + Output: remote_scan.count, remote_scan.worker_column_2 + Sort Key: remote_scan.count + -> Custom Scan (Citus Adaptive) + Output: remote_scan.count, remote_scan.worker_column_2 + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Query: SELECT worker_column_1 AS count, worker_column_2 FROM (SELECT t2.a2 AS worker_column_1, t2.id AS worker_column_2 FROM (outer_join_columns_testing.t1_30070000 t1 LEFT JOIN outer_join_columns_testing.t2_30070004 t2 ON ((t1.id OPERATOR(pg_catalog.=) t2.account_id)))) worker_subquery GROUP BY worker_column_1, worker_column_2 + Node: host=localhost port=xxxxx dbname=regression + -> HashAggregate + Output: t2.a2, t2.id + Group Key: t2.a2, t2.id + -> Hash Right Join + Output: t2.a2, t2.id + Inner Unique: true + Hash Cond: (t2.account_id = t1.id) + -> Seq Scan on outer_join_columns_testing.t2_30070004 t2 + Output: t2.id, t2.account_id, t2.a2 + -> Hash + Output: t1.id + -> Seq Scan on outer_join_columns_testing.t1_30070000 t1 + Output: t1.id +(26 rows) + +--- cleanup +\set VERBOSITY TERSE +DROP SCHEMA outer_join_columns_testing CASCADE; +NOTICE: drop cascades to 4 other objects +RESET all; diff --git a/src/test/regress/expected/issue_7705.out b/src/test/regress/expected/multi_outer_join_columns_1.out similarity index 51% rename from src/test/regress/expected/issue_7705.out rename to src/test/regress/expected/multi_outer_join_columns_1.out index 20b078226ad..10dc6773d10 100644 --- a/src/test/regress/expected/issue_7705.out +++ b/src/test/regress/expected/multi_outer_join_columns_1.out @@ -2,10 +2,18 @@ --- do not cause issues for the postgres planner, in particular postgres versions 16+, where the --- varnullingrels field of a VAR node may contain relids of join relations that can make the var --- NULL; in a rewritten distributed query without a join such relids do not have a meaning. ---- Issue #7705: [SEGFAULT] Querying distributed tables with window partition causes segmentation fault ---- https://github.com/citusdata/citus/issues/7705 -CREATE SCHEMA issue_7705; -SET search_path to 'issue_7705'; +-- This test has an alternative goldfile because of the following feature in Postgres 16: +-- https://github.com/postgres/postgres/commit/1349d2790bf48a4de072931c722f39337e72055e +-- +SHOW server_version \gset +SELECT substring(:'server_version', '\d+')::int >= 16 AS server_version_ge_16; + server_version_ge_16 +--------------------------------------------------------------------- + f +(1 row) + +CREATE SCHEMA outer_join_columns_testing; +SET search_path to 'outer_join_columns_testing'; SET citus.next_shard_id TO 30070000; SET citus.shard_replication_factor TO 1; SET citus.enable_local_execution TO ON; @@ -17,7 +25,7 @@ SELECT create_distributed_table('t1', 'id'); NOTICE: Copying data from local table... NOTICE: copying the data has completed DETAIL: The local data in the table is no longer visible, but is still on disk. -HINT: To remove the local data, run: SELECT truncate_local_data_after_distributing_table($$issue_7705.t1$$) +HINT: To remove the local data, run: SELECT truncate_local_data_after_distributing_table($$outer_join_columns_testing.t1$$) create_distributed_table --------------------------------------------------------------------- @@ -27,7 +35,7 @@ SELECT create_distributed_table('t2', 'account_id'); NOTICE: Copying data from local table... NOTICE: copying the data has completed DETAIL: The local data in the table is no longer visible, but is still on disk. -HINT: To remove the local data, run: SELECT truncate_local_data_after_distributing_table($$issue_7705.t2$$) +HINT: To remove the local data, run: SELECT truncate_local_data_after_distributing_table($$outer_join_columns_testing.t2$$) create_distributed_table --------------------------------------------------------------------- @@ -50,7 +58,7 @@ FROM t1 LEFT OUTER JOIN t2 ON t1.id = t2.account_id; EXPLAIN (VERBOSE, COSTS OFF, TIMING OFF) SELECT t1.id, MAX(t2.a2) OVER (PARTITION BY t2.id) FROM t1 LEFT OUTER JOIN t2 ON t1.id = t2.account_id; - QUERY PLAN + QUERY PLAN --------------------------------------------------------------------- WindowAgg Output: remote_scan.id, max(remote_scan.max) OVER (?), remote_scan.worker_column_3 @@ -62,17 +70,17 @@ FROM t1 LEFT OUTER JOIN t2 ON t1.id = t2.account_id; Task Count: 4 Tasks Shown: One of 4 -> Task - Query: SELECT worker_column_1 AS id, worker_column_2 AS max, worker_column_3 FROM (SELECT t1.id AS worker_column_1, t2.a2 AS worker_column_2, t2.id AS worker_column_3 FROM (issue_7705.t1_30070000 t1 LEFT JOIN issue_7705.t2_30070004 t2 ON ((t1.id OPERATOR(pg_catalog.=) t2.account_id)))) worker_subquery + Query: SELECT worker_column_1 AS id, worker_column_2 AS max, worker_column_3 FROM (SELECT t1.id AS worker_column_1, t2.a2 AS worker_column_2, t2.id AS worker_column_3 FROM (outer_join_columns_testing.t1_30070000 t1 LEFT JOIN outer_join_columns_testing.t2_30070004 t2 ON ((t1.id OPERATOR(pg_catalog.=) t2.account_id)))) worker_subquery Node: host=localhost port=xxxxx dbname=regression -> Hash Right Join Output: t1.id, t2.a2, t2.id Inner Unique: true Hash Cond: (t2.account_id = t1.id) - -> Seq Scan on issue_7705.t2_30070004 t2 + -> Seq Scan on outer_join_columns_testing.t2_30070004 t2 Output: t2.id, t2.account_id, t2.a2 -> Hash Output: t1.id - -> Seq Scan on issue_7705.t1_30070000 t1 + -> Seq Scan on outer_join_columns_testing.t1_30070000 t1 Output: t1.id (22 rows) @@ -88,7 +96,7 @@ FROM t2 RIGHT OUTER JOIN t1 ON t1.id = t2.account_id; EXPLAIN (VERBOSE, COSTS OFF, TIMING OFF) SELECT t1.id, MAX(t2.a2) OVER (PARTITION BY t2.id) FROM t2 RIGHT OUTER JOIN t1 ON t1.id = t2.account_id; - QUERY PLAN + QUERY PLAN --------------------------------------------------------------------- WindowAgg Output: remote_scan.id, max(remote_scan.max) OVER (?), remote_scan.worker_column_3 @@ -100,17 +108,17 @@ FROM t2 RIGHT OUTER JOIN t1 ON t1.id = t2.account_id; Task Count: 4 Tasks Shown: One of 4 -> Task - Query: SELECT worker_column_1 AS id, worker_column_2 AS max, worker_column_3 FROM (SELECT t1.id AS worker_column_1, t2.a2 AS worker_column_2, t2.id AS worker_column_3 FROM (issue_7705.t2_30070004 t2 RIGHT JOIN issue_7705.t1_30070000 t1 ON ((t1.id OPERATOR(pg_catalog.=) t2.account_id)))) worker_subquery + Query: SELECT worker_column_1 AS id, worker_column_2 AS max, worker_column_3 FROM (SELECT t1.id AS worker_column_1, t2.a2 AS worker_column_2, t2.id AS worker_column_3 FROM (outer_join_columns_testing.t2_30070004 t2 RIGHT JOIN outer_join_columns_testing.t1_30070000 t1 ON ((t1.id OPERATOR(pg_catalog.=) t2.account_id)))) worker_subquery Node: host=localhost port=xxxxx dbname=regression -> Hash Right Join Output: t1.id, t2.a2, t2.id Inner Unique: true Hash Cond: (t2.account_id = t1.id) - -> Seq Scan on issue_7705.t2_30070004 t2 + -> Seq Scan on outer_join_columns_testing.t2_30070004 t2 Output: t2.id, t2.account_id, t2.a2 -> Hash Output: t1.id - -> Seq Scan on issue_7705.t1_30070000 t1 + -> Seq Scan on outer_join_columns_testing.t1_30070000 t1 Output: t1.id (22 rows) @@ -126,7 +134,7 @@ FROM t1 LEFT OUTER JOIN t2 ON t1.id = t2.account_id; EXPLAIN (VERBOSE, COSTS OFF, TIMING OFF) SELECT DISTINCT t1.id, MAX(t2.a2) OVER (PARTITION BY t2.id) FROM t1 LEFT OUTER JOIN t2 ON t1.id = t2.account_id; - QUERY PLAN + QUERY PLAN --------------------------------------------------------------------- HashAggregate Output: remote_scan.id, (max(remote_scan.max) OVER (?)), remote_scan.worker_column_3 @@ -141,17 +149,17 @@ FROM t1 LEFT OUTER JOIN t2 ON t1.id = t2.account_id; Task Count: 4 Tasks Shown: One of 4 -> Task - Query: SELECT worker_column_1 AS id, worker_column_2 AS max, worker_column_3 FROM (SELECT t1.id AS worker_column_1, t2.a2 AS worker_column_2, t2.id AS worker_column_3 FROM (issue_7705.t1_30070000 t1 LEFT JOIN issue_7705.t2_30070004 t2 ON ((t1.id OPERATOR(pg_catalog.=) t2.account_id)))) worker_subquery + Query: SELECT worker_column_1 AS id, worker_column_2 AS max, worker_column_3 FROM (SELECT t1.id AS worker_column_1, t2.a2 AS worker_column_2, t2.id AS worker_column_3 FROM (outer_join_columns_testing.t1_30070000 t1 LEFT JOIN outer_join_columns_testing.t2_30070004 t2 ON ((t1.id OPERATOR(pg_catalog.=) t2.account_id)))) worker_subquery Node: host=localhost port=xxxxx dbname=regression -> Hash Right Join Output: t1.id, t2.a2, t2.id Inner Unique: true Hash Cond: (t2.account_id = t1.id) - -> Seq Scan on issue_7705.t2_30070004 t2 + -> Seq Scan on outer_join_columns_testing.t2_30070004 t2 Output: t2.id, t2.account_id, t2.a2 -> Hash Output: t1.id - -> Seq Scan on issue_7705.t1_30070000 t1 + -> Seq Scan on outer_join_columns_testing.t1_30070000 t1 Output: t1.id (25 rows) @@ -175,7 +183,7 @@ EXPLAIN (VERBOSE, COSTS OFF, TIMING OFF) SELECT t1.id, TEST_F(t2.a2 + nextval('test_seq') :: int) FROM t1 LEFT OUTER JOIN t2 ON t1.id = t2.account_id ORDER BY t1.id; - QUERY PLAN + QUERY PLAN --------------------------------------------------------------------- Result Output: remote_scan.id, ((remote_scan.test_f + (nextval('test_seq'::regclass))::integer) + 42) @@ -187,17 +195,17 @@ ORDER BY t1.id; Task Count: 4 Tasks Shown: One of 4 -> Task - Query: SELECT worker_column_1 AS id, worker_column_2 AS test_f FROM (SELECT t1.id AS worker_column_1, t2.a2 AS worker_column_2 FROM (issue_7705.t1_30070000 t1 LEFT JOIN issue_7705.t2_30070004 t2 ON ((t1.id OPERATOR(pg_catalog.=) t2.account_id)))) worker_subquery + Query: SELECT worker_column_1 AS id, worker_column_2 AS test_f FROM (SELECT t1.id AS worker_column_1, t2.a2 AS worker_column_2 FROM (outer_join_columns_testing.t1_30070000 t1 LEFT JOIN outer_join_columns_testing.t2_30070004 t2 ON ((t1.id OPERATOR(pg_catalog.=) t2.account_id)))) worker_subquery Node: host=localhost port=xxxxx dbname=regression -> Hash Right Join Output: t1.id, t2.a2 Inner Unique: true Hash Cond: (t2.account_id = t1.id) - -> Seq Scan on issue_7705.t2_30070004 t2 + -> Seq Scan on outer_join_columns_testing.t2_30070004 t2 Output: t2.id, t2.account_id, t2.a2 -> Hash Output: t1.id - -> Seq Scan on issue_7705.t1_30070000 t1 + -> Seq Scan on outer_join_columns_testing.t1_30070000 t1 Output: t1.id (22 rows) @@ -215,7 +223,7 @@ EXPLAIN (VERBOSE, COSTS OFF, TIMING OFF) SELECT t1.id, CASE nextval('test_seq') %2 = 0 WHEN true THEN t2.a2 ELSE 1 END FROM t1 LEFT OUTER JOIN t2 ON t1.id = t2.account_id ORDER BY t1.id; - QUERY PLAN + QUERY PLAN --------------------------------------------------------------------- Result Output: remote_scan.id, CASE ((nextval('test_seq'::regclass) % '2'::bigint) = 0) WHEN CASE_TEST_EXPR THEN remote_scan."case" ELSE 1 END @@ -227,22 +235,180 @@ ORDER BY t1.id; Task Count: 4 Tasks Shown: One of 4 -> Task - Query: SELECT worker_column_1 AS id, worker_column_2 AS "case" FROM (SELECT t1.id AS worker_column_1, t2.a2 AS worker_column_2 FROM (issue_7705.t1_30070000 t1 LEFT JOIN issue_7705.t2_30070004 t2 ON ((t1.id OPERATOR(pg_catalog.=) t2.account_id)))) worker_subquery + Query: SELECT worker_column_1 AS id, worker_column_2 AS "case" FROM (SELECT t1.id AS worker_column_1, t2.a2 AS worker_column_2 FROM (outer_join_columns_testing.t1_30070000 t1 LEFT JOIN outer_join_columns_testing.t2_30070004 t2 ON ((t1.id OPERATOR(pg_catalog.=) t2.account_id)))) worker_subquery Node: host=localhost port=xxxxx dbname=regression -> Hash Right Join Output: t1.id, t2.a2 Inner Unique: true Hash Cond: (t2.account_id = t1.id) - -> Seq Scan on issue_7705.t2_30070004 t2 + -> Seq Scan on outer_join_columns_testing.t2_30070004 t2 + Output: t2.id, t2.account_id, t2.a2 + -> Hash + Output: t1.id + -> Seq Scan on outer_join_columns_testing.t1_30070000 t1 + Output: t1.id +(22 rows) + +-- Issue #7787: count distinct of a column from the inner side of a +-- left outer join will have a non-empty varnullingrels in the query +-- tree returned by Postgres 16+, so ensure this is not reflected in +-- the worker subquery constructed by Citus; it has just one relation, +-- for the pushed down subquery. +SELECT COUNT(DISTINCT a2) +FROM t1 LEFT OUTER JOIN t2 ON t1.id = t2.account_id; + count +--------------------------------------------------------------------- + 2 +(1 row) + +EXPLAIN (VERBOSE, COSTS OFF, TIMING OFF) +SELECT COUNT(DISTINCT a2) +FROM t1 LEFT OUTER JOIN t2 ON t1.id = t2.account_id; + QUERY PLAN +--------------------------------------------------------------------- + Aggregate + Output: count(DISTINCT remote_scan.count) + -> Custom Scan (Citus Adaptive) + Output: remote_scan.count + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Query: SELECT worker_column_1 AS count FROM (SELECT t2.a2 AS worker_column_1 FROM (outer_join_columns_testing.t1_30070000 t1 LEFT JOIN outer_join_columns_testing.t2_30070004 t2 ON ((t1.id OPERATOR(pg_catalog.=) t2.account_id)))) worker_subquery GROUP BY worker_column_1 + Node: host=localhost port=xxxxx dbname=regression + -> HashAggregate + Output: t2.a2 + Group Key: t2.a2 + -> Hash Right Join + Output: t2.a2 + Inner Unique: true + Hash Cond: (t2.account_id = t1.id) + -> Seq Scan on outer_join_columns_testing.t2_30070004 t2 + Output: t2.id, t2.account_id, t2.a2 + -> Hash + Output: t1.id + -> Seq Scan on outer_join_columns_testing.t1_30070000 t1 + Output: t1.id +(22 rows) + +-- Issue #7787 also occurs with a HAVING clause +SELECT 1 +FROM t1 LEFT OUTER JOIN t2 ON t1.id = t2.account_id +HAVING COUNT(DISTINCT a2) > 1; + ?column? +--------------------------------------------------------------------- + 1 +(1 row) + +EXPLAIN (VERBOSE, COSTS OFF, TIMING OFF) +SELECT 1 +FROM t1 LEFT OUTER JOIN t2 ON t1.id = t2.account_id +HAVING COUNT(DISTINCT a2) > 1; + QUERY PLAN +--------------------------------------------------------------------- + Aggregate + Output: remote_scan."?column?" + Filter: (count(DISTINCT remote_scan.worker_column_2) > 1) + -> Custom Scan (Citus Adaptive) + Output: remote_scan."?column?", remote_scan.worker_column_2 + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Query: SELECT 1, worker_column_1 AS worker_column_2 FROM (SELECT t2.a2 AS worker_column_1 FROM (outer_join_columns_testing.t1_30070000 t1 LEFT JOIN outer_join_columns_testing.t2_30070004 t2 ON ((t1.id OPERATOR(pg_catalog.=) t2.account_id)))) worker_subquery GROUP BY worker_column_1 + Node: host=localhost port=xxxxx dbname=regression + -> HashAggregate + Output: 1, t2.a2 + Group Key: t2.a2 + -> Hash Right Join + Output: t2.a2 + Inner Unique: true + Hash Cond: (t2.account_id = t1.id) + -> Seq Scan on outer_join_columns_testing.t2_30070004 t2 Output: t2.id, t2.account_id, t2.a2 -> Hash Output: t1.id - -> Seq Scan on issue_7705.t1_30070000 t1 + -> Seq Scan on outer_join_columns_testing.t1_30070000 t1 + Output: t1.id +(23 rows) + +-- Check right outer join +SELECT COUNT(DISTINCT a2) +FROM t2 RIGHT OUTER JOIN t1 ON t2.account_id = t1.id; + count +--------------------------------------------------------------------- + 2 +(1 row) + +EXPLAIN (VERBOSE, COSTS OFF, TIMING OFF) +SELECT COUNT(DISTINCT a2) +FROM t2 RIGHT OUTER JOIN t1 ON t2.account_id = t1.id; + QUERY PLAN +--------------------------------------------------------------------- + Aggregate + Output: count(DISTINCT remote_scan.count) + -> Custom Scan (Citus Adaptive) + Output: remote_scan.count + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Query: SELECT worker_column_1 AS count FROM (SELECT t2.a2 AS worker_column_1 FROM (outer_join_columns_testing.t2_30070004 t2 RIGHT JOIN outer_join_columns_testing.t1_30070000 t1 ON ((t2.account_id OPERATOR(pg_catalog.=) t1.id)))) worker_subquery GROUP BY worker_column_1 + Node: host=localhost port=xxxxx dbname=regression + -> HashAggregate + Output: t2.a2 + Group Key: t2.a2 + -> Hash Right Join + Output: t2.a2 + Inner Unique: true + Hash Cond: (t2.account_id = t1.id) + -> Seq Scan on outer_join_columns_testing.t2_30070004 t2 + Output: t2.id, t2.account_id, t2.a2 + -> Hash + Output: t1.id + -> Seq Scan on outer_join_columns_testing.t1_30070000 t1 Output: t1.id (22 rows) +-- Check both count distinct and having clause +SELECT COUNT(DISTINCT a2) +FROM t1 LEFT OUTER JOIN t2 ON t1.id = t2.account_id +HAVING COUNT(DISTINCT t2.id) > 1; + count +--------------------------------------------------------------------- + 2 +(1 row) + +EXPLAIN (VERBOSE, COSTS OFF, TIMING OFF) +SELECT COUNT(DISTINCT a2) +FROM t1 LEFT OUTER JOIN t2 ON t1.id = t2.account_id +HAVING COUNT(DISTINCT t2.id) > 1; + QUERY PLAN +--------------------------------------------------------------------- + Aggregate + Output: count(DISTINCT remote_scan.count) + Filter: (count(DISTINCT remote_scan.worker_column_2) > 1) + -> Custom Scan (Citus Adaptive) + Output: remote_scan.count, remote_scan.worker_column_2 + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Query: SELECT worker_column_1 AS count, worker_column_2 FROM (SELECT t2.a2 AS worker_column_1, t2.id AS worker_column_2 FROM (outer_join_columns_testing.t1_30070000 t1 LEFT JOIN outer_join_columns_testing.t2_30070004 t2 ON ((t1.id OPERATOR(pg_catalog.=) t2.account_id)))) worker_subquery GROUP BY worker_column_1, worker_column_2 + Node: host=localhost port=xxxxx dbname=regression + -> HashAggregate + Output: t2.a2, t2.id + Group Key: t2.a2, t2.id + -> Hash Right Join + Output: t2.a2, t2.id + Inner Unique: true + Hash Cond: (t2.account_id = t1.id) + -> Seq Scan on outer_join_columns_testing.t2_30070004 t2 + Output: t2.id, t2.account_id, t2.a2 + -> Hash + Output: t1.id + -> Seq Scan on outer_join_columns_testing.t1_30070000 t1 + Output: t1.id +(23 rows) + --- cleanup \set VERBOSITY TERSE -DROP SCHEMA issue_7705 CASCADE; +DROP SCHEMA outer_join_columns_testing CASCADE; NOTICE: drop cascades to 4 other objects RESET all; diff --git a/src/test/regress/multi_schedule b/src/test/regress/multi_schedule index bbb4047a950..daaa5a0cad3 100644 --- a/src/test/regress/multi_schedule +++ b/src/test/regress/multi_schedule @@ -84,7 +84,7 @@ test: forcedelegation_functions system_queries test: join_pushdown test: multi_subquery_union multi_subquery_in_where_clause multi_subquery_misc statement_cancel_error_message test: multi_agg_distinct -test: multi_limit_clause_approximate multi_outer_join_reference multi_single_relation_subquery multi_prepare_plsql set_role_in_transaction +test: multi_limit_clause_approximate multi_outer_join_reference multi_outer_join_columns multi_single_relation_subquery multi_prepare_plsql set_role_in_transaction test: multi_reference_table multi_select_for_update relation_access_tracking pg13_with_ties test: custom_aggregate_support aggregate_support tdigest_aggregate_support test: multi_average_expression multi_working_columns multi_having_pushdown having_subquery @@ -103,7 +103,7 @@ test: multi_dropped_column_aliases foreign_key_restriction_enforcement test: binary_protocol test: alter_table_set_access_method test: alter_distributed_table -test: issue_5248 issue_5099 issue_5763 issue_6543 issue_6758 issue_7477 issue_7705 +test: issue_5248 issue_5099 issue_5763 issue_6543 issue_6758 issue_7477 test: object_propagation_debug test: undistribute_table test: run_command_on_all_nodes diff --git a/src/test/regress/sql/issue_7705.sql b/src/test/regress/sql/multi_outer_join_columns.sql similarity index 62% rename from src/test/regress/sql/issue_7705.sql rename to src/test/regress/sql/multi_outer_join_columns.sql index 950933017ad..eec111cb547 100644 --- a/src/test/regress/sql/issue_7705.sql +++ b/src/test/regress/sql/multi_outer_join_columns.sql @@ -2,11 +2,16 @@ --- do not cause issues for the postgres planner, in particular postgres versions 16+, where the --- varnullingrels field of a VAR node may contain relids of join relations that can make the var --- NULL; in a rewritten distributed query without a join such relids do not have a meaning. ---- Issue #7705: [SEGFAULT] Querying distributed tables with window partition causes segmentation fault ---- https://github.com/citusdata/citus/issues/7705 -CREATE SCHEMA issue_7705; -SET search_path to 'issue_7705'; +-- This test has an alternative goldfile because of the following feature in Postgres 16: +-- https://github.com/postgres/postgres/commit/1349d2790bf48a4de072931c722f39337e72055e +-- + +SHOW server_version \gset +SELECT substring(:'server_version', '\d+')::int >= 16 AS server_version_ge_16; + +CREATE SCHEMA outer_join_columns_testing; +SET search_path to 'outer_join_columns_testing'; SET citus.next_shard_id TO 30070000; SET citus.shard_replication_factor TO 1; SET citus.enable_local_execution TO ON; @@ -66,7 +71,43 @@ SELECT t1.id, CASE nextval('test_seq') %2 = 0 WHEN true THEN t2.a2 ELSE 1 END FROM t1 LEFT OUTER JOIN t2 ON t1.id = t2.account_id ORDER BY t1.id; +-- Issue #7787: count distinct of a column from the inner side of a +-- left outer join will have a non-empty varnullingrels in the query +-- tree returned by Postgres 16+, so ensure this is not reflected in +-- the worker subquery constructed by Citus; it has just one relation, +-- for the pushed down subquery. +SELECT COUNT(DISTINCT a2) +FROM t1 LEFT OUTER JOIN t2 ON t1.id = t2.account_id; +EXPLAIN (VERBOSE, COSTS OFF, TIMING OFF) +SELECT COUNT(DISTINCT a2) +FROM t1 LEFT OUTER JOIN t2 ON t1.id = t2.account_id; + +-- Issue #7787 also occurs with a HAVING clause +SELECT 1 +FROM t1 LEFT OUTER JOIN t2 ON t1.id = t2.account_id +HAVING COUNT(DISTINCT a2) > 1; +EXPLAIN (VERBOSE, COSTS OFF, TIMING OFF) +SELECT 1 +FROM t1 LEFT OUTER JOIN t2 ON t1.id = t2.account_id +HAVING COUNT(DISTINCT a2) > 1; + +-- Check right outer join +SELECT COUNT(DISTINCT a2) +FROM t2 RIGHT OUTER JOIN t1 ON t2.account_id = t1.id; +EXPLAIN (VERBOSE, COSTS OFF, TIMING OFF) +SELECT COUNT(DISTINCT a2) +FROM t2 RIGHT OUTER JOIN t1 ON t2.account_id = t1.id; + +-- Check both count distinct and having clause +SELECT COUNT(DISTINCT a2) +FROM t1 LEFT OUTER JOIN t2 ON t1.id = t2.account_id +HAVING COUNT(DISTINCT t2.id) > 1; +EXPLAIN (VERBOSE, COSTS OFF, TIMING OFF) +SELECT COUNT(DISTINCT a2) +FROM t1 LEFT OUTER JOIN t2 ON t1.id = t2.account_id +HAVING COUNT(DISTINCT t2.id) > 1; + --- cleanup \set VERBOSITY TERSE -DROP SCHEMA issue_7705 CASCADE; +DROP SCHEMA outer_join_columns_testing CASCADE; RESET all;