diff --git a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseTPCHSaltNullParquetSuite.scala b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseTPCHSaltNullParquetSuite.scala index 27cb39584fea..615cceae93b6 100644 --- a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseTPCHSaltNullParquetSuite.scala +++ b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseTPCHSaltNullParquetSuite.scala @@ -688,6 +688,22 @@ class GlutenClickHouseTPCHSaltNullParquetSuite extends GlutenClickHouseTPCHAbstr } } + test("test shuffle function") { + withSQLConf( + SQLConf.OPTIMIZER_EXCLUDED_RULES.key -> (ConstantFolding.ruleName + "," + NullPropagation.ruleName)) { + runQueryAndCompare( + "select shuffle(split(n_comment, ' ')) from nation", + compareResult = false + )(checkGlutenOperatorMatch[ProjectExecTransformer]) + + runQueryAndCompare( + "select shuffle(array(1,2,3,4,5)), shuffle(array(1,3,null,3,4)), shuffle(null)", + compareResult = false, + noFallBack = false + )(checkGlutenOperatorMatch[ProjectExecTransformer]) + } + } + test("test 'function regexp_extract_all'") { runQueryAndCompare( "select l_orderkey, regexp_extract_all(l_comment, '([a-z])', 1) " + diff --git a/cpp-ch/local-engine/Parser/SerializedPlanParser.h b/cpp-ch/local-engine/Parser/SerializedPlanParser.h index b9a72cf0548a..019d49b02a1c 100644 --- a/cpp-ch/local-engine/Parser/SerializedPlanParser.h +++ b/cpp-ch/local-engine/Parser/SerializedPlanParser.h @@ -181,6 +181,7 @@ static const std::map SCALAR_FUNCTIONS // array functions {"array", "array"}, + {"shuffle", "arrayShuffle"}, {"range", "range"}, /// dummy mapping // map functions diff --git a/gluten-core/src/main/scala/org/apache/gluten/expression/ExpressionMappings.scala b/gluten-core/src/main/scala/org/apache/gluten/expression/ExpressionMappings.scala index 618798b15c00..e34ea88403d1 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/expression/ExpressionMappings.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/expression/ExpressionMappings.scala @@ -229,6 +229,7 @@ object ExpressionMappings { Sig[ArrayRepeat](ARRAY_REPEAT), Sig[ArrayRemove](ARRAY_REMOVE), Sig[ArrayFilter](FILTER), + Sig[Shuffle](SHUFFLE), // Map functions Sig[CreateMap](CREATE_MAP), Sig[GetMapValue](GET_MAP_VALUE), diff --git a/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala b/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala index 9b987b0e4b3b..a51157e6263a 100644 --- a/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala +++ b/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala @@ -167,6 +167,9 @@ class ClickHouseTestSettings extends BackendTestSettings { .exclude("transform values function - test empty") .exclude("SPARK-14393: values generated by non-deterministic functions shouldn't change after coalesce or union") .exclude("SPARK-24734: Fix containsNull of Concat for array type") + .exclude("shuffle function - array for primitive type not containing null") + .exclude("shuffle function - array for primitive type containing null") + .exclude("shuffle function - array for non-primitive type") enableSuite[GlutenDataFrameHintSuite] enableSuite[GlutenDataFrameImplicitsSuite] enableSuite[GlutenDataFrameJoinSuite].exclude( @@ -686,6 +689,7 @@ class ClickHouseTestSettings extends BackendTestSettings { .exclude("SPARK-36755: ArraysOverlap hould handle duplicated Double.NaN and Float.Nan") .exclude( "SPARK-36740: ArrayMin/ArrayMax/SortArray should handle NaN greater then non-NaN value") + .excludeGlutenTest("Shuffle") enableSuite[GlutenComplexTypeSuite] .exclude("SPARK-33386: GetArrayItem ArrayIndexOutOfBoundsException") .exclude("SPARK-33460: GetMapValue NoSuchElementException") diff --git a/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala b/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala index 189a09d350ff..7e0ccb17c918 100644 --- a/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala +++ b/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala @@ -195,6 +195,7 @@ class VeloxTestSettings extends BackendTestSettings { enableSuite[GlutenCollectionExpressionsSuite] // Rewrite in Gluten to replace Seq with Array .exclude("Shuffle") + .excludeGlutenTest("Shuffle") // TODO: ArrayDistinct should handle duplicated Double.NaN .excludeByPrefix("SPARK-36741") // TODO: ArrayIntersect should handle duplicated Double.NaN @@ -273,6 +274,9 @@ class VeloxTestSettings extends BackendTestSettings { // blocked by Velox-5768 .exclude("aggregate function - array for primitive type containing null") .exclude("aggregate function - array for non-primitive type") + .exclude("shuffle function - array for primitive type not containing null") + .exclude("shuffle function - array for primitive type containing null") + .exclude("shuffle function - array for non-primitive type") enableSuite[GlutenDataFrameTungstenSuite] enableSuite[GlutenDataFrameSetOperationsSuite] // Result depends on the implementation for nondeterministic expression rand. diff --git a/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala b/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala index 05e00ca5d47e..144d103c0d66 100644 --- a/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala +++ b/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala @@ -185,6 +185,9 @@ class ClickHouseTestSettings extends BackendTestSettings { .exclude("transform values function - test empty") .exclude("SPARK-14393: values generated by non-deterministic functions shouldn't change after coalesce or union") .exclude("SPARK-24734: Fix containsNull of Concat for array type") + .exclude("shuffle function - array for primitive type not containing null") + .exclude("shuffle function - array for primitive type containing null") + .exclude("shuffle function - array for non-primitive type") enableSuite[GlutenDataFrameHintSuite] enableSuite[GlutenDataFrameImplicitsSuite] enableSuite[GlutenDataFrameJoinSuite].exclude( @@ -727,6 +730,7 @@ class ClickHouseTestSettings extends BackendTestSettings { .exclude( "SPARK-36740: ArrayMin/ArrayMax/SortArray should handle NaN greater then non-NaN value") .exclude("SPARK-39184: Avoid ArrayIndexOutOfBoundsException when crossing DST boundary") + .excludeGlutenTest("Shuffle") enableSuite[GlutenComplexTypeSuite] .exclude("SPARK-33386: GetArrayItem ArrayIndexOutOfBoundsException") .exclude("SPARK-33460: GetMapValue NoSuchElementException") diff --git a/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala b/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala index 518908c9c5bf..00eb455c0f9c 100644 --- a/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala +++ b/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala @@ -114,6 +114,7 @@ class VeloxTestSettings extends BackendTestSettings { enableSuite[GlutenCollectionExpressionsSuite] // Rewrite in Gluten to replace Seq with Array .exclude("Shuffle") + .excludeGlutenTest("Shuffle") // TODO: ArrayDistinct should handle duplicated Double.NaN .excludeByPrefix("SPARK-36741") // TODO: ArrayIntersect should handle duplicated Double.NaN @@ -938,6 +939,9 @@ class VeloxTestSettings extends BackendTestSettings { // blocked by Velox-5768 .exclude("aggregate function - array for primitive type containing null") .exclude("aggregate function - array for non-primitive type") + .exclude("shuffle function - array for primitive type not containing null") + .exclude("shuffle function - array for primitive type containing null") + .exclude("shuffle function - array for non-primitive type") enableSuite[GlutenDataFrameHintSuite] enableSuite[GlutenDataFrameImplicitsSuite] enableSuite[GlutenDataFrameJoinSuite] diff --git a/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala b/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala index d907223015ec..679893bb65e4 100644 --- a/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala +++ b/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala @@ -187,6 +187,9 @@ class ClickHouseTestSettings extends BackendTestSettings { .exclude("transform values function - test empty") .exclude("SPARK-14393: values generated by non-deterministic functions shouldn't change after coalesce or union") .exclude("SPARK-24734: Fix containsNull of Concat for array type") + .exclude("shuffle function - array for primitive type not containing null") + .exclude("shuffle function - array for primitive type containing null") + .exclude("shuffle function - array for non-primitive type") enableSuite[GlutenDataFrameHintSuite] enableSuite[GlutenDataFrameImplicitsSuite] enableSuite[GlutenDataFrameJoinSuite].exclude( @@ -567,6 +570,7 @@ class ClickHouseTestSettings extends BackendTestSettings { .exclude( "SPARK-36740: ArrayMin/ArrayMax/SortArray should handle NaN greater then non-NaN value") .exclude("SPARK-39184: Avoid ArrayIndexOutOfBoundsException when crossing DST boundary") + .excludeGlutenTest("Shuffle") enableSuite[GlutenComplexTypeSuite] .exclude("SPARK-33386: GetArrayItem ArrayIndexOutOfBoundsException") .exclude("SPARK-33460: GetMapValue NoSuchElementException") diff --git a/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala b/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala index 9d297e4ea067..4f8afe579b88 100644 --- a/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala +++ b/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala @@ -95,6 +95,7 @@ class VeloxTestSettings extends BackendTestSettings { enableSuite[GlutenCollectionExpressionsSuite] // Rewrite in Gluten to replace Seq with Array .exclude("Shuffle") + .excludeGlutenTest("Shuffle") // TODO: ArrayDistinct should handle duplicated Double.NaN .excludeByPrefix("SPARK-36741") // TODO: ArrayIntersect should handle duplicated Double.NaN @@ -943,6 +944,9 @@ class VeloxTestSettings extends BackendTestSettings { // blocked by Velox-5768 .exclude("aggregate function - array for primitive type containing null") .exclude("aggregate function - array for non-primitive type") + .exclude("shuffle function - array for primitive type not containing null") + .exclude("shuffle function - array for primitive type containing null") + .exclude("shuffle function - array for non-primitive type") enableSuite[GlutenDataFrameHintSuite] enableSuite[GlutenDataFrameImplicitsSuite] enableSuite[GlutenDataFrameJoinSuite] diff --git a/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala b/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala index d907223015ec..679893bb65e4 100644 --- a/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala +++ b/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala @@ -187,6 +187,9 @@ class ClickHouseTestSettings extends BackendTestSettings { .exclude("transform values function - test empty") .exclude("SPARK-14393: values generated by non-deterministic functions shouldn't change after coalesce or union") .exclude("SPARK-24734: Fix containsNull of Concat for array type") + .exclude("shuffle function - array for primitive type not containing null") + .exclude("shuffle function - array for primitive type containing null") + .exclude("shuffle function - array for non-primitive type") enableSuite[GlutenDataFrameHintSuite] enableSuite[GlutenDataFrameImplicitsSuite] enableSuite[GlutenDataFrameJoinSuite].exclude( @@ -567,6 +570,7 @@ class ClickHouseTestSettings extends BackendTestSettings { .exclude( "SPARK-36740: ArrayMin/ArrayMax/SortArray should handle NaN greater then non-NaN value") .exclude("SPARK-39184: Avoid ArrayIndexOutOfBoundsException when crossing DST boundary") + .excludeGlutenTest("Shuffle") enableSuite[GlutenComplexTypeSuite] .exclude("SPARK-33386: GetArrayItem ArrayIndexOutOfBoundsException") .exclude("SPARK-33460: GetMapValue NoSuchElementException") diff --git a/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala b/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala index 1c74bd2476c4..6f6c6d05a80e 100644 --- a/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala +++ b/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala @@ -96,6 +96,7 @@ class VeloxTestSettings extends BackendTestSettings { enableSuite[GlutenCollectionExpressionsSuite] // Rewrite in Gluten to replace Seq with Array .exclude("Shuffle") + .excludeGlutenTest("Shuffle") // TODO: ArrayDistinct should handle duplicated Double.NaN .excludeByPrefix("SPARK-36741") // TODO: ArrayIntersect should handle duplicated Double.NaN @@ -959,6 +960,9 @@ class VeloxTestSettings extends BackendTestSettings { // blocked by Velox-5768 .exclude("aggregate function - array for primitive type containing null") .exclude("aggregate function - array for non-primitive type") + .exclude("shuffle function - array for primitive type not containing null") + .exclude("shuffle function - array for primitive type containing null") + .exclude("shuffle function - array for non-primitive type") enableSuite[GlutenDataFrameHintSuite] enableSuite[GlutenDataFrameImplicitsSuite] enableSuite[GlutenDataFrameJoinSuite] diff --git a/shims/common/src/main/scala/org/apache/gluten/expression/ExpressionNames.scala b/shims/common/src/main/scala/org/apache/gluten/expression/ExpressionNames.scala index ca8b098aa32d..f097d53625a2 100644 --- a/shims/common/src/main/scala/org/apache/gluten/expression/ExpressionNames.scala +++ b/shims/common/src/main/scala/org/apache/gluten/expression/ExpressionNames.scala @@ -245,6 +245,7 @@ object ExpressionNames { final val ARRAY_REPEAT = "array_repeat" final val ARRAY_REMOVE = "array_remove" final val FILTER = "filter" + final val SHUFFLE = "shuffle" // Map functions final val CREATE_MAP = "map"