From 64c472df2364ed71256a5a304b755c4a5894efc7 Mon Sep 17 00:00:00 2001 From: PHILO-HE Date: Tue, 19 Mar 2024 16:12:04 +0800 Subject: [PATCH 1/2] Initial commit --- .../clickhouse/CHSparkPlanExecApi.scala | 7 +++++ .../expression/CHExpressionTransformer.scala | 25 ++++++++++++++++++ .../execution/VeloxStringFunctionsSuite.scala | 5 +--- .../backendsapi/SparkPlanExecApi.scala | 7 +++++ .../expression/ExpressionConverter.scala | 14 +++++----- .../StringExpressionTransformer.scala | 26 ------------------- 6 files changed, 48 insertions(+), 36 deletions(-) diff --git a/backends-clickhouse/src/main/scala/io/glutenproject/backendsapi/clickhouse/CHSparkPlanExecApi.scala b/backends-clickhouse/src/main/scala/io/glutenproject/backendsapi/clickhouse/CHSparkPlanExecApi.scala index 9bd1f938ab07..cea88266ea44 100644 --- a/backends-clickhouse/src/main/scala/io/glutenproject/backendsapi/clickhouse/CHSparkPlanExecApi.scala +++ b/backends-clickhouse/src/main/scala/io/glutenproject/backendsapi/clickhouse/CHSparkPlanExecApi.scala @@ -581,6 +581,13 @@ class CHSparkPlanExecApi extends SparkPlanExecApi { CHPosExplodeTransformer(substraitExprName, child, original, attributeSeq) } + override def genRegexpReplaceTransformer( + substraitExprName: String, + children: Seq[ExpressionTransformer], + expr: RegExpReplace): ExpressionTransformer = { + CHRegExpReplaceTransformer(substraitExprName, children, expr) + } + override def createColumnarWriteFilesExec( child: SparkPlan, fileFormat: FileFormat, diff --git a/backends-clickhouse/src/main/scala/io/glutenproject/expression/CHExpressionTransformer.scala b/backends-clickhouse/src/main/scala/io/glutenproject/expression/CHExpressionTransformer.scala index bb8b704d9069..a52c67265971 100644 --- a/backends-clickhouse/src/main/scala/io/glutenproject/expression/CHExpressionTransformer.scala +++ b/backends-clickhouse/src/main/scala/io/glutenproject/expression/CHExpressionTransformer.scala @@ -197,3 +197,28 @@ case class CHPosExplodeTransformer( } } } + +case class CHRegExpReplaceTransformer( + substraitExprName: String, + children: Seq[ExpressionTransformer], + original: RegExpReplace) + extends ExpressionTransformer { + + override def doTransform(args: java.lang.Object): ExpressionNode = { + // In CH: replaceRegexpAll(subject, regexp, rep), which is equivalent + // In Spark: regexp_replace(subject, regexp, rep, pos=1) + val posNode = children(3).doTransform(args) + if ( + !posNode.isInstanceOf[IntLiteralNode] || + posNode.asInstanceOf[IntLiteralNode].getValue != 1 + ) { + throw new UnsupportedOperationException(s"$original not supported yet.") + } + + GenericExpressionTransformer( + substraitExprName, + Seq(children(0), children(1), children(2)), + original) + .doTransform(args) + } +} diff --git a/backends-velox/src/test/scala/io/glutenproject/execution/VeloxStringFunctionsSuite.scala b/backends-velox/src/test/scala/io/glutenproject/execution/VeloxStringFunctionsSuite.scala index c306d70ac519..ed1f851cae6c 100644 --- a/backends-velox/src/test/scala/io/glutenproject/execution/VeloxStringFunctionsSuite.scala +++ b/backends-velox/src/test/scala/io/glutenproject/execution/VeloxStringFunctionsSuite.scala @@ -466,12 +466,9 @@ class VeloxStringFunctionsSuite extends VeloxWholeStageTransformerSuite { runQueryAndCompare( s"select l_orderkey, regexp_replace(l_comment, '([a-z])', '1', 1) " + s"from $LINEITEM_TABLE limit 5")(checkOperatorMatch[ProjectExecTransformer]) - // todo incorrect results runQueryAndCompare( s"select l_orderkey, regexp_replace(l_comment, '([a-z])', '1', 10) " + - s"from $LINEITEM_TABLE limit 5", - true, - false)(_ => {}) + s"from $LINEITEM_TABLE limit 5")(checkOperatorMatch[ProjectExecTransformer]) } test("regex invalid") { diff --git a/gluten-core/src/main/scala/io/glutenproject/backendsapi/SparkPlanExecApi.scala b/gluten-core/src/main/scala/io/glutenproject/backendsapi/SparkPlanExecApi.scala index ba717ec00656..1fa18634d66e 100644 --- a/gluten-core/src/main/scala/io/glutenproject/backendsapi/SparkPlanExecApi.scala +++ b/gluten-core/src/main/scala/io/glutenproject/backendsapi/SparkPlanExecApi.scala @@ -225,6 +225,13 @@ trait SparkPlanExecApi { throw new GlutenNotSupportException("make_timestamp is not supported") } + def genRegexpReplaceTransformer( + substraitExprName: String, + children: Seq[ExpressionTransformer], + expr: RegExpReplace): ExpressionTransformer = { + GenericExpressionTransformer(substraitExprName, children, expr) + } + /** * Generate ShuffleDependency for ColumnarShuffleExchangeExec. * diff --git a/gluten-core/src/main/scala/io/glutenproject/expression/ExpressionConverter.scala b/gluten-core/src/main/scala/io/glutenproject/expression/ExpressionConverter.scala index dd9fbed1c690..8e92f02a4268 100644 --- a/gluten-core/src/main/scala/io/glutenproject/expression/ExpressionConverter.scala +++ b/gluten-core/src/main/scala/io/glutenproject/expression/ExpressionConverter.scala @@ -372,12 +372,14 @@ object ExpressionConverter extends SQLConfHelper with Logging { s ) case r: RegExpReplace => - RegExpReplaceTransformer( - substraitExprName, - replaceWithExpressionTransformerInternal(r.subject, attributeSeq, expressionsMap), - replaceWithExpressionTransformerInternal(r.regexp, attributeSeq, expressionsMap), - replaceWithExpressionTransformerInternal(r.rep, attributeSeq, expressionsMap), - replaceWithExpressionTransformerInternal(r.pos, attributeSeq, expressionsMap), + BackendsApiManager.getSparkPlanExecApiInstance.genRegexpReplaceTransformer( + substraitExprName, + Seq( + replaceWithExpressionTransformerInternal(r.subject, attributeSeq, expressionsMap), + replaceWithExpressionTransformerInternal(r.regexp, attributeSeq, expressionsMap), + replaceWithExpressionTransformerInternal(r.rep, attributeSeq, expressionsMap), + replaceWithExpressionTransformerInternal(r.pos, attributeSeq, expressionsMap) + ), r ) case equal: EqualNullSafe => diff --git a/gluten-core/src/main/scala/io/glutenproject/expression/StringExpressionTransformer.scala b/gluten-core/src/main/scala/io/glutenproject/expression/StringExpressionTransformer.scala index b1ac0ddae8c7..3fe42e8d7039 100644 --- a/gluten-core/src/main/scala/io/glutenproject/expression/StringExpressionTransformer.scala +++ b/gluten-core/src/main/scala/io/glutenproject/expression/StringExpressionTransformer.scala @@ -16,7 +16,6 @@ */ package io.glutenproject.expression -import io.glutenproject.exception.GlutenNotSupportException import io.glutenproject.expression.ConverterUtils.FunctionConfig import io.glutenproject.substrait.expression._ @@ -48,28 +47,3 @@ case class String2TrimExpressionTransformer( ExpressionBuilder.makeScalarFunction(functionId, expressNodes, typeNode) } } - -case class RegExpReplaceTransformer( - substraitExprName: String, - subject: ExpressionTransformer, - regexp: ExpressionTransformer, - rep: ExpressionTransformer, - pos: ExpressionTransformer, - original: RegExpReplace) - extends ExpressionTransformer { - - override def doTransform(args: java.lang.Object): ExpressionNode = { - // In CH: replaceRegexpAll(subject, regexp, rep), which is equivalent - // In Spark: regexp_replace(subject, regexp, rep, pos=1) - val posNode = pos.doTransform(args) - if ( - !posNode.isInstanceOf[IntLiteralNode] || - posNode.asInstanceOf[IntLiteralNode].getValue != 1 - ) { - throw new GlutenNotSupportException(s"$original not supported yet.") - } - - GenericExpressionTransformer(substraitExprName, Seq(subject, regexp, rep), original) - .doTransform(args) - } -} From 525eb54777a530c59276c7f30ca856ed57eaf493 Mon Sep 17 00:00:00 2001 From: PHILO-HE Date: Wed, 20 Mar 2024 09:50:56 +0800 Subject: [PATCH 2/2] Add a test --- .../execution/VeloxFunctionsValidateSuite.scala | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/backends-velox/src/test/scala/io/glutenproject/execution/VeloxFunctionsValidateSuite.scala b/backends-velox/src/test/scala/io/glutenproject/execution/VeloxFunctionsValidateSuite.scala index 5c6def05e88e..4cffa121dae3 100644 --- a/backends-velox/src/test/scala/io/glutenproject/execution/VeloxFunctionsValidateSuite.scala +++ b/backends-velox/src/test/scala/io/glutenproject/execution/VeloxFunctionsValidateSuite.scala @@ -512,4 +512,16 @@ class VeloxFunctionsValidateSuite extends VeloxWholeStageTransformerSuite { } } } + + test("regexp_replace") { + runQueryAndCompare( + "SELECT regexp_replace(l_partkey, '\\w', 'something') FROM lineitem limit 100") { + checkOperatorMatch[ProjectExecTransformer] + } + runQueryAndCompare( + "SELECT regexp_replace(l_partkey, '\\w', 'something', 3) FROM lineitem limit 100") { + checkOperatorMatch[ProjectExecTransformer] + } + } + }