From afd096737fe95b76d69783b682955a2d833436ed Mon Sep 17 00:00:00 2001 From: emptyOVO Date: Sun, 22 Sep 2024 15:22:09 +0800 Subject: [PATCH] [INLONG-11168][SDK] Transform support REGEXP_SPLIT_TO_ARRAY() function --- .../function/RegexpSplitToArrayFunction.java | 101 ++++++++++++++++++ .../TestRegexpSplitToArrayFunction.java | 90 ++++++++++++++++ 2 files changed, 191 insertions(+) create mode 100644 inlong-sdk/transform-sdk/src/main/java/org/apache/inlong/sdk/transform/process/function/RegexpSplitToArrayFunction.java create mode 100644 inlong-sdk/transform-sdk/src/test/java/org/apache/inlong/sdk/transform/process/function/string/TestRegexpSplitToArrayFunction.java diff --git a/inlong-sdk/transform-sdk/src/main/java/org/apache/inlong/sdk/transform/process/function/RegexpSplitToArrayFunction.java b/inlong-sdk/transform-sdk/src/main/java/org/apache/inlong/sdk/transform/process/function/RegexpSplitToArrayFunction.java new file mode 100644 index 00000000000..81df7b561f6 --- /dev/null +++ b/inlong-sdk/transform-sdk/src/main/java/org/apache/inlong/sdk/transform/process/function/RegexpSplitToArrayFunction.java @@ -0,0 +1,101 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.inlong.sdk.transform.process.function; + +import org.apache.inlong.sdk.transform.decode.SourceData; +import org.apache.inlong.sdk.transform.process.Context; +import org.apache.inlong.sdk.transform.process.operator.OperatorTools; +import org.apache.inlong.sdk.transform.process.parser.ValueParser; + +import net.sf.jsqlparser.expression.Expression; +import net.sf.jsqlparser.expression.Function; + +import java.util.Arrays; +import java.util.List; +import java.util.regex.Pattern; + +/** + * RegexpSplitToArrayFunction + * description: REGEXP_SPLIT_TO_ARRAY( string, pattern text|, flags text l) → text[] + * Splits string using a POslX regular expression as the delimiter, producing an array of results + * parameters: 1) source_string: the string to be matched + * 2) pattern: POSIX regular expression for matching + * 3) flags: one or more characters that control the behavior of a function, + * 'g' flag can be used when we want to match all the substrings that occur, + * 'i' flag to ignore case for matching, + * 'x' flag to extend syntax (ignoring whitespace and comments in regular expressions), + * 'm' and 'n' flag allows regular expressions to match across multiple lines + * for example: regexp_split_to_array("hello world","\s+")--return {hello, world} + */ +@TransformFunction(names = {"regexp_split_to_array"}) +public class RegexpSplitToArrayFunction implements ValueParser { + + private ValueParser inputParser; + + private ValueParser patternParser; + + private ValueParser flagParser; + + public RegexpSplitToArrayFunction(Function expr) { + if (expr.getParameters() != null) { + List expressions = expr.getParameters().getExpressions(); + if (expressions != null) { + inputParser = OperatorTools.buildParser(expressions.get(0)); + patternParser = OperatorTools.buildParser(expressions.get(1)); + if (expressions.size() == 3) { + flagParser = OperatorTools.buildParser(expressions.get(2)); + } + } + } + } + + @Override + public Object parse(SourceData sourceData, int rowIndex, Context context) { + if (inputParser == null || patternParser == null) { + return null; + } + String inputString = OperatorTools.parseString(inputParser.parse(sourceData, rowIndex, context)); + String patternString = OperatorTools.parseString(patternParser.parse(sourceData, rowIndex, context)); + String flagString = ""; + if (flagParser != null) { + flagString = OperatorTools.parseString(flagParser.parse(sourceData, rowIndex, context)); + } + return regexpSplitToArray(inputString, patternString, flagString); + } + + private List regexpSplitToArray(String inputString, String patternString, String flagString) { + int regexFlags = 0; + + if (flagString != null) { + if (flagString.contains("i")) { + regexFlags |= Pattern.CASE_INSENSITIVE; + } + if (flagString.contains("m") || flagString.contains("n")) { + regexFlags |= Pattern.MULTILINE; + } + if (flagString.contains("s")) { + regexFlags |= Pattern.DOTALL; + } + if (flagString.contains("x")) { + regexFlags |= Pattern.COMMENTS; + } + } + Pattern pattern = Pattern.compile(patternString, regexFlags); + return Arrays.asList(pattern.split(inputString)); + } +} diff --git a/inlong-sdk/transform-sdk/src/test/java/org/apache/inlong/sdk/transform/process/function/string/TestRegexpSplitToArrayFunction.java b/inlong-sdk/transform-sdk/src/test/java/org/apache/inlong/sdk/transform/process/function/string/TestRegexpSplitToArrayFunction.java new file mode 100644 index 00000000000..082e726bd29 --- /dev/null +++ b/inlong-sdk/transform-sdk/src/test/java/org/apache/inlong/sdk/transform/process/function/string/TestRegexpSplitToArrayFunction.java @@ -0,0 +1,90 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.inlong.sdk.transform.process.function.string; + +import org.apache.inlong.sdk.transform.decode.SourceDecoderFactory; +import org.apache.inlong.sdk.transform.encode.SinkEncoderFactory; +import org.apache.inlong.sdk.transform.pojo.TransformConfig; +import org.apache.inlong.sdk.transform.process.TransformProcessor; + +import org.junit.Assert; +import org.junit.Test; + +import java.util.HashMap; +import java.util.List; + +public class TestRegexpSplitToArrayFunction extends AbstractFunctionStringTestBase { + + @Test + public void testRegexpSplitToArrayFunction() throws Exception { + String transformSql1 = "select regexp_split_to_array(string1,string2) from source"; + TransformConfig config1 = new TransformConfig(transformSql1); + TransformProcessor processor1 = TransformProcessor + .create(config1, SourceDecoderFactory.createCsvDecoder(csvSource), + SinkEncoderFactory.createKvEncoder(kvSink)); + // case1: regexp_split_to_array("hello,world", "\s+") + List output1 = processor1.transform("hello,world|\\s+|5|2|1|3", new HashMap<>()); + Assert.assertEquals(1, output1.size()); + Assert.assertEquals(output1.get(0), "result=[hello,world]"); + String transformSql2 = "select regexp_split_to_array(string1,string2) from source"; + TransformConfig config2 = new TransformConfig(transformSql2); + TransformProcessor processor2 = TransformProcessor + .create(config2, SourceDecoderFactory.createCsvDecoder(csvSource), + SinkEncoderFactory.createKvEncoder(kvSink)); + // case2: regexp_split_to_array("User: Alice, ID: 12345", ":") + List output2 = + processor2.transform("User: Alice, ID: 12345|:|5|2|1|3", new HashMap<>()); + Assert.assertEquals(1, output2.size()); + Assert.assertEquals(output2.get(0), "result=[User, Alice, ID, 12345]"); + + String transformSql3 = "select regexp_split_to_array(string1,string2,string3) from source"; + TransformConfig config3 = new TransformConfig(transformSql3); + TransformProcessor processor3 = TransformProcessor + .create(config3, SourceDecoderFactory.createCsvDecoder(csvSource), + SinkEncoderFactory.createKvEncoder(kvSink)); + // case3: regexp_split_to_array("foo 123 bar 456", "\\d+", "g") + List output3 = processor3.transform("foo 123 bar 456|\\\\d+|g|2|1|3", new HashMap<>()); + Assert.assertEquals(1, output3.size()); + Assert.assertEquals(output3.get(0), "result=[foo , bar ]"); + + // case4: regexp_split_to_array("foo 123 bAr 456", "bar", "i") + List output4 = processor3.transform("foo 123 bAr 456|bar|i|2|1|3", new HashMap<>()); + Assert.assertEquals(1, output4.size()); + Assert.assertEquals(output4.get(0), "result=[foo 123 , 456]"); + + // case5: regexp_split_to_array("Hello! hello World", "hello", "ig") + List output5 = processor3.transform("Hello! hello World|hello|ig|2|1|3", new HashMap<>()); + Assert.assertEquals(1, output5.size()); + Assert.assertEquals(output5.get(0), "result=[, ! , World]"); + + // case6: regexp_split_to_array("First line\nSecond line", "^Second", "m") + List output6 = processor3.transform("First line\\\nSecond line|^Second|m|2|1|3", new HashMap<>()); + Assert.assertEquals(1, output6.size()); + Assert.assertEquals(output6.get(0), "result=[First line\n, line]"); + + // case7: regexp_split_to_array("Hello! hello World", "hello", "igx") + List output7 = processor3.transform("Hello! hello World|hello|igx|2|1|3", new HashMap<>()); + Assert.assertEquals(1, output7.size()); + Assert.assertEquals(output7.get(0), "result=[, ! , World]"); + + // case6: regexp_split_to_array("First line\nSecond line", "^Second", "n") + List output8 = processor3.transform("First line\\\nSecond line|^Second|n|2|1|3", new HashMap<>()); + Assert.assertEquals(1, output8.size()); + Assert.assertEquals(output8.get(0), "result=[First line\n, line]"); + } +}