Skip to content

Commit

Permalink
[INLONG-11168][SDK] Transform support REGEXP_SPLIT_TO_ARRAY() function
Browse files Browse the repository at this point in the history
  • Loading branch information
emptyOVO committed Sep 22, 2024
1 parent 260a12d commit afd0967
Show file tree
Hide file tree
Showing 2 changed files with 191 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.inlong.sdk.transform.process.function;

import org.apache.inlong.sdk.transform.decode.SourceData;
import org.apache.inlong.sdk.transform.process.Context;
import org.apache.inlong.sdk.transform.process.operator.OperatorTools;
import org.apache.inlong.sdk.transform.process.parser.ValueParser;

import net.sf.jsqlparser.expression.Expression;
import net.sf.jsqlparser.expression.Function;

import java.util.Arrays;
import java.util.List;
import java.util.regex.Pattern;

/**
* RegexpSplitToArrayFunction
* description: REGEXP_SPLIT_TO_ARRAY( string, pattern text|, flags text l) → text[]
* Splits string using a POslX regular expression as the delimiter, producing an array of results
* parameters: 1) source_string: the string to be matched
* 2) pattern: POSIX regular expression for matching
* 3) flags: one or more characters that control the behavior of a function,
* 'g' flag can be used when we want to match all the substrings that occur,
* 'i' flag to ignore case for matching,
* 'x' flag to extend syntax (ignoring whitespace and comments in regular expressions),
* 'm' and 'n' flag allows regular expressions to match across multiple lines
* for example: regexp_split_to_array("hello world","\s+")--return {hello, world}
*/
@TransformFunction(names = {"regexp_split_to_array"})
public class RegexpSplitToArrayFunction implements ValueParser {

private ValueParser inputParser;

private ValueParser patternParser;

private ValueParser flagParser;

public RegexpSplitToArrayFunction(Function expr) {
if (expr.getParameters() != null) {
List<Expression> expressions = expr.getParameters().getExpressions();
if (expressions != null) {
inputParser = OperatorTools.buildParser(expressions.get(0));
patternParser = OperatorTools.buildParser(expressions.get(1));
if (expressions.size() == 3) {
flagParser = OperatorTools.buildParser(expressions.get(2));
}
}
}
}

@Override
public Object parse(SourceData sourceData, int rowIndex, Context context) {
if (inputParser == null || patternParser == null) {
return null;
}
String inputString = OperatorTools.parseString(inputParser.parse(sourceData, rowIndex, context));
String patternString = OperatorTools.parseString(patternParser.parse(sourceData, rowIndex, context));
String flagString = "";
if (flagParser != null) {
flagString = OperatorTools.parseString(flagParser.parse(sourceData, rowIndex, context));
}
return regexpSplitToArray(inputString, patternString, flagString);
}

private List<String> regexpSplitToArray(String inputString, String patternString, String flagString) {
int regexFlags = 0;

if (flagString != null) {
if (flagString.contains("i")) {
regexFlags |= Pattern.CASE_INSENSITIVE;
}
if (flagString.contains("m") || flagString.contains("n")) {
regexFlags |= Pattern.MULTILINE;
}
if (flagString.contains("s")) {
regexFlags |= Pattern.DOTALL;
}
if (flagString.contains("x")) {
regexFlags |= Pattern.COMMENTS;
}
}
Pattern pattern = Pattern.compile(patternString, regexFlags);
return Arrays.asList(pattern.split(inputString));
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.inlong.sdk.transform.process.function.string;

import org.apache.inlong.sdk.transform.decode.SourceDecoderFactory;
import org.apache.inlong.sdk.transform.encode.SinkEncoderFactory;
import org.apache.inlong.sdk.transform.pojo.TransformConfig;
import org.apache.inlong.sdk.transform.process.TransformProcessor;

import org.junit.Assert;
import org.junit.Test;

import java.util.HashMap;
import java.util.List;

public class TestRegexpSplitToArrayFunction extends AbstractFunctionStringTestBase {

@Test
public void testRegexpSplitToArrayFunction() throws Exception {
String transformSql1 = "select regexp_split_to_array(string1,string2) from source";
TransformConfig config1 = new TransformConfig(transformSql1);
TransformProcessor<String, String> processor1 = TransformProcessor
.create(config1, SourceDecoderFactory.createCsvDecoder(csvSource),
SinkEncoderFactory.createKvEncoder(kvSink));
// case1: regexp_split_to_array("hello,world", "\s+")
List<String> output1 = processor1.transform("hello,world|\\s+|5|2|1|3", new HashMap<>());
Assert.assertEquals(1, output1.size());
Assert.assertEquals(output1.get(0), "result=[hello,world]");
String transformSql2 = "select regexp_split_to_array(string1,string2) from source";
TransformConfig config2 = new TransformConfig(transformSql2);
TransformProcessor<String, String> processor2 = TransformProcessor
.create(config2, SourceDecoderFactory.createCsvDecoder(csvSource),
SinkEncoderFactory.createKvEncoder(kvSink));
// case2: regexp_split_to_array("User: Alice, ID: 12345", ":")
List<String> output2 =
processor2.transform("User: Alice, ID: 12345|:|5|2|1|3", new HashMap<>());
Assert.assertEquals(1, output2.size());
Assert.assertEquals(output2.get(0), "result=[User, Alice, ID, 12345]");

String transformSql3 = "select regexp_split_to_array(string1,string2,string3) from source";
TransformConfig config3 = new TransformConfig(transformSql3);
TransformProcessor<String, String> processor3 = TransformProcessor
.create(config3, SourceDecoderFactory.createCsvDecoder(csvSource),
SinkEncoderFactory.createKvEncoder(kvSink));
// case3: regexp_split_to_array("foo 123 bar 456", "\\d+", "g")
List<String> output3 = processor3.transform("foo 123 bar 456|\\\\d+|g|2|1|3", new HashMap<>());
Assert.assertEquals(1, output3.size());
Assert.assertEquals(output3.get(0), "result=[foo , bar ]");

// case4: regexp_split_to_array("foo 123 bAr 456", "bar", "i")
List<String> output4 = processor3.transform("foo 123 bAr 456|bar|i|2|1|3", new HashMap<>());
Assert.assertEquals(1, output4.size());
Assert.assertEquals(output4.get(0), "result=[foo 123 , 456]");

// case5: regexp_split_to_array("Hello! hello World", "hello", "ig")
List<String> output5 = processor3.transform("Hello! hello World|hello|ig|2|1|3", new HashMap<>());
Assert.assertEquals(1, output5.size());
Assert.assertEquals(output5.get(0), "result=[, ! , World]");

// case6: regexp_split_to_array("First line\nSecond line", "^Second", "m")
List<String> output6 = processor3.transform("First line\\\nSecond line|^Second|m|2|1|3", new HashMap<>());
Assert.assertEquals(1, output6.size());
Assert.assertEquals(output6.get(0), "result=[First line\n, line]");

// case7: regexp_split_to_array("Hello! hello World", "hello", "igx")
List<String> output7 = processor3.transform("Hello! hello World|hello|igx|2|1|3", new HashMap<>());
Assert.assertEquals(1, output7.size());
Assert.assertEquals(output7.get(0), "result=[, ! , World]");

// case6: regexp_split_to_array("First line\nSecond line", "^Second", "n")
List<String> output8 = processor3.transform("First line\\\nSecond line|^Second|n|2|1|3", new HashMap<>());
Assert.assertEquals(1, output8.size());
Assert.assertEquals(output8.get(0), "result=[First line\n, line]");
}
}

0 comments on commit afd0967

Please sign in to comment.