Skip to content

Commit

Permalink
[INLONG-11060][SDK] Transform support REGEXP_...() related functions
Browse files Browse the repository at this point in the history
  • Loading branch information
emptyOVO committed Sep 13, 2024
1 parent c620fdc commit 9d65299
Show file tree
Hide file tree
Showing 14 changed files with 1,075 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.inlong.sdk.transform.process.function;

import org.apache.inlong.sdk.transform.decode.SourceData;
import org.apache.inlong.sdk.transform.process.Context;
import org.apache.inlong.sdk.transform.process.operator.OperatorTools;
import org.apache.inlong.sdk.transform.process.parser.ValueParser;

import net.sf.jsqlparser.expression.Expression;
import net.sf.jsqlparser.expression.Function;

import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
* RegexpCountFunction
* description: REGEXP_COUNT(str, regexp)--Returns the number of times str matches the regexp pattern.
* regexp must be a Java regular expression.
* Returns an INTEGER representation of the number of matches.
* NULL if any of the arguments are NULL or regexp is invalid.
*/
@TransformFunction(names = {"regexp_count"})
public class RegexpCountFunction implements ValueParser {

private ValueParser inputStringParser;

private ValueParser patternStringParser;

public RegexpCountFunction(Function expr) {
if (expr.getParameters() != null) {
List<Expression> expressions = expr.getParameters().getExpressions();
if (expressions != null && expressions.size() >= 2) {
inputStringParser = OperatorTools.buildParser(expressions.get(0));
patternStringParser = OperatorTools.buildParser(expressions.get(1));
}
}
}

@Override
public Object parse(SourceData sourceData, int rowIndex, Context context) {
if (inputStringParser == null || patternStringParser == null) {
return null;
}
String inputString = OperatorTools.parseString(inputStringParser.parse(sourceData, rowIndex, context));
String patternString = OperatorTools.parseString(patternStringParser.parse(sourceData, rowIndex, context));
Pattern pattern = Pattern.compile(patternString);
Matcher matcher = pattern.matcher(inputString);
int count = 0;
while (matcher.find()) {
count++;
}
return count;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.inlong.sdk.transform.process.function;

import org.apache.inlong.sdk.transform.decode.SourceData;
import org.apache.inlong.sdk.transform.process.Context;
import org.apache.inlong.sdk.transform.process.operator.OperatorTools;
import org.apache.inlong.sdk.transform.process.parser.ValueParser;

import net.sf.jsqlparser.expression.Expression;
import net.sf.jsqlparser.expression.Function;

import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
* RegexpExtractAllFunction
* description: REGEXP_EXTRACT_ALL(str, regexp[, extractIndex])--Returns an ARRAY representation of all the matched substrings.
* NULL if any of the arguments are NULL or invalid.Extracts all the substrings in str that match the regexp
* expression and correspond to the regexp group extractIndex. regexp may contain multiple groups. extractIndex
* indicates which regexp group to extract and starts from 1, also the default value if not specified.
* 0 means matching the entire regular expression.
* for example: REGEXP_EXTRACT_ALL("abc123def456ghi789", "(\\d+)", 0)--return [123, 456, 789]
* REGEXP_EXTRACT_ALL("Name: John, Age: 25, Location: NY", "Name: (\\w+), Age: (\\d+), Location: (\\w+)", 1)--return [John]
* REGEXP_EXTRACT_ALL("Name: John, Age: 25, Location: NY", "Name: (\\w+), Age: (\\d+), Location: (\\w+)", 0)--return [Name: John, Age: 25, Location: NY]
*/
@TransformFunction(names = {"regexp_extract_all"})
public class RegexpExtractAllFunction implements ValueParser {

private ValueParser inputStringParser;

private ValueParser patternStringParser;

private ValueParser indexIntegerParser;

public RegexpExtractAllFunction(Function expr) {
if (expr.getParameters() != null) {
List<Expression> expressions = expr.getParameters().getExpressions();
if (expressions != null && expressions.size() >= 2) {
inputStringParser = OperatorTools.buildParser(expressions.get(0));
patternStringParser = OperatorTools.buildParser(expressions.get(1));
if (expressions.size() >= 3) {
indexIntegerParser = OperatorTools.buildParser(expressions.get(2));
}
}
}
}

@Override
public Object parse(SourceData sourceData, int rowIndex, Context context) {
if (inputStringParser == null || patternStringParser == null) {
return null;
}
String inputString = OperatorTools.parseString(inputStringParser.parse(sourceData, rowIndex, context));
String patternString = OperatorTools.parseString(patternStringParser.parse(sourceData, rowIndex, context));
int index = 0;
if (indexIntegerParser != null) {
index = OperatorTools.parseBigDecimal(indexIntegerParser.parse(sourceData, rowIndex, context)).intValue();
}
if (index < 0) {
return null;
}
List<String> resultList = new ArrayList<>();

Pattern pattern = Pattern.compile(patternString);
Matcher matcher = pattern.matcher(inputString);
while (matcher.find()) {
if (index <= matcher.groupCount()) {
resultList.add(matcher.group(index));
} else {
return null;
}
}

return resultList.isEmpty() ? null : resultList;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.inlong.sdk.transform.process.function;

import org.apache.inlong.sdk.transform.decode.SourceData;
import org.apache.inlong.sdk.transform.process.Context;
import org.apache.inlong.sdk.transform.process.operator.OperatorTools;
import org.apache.inlong.sdk.transform.process.parser.ValueParser;

import net.sf.jsqlparser.expression.Expression;
import net.sf.jsqlparser.expression.Function;

import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
* RegexpExtractFunction
* description: REGEXP_EXTRACT(string1, string2[, integer])--Returns a string from string1 which extracted with a specified
* regular expression string2 and a regexp match group index integer.The regexp match group index starts
* from 1 and 0 means matching the whole regexp. In addition, the regexp match group index should not exceed
* the number of the defined groups.
* for example: REGEXP_EXTRACT("abc123def", "(\\d+)", 1)--return 123
* REGEXP_EXTRACT("Name: John, Age: 25, Location: NY", "Name: (\\w+), Age: (\\d+), Location: (\\w+)", 2)--return 25
* REGEXP_EXTRACT("abc123def", "(\\d+)", 2)--return null
* REGEXP_EXTRACT("abc123def", "abcdef", 1)--return null
*/
@TransformFunction(names = {"regexp_extract"})
public class RegexpExtractFunction implements ValueParser {

private ValueParser inputStringParser;

private ValueParser patternStringParser;

private ValueParser indexIntegerParser;

public RegexpExtractFunction(Function expr) {
if (expr.getParameters() != null) {
List<Expression> expressions = expr.getParameters().getExpressions();
if (expressions != null && expressions.size() >= 3) {
inputStringParser = OperatorTools.buildParser(expressions.get(0));
patternStringParser = OperatorTools.buildParser(expressions.get(1));
indexIntegerParser = OperatorTools.buildParser(expressions.get(2));
}
}
}

@Override
public Object parse(SourceData sourceData, int rowIndex, Context context) {
if (inputStringParser == null || patternStringParser == null || indexIntegerParser == null) {
return null;
}
String inputString = OperatorTools.parseString(inputStringParser.parse(sourceData, rowIndex, context));
String patternString = OperatorTools.parseString(patternStringParser.parse(sourceData, rowIndex, context));
int indexInteger =
OperatorTools.parseBigDecimal(indexIntegerParser.parse(sourceData, rowIndex, context)).intValue();
if (indexInteger < 0) {
return null;
}
Pattern pattern = Pattern.compile(patternString);
Matcher matcher = pattern.matcher(inputString);
if (matcher.find()) {
if (indexInteger <= matcher.groupCount()) {
return matcher.group(indexInteger);
}
}
return null;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.inlong.sdk.transform.process.function;

import org.apache.inlong.sdk.transform.decode.SourceData;
import org.apache.inlong.sdk.transform.process.Context;
import org.apache.inlong.sdk.transform.process.operator.OperatorTools;
import org.apache.inlong.sdk.transform.process.parser.ValueParser;

import net.sf.jsqlparser.expression.Expression;
import net.sf.jsqlparser.expression.Function;

import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* RegexpFunction
* description: REGEX(string1, string2)--Returns TRUE if any (possibly empty) substring of string1 matches the Java
* regular expression string2, otherwise FALSE. Returns NULL if any of arguments is NULL.
*/
@TransformFunction(names = {"regex"})
public class RegexpFunction implements ValueParser {

private ValueParser inputParser;

private ValueParser patternParser;

public RegexpFunction(Function expr) {
if (expr.getParameters() != null) {
List<Expression> expressions = expr.getParameters().getExpressions();
if (expressions != null && expressions.size() == 2) {
inputParser = OperatorTools.buildParser(expressions.get(0));
patternParser = OperatorTools.buildParser(expressions.get(1));
}
}
}

@Override
public Object parse(SourceData sourceData, int rowIndex, Context context) {
if (inputParser == null || patternParser == null) {
return null;
}
String inputString = OperatorTools.parseString(inputParser.parse(sourceData, rowIndex, context));
String patternString = OperatorTools.parseString(patternParser.parse(sourceData, rowIndex, context));
Pattern pattern = Pattern.compile(patternString);
Matcher matcher = pattern.matcher(inputString);
return matcher.find();
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.inlong.sdk.transform.process.function;

import org.apache.inlong.sdk.transform.decode.SourceData;
import org.apache.inlong.sdk.transform.process.Context;
import org.apache.inlong.sdk.transform.process.operator.OperatorTools;
import org.apache.inlong.sdk.transform.process.parser.ValueParser;

import net.sf.jsqlparser.expression.Expression;
import net.sf.jsqlparser.expression.Function;

import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
* RegexpInstrFunction
* description: REGEXP_INSTR(str, regexp)--Returns the position of the first substring in str that matches regexp.
* Result indexes begin at 1, 0 if there is no match.
* Returns an INTEGER representation of the first matched substring index.
* NULL if any of the arguments are NULL or regexp is invalid.
*/
@TransformFunction(names = {"regexp_instr"})
public class RegexpInstrFunction implements ValueParser {

private ValueParser inputStringParser;

private ValueParser patternStringParser;

public RegexpInstrFunction(Function expr) {
if (expr.getParameters() != null) {
List<Expression> expressions = expr.getParameters().getExpressions();
if (expressions != null && expressions.size() >= 2) {
inputStringParser = OperatorTools.buildParser(expressions.get(0));
patternStringParser = OperatorTools.buildParser(expressions.get(1));
}
}
}

@Override
public Object parse(SourceData sourceData, int rowIndex, Context context) {
if (inputStringParser == null || patternStringParser == null) {
return null;
}
String inputString = OperatorTools.parseString(inputStringParser.parse(sourceData, rowIndex, context));
String patternString = OperatorTools.parseString(patternStringParser.parse(sourceData, rowIndex, context));
Pattern pattern = Pattern.compile(patternString);
Matcher matcher = pattern.matcher(inputString);
if (matcher.find()) {
return matcher.start() + 1;
} else {
return 0;
}
}
}
Loading

0 comments on commit 9d65299

Please sign in to comment.