From 00dfee300d6eb67ea20a834e96d3301fa5485a0d Mon Sep 17 00:00:00 2001 From: emptyOVO Date: Sun, 22 Sep 2024 15:38:41 +0800 Subject: [PATCH] [INLONG-11172][SDK] Transform REGEXP_MATCHES() function supports more flags --- .../function/RegexpMatchesFunction.java | 44 +++++++++++-------- .../string/TestRegexpMatchesFunction.java | 13 ++++++ 2 files changed, 39 insertions(+), 18 deletions(-) diff --git a/inlong-sdk/transform-sdk/src/main/java/org/apache/inlong/sdk/transform/process/function/RegexpMatchesFunction.java b/inlong-sdk/transform-sdk/src/main/java/org/apache/inlong/sdk/transform/process/function/RegexpMatchesFunction.java index d86641654dd..e12e8c4a3fc 100644 --- a/inlong-sdk/transform-sdk/src/main/java/org/apache/inlong/sdk/transform/process/function/RegexpMatchesFunction.java +++ b/inlong-sdk/transform-sdk/src/main/java/org/apache/inlong/sdk/transform/process/function/RegexpMatchesFunction.java @@ -41,7 +41,8 @@ * 3) flags: one or more characters that control the behavior of a function, * 'g' flag can be used when we want to match all the substrings that occur, * 'i' flag to ignore case for matching, - * 'm' flag allows regular expressions to match across multiple lines + * 'x' flag to extend syntax (ignoring whitespace and comments in regular expressions) + * 'm' and 'n' flag allows regular expressions to match across multiple lines */ @TransformFunction(names = {"regexp_matches"}) public class RegexpMatchesFunction implements ValueParser { @@ -78,30 +79,37 @@ public Object parse(SourceData sourceData, int rowIndex, Context context) { private String regexpMatches(String input, String regex, String flags) { int flag = 0; - if (flags.contains("i")) { - flag |= Pattern.CASE_INSENSITIVE; - } - if (flags.contains("m")) { - flag |= Pattern.MULTILINE; - } - if (flags.contains("g")) { - flag |= Pattern.DOTALL; + if (flags != null) { + if (flags.contains("i")) { + flag |= Pattern.CASE_INSENSITIVE; + } + if (flags.contains("m") || flags.contains("n")) { + flag |= Pattern.MULTILINE; + } + if (flags.contains("s")) { + flag |= Pattern.DOTALL; + } + if (flags.contains("x")) { + flag |= Pattern.COMMENTS; + } } Pattern pattern = Pattern.compile(regex, flag); Matcher matcher = pattern.matcher(input); - + boolean isGlobalMatch = flags != null && flags.contains("g"); List matches = new ArrayList<>(); while (matcher.find()) { - if (matcher.groupCount() == 0) { - matches.add(new String[]{matcher.group(0)}); - } else { - String[] matchGroups = new String[matcher.groupCount()]; - for (int i = 1; i <= matcher.groupCount(); i++) { - matchGroups[i - 1] = matcher.group(i) != null ? matcher.group(i) : ""; - } - matches.add(matchGroups); + int groupCount = matcher.groupCount(); + String[] matchGroups = new String[groupCount > 0 ? groupCount : 1]; + + for (int i = 0; i <= groupCount; i++) { + matchGroups[i == 0 ? 0 : i - 1] = matcher.group(i) != null ? matcher.group(i) : ""; + } + matches.add(matchGroups); + + if (!isGlobalMatch) { + break; } } return listToString(matches); diff --git a/inlong-sdk/transform-sdk/src/test/java/org/apache/inlong/sdk/transform/process/function/string/TestRegexpMatchesFunction.java b/inlong-sdk/transform-sdk/src/test/java/org/apache/inlong/sdk/transform/process/function/string/TestRegexpMatchesFunction.java index f3569ecaff2..48c2a6f835d 100644 --- a/inlong-sdk/transform-sdk/src/test/java/org/apache/inlong/sdk/transform/process/function/string/TestRegexpMatchesFunction.java +++ b/inlong-sdk/transform-sdk/src/test/java/org/apache/inlong/sdk/transform/process/function/string/TestRegexpMatchesFunction.java @@ -37,6 +37,7 @@ public void testRegexpMatchesFunction() throws Exception { TransformProcessor processor1 = TransformProcessor .create(config1, SourceDecoderFactory.createCsvDecoder(csvSource), SinkEncoderFactory.createKvEncoder(kvSink)); + // case1: regexp_matches("The quick brown fox", "quick") List output1 = processor1.transform("The quick brown fox|quick|5|2|1|3", new HashMap<>()); Assert.assertEquals(1, output1.size()); @@ -46,11 +47,13 @@ public void testRegexpMatchesFunction() throws Exception { TransformProcessor processor2 = TransformProcessor .create(config2, SourceDecoderFactory.createCsvDecoder(csvSource), SinkEncoderFactory.createKvEncoder(kvSink)); + // case2: regexp_matches("User: Alice, ID: 12345", "User: (\\w+), ID: (\\d+)") List output2 = processor2.transform("User: Alice, ID: 12345|User: (\\\\w+), ID: (\\\\d+)|5|2|1|3", new HashMap<>()); Assert.assertEquals(1, output2.size()); Assert.assertEquals(output2.get(0), "result=[{\"Alice\",\"12345\"}]"); + // case3: regexp_matches("User: Alice, ID: 12345User: Alice, ID: 12345; // User: Bob, ID: 67890", "User: (\\w+), ID: (\\d+)") List output3 = @@ -62,10 +65,12 @@ public void testRegexpMatchesFunction() throws Exception { TransformProcessor processor3 = TransformProcessor .create(config3, SourceDecoderFactory.createCsvDecoder(csvSource), SinkEncoderFactory.createKvEncoder(kvSink)); + // case4: regexp_matches("foo 123 bar 456", "\\d+", "g") List output4 = processor3.transform("foo 123 bar 456|\\\\d+|g|2|1|3", new HashMap<>()); Assert.assertEquals(1, output4.size()); Assert.assertEquals(output4.get(0), "result=[{\"123\"},{\"456\"}]"); + // case5: regexp_matches("User: Alice, ID: 12345User: Alice, ID: 12345; // User: Bob, ID: 67890", "User: (\\w+),ID: (\\d+)", "g") List output5 = processor3.transform( @@ -77,6 +82,7 @@ public void testRegexpMatchesFunction() throws Exception { TransformProcessor processor4 = TransformProcessor .create(config4, SourceDecoderFactory.createCsvDecoder(csvSource), SinkEncoderFactory.createKvEncoder(kvSink)); + // case6: regexp_matches("Hello! hello World", "hello", "ig") List output6 = processor4.transform("Hello! hello World|hello|ig|2|1|3", new HashMap<>()); Assert.assertEquals(1, output6.size()); @@ -86,9 +92,16 @@ public void testRegexpMatchesFunction() throws Exception { TransformProcessor processor5 = TransformProcessor .create(config5, SourceDecoderFactory.createCsvDecoder(csvSource), SinkEncoderFactory.createKvEncoder(kvSink)); + // case7: regexp_matches("First line\nSecond line", "^Second", "m") List output7 = processor5.transform("First line\\\nSecond line|^Second|m|2|1|3", new HashMap<>()); Assert.assertEquals(1, output7.size()); Assert.assertEquals(output7.get(0), "result=[{\"Second\"}]"); + + // without 'g' flag + // case7: regexp_matches("Hello! hello World", "hello", "i") + List output8 = processor5.transform("Hello! hello World|hello|i|2|1|3", new HashMap<>()); + Assert.assertEquals(1, output8.size()); + Assert.assertEquals(output8.get(0), "result=[{\"Hello\"}]"); } }