Skip to content

Commit

Permalink
[INLONG-10119][SDK] Supporting Data Sharding with GroupBy Semantics
Browse files Browse the repository at this point in the history
  • Loading branch information
emptyOVO committed Sep 20, 2024
1 parent ed42b8e commit f8c587b
Show file tree
Hide file tree
Showing 2 changed files with 168 additions and 25 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@

import com.google.common.collect.ImmutableMap;
import net.sf.jsqlparser.JSQLParserException;
import net.sf.jsqlparser.expression.Expression;
import net.sf.jsqlparser.parser.CCJSqlParserManager;
import net.sf.jsqlparser.statement.select.AllColumns;
import net.sf.jsqlparser.statement.select.PlainSelect;
Expand All @@ -41,6 +42,7 @@

import java.io.StringReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

Expand All @@ -62,6 +64,7 @@ public class TransformProcessor<I, O> {
private PlainSelect transformSelect;
private ExpressionOperator where;
private List<ValueParserNode> selectItems;
private List<ValueParserNode> groupByItems;

private List<String> sinkFieldList;

Expand Down Expand Up @@ -131,6 +134,14 @@ private void initTransformSql(String sql) throws JSQLParserException {
this.encoder.getFields().clear();
this.selectItems.add(new ValueParserNode(fieldName, null));
}
if (this.transformSelect.getGroupBy() != null) {
this.groupByItems = new ArrayList<>();
List<Expression> groupByExpressions = this.transformSelect.getGroupBy().getGroupByExpressions();
for (Expression expr : groupByExpressions) {
ValueParser parser = OperatorTools.buildParser(expr);
groupByItems.add(new ValueParserNode(expr.toString(), parser));
}
}
}
}

Expand All @@ -153,7 +164,8 @@ public List<O> transform(I input, Map<String, Object> extParams) {
if (sourceData == null) {
return null;
}

boolean hasGroupBy = this.groupByItems != null && !this.groupByItems.isEmpty();
Map<String, DefaultSinkData> groupedData = hasGroupBy ? new HashMap<>() : null;
List<O> sinkDatas = new ArrayList<>(sourceData.getRowCount());
for (int i = 0; i < sourceData.getRowCount(); i++) {

Expand All @@ -162,38 +174,64 @@ public List<O> transform(I input, Map<String, Object> extParams) {
continue;
}

DefaultSinkData sinkData;
if (hasGroupBy) {
// has 'group by' key
String groupKey = generateGroupKey(sourceData, i, context);
groupedData.putIfAbsent(groupKey, new DefaultSinkData());
sinkData = groupedData.get(groupKey);
} else {
sinkData = new DefaultSinkData();
}
// parse value
DefaultSinkData sinkData = new DefaultSinkData();
for (ValueParserNode node : this.selectItems) {
String fieldName = node.getFieldName();
ValueParser parser = node.getParser();
if (parser == null || StringUtils.equals(fieldName, SinkEncoder.ALL_SOURCE_FIELD_SIGN)) {
if (input instanceof String) {
sinkData.addField(fieldName, (String) input);
} else {
sinkData.addField(fieldName, "");
}
continue;
parseSelectItems(input, sourceData, i, context, sinkData);
if (!hasGroupBy) {
if (this.sinkFieldList != null) {
sinkData.setKeyList(this.sinkFieldList);
}
sinkDatas.add(this.encoder.encode(sinkData, context));
}
}
if (hasGroupBy) {
for (DefaultSinkData groupData : groupedData.values()) {
if (this.sinkFieldList != null) {
groupData.setKeyList(this.sinkFieldList);
}
sinkDatas.add(this.encoder.encode(groupData, context));
}
}
return sinkDatas;
}

private void parseSelectItems(I input, SourceData sourceData, int rowIndex, Context context,
DefaultSinkData sinkData) {
for (ValueParserNode node : this.selectItems) {
String fieldName = node.getFieldName();
ValueParser parser = node.getParser();

if (parser == null || StringUtils.equals(fieldName, SinkEncoder.ALL_SOURCE_FIELD_SIGN)) {
if (input instanceof String) {
sinkData.addField(fieldName, (String) input);
} else {
sinkData.addField(fieldName, "");
}
} else {
try {
Object fieldValue = parser.parse(sourceData, i, context);
if (fieldValue == null) {
sinkData.addField(fieldName, "");
} else {
sinkData.addField(fieldName, fieldValue.toString());
}
Object fieldValue = parser.parse(sourceData, rowIndex, context);
sinkData.addField(fieldName, fieldValue != null ? fieldValue.toString() : "");
} catch (Throwable t) {
sinkData.addField(fieldName, "");
}
}

if (this.sinkFieldList != null) {
sinkData.setKeyList(this.sinkFieldList);
}
// encode
sinkDatas.add(this.encoder.encode(sinkData, context));
}
return sinkDatas;
}

private String generateGroupKey(SourceData sourceData, int rowIndex, Context context) {
StringBuilder groupKeyBuilder = new StringBuilder();
for (ValueParserNode groupByNode : this.groupByItems) {
Object groupByValue = groupByNode.getParser().parse(sourceData, rowIndex, context);
groupKeyBuilder.append(groupByValue);
}
return groupKeyBuilder.toString();
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.inlong.sdk.transform.process.processor;

import org.apache.inlong.sdk.transform.decode.SourceDecoderFactory;
import org.apache.inlong.sdk.transform.encode.SinkEncoderFactory;
import org.apache.inlong.sdk.transform.pojo.CsvSinkInfo;
import org.apache.inlong.sdk.transform.pojo.FieldInfo;
import org.apache.inlong.sdk.transform.pojo.JsonSourceInfo;
import org.apache.inlong.sdk.transform.pojo.TransformConfig;
import org.apache.inlong.sdk.transform.process.TransformProcessor;

import org.junit.Assert;
import org.junit.Test;

import java.util.HashMap;
import java.util.List;

public class TestGroupByProcessor extends AbstractProcessorTestBase {

@Test
public void testJsonGroupBy() throws Exception {
List<FieldInfo> fields1 = this.getTestFieldList("sid", "packageID", "msgTime", "msg");
JsonSourceInfo jsonSource1 = new JsonSourceInfo("UTF-8", "msgs");
CsvSinkInfo csvSink1 = new CsvSinkInfo("UTF-8", '|', '\\', fields1);
String transformSql1 = "select $root.sid,$root.packageID,$child.msgTime,$child.msg from source " +
"group by $child.msgTime,$child.msg";
TransformConfig config1 = new TransformConfig(transformSql1);
// case1
TransformProcessor<String, String> processor1 = TransformProcessor
.create(config1, SourceDecoderFactory.createJsonDecoder(jsonSource1),
SinkEncoderFactory.createCsvEncoder(csvSink1));
String srcString = "{\n"
+ " \"sid\":\"value1\",\n"
+ " \"packageID\":\"value2\",\n"
+ " \"msgs\":[\n"
+ " {\"msg\":\"value3\",\"msgTime\":1713243918000},\n"
+ " {\"msg\":\"value4\",\"msgTime\":1713243918001},\n"
+ " {\"msg\":\"value5\",\"msgTime\":1713243918002},\n"
+ " {\"msg\":\"value6\",\"msgTime\":1713243918003},\n"
+ " {\"msg\":\"value3\",\"msgTime\":1713243918000},\n"
+ " {\"msg\":\"value8\",\"msgTime\":1713243918001},\n"
+ " {\"msg\":\"value9\",\"msgTime\":1713243918002},\n"
+ " {\"msg\":\"value10\",\"msgTime\":1713243918003}\n"
+ " ]\n"
+ "}";
List<String> output1 = processor1.transform(srcString, new HashMap<>());
Assert.assertEquals(7, output1.size());
Assert.assertEquals(output1.get(0), "value1|value2|1713243918000|value3");
Assert.assertEquals(output1.get(1), "value1|value2|1713243918001|value8");
Assert.assertEquals(output1.get(2), "value1|value2|1713243918002|value5");
Assert.assertEquals(output1.get(3), "value1|value2|1713243918003|value10");
Assert.assertEquals(output1.get(4), "value1|value2|1713243918002|value9");
Assert.assertEquals(output1.get(5), "value1|value2|1713243918001|value4");
Assert.assertEquals(output1.get(6), "value1|value2|1713243918003|value6");
}

@Test
public void testJsonGroupByForOneField() throws Exception {
List<FieldInfo> fields1 = this.getTestFieldList("sid", "packageID", "msgTime", "msg");
JsonSourceInfo jsonSource1 = new JsonSourceInfo("UTF-8", "msgs");
CsvSinkInfo csvSink1 = new CsvSinkInfo("UTF-8", '|', '\\', fields1);
String transformSql1 =
"select $root.sid,$root.packageID,$child.msgTime,$child.msg from source group by $child.msgTime";
TransformConfig config1 = new TransformConfig(transformSql1);
// case1
TransformProcessor<String, String> processor1 = TransformProcessor
.create(config1, SourceDecoderFactory.createJsonDecoder(jsonSource1),
SinkEncoderFactory.createCsvEncoder(csvSink1));
String srcString = "{\n"
+ " \"sid\":\"value1\",\n"
+ " \"packageID\":\"value2\",\n"
+ " \"msgs\":[\n"
+ " {\"msg\":\"value3\",\"msgTime\":1713243918000},\n"
+ " {\"msg\":\"value4\",\"msgTime\":1713243918000},\n"
+ " {\"msg\":\"value5\",\"msgTime\":1713243918001},\n"
+ " {\"msg\":\"value6\",\"msgTime\":1713243918001},\n"
+ " {\"msg\":\"value7\",\"msgTime\":1713243918000},\n"
+ " {\"msg\":\"value8\",\"msgTime\":1713243918004},\n"
+ " {\"msg\":\"value10\",\"msgTime\":1713243918005}\n"
+ " ]\n"
+ "}";
List<String> output1 = processor1.transform(srcString, new HashMap<>());
Assert.assertEquals(5, output1.size());
Assert.assertEquals(output1.get(0), "value1|value2|1713243918000|value7");
Assert.assertEquals(output1.get(1), "value1|value2|1713243918001|value6");
Assert.assertEquals(output1.get(2), "value1|value2|1713243918004|value8");
Assert.assertEquals(output1.get(4), "value1|value2|1713243918005|value10");
}
}

0 comments on commit f8c587b

Please sign in to comment.