Skip to content

Commit

Permalink
[INLONG-11037][SDK] Transform support ENCODE() and DECODE() function (a…
Browse files Browse the repository at this point in the history
…pache#11041)

* [INLONG-11037][SDK] Transform support ENCODE() and DECODE() function

* fix: add NP check

* fix: clear definition of the specific encoding type

* fix: add description
  • Loading branch information
emptyOVO authored Sep 11, 2024
1 parent e0a827c commit 751f69d
Show file tree
Hide file tree
Showing 4 changed files with 348 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.inlong.sdk.transform.process.function;

import org.apache.inlong.sdk.transform.decode.SourceData;
import org.apache.inlong.sdk.transform.process.Context;
import org.apache.inlong.sdk.transform.process.operator.OperatorTools;
import org.apache.inlong.sdk.transform.process.parser.ValueParser;

import net.sf.jsqlparser.expression.Expression;
import net.sf.jsqlparser.expression.Function;

import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
/**
* DecodeFunction
* description: decode(binary, string)
* Decode using the supplied character set (' US-ASCII ', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16').
* If either parameter is empty, the result will also be empty.
*/
@TransformFunction(names = {"decode"})
public class DecodeFunction implements ValueParser {

private ValueParser binaryParser;

private ValueParser characterSetParser;

private static final Set<String> SUPPORTED_CHARSETS;

static {
Set<String> charsets = new HashSet<>();
charsets.add(StandardCharsets.US_ASCII.name());
charsets.add(StandardCharsets.ISO_8859_1.name());
charsets.add(StandardCharsets.UTF_8.name());
charsets.add(StandardCharsets.UTF_16.name());
charsets.add(StandardCharsets.UTF_16BE.name());
charsets.add(StandardCharsets.UTF_16LE.name());
SUPPORTED_CHARSETS = Collections.unmodifiableSet(charsets);
}

public DecodeFunction(Function expr) {
List<Expression> expressions = expr.getParameters().getExpressions();
if (expressions != null && expressions.size() == 2) {
binaryParser = OperatorTools.buildParser(expressions.get(0));
characterSetParser = OperatorTools.buildParser(expressions.get(1));
}
}

@Override
public Object parse(SourceData sourceData, int rowIndex, Context context) {
Object binaryObj = binaryParser.parse(sourceData, rowIndex, context);
Object characterObj = characterSetParser.parse(sourceData, rowIndex, context);
if (binaryObj == null || characterObj == null) {
return null;
}
String binaryString = OperatorTools.parseString(binaryObj);
String characterSetValue = OperatorTools.parseString(characterObj).toUpperCase();
return decode(binaryString, characterSetValue);
}

private String decode(String binaryString, String charsetName) {
if (binaryString == null || binaryString.isEmpty() || charsetName == null || charsetName.isEmpty()) {
return "";
}
String[] byteValues = binaryString.split(" ");
byte[] byteArray = new byte[byteValues.length];
for (int i = 0; i < byteValues.length; i++) {
byteArray[i] = (byte) Integer.parseInt(byteValues[i]);
}
if (Charset.isSupported(charsetName) && SUPPORTED_CHARSETS.contains(charsetName)) {
Charset charset = Charset.forName(charsetName);
return new String(byteArray, charset);
}
return "";
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.inlong.sdk.transform.process.function;

import org.apache.inlong.sdk.transform.decode.SourceData;
import org.apache.inlong.sdk.transform.process.Context;
import org.apache.inlong.sdk.transform.process.operator.OperatorTools;
import org.apache.inlong.sdk.transform.process.parser.ValueParser;

import net.sf.jsqlparser.expression.Expression;
import net.sf.jsqlparser.expression.Function;

import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
/**
* EncodeFunction
* description: encode(string1, string2)
* Encode using the provided character set (' US-ASCII ', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16').
* If either parameter is empty, the result will also be empty.
*/
@TransformFunction(names = {"encode"})
public class EncodeFunction implements ValueParser {

private ValueParser stringParser;

private ValueParser characterSetParser;

private static final Set<String> SUPPORTED_CHARSETS;

static {
Set<String> charsets = new HashSet<>();
charsets.add(StandardCharsets.US_ASCII.name());
charsets.add(StandardCharsets.ISO_8859_1.name());
charsets.add(StandardCharsets.UTF_8.name());
charsets.add(StandardCharsets.UTF_16.name());
charsets.add(StandardCharsets.UTF_16BE.name());
charsets.add(StandardCharsets.UTF_16LE.name());
SUPPORTED_CHARSETS = Collections.unmodifiableSet(charsets);
}

public EncodeFunction(Function expr) {
List<Expression> expressions = expr.getParameters().getExpressions();
if (expressions != null && expressions.size() == 2) {
stringParser = OperatorTools.buildParser(expressions.get(0));
characterSetParser = OperatorTools.buildParser(expressions.get(1));
}
}

@Override
public Object parse(SourceData sourceData, int rowIndex, Context context) {
Object stringObj = stringParser.parse(sourceData, rowIndex, context);
Object characterObj = characterSetParser.parse(sourceData, rowIndex, context);
if (stringObj == null || characterObj == null) {
return null;
}
String stringValue = OperatorTools.parseString(stringObj);
String characterSetValue = OperatorTools.parseString(characterObj).toUpperCase();
byte[] encodeBytes = encode(stringValue, characterSetValue);
StringBuilder res = new StringBuilder();
if (encodeBytes != null) {
for (byte encodeByte : encodeBytes) {
res.append((int) encodeByte).append(" ");
}
}
return res.toString().trim();
}

private byte[] encode(String stringValue, String characterSetValue) {
if (stringValue == null || stringValue.isEmpty() || characterSetValue == null || characterSetValue.isEmpty()) {
return new byte[0];
}
if (Charset.isSupported(characterSetValue) && SUPPORTED_CHARSETS.contains(characterSetValue)) {
Charset charset = Charset.forName(characterSetValue);
return stringValue.getBytes(charset);
}
return null;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.inlong.sdk.transform.process.function.string;

import org.apache.inlong.sdk.transform.decode.SourceDecoderFactory;
import org.apache.inlong.sdk.transform.encode.SinkEncoderFactory;
import org.apache.inlong.sdk.transform.pojo.TransformConfig;
import org.apache.inlong.sdk.transform.process.TransformProcessor;

import org.junit.Assert;
import org.junit.Test;

import java.util.HashMap;
import java.util.List;

public class TestDecodeFunction extends AbstractFunctionStringTestBase {

@Test
public void testDecodeFunction() throws Exception {
String transformSql = "select decode(string1,string2) from source";
TransformConfig config = new TransformConfig(transformSql);
TransformProcessor<String, String> processor = TransformProcessor
.create(config, SourceDecoderFactory.createCsvDecoder(csvSource),
SinkEncoderFactory.createKvEncoder(kvSink));

// case1: decode('72 101 108 108 111','UTF-8')
List<String> output1 = processor.transform("72 101 108 108 111|UTF-8|banana|cloud|1", new HashMap<>());
Assert.assertEquals(1, output1.size());
Assert.assertEquals(output1.get(0), "result=Hello");

// case2: decode('72 101 108 108 111','US-ASCII')
List<String> output2 = processor.transform("72 101 108 108 111|US-ASCII|banana|cloud|1", new HashMap<>());
Assert.assertEquals(1, output2.size());
Assert.assertEquals(output2.get(0), "result=Hello");

// case3: decode('72 101 108 108 111','ISO-8859-1')
List<String> output3 = processor.transform("72 101 108 108 111|ISO-8859-1|banana|cloud|1", new HashMap<>());
Assert.assertEquals(1, output3.size());
Assert.assertEquals(output3.get(0), "result=Hello");

// case4: decode('0 72 0 101 0 108 0 108 0 111','UTF-16BE')
List<String> output4 =
processor.transform("0 72 0 101 0 108 0 108 0 111|UTF-16BE|banana|cloud|1", new HashMap<>());
Assert.assertEquals(1, output4.size());
Assert.assertEquals(output4.get(0), "result=Hello");

// case5: decode('72 0 101 0 108 0 108 0 111 0','UTF-16LE')
List<String> output5 =
processor.transform("72 0 101 0 108 0 108 0 111 0|UTf-16LE|banana|cloud|1", new HashMap<>());
Assert.assertEquals(1, output5.size());
Assert.assertEquals(output5.get(0), "result=Hello");

// case6: decode('-2 -1 0 72 0 101 0 108 0 108 0 111','UTF-16')
List<String> output6 =
processor.transform("-2 -1 0 72 0 101 0 108 0 108 0 111|UtF-16|banana|cloud|1", new HashMap<>());
Assert.assertEquals(1, output6.size());
Assert.assertEquals(output6.get(0), "result=Hello");

// case7: decode('-2 -1 0 72 0 101 0 108 0 108 0 111','UTF-16--')
List<String> output7 =
processor.transform("-2 -1 0 72 0 101 0 108 0 108 0 111|UTF-16--|banana|cloud|1", new HashMap<>());
Assert.assertEquals(1, output7.size());
Assert.assertEquals(output7.get(0), "result=");
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.inlong.sdk.transform.process.function.string;

import org.apache.inlong.sdk.transform.decode.SourceDecoderFactory;
import org.apache.inlong.sdk.transform.encode.SinkEncoderFactory;
import org.apache.inlong.sdk.transform.pojo.TransformConfig;
import org.apache.inlong.sdk.transform.process.TransformProcessor;

import org.junit.Assert;
import org.junit.Test;

import java.util.HashMap;
import java.util.List;

public class TestEncodeFunction extends AbstractFunctionStringTestBase {

@Test
public void testEncodeFunction() throws Exception {
String transformSql = "select encode(string1,string2) from source";
TransformConfig config = new TransformConfig(transformSql);
TransformProcessor<String, String> processor = TransformProcessor
.create(config, SourceDecoderFactory.createCsvDecoder(csvSource),
SinkEncoderFactory.createKvEncoder(kvSink));

// case1: encode('Hello','UTF-8')
List<String> output1 = processor.transform("Hello|UTF-8|banana|cloud|1", new HashMap<>());
Assert.assertEquals(1, output1.size());
Assert.assertEquals(output1.get(0), "result=72 101 108 108 111");

// case2: encode('Hello','US-ASCII')
List<String> output2 = processor.transform("Hello|US-ASCII|banana|cloud|1", new HashMap<>());
Assert.assertEquals(1, output2.size());
Assert.assertEquals(output2.get(0), "result=72 101 108 108 111");

// case3: encode('Hello','ISO-8859-1')
List<String> output3 = processor.transform("Hello|ISO-8859-1|banana|cloud|1", new HashMap<>());
Assert.assertEquals(1, output3.size());
Assert.assertEquals(output3.get(0), "result=72 101 108 108 111");

// case4: encode('Hello','UTF-16BE')
List<String> output4 = processor.transform("Hello|UTF-16BE|banana|cloud|1", new HashMap<>());
Assert.assertEquals(1, output4.size());
Assert.assertEquals(output4.get(0), "result=0 72 0 101 0 108 0 108 0 111");

// case5: encode('Hello','UTF-16LE')
List<String> output5 = processor.transform("Hello|UTf-16LE|banana|cloud|1", new HashMap<>());
Assert.assertEquals(1, output5.size());
Assert.assertEquals(output5.get(0), "result=72 0 101 0 108 0 108 0 111 0");

// case6: encode('Hello','UTF-16')
List<String> output6 = processor.transform("Hello|UtF-16|banana|cloud|1", new HashMap<>());
Assert.assertEquals(1, output6.size());
Assert.assertEquals(output6.get(0), "result=-2 -1 0 72 0 101 0 108 0 108 0 111");

// case7: encode('Hello','UTF-16--')
List<String> output7 = processor.transform("Hello|UTF-16--|banana|cloud|1", new HashMap<>());
Assert.assertEquals(1, output7.size());
Assert.assertEquals(output7.get(0), "result=");
}
}

0 comments on commit 751f69d

Please sign in to comment.