Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add stringview support to encode and decode and bit_length #13332

Merged
merged 5 commits into from
Nov 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions datafusion/functions/src/core/named_struct.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,9 @@ use datafusion_expr::{ScalarUDFImpl, Signature, Volatility};
use std::any::Any;
use std::sync::{Arc, OnceLock};

/// put values in a struct array.
/// Put values in a struct array.
fn named_struct_expr(args: &[ColumnarValue]) -> Result<ColumnarValue> {
// do not accept 0 arguments.
// Do not accept 0 arguments.
if args.is_empty() {
return exec_err!(
"named_struct requires at least one pair of arguments, got 0 instead"
Expand Down
20 changes: 14 additions & 6 deletions datafusion/functions/src/encoding/inner.rs
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ impl ScalarUDFImpl for EncodeFunc {
}

match arg_types[0] {
DataType::Utf8 | DataType::Binary | DataType::Null => {
DataType::Utf8 | DataType::Utf8View | DataType::Binary | DataType::Null => {
Ok(vec![DataType::Utf8; 2])
}
DataType::LargeUtf8 | DataType::LargeBinary => {
Expand Down Expand Up @@ -195,7 +195,7 @@ impl ScalarUDFImpl for DecodeFunc {
}

match arg_types[0] {
DataType::Utf8 | DataType::Binary | DataType::Null => {
DataType::Utf8 | DataType::Utf8View | DataType::Binary | DataType::Null => {
Ok(vec![DataType::Binary, DataType::Utf8])
}
DataType::LargeUtf8 | DataType::LargeBinary => {
Expand Down Expand Up @@ -224,6 +224,7 @@ fn encode_process(value: &ColumnarValue, encoding: Encoding) -> Result<ColumnarV
ColumnarValue::Array(a) => match a.data_type() {
DataType::Utf8 => encoding.encode_utf8_array::<i32>(a.as_ref()),
DataType::LargeUtf8 => encoding.encode_utf8_array::<i64>(a.as_ref()),
DataType::Utf8View => encoding.encode_utf8_array::<i32>(a.as_ref()),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this means the output type will be Utf8 but for this type i think that is totally fine (there is no need to preserve the view encoding)

DataType::Binary => encoding.encode_binary_array::<i32>(a.as_ref()),
DataType::LargeBinary => encoding.encode_binary_array::<i64>(a.as_ref()),
other => exec_err!(
Expand All @@ -237,6 +238,9 @@ fn encode_process(value: &ColumnarValue, encoding: Encoding) -> Result<ColumnarV
}
ScalarValue::LargeUtf8(a) => Ok(encoding
.encode_large_scalar(a.as_ref().map(|s: &String| s.as_bytes()))),
ScalarValue::Utf8View(a) => {
Ok(encoding.encode_scalar(a.as_ref().map(|s: &String| s.as_bytes())))
}
ScalarValue::Binary(a) => Ok(
encoding.encode_scalar(a.as_ref().map(|v: &Vec<u8>| v.as_slice()))
),
Expand All @@ -255,6 +259,7 @@ fn decode_process(value: &ColumnarValue, encoding: Encoding) -> Result<ColumnarV
ColumnarValue::Array(a) => match a.data_type() {
DataType::Utf8 => encoding.decode_utf8_array::<i32>(a.as_ref()),
DataType::LargeUtf8 => encoding.decode_utf8_array::<i64>(a.as_ref()),
DataType::Utf8View => encoding.decode_utf8_array::<i32>(a.as_ref()),
DataType::Binary => encoding.decode_binary_array::<i32>(a.as_ref()),
DataType::LargeBinary => encoding.decode_binary_array::<i64>(a.as_ref()),
other => exec_err!(
Expand All @@ -268,6 +273,9 @@ fn decode_process(value: &ColumnarValue, encoding: Encoding) -> Result<ColumnarV
}
ScalarValue::LargeUtf8(a) => encoding
.decode_large_scalar(a.as_ref().map(|s: &String| s.as_bytes())),
ScalarValue::Utf8View(a) => {
encoding.decode_scalar(a.as_ref().map(|s: &String| s.as_bytes()))
}
ScalarValue::Binary(a) => {
encoding.decode_scalar(a.as_ref().map(|v: &Vec<u8>| v.as_slice()))
}
Expand Down Expand Up @@ -512,7 +520,7 @@ impl FromStr for Encoding {
}
}

/// Encodes the given data, accepts Binary, LargeBinary, Utf8 or LargeUtf8 and returns a [`ColumnarValue`].
/// Encodes the given data, accepts Binary, LargeBinary, Utf8, Utf8View or LargeUtf8 and returns a [`ColumnarValue`].
/// Second argument is the encoding to use.
/// Standard encodings are base64 and hex.
fn encode(args: &[ColumnarValue]) -> Result<ColumnarValue> {
Expand All @@ -524,7 +532,7 @@ fn encode(args: &[ColumnarValue]) -> Result<ColumnarValue> {
}
let encoding = match &args[1] {
ColumnarValue::Scalar(scalar) => match scalar {
ScalarValue::Utf8(Some(method)) | ScalarValue::LargeUtf8(Some(method)) => {
ScalarValue::Utf8(Some(method)) | ScalarValue::Utf8View(Some(method)) | ScalarValue::LargeUtf8(Some(method)) => {
method.parse::<Encoding>()
}
_ => not_impl_err!(
Expand All @@ -538,7 +546,7 @@ fn encode(args: &[ColumnarValue]) -> Result<ColumnarValue> {
encode_process(&args[0], encoding)
}

/// Decodes the given data, accepts Binary, LargeBinary, Utf8 or LargeUtf8 and returns a [`ColumnarValue`].
/// Decodes the given data, accepts Binary, LargeBinary, Utf8, Utf8View or LargeUtf8 and returns a [`ColumnarValue`].
/// Second argument is the encoding to use.
/// Standard encodings are base64 and hex.
fn decode(args: &[ColumnarValue]) -> Result<ColumnarValue> {
Expand All @@ -550,7 +558,7 @@ fn decode(args: &[ColumnarValue]) -> Result<ColumnarValue> {
}
let encoding = match &args[1] {
ColumnarValue::Scalar(scalar) => match scalar {
ScalarValue::Utf8(Some(method)) | ScalarValue::LargeUtf8(Some(method)) => {
ScalarValue::Utf8(Some(method)) | ScalarValue::Utf8View(Some(method)) | ScalarValue::LargeUtf8(Some(method)) => {
method.parse::<Encoding>()
}
_ => not_impl_err!(
Expand Down
3 changes: 3 additions & 0 deletions datafusion/functions/src/string/bit_length.rs
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,9 @@ impl ScalarUDFImpl for BitLengthFunc {
ScalarValue::LargeUtf8(v) => Ok(ColumnarValue::Scalar(
ScalarValue::Int64(v.as_ref().map(|x| (x.len() * 8) as i64)),
)),
ScalarValue::Utf8View(v) => Ok(ColumnarValue::Scalar(
ScalarValue::Int32(v.as_ref().map(|x| (x.len() * 8) as i32)),
)),
_ => unreachable!("bit length"),
},
}
Expand Down
31 changes: 31 additions & 0 deletions datafusion/sqllogictest/test_files/encoding.slt
Original file line number Diff line number Diff line change
Expand Up @@ -71,3 +71,34 @@ select to_hex(num) from test ORDER BY num;
0
1
2

# test for Utf8View support for encode
statement ok
CREATE TABLE test_source AS VALUES
('Andrew', 'X'),
('Xiangpeng', 'Xiangpeng'),
('Raphael', 'R'),
(NULL, 'R');

statement ok
CREATE TABLE test_utf8view AS
select
arrow_cast(column1, 'Utf8View') AS column1_utf8view,
arrow_cast(column2, 'Utf8View') AS column2_utf8view
FROM test_source;

query TTTTTT
SELECT
column1_utf8view,
encode(column1_utf8view, 'base64') AS column1_base64,
encode(column1_utf8view, 'hex') AS column1_hex,

column2_utf8view,
encode(column2_utf8view, 'base64') AS column2_base64,
encode(column2_utf8view, 'hex') AS column2_hex
FROM test_utf8view;
----
Andrew QW5kcmV3 416e64726577 X WA 58
Xiangpeng WGlhbmdwZW5n 5869616e6770656e67 Xiangpeng WGlhbmdwZW5n 5869616e6770656e67
Raphael UmFwaGFlbA 5261706861656c R Ug 52
NULL NULL NULL R Ug 52
5 changes: 5 additions & 0 deletions datafusion/sqllogictest/test_files/expr.slt
Original file line number Diff line number Diff line change
Expand Up @@ -364,6 +364,11 @@ SELECT bit_length(NULL)
----
NULL

query I
SELECT bit_length(arrow_cast('jonathan', 'Utf8View'));
----
64

query T
SELECT btrim(' xyxtrimyyx ', NULL)
----
Expand Down