Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support casting StringArray/BinaryArray --> StringView / BinaryView #5686

Merged
merged 4 commits into from
Apr 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions arrow-array/src/array/byte_view_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -428,6 +428,18 @@ impl BinaryViewArray {
}
}

impl From<Vec<&[u8]>> for BinaryViewArray {
fn from(v: Vec<&[u8]>) -> Self {
Self::from_iter_values(v)
}
}

impl From<Vec<Option<&[u8]>>> for BinaryViewArray {
fn from(v: Vec<Option<&[u8]>>) -> Self {
v.into_iter().collect()
}
}

/// A [`GenericByteViewArray`] that stores utf8 data
///
/// # Example
Expand Down
129 changes: 127 additions & 2 deletions arrow-cast/src/cast/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ use crate::cast::dictionary::*;
use crate::cast::list::*;
use crate::cast::string::*;

use arrow_buffer::ScalarBuffer;
use arrow_data::ByteView;
use chrono::{NaiveTime, Offset, TimeZone, Utc};
use std::cmp::Ordering;
use std::sync::Arc;
Expand Down Expand Up @@ -119,6 +121,8 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool {
| Utf8
| LargeBinary
| LargeUtf8
| BinaryView
| Utf8View
| List(_)
| LargeList(_)
| FixedSizeList(_, _)
Expand Down Expand Up @@ -192,8 +196,8 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool {
DataType::is_integer(to_type) || DataType::is_floating(to_type) || to_type == &Utf8 || to_type == &LargeUtf8
}

(Binary, LargeBinary | Utf8 | LargeUtf8 | FixedSizeBinary(_)) => true,
(LargeBinary, Binary | Utf8 | LargeUtf8 | FixedSizeBinary(_)) => true,
(Binary, LargeBinary | Utf8 | LargeUtf8 | FixedSizeBinary(_) | BinaryView) => true,
(LargeBinary, Binary | Utf8 | LargeUtf8 | FixedSizeBinary(_) | BinaryView) => true,
(FixedSizeBinary(_), Binary | LargeBinary) => true,
(
Utf8 | LargeUtf8,
Expand All @@ -213,6 +217,7 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool {
| Timestamp(Nanosecond, _)
| Interval(_),
) => true,
(Utf8 | LargeUtf8, Utf8View) => true,
(Utf8 | LargeUtf8, _) => to_type.is_numeric() && to_type != &Float16,
(_, Utf8 | LargeUtf8) => from_type.is_primitive(),

Expand Down Expand Up @@ -611,6 +616,8 @@ pub fn cast_with_options(
| Utf8
| LargeBinary
| LargeUtf8
| BinaryView
| Utf8View
| List(_)
| LargeList(_)
| FixedSizeList(_, _)
Expand Down Expand Up @@ -1120,6 +1127,7 @@ pub fn cast_with_options(
let binary = BinaryArray::from(array.as_string::<i32>().clone());
cast_byte_container::<BinaryType, LargeBinaryType>(&binary)
}
Utf8View => cast_byte_to_view::<Utf8Type, StringViewType>(array),
LargeUtf8 => cast_byte_container::<Utf8Type, LargeUtf8Type>(array),
Time32(TimeUnit::Second) => parse_string::<Time32SecondType, i32>(array, cast_options),
Time32(TimeUnit::Millisecond) => {
Expand Down Expand Up @@ -1179,6 +1187,7 @@ pub fn cast_with_options(
LargeBinary => Ok(Arc::new(LargeBinaryArray::from(
array.as_string::<i64>().clone(),
))),
Utf8View => cast_byte_to_view::<LargeUtf8Type, StringViewType>(array),
Time32(TimeUnit::Second) => parse_string::<Time32SecondType, i64>(array, cast_options),
Time32(TimeUnit::Millisecond) => {
parse_string::<Time32MillisecondType, i64>(array, cast_options)
Expand Down Expand Up @@ -1226,6 +1235,7 @@ pub fn cast_with_options(
FixedSizeBinary(size) => {
cast_binary_to_fixed_size_binary::<i32>(array, *size, cast_options)
}
BinaryView => cast_byte_to_view::<BinaryType, BinaryViewType>(array),
_ => Err(ArrowError::CastError(format!(
"Casting from {from_type:?} to {to_type:?} not supported",
))),
Expand All @@ -1240,6 +1250,7 @@ pub fn cast_with_options(
FixedSizeBinary(size) => {
cast_binary_to_fixed_size_binary::<i64>(array, *size, cast_options)
}
BinaryView => cast_byte_to_view::<LargeBinaryType, BinaryViewType>(array),
_ => Err(ArrowError::CastError(format!(
"Casting from {from_type:?} to {to_type:?} not supported",
))),
Expand Down Expand Up @@ -2238,6 +2249,56 @@ where
Ok(Arc::new(GenericByteArray::<TO>::from(array_data)))
}

/// Helper function to cast from one `ByteArrayType` array to `ByteViewType` array.
fn cast_byte_to_view<FROM, V>(array: &dyn Array) -> Result<ArrayRef, ArrowError>
where
FROM: ByteArrayType,
FROM::Offset: OffsetSizeTrait + ToPrimitive,
V: ByteViewType,
{
let data = array.to_data();
assert_eq!(data.data_type(), &FROM::DATA_TYPE);

let len = array.len();
let str_values_buf = data.buffers()[1].clone();
let offsets = data.buffers()[0].typed_data::<FROM::Offset>();

let mut views_builder = BufferBuilder::<u128>::new(len);
for w in offsets.windows(2) {
let offset = w[0].to_u32().unwrap();
let end = w[1].to_u32().unwrap();
let value_buf = &str_values_buf[offset as usize..end as usize];
let length = end - offset;

if length <= 12 {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As an aside (not for this PR) it would be great to somehow encapsulate this logic into a struct to avoid having to copy the same pattern it so many times. I took a shot at this here #5619

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The new enum View looks good to me.

let mut view_buffer = [0; 16];
view_buffer[0..4].copy_from_slice(&length.to_le_bytes());
view_buffer[4..4 + value_buf.len()].copy_from_slice(value_buf);
views_builder.append(u128::from_le_bytes(view_buffer));
} else {
let view = ByteView {
length,
prefix: u32::from_le_bytes(value_buf[0..4].try_into().unwrap()),
buffer_index: 0,
offset,
};
views_builder.append(view.into());
}
}

assert_eq!(views_builder.len(), len);

// Safety: the input was a valid array so it valid UTF8 (if string). And
// all offsets were valid and we created the views correctly
Ok(Arc::new(unsafe {
alamb marked this conversation as resolved.
Show resolved Hide resolved
GenericByteViewArray::<V>::new_unchecked(
ScalarBuffer::new(views_builder.finish(), 0, len),
vec![str_values_buf],
data.nulls().cloned(),
)
}))
}

#[cfg(test)]
mod tests {
use arrow_buffer::{Buffer, NullBuffer};
Expand Down Expand Up @@ -5044,6 +5105,70 @@ mod tests {
}
}

#[test]
fn test_string_to_view() {
_test_string_to_view::<i32>();
_test_string_to_view::<i64>();
}

fn _test_string_to_view<O>()
where
O: OffsetSizeTrait,
{
let data = vec![
Some("hello"),
Some("world"),
None,
Some("large payload over 12 bytes"),
Some("lulu"),
];

let string_array = GenericStringArray::<O>::from(data.clone());

assert!(can_cast_types(
string_array.data_type(),
&DataType::Utf8View
));

let string_view_array = cast(&string_array, &DataType::Utf8View).unwrap();
assert_eq!(string_view_array.data_type(), &DataType::Utf8View);

let expect_string_view_array = StringViewArray::from(data);
assert_eq!(string_view_array.as_ref(), &expect_string_view_array);
}

#[test]
fn test_bianry_to_view() {
_test_binary_to_view::<i32>();
_test_binary_to_view::<i64>();
}

fn _test_binary_to_view<O>()
where
O: OffsetSizeTrait,
{
let data: Vec<Option<&[u8]>> = vec![
Some(b"hello"),
Some(b"world"),
None,
Some(b"large payload over 12 bytes"),
Some(b"lulu"),
];

let binary_array = GenericBinaryArray::<O>::from(data.clone());

assert!(can_cast_types(
binary_array.data_type(),
&DataType::BinaryView
));

let binary_view_array = cast(&binary_array, &DataType::BinaryView).unwrap();
assert_eq!(binary_view_array.data_type(), &DataType::BinaryView);

let expect_binary_view_array = BinaryViewArray::from(data);
assert_eq!(binary_view_array.as_ref(), &expect_binary_view_array);
}

#[test]
fn test_cast_from_f64() {
let f64_values: Vec<f64> = vec![
Expand Down
Loading