From 7568178e37812424d9189c625f3958b165ec13cf Mon Sep 17 00:00:00 2001 From: RinChanNOW Date: Mon, 6 May 2024 23:26:06 +0800 Subject: [PATCH] Support casting `StringView`/`BinaryView` --> `StringArray`/`BinaryArray`. (#5704) --- arrow-cast/src/cast/mod.rs | 110 +++++++++++++++++++++++++++++++++++++ 1 file changed, 110 insertions(+) diff --git a/arrow-cast/src/cast/mod.rs b/arrow-cast/src/cast/mod.rs index 36072760ed07..171267f80543 100644 --- a/arrow-cast/src/cast/mod.rs +++ b/arrow-cast/src/cast/mod.rs @@ -218,6 +218,8 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { | Interval(_), ) => true, (Utf8 | LargeUtf8, Utf8View) => true, + (Utf8View, Utf8 | LargeUtf8) => true, + (BinaryView, Binary | LargeBinary) => true, (Utf8 | LargeUtf8, _) => to_type.is_numeric() && to_type != &Float16, (_, Utf8 | LargeUtf8) => from_type.is_primitive(), @@ -1262,6 +1264,12 @@ pub fn cast_with_options( "Casting from {from_type:?} to {to_type:?} not supported", ))), }, + (Utf8View, Utf8) => cast_view_to_byte::>(array), + (Utf8View, LargeUtf8) => cast_view_to_byte::>(array), + (BinaryView, Binary) => cast_view_to_byte::>(array), + (BinaryView, LargeBinary) => { + cast_view_to_byte::>(array) + } (from_type, LargeUtf8) if from_type.is_primitive() => { value_to_string::(array, cast_options) } @@ -2299,6 +2307,32 @@ where })) } +/// Helper function to cast from one `ByteViewType` array to `ByteArrayType` array. +fn cast_view_to_byte(array: &dyn Array) -> Result +where + FROM: ByteViewType, + TO: ByteArrayType, + FROM::Native: AsRef, +{ + let data = array.to_data(); + let view_array = GenericByteViewArray::::from(data); + + let len = view_array.len(); + let bytes = view_array + .views() + .iter() + .map(|v| ByteView::from(*v).length as usize) + .sum::(); + + let mut byte_array_builder = GenericByteBuilder::::with_capacity(len, bytes); + + for val in view_array.iter() { + byte_array_builder.append_option(val); + } + + Ok(Arc::new(byte_array_builder.finish())) +} + #[cfg(test)] mod tests { use arrow_buffer::{Buffer, NullBuffer}; @@ -5169,6 +5203,82 @@ mod tests { assert_eq!(binary_view_array.as_ref(), &expect_binary_view_array); } + #[test] + fn test_view_to_string() { + _test_view_to_string::(); + _test_view_to_string::(); + } + + fn _test_view_to_string() + where + O: OffsetSizeTrait, + { + let data: Vec> = vec![ + Some("hello"), + Some("world"), + None, + Some("large payload over 12 bytes"), + Some("lulu"), + ]; + + let view_array = { + // ["hello", "world", null, "large payload over 12 bytes", "lulu"] + let mut builder = StringViewBuilder::new().with_block_size(8); // multiple buffers. + for s in data.iter() { + builder.append_option(*s); + } + builder.finish() + }; + + let expected_string_array = GenericStringArray::::from(data); + let expected_type = expected_string_array.data_type(); + + assert!(can_cast_types(view_array.data_type(), expected_type)); + + let string_array = cast(&view_array, expected_type).unwrap(); + assert_eq!(string_array.data_type(), expected_type); + + assert_eq!(string_array.as_ref(), &expected_string_array); + } + + #[test] + fn test_view_to_binary() { + _test_view_to_binary::(); + _test_view_to_binary::(); + } + + fn _test_view_to_binary() + where + O: OffsetSizeTrait, + { + let data: Vec> = vec![ + Some(b"hello"), + Some(b"world"), + None, + Some(b"large payload over 12 bytes"), + Some(b"lulu"), + ]; + + let view_array = { + // ["hello", "world", null, "large payload over 12 bytes", "lulu"] + let mut builder = BinaryViewBuilder::new().with_block_size(8); // multiple buffers. + for s in data.iter() { + builder.append_option(*s); + } + builder.finish() + }; + + let expected_binary_array = GenericBinaryArray::::from(data); + let expected_type = expected_binary_array.data_type(); + + assert!(can_cast_types(view_array.data_type(), expected_type)); + + let binary_array = cast(&view_array, expected_type).unwrap(); + assert_eq!(binary_array.data_type(), expected_type); + + assert_eq!(binary_array.as_ref(), &expected_binary_array); + } + #[test] fn test_cast_from_f64() { let f64_values: Vec = vec![