From 4c5105b98eea543831eb20c9a55b80c4f3fc25da Mon Sep 17 00:00:00 2001 From: RinChanNOWWW Date: Wed, 24 Apr 2024 19:26:31 +0800 Subject: [PATCH 1/4] Support casting from byte array to byte view array. --- arrow-array/src/array/byte_view_array.rs | 12 +++ arrow-cast/src/cast/mod.rs | 123 ++++++++++++++++++++++- 2 files changed, 133 insertions(+), 2 deletions(-) diff --git a/arrow-array/src/array/byte_view_array.rs b/arrow-array/src/array/byte_view_array.rs index e2839b19e5f3..79f2d47587a6 100644 --- a/arrow-array/src/array/byte_view_array.rs +++ b/arrow-array/src/array/byte_view_array.rs @@ -428,6 +428,18 @@ impl BinaryViewArray { } } +impl From> for BinaryViewArray { + fn from(v: Vec<&[u8]>) -> Self { + Self::from_iter_values(v) + } +} + +impl From>> for BinaryViewArray { + fn from(v: Vec>) -> Self { + v.into_iter().collect() + } +} + /// A [`GenericByteViewArray`] that stores utf8 data /// /// # Example diff --git a/arrow-cast/src/cast/mod.rs b/arrow-cast/src/cast/mod.rs index 8b7579c4cfc0..3e19f4d04b23 100644 --- a/arrow-cast/src/cast/mod.rs +++ b/arrow-cast/src/cast/mod.rs @@ -46,6 +46,8 @@ use crate::cast::dictionary::*; use crate::cast::list::*; use crate::cast::string::*; +use arrow_buffer::ScalarBuffer; +use arrow_data::ByteView; use chrono::{NaiveTime, Offset, TimeZone, Utc}; use std::cmp::Ordering; use std::sync::Arc; @@ -119,6 +121,8 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { | Utf8 | LargeBinary | LargeUtf8 + | BinaryView + | Utf8View | List(_) | LargeList(_) | FixedSizeList(_, _) @@ -192,8 +196,8 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { DataType::is_integer(to_type) || DataType::is_floating(to_type) || to_type == &Utf8 || to_type == &LargeUtf8 } - (Binary, LargeBinary | Utf8 | LargeUtf8 | FixedSizeBinary(_)) => true, - (LargeBinary, Binary | Utf8 | LargeUtf8 | FixedSizeBinary(_)) => true, + (Binary, LargeBinary | Utf8 | LargeUtf8 | FixedSizeBinary(_) | BinaryView) => true, + (LargeBinary, Binary | Utf8 | LargeUtf8 | FixedSizeBinary(_) | BinaryView) => true, (FixedSizeBinary(_), Binary | LargeBinary) => true, ( Utf8 | LargeUtf8, @@ -213,6 +217,7 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { | Timestamp(Nanosecond, _) | Interval(_), ) => true, + (Utf8 | LargeUtf8, Utf8View) => true, (Utf8 | LargeUtf8, _) => to_type.is_numeric() && to_type != &Float16, (_, Utf8 | LargeUtf8) => from_type.is_primitive(), @@ -611,6 +616,8 @@ pub fn cast_with_options( | Utf8 | LargeBinary | LargeUtf8 + | BinaryView + | Utf8View | List(_) | LargeList(_) | FixedSizeList(_, _) @@ -1120,6 +1127,7 @@ pub fn cast_with_options( let binary = BinaryArray::from(array.as_string::().clone()); cast_byte_container::(&binary) } + Utf8View => cast_byte_to_view::(array), LargeUtf8 => cast_byte_container::(array), Time32(TimeUnit::Second) => parse_string::(array, cast_options), Time32(TimeUnit::Millisecond) => { @@ -1179,6 +1187,7 @@ pub fn cast_with_options( LargeBinary => Ok(Arc::new(LargeBinaryArray::from( array.as_string::().clone(), ))), + Utf8View => cast_byte_to_view::(array), Time32(TimeUnit::Second) => parse_string::(array, cast_options), Time32(TimeUnit::Millisecond) => { parse_string::(array, cast_options) @@ -1226,6 +1235,7 @@ pub fn cast_with_options( FixedSizeBinary(size) => { cast_binary_to_fixed_size_binary::(array, *size, cast_options) } + BinaryView => cast_byte_to_view::(array), _ => Err(ArrowError::CastError(format!( "Casting from {from_type:?} to {to_type:?} not supported", ))), @@ -1240,6 +1250,7 @@ pub fn cast_with_options( FixedSizeBinary(size) => { cast_binary_to_fixed_size_binary::(array, *size, cast_options) } + BinaryView => cast_byte_to_view::(array), _ => Err(ArrowError::CastError(format!( "Casting from {from_type:?} to {to_type:?} not supported", ))), @@ -2238,6 +2249,50 @@ where Ok(Arc::new(GenericByteArray::::from(array_data))) } +/// Helper function to cast from one `ByteArrayType` array to `ByteViewType` array. +fn cast_byte_to_view(array: &dyn Array) -> Result +where + FROM: ByteArrayType, + FROM::Offset: OffsetSizeTrait + ToPrimitive, + V: ByteViewType, +{ + let data = array.to_data(); + let len = array.len(); + assert_eq!(data.data_type(), &FROM::DATA_TYPE); + let str_values_buf = data.buffers()[1].clone(); + let offsets = data.buffers()[0].typed_data::(); + + let mut views_builder = BufferBuilder::::new(len); + for w in offsets.windows(2) { + let offset = w[0].to_u32().unwrap(); + let end = w[1].to_u32().unwrap(); + let value_buf = &str_values_buf[offset as usize..end as usize]; + let length = end - offset; + + if length <= 12 { + let mut view_buffer = [0; 16]; + view_buffer[0..4].copy_from_slice(&length.to_le_bytes()); + view_buffer[4..4 + value_buf.len()].copy_from_slice(value_buf); + views_builder.append(u128::from_le_bytes(view_buffer)); + } else { + let view = ByteView { + length, + prefix: u32::from_le_bytes(value_buf[0..4].try_into().unwrap()), + buffer_index: 0, + offset, + }; + views_builder.append(view.into()); + } + } + + assert_eq!(views_builder.len(), len); + Ok(Arc::new(GenericByteViewArray::::new( + ScalarBuffer::new(views_builder.finish(), 0, len), + vec![str_values_buf], + data.nulls().cloned(), + ))) +} + #[cfg(test)] mod tests { use arrow_buffer::{Buffer, NullBuffer}; @@ -5044,6 +5099,70 @@ mod tests { } } + #[test] + fn test_string_to_view() { + _test_string_to_view::(); + _test_string_to_view::(); + } + + fn _test_string_to_view() + where + O: OffsetSizeTrait, + { + let data = vec![ + Some("hello"), + Some("world"), + None, + Some("large payload over 12 bytes"), + Some("lulu"), + ]; + + let string_array = GenericStringArray::::from(data.clone()); + + assert!(can_cast_types( + string_array.data_type(), + &DataType::Utf8View + )); + + let string_view_array = cast(&string_array, &DataType::Utf8View).unwrap(); + assert_eq!(string_view_array.data_type(), &DataType::Utf8View); + + let expect_string_view_array = StringViewArray::from(data); + assert_eq!(string_view_array.as_ref(), &expect_string_view_array); + } + + #[test] + fn test_bianry_to_view() { + _test_binary_to_view::(); + _test_binary_to_view::(); + } + + fn _test_binary_to_view() + where + O: OffsetSizeTrait, + { + let data: Vec> = vec![ + Some(b"hello"), + Some(b"world"), + None, + Some(b"large payload over 12 bytes"), + Some(b"lulu"), + ]; + + let binary_array = GenericBinaryArray::::from(data.clone()); + + assert!(can_cast_types( + binary_array.data_type(), + &DataType::BinaryView + )); + + let binary_view_array = cast(&binary_array, &DataType::BinaryView).unwrap(); + assert_eq!(binary_view_array.data_type(), &DataType::BinaryView); + + let expect_binary_view_array = BinaryViewArray::from(data); + assert_eq!(binary_view_array.as_ref(), &expect_binary_view_array); + } + #[test] fn test_cast_from_f64() { let f64_values: Vec = vec![ From c0cc317b0a7cafe1f5a822ec8507a8947f66843b Mon Sep 17 00:00:00 2001 From: RinChanNOWWW Date: Fri, 26 Apr 2024 20:02:07 +0800 Subject: [PATCH 2/4] Use new_unchecked. --- arrow-cast/src/cast/mod.rs | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/arrow-cast/src/cast/mod.rs b/arrow-cast/src/cast/mod.rs index 3e19f4d04b23..8b1b97561e25 100644 --- a/arrow-cast/src/cast/mod.rs +++ b/arrow-cast/src/cast/mod.rs @@ -2257,8 +2257,9 @@ where V: ByteViewType, { let data = array.to_data(); - let len = array.len(); assert_eq!(data.data_type(), &FROM::DATA_TYPE); + + let len = array.len(); let str_values_buf = data.buffers()[1].clone(); let offsets = data.buffers()[0].typed_data::(); @@ -2286,11 +2287,14 @@ where } assert_eq!(views_builder.len(), len); - Ok(Arc::new(GenericByteViewArray::::new( - ScalarBuffer::new(views_builder.finish(), 0, len), - vec![str_values_buf], - data.nulls().cloned(), - ))) + + Ok(Arc::new(unsafe { + GenericByteViewArray::::new_unchecked( + ScalarBuffer::new(views_builder.finish(), 0, len), + vec![str_values_buf], + data.nulls().cloned(), + ) + })) } #[cfg(test)] From d857cee67caf555fedae54522ffd58429f7ee722 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 26 Apr 2024 11:23:27 -0400 Subject: [PATCH 3/4] Add safety justification comment --- arrow-cast/src/cast/mod.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arrow-cast/src/cast/mod.rs b/arrow-cast/src/cast/mod.rs index 8b1b97561e25..b5b45f98feab 100644 --- a/arrow-cast/src/cast/mod.rs +++ b/arrow-cast/src/cast/mod.rs @@ -2288,6 +2288,8 @@ where assert_eq!(views_builder.len(), len); + /// Safety: the input was a valid array so the data was UTF8 (if string) and all offsets were valid + /// and we created the views correctly Ok(Arc::new(unsafe { GenericByteViewArray::::new_unchecked( ScalarBuffer::new(views_builder.finish(), 0, len), From 670d161f4005079ee53b8e5026f46ca9d96ce5fb Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 26 Apr 2024 11:44:46 -0400 Subject: [PATCH 4/4] Fix comments :facepalm --- arrow-cast/src/cast/mod.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arrow-cast/src/cast/mod.rs b/arrow-cast/src/cast/mod.rs index b5b45f98feab..36072760ed07 100644 --- a/arrow-cast/src/cast/mod.rs +++ b/arrow-cast/src/cast/mod.rs @@ -2288,8 +2288,8 @@ where assert_eq!(views_builder.len(), len); - /// Safety: the input was a valid array so the data was UTF8 (if string) and all offsets were valid - /// and we created the views correctly + // Safety: the input was a valid array so it valid UTF8 (if string). And + // all offsets were valid and we created the views correctly Ok(Arc::new(unsafe { GenericByteViewArray::::new_unchecked( ScalarBuffer::new(views_builder.finish(), 0, len),