diff --git a/arrow-array/src/builder/generic_bytes_view_builder.rs b/arrow-array/src/builder/generic_bytes_view_builder.rs index e7f13a68288a..6ec34bf5a91c 100644 --- a/arrow-array/src/builder/generic_bytes_view_builder.rs +++ b/arrow-array/src/builder/generic_bytes_view_builder.rs @@ -116,6 +116,36 @@ impl GenericByteViewBuilder { offset as u32 } + /// Append a view of the given `block`, `offset` and `length` + /// + /// # Safety + /// (1) The block must have been added using [`Self::append_block`] + /// (2) The range `offset..offset+length` must be within the bounds of the block + /// (3) The data in the block must be valid of type `T` + pub unsafe fn append_view_unchecked(&mut self, block: u32, offset: u32, len: u32) { + let b = self.completed.get_unchecked(block as usize); + let start = offset as usize; + let end = start.saturating_add(len as usize); + let b = b.get_unchecked(start..end); + + if len <= 12 { + let mut view_buffer = [0; 16]; + view_buffer[0..4].copy_from_slice(&len.to_le_bytes()); + view_buffer[4..4 + b.len()].copy_from_slice(b); + self.views_builder.append(u128::from_le_bytes(view_buffer)); + } else { + let view = ByteView { + length: len, + prefix: u32::from_le_bytes(b[0..4].try_into().unwrap()), + buffer_index: block, + offset, + }; + self.views_builder.append(view.into()); + } + + self.null_buffer_builder.append_non_null(); + } + /// Try to append a view of the given `block`, `offset` and `length` /// /// See [`Self::append_block`] @@ -139,22 +169,9 @@ impl GenericByteViewBuilder { )); } - if len <= 12 { - let mut view_buffer = [0; 16]; - view_buffer[0..4].copy_from_slice(&len.to_le_bytes()); - view_buffer[4..4 + b.len()].copy_from_slice(b); - self.views_builder.append(u128::from_le_bytes(view_buffer)); - } else { - let view = ByteView { - length: len, - prefix: u32::from_le_bytes(b[0..4].try_into().unwrap()), - buffer_index: block, - offset, - }; - self.views_builder.append(view.into()); + unsafe { + self.append_view_unchecked(block, offset, len); } - - self.null_buffer_builder.append_non_null(); Ok(()) } diff --git a/arrow-cast/src/cast/dictionary.rs b/arrow-cast/src/cast/dictionary.rs index 244e101f1d8d..d929277a4da3 100644 --- a/arrow-cast/src/cast/dictionary.rs +++ b/arrow-cast/src/cast/dictionary.rs @@ -85,10 +85,69 @@ pub(crate) fn dictionary_cast( Ok(new_array) } + Utf8View => { + // `unpack_dictionary` can handle Utf8View/BinaryView types, but incurs unnecessary data copy of the value buffer. + // we handle it here to avoid the copy. + let dict_array = array + .as_dictionary::() + .downcast_dict::() + .unwrap(); + + let string_view = view_from_dict_values::>( + dict_array.values(), + dict_array.keys(), + ); + Ok(Arc::new(string_view)) + } + BinaryView => { + // `unpack_dictionary` can handle Utf8View/BinaryView types, but incurs unnecessary data copy of the value buffer. + // we handle it here to avoid the copy. + let dict_array = array + .as_dictionary::() + .downcast_dict::() + .unwrap(); + + let binary_view = view_from_dict_values::( + dict_array.values(), + dict_array.keys(), + ); + Ok(Arc::new(binary_view)) + } _ => unpack_dictionary::(array, to_type, cast_options), } } +fn view_from_dict_values( + array: &GenericByteArray, + keys: &PrimitiveArray, +) -> GenericByteViewArray { + let value_buffer = array.values(); + let value_offsets = array.value_offsets(); + let mut builder = GenericByteViewBuilder::::with_capacity(keys.len()); + builder.append_block(value_buffer.clone()); + for i in keys.iter() { + match i { + Some(v) => { + let idx = v.to_usize().unwrap(); + + // Safety + // (1) The index is within bounds as they are offsets + // (2) The append_view is safe + unsafe { + let offset = value_offsets.get_unchecked(idx).as_usize(); + let end = value_offsets.get_unchecked(idx + 1).as_usize(); + let length = end - offset; + builder.append_view_unchecked(0, offset as u32, length as u32) + } + } + None => { + builder.append_null(); + } + } + } + builder.finish() +} + // Unpack a dictionary where the keys are of type into a flattened array of type to_type pub(crate) fn unpack_dictionary( array: &dyn Array, diff --git a/arrow-cast/src/cast/mod.rs b/arrow-cast/src/cast/mod.rs index ae2240bd9404..e073e34cb6e4 100644 --- a/arrow-cast/src/cast/mod.rs +++ b/arrow-cast/src/cast/mod.rs @@ -5203,19 +5203,19 @@ mod tests { _test_string_to_view::(); } + const VIEW_TEST_DATA: [Option<&str>; 5] = [ + Some("hello"), + Some("world"), + None, + Some("large payload over 12 bytes"), + Some("lulu"), + ]; + fn _test_string_to_view() where O: OffsetSizeTrait, { - let data = vec![ - Some("hello"), - Some("world"), - None, - Some("large payload over 12 bytes"), - Some("lulu"), - ]; - - let string_array = GenericStringArray::::from(data.clone()); + let string_array = GenericStringArray::::from_iter(VIEW_TEST_DATA); assert!(can_cast_types( string_array.data_type(), @@ -5225,7 +5225,7 @@ mod tests { let string_view_array = cast(&string_array, &DataType::Utf8View).unwrap(); assert_eq!(string_view_array.data_type(), &DataType::Utf8View); - let expect_string_view_array = StringViewArray::from(data); + let expect_string_view_array = StringViewArray::from_iter(VIEW_TEST_DATA); assert_eq!(string_view_array.as_ref(), &expect_string_view_array); } @@ -5239,15 +5239,7 @@ mod tests { where O: OffsetSizeTrait, { - let data: Vec> = vec![ - Some(b"hello"), - Some(b"world"), - None, - Some(b"large payload over 12 bytes"), - Some(b"lulu"), - ]; - - let binary_array = GenericBinaryArray::::from(data.clone()); + let binary_array = GenericBinaryArray::::from_iter(VIEW_TEST_DATA); assert!(can_cast_types( binary_array.data_type(), @@ -5257,10 +5249,48 @@ mod tests { let binary_view_array = cast(&binary_array, &DataType::BinaryView).unwrap(); assert_eq!(binary_view_array.data_type(), &DataType::BinaryView); - let expect_binary_view_array = BinaryViewArray::from(data); + let expect_binary_view_array = BinaryViewArray::from_iter(VIEW_TEST_DATA); assert_eq!(binary_view_array.as_ref(), &expect_binary_view_array); } + #[test] + fn test_dict_to_view() { + let values = StringArray::from_iter(VIEW_TEST_DATA); + let keys = Int8Array::from_iter([Some(1), Some(0), None, Some(3), None, Some(1), Some(4)]); + let string_dict_array = + DictionaryArray::::try_new(keys, Arc::new(values)).unwrap(); + let typed_dict = string_dict_array.downcast_dict::().unwrap(); + + let string_view_array = { + let mut builder = StringViewBuilder::new().with_block_size(8); // multiple buffers. + for v in typed_dict.into_iter() { + builder.append_option(v); + } + builder.finish() + }; + let expected_string_array_type = string_view_array.data_type(); + let casted_string_array = cast(&string_dict_array, expected_string_array_type).unwrap(); + assert_eq!(casted_string_array.data_type(), expected_string_array_type); + assert_eq!(casted_string_array.as_ref(), &string_view_array); + + let binary_buffer = cast(&typed_dict.values(), &DataType::Binary).unwrap(); + let binary_dict_array = + DictionaryArray::::new(typed_dict.keys().clone(), binary_buffer); + let typed_binary_dict = binary_dict_array.downcast_dict::().unwrap(); + + let binary_view_array = { + let mut builder = BinaryViewBuilder::new().with_block_size(8); // multiple buffers. + for v in typed_binary_dict.into_iter() { + builder.append_option(v); + } + builder.finish() + }; + let expected_binary_array_type = binary_view_array.data_type(); + let casted_binary_array = cast(&binary_dict_array, expected_binary_array_type).unwrap(); + assert_eq!(casted_binary_array.data_type(), expected_binary_array_type); + assert_eq!(casted_binary_array.as_ref(), &binary_view_array); + } + #[test] fn test_view_to_string() { _test_view_to_string::(); @@ -5271,24 +5301,15 @@ mod tests { where O: OffsetSizeTrait, { - let data: Vec> = vec![ - Some("hello"), - Some("world"), - None, - Some("large payload over 12 bytes"), - Some("lulu"), - ]; - let view_array = { - // ["hello", "world", null, "large payload over 12 bytes", "lulu"] let mut builder = StringViewBuilder::new().with_block_size(8); // multiple buffers. - for s in data.iter() { + for s in VIEW_TEST_DATA.iter() { builder.append_option(*s); } builder.finish() }; - let expected_string_array = GenericStringArray::::from(data); + let expected_string_array = GenericStringArray::::from_iter(VIEW_TEST_DATA); let expected_type = expected_string_array.data_type(); assert!(can_cast_types(view_array.data_type(), expected_type)); @@ -5318,7 +5339,6 @@ mod tests { ]; let view_array = { - // ["hello", "world", null, "large payload over 12 bytes", "lulu"] let mut builder = BinaryViewBuilder::new().with_block_size(8); // multiple buffers. for s in data.iter() { builder.append_option(*s);