Skip to content

Commit

Permalink
Improve performance of casting DictionaryArray to StringViewArray (
Browse files Browse the repository at this point in the history
…#5871)

* zero-copy dict to view

* refactor to use try_append_view

* unchecked view

* make fmt happy

---------

Co-authored-by: Andrew Lamb <[email protected]>
  • Loading branch information
XiangpengHao and alamb authored Jun 13, 2024
1 parent 601a722 commit 77271c4
Show file tree
Hide file tree
Showing 3 changed files with 143 additions and 47 deletions.
47 changes: 32 additions & 15 deletions arrow-array/src/builder/generic_bytes_view_builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,36 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
offset as u32
}

/// Append a view of the given `block`, `offset` and `length`
///
/// # Safety
/// (1) The block must have been added using [`Self::append_block`]
/// (2) The range `offset..offset+length` must be within the bounds of the block
/// (3) The data in the block must be valid of type `T`
pub unsafe fn append_view_unchecked(&mut self, block: u32, offset: u32, len: u32) {
let b = self.completed.get_unchecked(block as usize);
let start = offset as usize;
let end = start.saturating_add(len as usize);
let b = b.get_unchecked(start..end);

if len <= 12 {
let mut view_buffer = [0; 16];
view_buffer[0..4].copy_from_slice(&len.to_le_bytes());
view_buffer[4..4 + b.len()].copy_from_slice(b);
self.views_builder.append(u128::from_le_bytes(view_buffer));
} else {
let view = ByteView {
length: len,
prefix: u32::from_le_bytes(b[0..4].try_into().unwrap()),
buffer_index: block,
offset,
};
self.views_builder.append(view.into());
}

self.null_buffer_builder.append_non_null();
}

/// Try to append a view of the given `block`, `offset` and `length`
///
/// See [`Self::append_block`]
Expand All @@ -139,22 +169,9 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
));
}

if len <= 12 {
let mut view_buffer = [0; 16];
view_buffer[0..4].copy_from_slice(&len.to_le_bytes());
view_buffer[4..4 + b.len()].copy_from_slice(b);
self.views_builder.append(u128::from_le_bytes(view_buffer));
} else {
let view = ByteView {
length: len,
prefix: u32::from_le_bytes(b[0..4].try_into().unwrap()),
buffer_index: block,
offset,
};
self.views_builder.append(view.into());
unsafe {
self.append_view_unchecked(block, offset, len);
}

self.null_buffer_builder.append_non_null();
Ok(())
}

Expand Down
59 changes: 59 additions & 0 deletions arrow-cast/src/cast/dictionary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -85,10 +85,69 @@ pub(crate) fn dictionary_cast<K: ArrowDictionaryKeyType>(

Ok(new_array)
}
Utf8View => {
// `unpack_dictionary` can handle Utf8View/BinaryView types, but incurs unnecessary data copy of the value buffer.
// we handle it here to avoid the copy.
let dict_array = array
.as_dictionary::<K>()
.downcast_dict::<StringArray>()
.unwrap();

let string_view = view_from_dict_values::<K, StringViewType, GenericStringType<i32>>(
dict_array.values(),
dict_array.keys(),
);
Ok(Arc::new(string_view))
}
BinaryView => {
// `unpack_dictionary` can handle Utf8View/BinaryView types, but incurs unnecessary data copy of the value buffer.
// we handle it here to avoid the copy.
let dict_array = array
.as_dictionary::<K>()
.downcast_dict::<BinaryArray>()
.unwrap();

let binary_view = view_from_dict_values::<K, BinaryViewType, BinaryType>(
dict_array.values(),
dict_array.keys(),
);
Ok(Arc::new(binary_view))
}
_ => unpack_dictionary::<K>(array, to_type, cast_options),
}
}

fn view_from_dict_values<K: ArrowDictionaryKeyType, T: ByteViewType, V: ByteArrayType>(
array: &GenericByteArray<V>,
keys: &PrimitiveArray<K>,
) -> GenericByteViewArray<T> {
let value_buffer = array.values();
let value_offsets = array.value_offsets();
let mut builder = GenericByteViewBuilder::<T>::with_capacity(keys.len());
builder.append_block(value_buffer.clone());
for i in keys.iter() {
match i {
Some(v) => {
let idx = v.to_usize().unwrap();

// Safety
// (1) The index is within bounds as they are offsets
// (2) The append_view is safe
unsafe {
let offset = value_offsets.get_unchecked(idx).as_usize();
let end = value_offsets.get_unchecked(idx + 1).as_usize();
let length = end - offset;
builder.append_view_unchecked(0, offset as u32, length as u32)
}
}
None => {
builder.append_null();
}
}
}
builder.finish()
}

// Unpack a dictionary where the keys are of type <K> into a flattened array of type to_type
pub(crate) fn unpack_dictionary<K>(
array: &dyn Array,
Expand Down
84 changes: 52 additions & 32 deletions arrow-cast/src/cast/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5203,19 +5203,19 @@ mod tests {
_test_string_to_view::<i64>();
}

const VIEW_TEST_DATA: [Option<&str>; 5] = [
Some("hello"),
Some("world"),
None,
Some("large payload over 12 bytes"),
Some("lulu"),
];

fn _test_string_to_view<O>()
where
O: OffsetSizeTrait,
{
let data = vec![
Some("hello"),
Some("world"),
None,
Some("large payload over 12 bytes"),
Some("lulu"),
];

let string_array = GenericStringArray::<O>::from(data.clone());
let string_array = GenericStringArray::<O>::from_iter(VIEW_TEST_DATA);

assert!(can_cast_types(
string_array.data_type(),
Expand All @@ -5225,7 +5225,7 @@ mod tests {
let string_view_array = cast(&string_array, &DataType::Utf8View).unwrap();
assert_eq!(string_view_array.data_type(), &DataType::Utf8View);

let expect_string_view_array = StringViewArray::from(data);
let expect_string_view_array = StringViewArray::from_iter(VIEW_TEST_DATA);
assert_eq!(string_view_array.as_ref(), &expect_string_view_array);
}

Expand All @@ -5239,15 +5239,7 @@ mod tests {
where
O: OffsetSizeTrait,
{
let data: Vec<Option<&[u8]>> = vec![
Some(b"hello"),
Some(b"world"),
None,
Some(b"large payload over 12 bytes"),
Some(b"lulu"),
];

let binary_array = GenericBinaryArray::<O>::from(data.clone());
let binary_array = GenericBinaryArray::<O>::from_iter(VIEW_TEST_DATA);

assert!(can_cast_types(
binary_array.data_type(),
Expand All @@ -5257,10 +5249,48 @@ mod tests {
let binary_view_array = cast(&binary_array, &DataType::BinaryView).unwrap();
assert_eq!(binary_view_array.data_type(), &DataType::BinaryView);

let expect_binary_view_array = BinaryViewArray::from(data);
let expect_binary_view_array = BinaryViewArray::from_iter(VIEW_TEST_DATA);
assert_eq!(binary_view_array.as_ref(), &expect_binary_view_array);
}

#[test]
fn test_dict_to_view() {
let values = StringArray::from_iter(VIEW_TEST_DATA);
let keys = Int8Array::from_iter([Some(1), Some(0), None, Some(3), None, Some(1), Some(4)]);
let string_dict_array =
DictionaryArray::<Int8Type>::try_new(keys, Arc::new(values)).unwrap();
let typed_dict = string_dict_array.downcast_dict::<StringArray>().unwrap();

let string_view_array = {
let mut builder = StringViewBuilder::new().with_block_size(8); // multiple buffers.
for v in typed_dict.into_iter() {
builder.append_option(v);
}
builder.finish()
};
let expected_string_array_type = string_view_array.data_type();
let casted_string_array = cast(&string_dict_array, expected_string_array_type).unwrap();
assert_eq!(casted_string_array.data_type(), expected_string_array_type);
assert_eq!(casted_string_array.as_ref(), &string_view_array);

let binary_buffer = cast(&typed_dict.values(), &DataType::Binary).unwrap();
let binary_dict_array =
DictionaryArray::<Int8Type>::new(typed_dict.keys().clone(), binary_buffer);
let typed_binary_dict = binary_dict_array.downcast_dict::<BinaryArray>().unwrap();

let binary_view_array = {
let mut builder = BinaryViewBuilder::new().with_block_size(8); // multiple buffers.
for v in typed_binary_dict.into_iter() {
builder.append_option(v);
}
builder.finish()
};
let expected_binary_array_type = binary_view_array.data_type();
let casted_binary_array = cast(&binary_dict_array, expected_binary_array_type).unwrap();
assert_eq!(casted_binary_array.data_type(), expected_binary_array_type);
assert_eq!(casted_binary_array.as_ref(), &binary_view_array);
}

#[test]
fn test_view_to_string() {
_test_view_to_string::<i32>();
Expand All @@ -5271,24 +5301,15 @@ mod tests {
where
O: OffsetSizeTrait,
{
let data: Vec<Option<&str>> = vec![
Some("hello"),
Some("world"),
None,
Some("large payload over 12 bytes"),
Some("lulu"),
];

let view_array = {
// ["hello", "world", null, "large payload over 12 bytes", "lulu"]
let mut builder = StringViewBuilder::new().with_block_size(8); // multiple buffers.
for s in data.iter() {
for s in VIEW_TEST_DATA.iter() {
builder.append_option(*s);
}
builder.finish()
};

let expected_string_array = GenericStringArray::<O>::from(data);
let expected_string_array = GenericStringArray::<O>::from_iter(VIEW_TEST_DATA);
let expected_type = expected_string_array.data_type();

assert!(can_cast_types(view_array.data_type(), expected_type));
Expand Down Expand Up @@ -5318,7 +5339,6 @@ mod tests {
];

let view_array = {
// ["hello", "world", null, "large payload over 12 bytes", "lulu"]
let mut builder = BinaryViewBuilder::new().with_block_size(8); // multiple buffers.
for s in data.iter() {
builder.append_option(*s);
Expand Down

0 comments on commit 77271c4

Please sign in to comment.