From b9ded5bbf159fe238942da60a4e148c9bdf491c1 Mon Sep 17 00:00:00 2001 From: kikkon Date: Sun, 24 Mar 2024 20:36:41 +0800 Subject: [PATCH] feat: support live_view_array layout and basic construction --- arrow-array/src/array/list_view_array.rs | 1175 +++++++++++++++++ arrow-array/src/array/mod.rs | 31 +- .../src/builder/generic_list_builder.rs | 2 +- .../src/builder/generic_list_view_builder.rs | 705 ++++++++++ arrow-array/src/builder/mod.rs | 8 + arrow-array/src/builder/struct_builder.rs | 8 + arrow-array/src/cast.rs | 16 + arrow-array/src/iterator.rs | 4 +- arrow-array/src/record_batch.rs | 4 +- arrow-buffer/src/buffer/mod.rs | 3 + arrow-buffer/src/buffer/size.rs | 91 ++ arrow-data/src/data.rs | 69 +- arrow-data/src/equal/list_view.rs | 52 + arrow-data/src/equal/mod.rs | 7 +- 14 files changed, 2160 insertions(+), 15 deletions(-) create mode 100644 arrow-array/src/array/list_view_array.rs create mode 100644 arrow-array/src/builder/generic_list_view_builder.rs create mode 100644 arrow-buffer/src/buffer/size.rs create mode 100644 arrow-data/src/equal/list_view.rs diff --git a/arrow-array/src/array/list_view_array.rs b/arrow-array/src/array/list_view_array.rs new file mode 100644 index 000000000000..b0e3a0f26898 --- /dev/null +++ b/arrow-array/src/array/list_view_array.rs @@ -0,0 +1,1175 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::array::{get_offsets, get_sizes, make_array, print_long_array}; +use crate::builder::{GenericListViewBuilder, PrimitiveBuilder}; +use crate::{new_empty_array, Array, ArrayAccessor, ArrayRef, ArrowPrimitiveType, OffsetSizeTrait, FixedSizeListArray}; +use arrow_buffer::{NullBuffer, OffsetBuffer, SizeBuffer}; +use arrow_data::{ArrayData, ArrayDataBuilder}; +use arrow_schema::{ArrowError, DataType, FieldRef}; +use std::any::Any; +use std::ops::Add; +use std::sync::Arc; +use crate::iterator::GenericListViewArrayIter; + +// See [`ListViewBuilder`](crate::builder::ListViewBuilder) for how to construct a [`ListViewArray`] +pub type ListViewArray = GenericListViewArray; + +/// A [`GenericListViewArray`] of variable size lists, storing offsets as `i64`. +/// +// See [`LargeListViewArray`](crate::builder::LargeListViewBuilder) for how to construct a [`LargeListViewArray`] +pub type LargeListViewArray = GenericListViewArray; + +/// +/// Different than [`crate::GenericListArray`] as it stores both an offset and length +/// meaning that take / filter operations can be implemented without copying the underlying data. +/// +/// [Variable-size List Layout: ListView Layout]: https://arrow.apache.org/docs/format/Columnar.html#listview-layout +pub struct GenericListViewArray { + data_type: DataType, + nulls: Option, + values: ArrayRef, + value_offsets: OffsetBuffer, + value_sizes: SizeBuffer, + len: usize, +} + + +impl Clone for GenericListViewArray { + fn clone(&self) -> Self { + Self { + data_type: self.data_type.clone(), + nulls: self.nulls.clone(), + values: self.values.clone(), + value_offsets: self.value_offsets.clone(), + value_sizes: self.value_sizes.clone(), + len: self.len, + } + } +} + +impl GenericListViewArray { + /// The data type constructor of listview array. + /// The input is the schema of the child array and + /// the output is the [`DataType`], ListView or LargeListView. + pub const DATA_TYPE_CONSTRUCTOR: fn(FieldRef) -> DataType = if OffsetSize::IS_LARGE { + DataType::LargeListView + } else { + DataType::ListView + }; + + pub fn try_new( + field: FieldRef, + offsets: OffsetBuffer, + sizes: SizeBuffer, + values: ArrayRef, + nulls: Option, + ) -> Result { + + //todo: check illegal offset and size values + let mut len = 0usize; + for i in 0..offsets.len() { + let offset = offsets[i].as_usize(); + let size = sizes[i].as_usize(); + if offset + size > values.len() { + return Err(ArrowError::InvalidArgumentError(format!( + "Invalid offset and size values for {}ListViewArray, offset: {}, size: {}, values.len(): {}", + OffsetSize::PREFIX, offset, size, values.len() + ))); + } + len = len.add(size); + } + + let len = offsets.len(); + if len != sizes.len() { + return Err(ArrowError::InvalidArgumentError(format!( + "Length of offsets buffer and sizes buffer must be equal for {}ListViewArray, got {} and {}", + OffsetSize::PREFIX, len, sizes.len() + ))); + } + + if let Some(n) = nulls.as_ref() { + if n.len() != len { + return Err(ArrowError::InvalidArgumentError(format!( + "Incorrect length of null buffer for {}ListViewArray, expected {len} got {}", + OffsetSize::PREFIX, + n.len(), + ))); + } + } + if !field.is_nullable() && values.is_nullable() { + return Err(ArrowError::InvalidArgumentError(format!( + "Non-nullable field of {}ListViewArray {:?} cannot contain nulls", + OffsetSize::PREFIX, + field.name() + ))); + } + + if field.data_type() != values.data_type() { + return Err(ArrowError::InvalidArgumentError(format!( + "{}ListViewArray expected data type {} got {} for {:?}", + OffsetSize::PREFIX, + field.data_type(), + values.data_type(), + field.name() + ))); + } + + + Ok(Self { + data_type: Self::DATA_TYPE_CONSTRUCTOR(field), + nulls, + values, + value_offsets: offsets, + value_sizes: sizes, + len, + }) + } + + /// Create a new [`GenericListViewArray`] from the provided parts + /// + /// # Panics + /// + /// Panics if [`Self::try_new`] returns an error + pub fn new( + field: FieldRef, + offsets: OffsetBuffer, + sizes: SizeBuffer, + values: ArrayRef, + nulls: Option, + ) -> Self { + Self::try_new(field, offsets, sizes, values, nulls).unwrap() + } + + /// Create a new [`GenericListViewArray`] of length `len` where all values are null + pub fn new_null(field: FieldRef, len: usize) -> Self { + let values = new_empty_array(field.data_type()); + Self { + data_type: Self::DATA_TYPE_CONSTRUCTOR(field), + nulls: Some(NullBuffer::new_null(len)), + value_offsets: OffsetBuffer::new_zeroed(len), + value_sizes: SizeBuffer::new_zeroed(len), + values, + len: 0usize + } + } + + /// Deconstruct this array into its constituent parts + pub fn into_parts( + self, + ) -> ( + FieldRef, + OffsetBuffer, + SizeBuffer, + ArrayRef, + Option, + ) { + let f = match self.data_type { + DataType::ListView(f) | DataType::LargeListView(f) => f, + _ => unreachable!(), + }; + (f, self.value_offsets, self.value_sizes, self.values, self.nulls) + } + + /// Returns a reference to the offsets of this list + /// + /// Unlike [`Self::value_offsets`] this returns the [`OffsetBuffer`] + /// allowing for zero-copy cloning + #[inline] + pub fn offsets(&self) -> &OffsetBuffer { + &self.value_offsets + } + + /// Returns a reference to the values of this list + #[inline] + pub fn values(&self) -> &ArrayRef { + &self.values + } + + /// Returns a reference to the sizes of this list + /// + /// Unlike [`Self::value_sizes`] this returns the [`SizeBuffer`] + /// allowing for zero-copy cloning + #[inline] + pub fn sizes(&self) -> &SizeBuffer { + &self.value_sizes + } + + + /// Returns a clone of the value type of this list. + pub fn value_type(&self) -> DataType { + self.values.data_type().clone() + } + + /// Returns ith value of this list view array. + /// # Safety + /// Caller must ensure that the index is within the array bounds + pub unsafe fn value_unchecked(&self, i: usize) -> ArrayRef { + let end = self.value_offsets().get_unchecked(i + 1).as_usize(); + let start = self.value_offsets().get_unchecked(i).as_usize(); + self.values.slice(start, end - start) + } + + /// Returns ith value of this list view array. + pub fn value(&self, i: usize) -> ArrayRef { + let offset = self.value_offsets()[i].as_usize(); + let length = self.value_sizes()[i].as_usize(); + self.values.slice(offset, length) + } + + /// Returns the offset values in the offsets buffer + #[inline] + pub fn value_offsets(&self) -> &[OffsetSize] { + &self.value_offsets + } + + /// Returns the sizes values in the offsets buffer + #[inline] + pub fn value_sizes(&self) -> &[OffsetSize] { + &self.value_sizes + } + + /// Returns the length for value at index `i`. + #[inline] + pub fn value_length(&self, i: usize) -> OffsetSize { + self.value_sizes[i] + } + + /// constructs a new iterator + pub fn iter<'a>(&'a self) -> GenericListViewArrayIter<'a, OffsetSize> { + GenericListViewArrayIter::<'a, OffsetSize>::new(self) + } + + #[inline] + fn get_type(data_type: &DataType) -> Option<&DataType> { + match (OffsetSize::IS_LARGE, data_type) { + (true, DataType::LargeListView(child)) | (false, DataType::ListView(child)) => { + Some(child.data_type()) + } + _ => None, + } + } + + /// Returns a zero-copy slice of this array with the indicated offset and length. + pub fn slice(&self, offset: usize, length: usize) -> Self { + Self { + data_type: self.data_type.clone(), + nulls: self.nulls.as_ref().map(|n| n.slice(offset, length)), + values: self.values.clone(), + value_offsets: self.value_offsets.slice(offset, length), + value_sizes: self.value_sizes.slice(offset, length), + len: length, + } + } + + pub fn from_iter_primitive(iter: I) -> Self + where + T: ArrowPrimitiveType, + P: IntoIterator::Native>>, + I: IntoIterator>, + { + let iter = iter.into_iter(); + let size_hint = iter.size_hint().0; + let mut builder = + GenericListViewBuilder::with_capacity(PrimitiveBuilder::::new(), size_hint); + + for i in iter { + match i { + Some(p) => { + let mut size = 0usize; + for t in p { + builder.values().append_option(t); + size = size.add(1); + } + builder.append(true, size); + } + None => builder.append(false, 0), + } + } + builder.finish() + } +} + +impl<'a, OffsetSize: OffsetSizeTrait> ArrayAccessor for &'a GenericListViewArray { + type Item = ArrayRef; + + fn value(&self, index: usize) -> Self::Item { + GenericListViewArray::value(self, index) + } + + unsafe fn value_unchecked(&self, index: usize) -> Self::Item { + GenericListViewArray::value(self, index) + } +} + + +impl Array for GenericListViewArray { + fn as_any(&self) -> &dyn Any { + self + } + + fn to_data(&self) -> ArrayData { + self.clone().into() + } + + fn into_data(self) -> ArrayData { + self.into() + } + + fn data_type(&self) -> &DataType { + &self.data_type + } + + fn slice(&self, offset: usize, length: usize) -> ArrayRef { + Arc::new(self.slice(offset, length)) + } + + fn len(&self) -> usize { + self.len + } + + fn is_empty(&self) -> bool { + self.value_offsets.len() <= 1 + } + + fn offset(&self) -> usize { + 0 + } + + fn nulls(&self) -> Option<&NullBuffer> { + self.nulls.as_ref() + } + + fn get_buffer_memory_size(&self) -> usize { + let mut size = self.values.get_buffer_memory_size(); + size += self.value_offsets.inner().inner().capacity(); + if let Some(n) = self.nulls.as_ref() { + size += n.buffer().capacity(); + } + size + } + + fn get_array_memory_size(&self) -> usize { + let mut size = std::mem::size_of::() + self.values.get_array_memory_size(); + size += self.value_offsets.inner().inner().capacity(); + if let Some(n) = self.nulls.as_ref() { + size += n.buffer().capacity(); + } + size + } +} + +impl std::fmt::Debug for GenericListViewArray { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + let prefix = OffsetSize::PREFIX; + write!(f, "{prefix}ListViewArray\n[\n")?; + print_long_array(self, f, |array, index, f| { + std::fmt::Debug::fmt(&array.value(index), f) + })?; + write!(f, "]") + } +} + +impl From> for ArrayData { + fn from(array: GenericListViewArray) -> Self { + let len = array.len(); + let builder = ArrayDataBuilder::new(array.data_type) + .len(len) + .nulls(array.nulls) + .buffers(vec![array.value_offsets.into_inner().into_inner(), array.value_sizes.into_inner().into_inner()]) + .child_data(vec![array.values.to_data()]); + + unsafe { builder.build_unchecked() } + } +} + +impl From for GenericListViewArray { + fn from(data: ArrayData) -> Self { + Self::try_new_from_array_data(data) + .expect("Expected infallible creation of GenericListViewArray from ArrayDataRef failed") + } +} + +impl From for GenericListViewArray { + fn from(value: FixedSizeListArray) -> Self { + let (field, size) = match value.data_type() { + DataType::FixedSizeList(f, size) => (f, *size as usize), + _ => unreachable!(), + }; + + let offsets = OffsetBuffer::from_lengths(std::iter::repeat(size).take(value.len())); + let sizes = SizeBuffer::from_lengths(std::iter::repeat(size).take(value.len())); + Self { + data_type: Self::DATA_TYPE_CONSTRUCTOR(field.clone()), + nulls: value.nulls().cloned(), + values: value.values().clone(), + value_offsets: offsets, + value_sizes: sizes, + len: value.len(), + } + } +} + +impl GenericListViewArray { + fn try_new_from_array_data(data: ArrayData) -> Result { + if data.buffers().len() != 2 { + return Err(ArrowError::InvalidArgumentError(format!( + "ListViewArray data should contain two buffer (value offsets & value size), had {}", + data.buffers().len() + ))); + } + + if data.child_data().len() != 1 { + return Err(ArrowError::InvalidArgumentError(format!( + "ListViewArray should contain a single child array (values array), had {}", + data.child_data().len() + ))); + } + + let values = data.child_data()[0].clone(); + + if let Some(child_data_type) = Self::get_type(data.data_type()) { + if values.data_type() != child_data_type { + return Err(ArrowError::InvalidArgumentError(format!( + "[Large]ListViewArray's child datatype {:?} does not \ + correspond to the List's datatype {:?}", + values.data_type(), + child_data_type + ))); + } + } else { + return Err(ArrowError::InvalidArgumentError(format!( + "[Large]ListViewArray's datatype must be [Large]ListViewArray(). It is {:?}", + data.data_type() + ))); + } + + let values = make_array(values); + // SAFETY: + // ArrayData is valid, and verified type above + let value_offsets = unsafe { get_offsets(&data) }; + let value_sizes = unsafe { get_sizes(&data) }; + + Ok(Self { + data_type: data.data_type().clone(), + nulls: data.nulls().cloned(), + values, + value_offsets, + value_sizes, + len: data.len(), + }) + } +} + + +#[cfg(test)] +mod tests { + use super::*; + use crate::types::Int32Type; + use crate::{Int32Array, Int64Array}; + use arrow_buffer::{bit_util, Buffer, ScalarBuffer}; + use arrow_schema::DataType::LargeListView; + use arrow_schema::Field; + use crate::builder::{FixedSizeListBuilder, Int32Builder, ListViewBuilder}; + use crate::cast::AsArray; + + fn create_from_buffers() -> ListViewArray { + // [[0, 1, 2], [3, 4, 5], [6, 7]] + let values = Int32Array::from(vec![0, 1, 2, 3, 4, 5, 6, 7]); + let offsets = OffsetBuffer::new(ScalarBuffer::from(vec![0, 3, 6])); + let field = Arc::new(Field::new("item", DataType::Int32, true)); + let sizes = SizeBuffer::new(ScalarBuffer::from(vec![3, 3, 2])); + + ListViewArray::new(field, offsets, sizes,Arc::new(values), None) + } + + + #[test] + fn test_from_iter_primitive() { + let data = vec![ + Some(vec![Some(0), Some(1), Some(2)]), + Some(vec![Some(3), Some(4), Some(5)]), + Some(vec![Some(6), Some(7)]), + ]; + let list_array = ListViewArray::from_iter_primitive::(data); + let another = create_from_buffers(); + assert_eq!(list_array, another) + } + + #[test] + fn test_empty_list_view_array() { + // Construct an empty value array + let value_data = ArrayData::builder(DataType::Int32) + .len(0) + .add_buffer(Buffer::from([])) + .build() + .unwrap(); + + // Construct an empty offset buffer + let value_offsets = Buffer::from([]); + // Construct an empty size buffer + let value_sizes = Buffer::from([]); + + // Construct a list view array from the above two + let list_data_type = DataType::ListView(Arc::new(Field::new("item", DataType::Int32, false))); + let list_data = ArrayData::builder(list_data_type) + .len(0) + .add_buffer(value_offsets) + .add_buffer(value_sizes) + .add_child_data(value_data) + .build() + .unwrap(); + + let list_array = ListViewArray::from(list_data); + assert_eq!(list_array.len(), 0) + } + + + #[test] + fn test_list_view_array() { + // Construct a value array + let value_data = ArrayData::builder(DataType::Int32) + .len(8) + .add_buffer(Buffer::from_slice_ref([0, 1, 2, 3, 4, 5, 6, 7])) + .build() + .unwrap(); + + // Construct a buffer for value offsets, for the nested array: + // [[0, 1, 2], [3, 4, 5], [6, 7]] + let value_offsets = Buffer::from_slice_ref([0, 3, 6, 8]); + // Construct a buffer for value sizes, for the nested array: + let value_sizes = Buffer::from_slice_ref([3, 3, 2, 0]); + + // Construct a list view array from the above two + let list_data_type = DataType::ListView(Arc::new(Field::new("item", DataType::Int32, false))); + let list_data = ArrayData::builder(list_data_type.clone()) + .len(3) + .add_buffer(value_offsets.clone()) + .add_buffer(value_sizes.clone()) + .add_child_data(value_data.clone()) + .build() + .unwrap(); + let list_array = ListViewArray::from(list_data); + + let values = list_array.values(); + assert_eq!(value_data, values.to_data()); + assert_eq!(DataType::Int32, list_array.value_type()); + assert_eq!(3, list_array.len()); + assert_eq!(0, list_array.null_count()); + assert_eq!(6, list_array.value_offsets()[2]); + assert_eq!(2, list_array.value_sizes()[2]); + assert_eq!(2, list_array.value_length(2)); + assert_eq!( + 0, + list_array + .value(0) + .as_any() + .downcast_ref::() + .unwrap() + .value(0) + ); + assert_eq!( + 0, + unsafe { list_array.value_unchecked(0) } + .as_any() + .downcast_ref::() + .unwrap() + .value(0) + ); + for i in 0..3 { + assert!(list_array.is_valid(i)); + assert!(!list_array.is_null(i)); + } + + // Now test with a non-zero offset (skip first element) + // [[3, 4, 5], [6, 7]] + let list_data = ArrayData::builder(list_data_type) + .len(2) + .offset(1) + .add_buffer(value_offsets) + .add_buffer(value_sizes) + .add_child_data(value_data.clone()) + .build() + .unwrap(); + let list_array = ListViewArray::from(list_data); + + let values = list_array.values(); + assert_eq!(value_data, values.to_data()); + assert_eq!(DataType::Int32, list_array.value_type()); + assert_eq!(2, list_array.len()); + assert_eq!(0, list_array.null_count()); + assert_eq!(6, list_array.value_offsets()[1]); + assert_eq!(2, list_array.value_sizes()[1]); + assert_eq!(2, list_array.value_length(1)); + assert_eq!( + 3, + list_array + .value(0) + .as_any() + .downcast_ref::() + .unwrap() + .value(0) + ); + assert_eq!( + 3, + unsafe { list_array.value_unchecked(0) } + .as_any() + .downcast_ref::() + .unwrap() + .value(0) + ); + } + + #[test] + fn test_large_list_view_array() { + // Construct a value array + let value_data = ArrayData::builder(DataType::Int32) + .len(8) + .add_buffer(Buffer::from_slice_ref([0, 1, 2, 3, 4, 5, 6, 7])) + .build() + .unwrap(); + + // Construct a buffer for value offsets, for the nested array: + // [[0, 1, 2], [3, 4, 5], [6, 7]] + let value_offsets = Buffer::from_slice_ref([0i64, 3, 6, 8]); + + // Construct a buffer for value sizes, for the nested array: + let value_sizes = Buffer::from_slice_ref([3i64, 3, 2, 0]); + + // Construct a list view array from the above two + let list_data_type = DataType::LargeListView(Arc::new(Field::new("item", DataType::Int32, false))); + let list_data = ArrayData::builder(list_data_type.clone()) + .len(3) + .add_buffer(value_offsets.clone()) + .add_buffer(value_sizes.clone()) + .add_child_data(value_data.clone()) + .build() + .unwrap(); + let list_array = LargeListViewArray::from(list_data); + + let values = list_array.values(); + assert_eq!(value_data, values.to_data()); + assert_eq!(DataType::Int32, list_array.value_type()); + assert_eq!(3, list_array.len()); + assert_eq!(0, list_array.null_count()); + assert_eq!(6, list_array.value_offsets()[2]); + assert_eq!(2, list_array.value_sizes()[2]); + assert_eq!(2, list_array.value_length(2)); + assert_eq!( + 0, + list_array + .value(0) + .as_any() + .downcast_ref::() + .unwrap() + .value(0) + ); + assert_eq!( + 0, + unsafe { list_array.value_unchecked(0) } + .as_any() + .downcast_ref::() + .unwrap() + .value(0) + ); + for i in 0..3 { + assert!(list_array.is_valid(i)); + assert!(!list_array.is_null(i)); + } + + // Now test with a non-zero offset + // [[3, 4, 5], [6, 7]] + let list_data = ArrayData::builder(list_data_type) + .len(2) + .offset(1) + .add_buffer(value_offsets) + .add_buffer(value_sizes) + .add_child_data(value_data.clone()) + .build() + .unwrap(); + let list_array = LargeListViewArray::from(list_data); + + let values = list_array.values(); + assert_eq!(value_data, values.to_data()); + assert_eq!(DataType::Int32, list_array.value_type()); + assert_eq!(2, list_array.len()); + assert_eq!(0, list_array.null_count()); + assert_eq!(6, list_array.value_offsets()[1]); + assert_eq!(2, list_array.value_sizes()[1]); + assert_eq!(2, list_array.value_length(1)); + assert_eq!( + 3, + list_array + .value(0) + .as_any() + .downcast_ref::() + .unwrap() + .value(0) + ); + assert_eq!( + 3, + unsafe { list_array.value_unchecked(0) } + .as_any() + .downcast_ref::() + .unwrap() + .value(0) + ); + } + + + #[test] + fn test_list_view_array_slice() { + // Construct a value array + let value_data = ArrayData::builder(DataType::Int32) + .len(10) + .add_buffer(Buffer::from_slice_ref([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])) + .build() + .unwrap(); + + // Construct a buffer for value offsets, for the nested array: + // [[0, 1], null, null, [2, 3], [4, 5], null, [6, 7, 8], null, [9]] + let value_offsets = Buffer::from_slice_ref([0, 2, 2, 2, 4, 6, 6, 9, 9, 10]); + // Construct a buffer for value sizes + let value_sizes = Buffer::from_slice_ref([2, 0, 0, 2, 2, 0, 3, 0, 1]); + // 01011001 00000001 + let mut null_bits: [u8; 2] = [0; 2]; + bit_util::set_bit(&mut null_bits, 0); + bit_util::set_bit(&mut null_bits, 3); + bit_util::set_bit(&mut null_bits, 4); + bit_util::set_bit(&mut null_bits, 6); + bit_util::set_bit(&mut null_bits, 8); + + // Construct a list view array from the above two + let list_data_type = DataType::ListView(Arc::new(Field::new("item", DataType::Int32, false))); + let list_data = ArrayData::builder(list_data_type) + .len(9) + .add_buffer(value_offsets) + .add_buffer(value_sizes) + .add_child_data(value_data.clone()) + .null_bit_buffer(Some(Buffer::from(null_bits))) + .build() + .unwrap(); + let list_array = ListViewArray::from(list_data); + + let values = list_array.values(); + assert_eq!(value_data, values.to_data()); + assert_eq!(DataType::Int32, list_array.value_type()); + assert_eq!(9, list_array.len()); + assert_eq!(4, list_array.null_count()); + assert_eq!(2, list_array.value_offsets()[3]); + assert_eq!(2, list_array.value_sizes()[3]); + assert_eq!(2, list_array.value_length(3)); + + let sliced_array = list_array.slice(1, 6); + assert_eq!(6, sliced_array.len()); + assert_eq!(3, sliced_array.null_count()); + + for i in 0..sliced_array.len() { + if bit_util::get_bit(&null_bits, 1 + i) { + assert!(sliced_array.is_valid(i)); + } else { + assert!(sliced_array.is_null(i)); + } + } + + // Check offset and length for each non-null value. + let sliced_list_array = sliced_array.as_any().downcast_ref::().unwrap(); + assert_eq!(2, sliced_list_array.value_offsets()[2]); + assert_eq!(2, sliced_list_array.value_sizes()[2]); + assert_eq!(2, sliced_list_array.value_length(2)); + + assert_eq!(4, sliced_list_array.value_offsets()[3]); + assert_eq!(2, sliced_list_array.value_sizes()[3]); + assert_eq!(2, sliced_list_array.value_length(3)); + + assert_eq!(6, sliced_list_array.value_offsets()[5]); + assert_eq!(3, sliced_list_array.value_sizes()[5]); + assert_eq!(3, sliced_list_array.value_length(5)); + } + + #[test] + fn test_large_list_view_array_slice() { + // Construct a value array + let value_data = ArrayData::builder(DataType::Int32) + .len(10) + .add_buffer(Buffer::from_slice_ref([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])) + .build() + .unwrap(); + + // Construct a buffer for value offsets, for the nested array: + // [[0, 1], null, null, [2, 3], [4, 5], null, [6, 7, 8], null, [9]] + let value_offsets = Buffer::from_slice_ref([0i64, 2, 2, 2, 4, 6, 6, 9, 9, 10]); + // Construct a buffer for value sizes + let value_sizes = Buffer::from_slice_ref([2i64, 0, 0, 2, 2, 0, 3, 0, 1]); + // 01011001 00000001 + let mut null_bits: [u8; 2] = [0; 2]; + bit_util::set_bit(&mut null_bits, 0); + bit_util::set_bit(&mut null_bits, 3); + bit_util::set_bit(&mut null_bits, 4); + bit_util::set_bit(&mut null_bits, 6); + bit_util::set_bit(&mut null_bits, 8); + + // Construct a list view array from the above two + let list_data_type = DataType::LargeListView(Arc::new(Field::new("item", DataType::Int32, false))); + let list_data = ArrayData::builder(list_data_type) + .len(9) + .add_buffer(value_offsets) + .add_buffer(value_sizes) + .add_child_data(value_data.clone()) + .null_bit_buffer(Some(Buffer::from(null_bits))) + .build() + .unwrap(); + let list_array = LargeListViewArray::from(list_data); + + let values = list_array.values(); + assert_eq!(value_data, values.to_data()); + assert_eq!(DataType::Int32, list_array.value_type()); + assert_eq!(9, list_array.len()); + assert_eq!(4, list_array.null_count()); + assert_eq!(2, list_array.value_offsets()[3]); + assert_eq!(2, list_array.value_sizes()[3]); + assert_eq!(2, list_array.value_length(3)); + + let sliced_array = list_array.slice(1, 6); + assert_eq!(6, sliced_array.len()); + assert_eq!(3, sliced_array.null_count()); + + for i in 0..sliced_array.len() { + if bit_util::get_bit(&null_bits, 1 + i) { + assert!(sliced_array.is_valid(i)); + } else { + assert!(sliced_array.is_null(i)); + } + } + + // Check offset and length for each non-null value. + let sliced_list_array = sliced_array + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(2, sliced_list_array.value_offsets()[2]); + assert_eq!(2, sliced_list_array.value_length(2)); + assert_eq!(2, sliced_list_array.value_sizes()[2]); + + assert_eq!(4, sliced_list_array.value_offsets()[3]); + assert_eq!(2, sliced_list_array.value_length(3)); + assert_eq!(2, sliced_list_array.value_sizes()[3]); + + assert_eq!(6, sliced_list_array.value_offsets()[5]); + assert_eq!(3, sliced_list_array.value_length(5)); + assert_eq!(2, sliced_list_array.value_sizes()[3]); + } + + #[test] + #[should_panic(expected = "index out of bounds: the len is 10 but the index is 10")] + fn test_list_view_array_index_out_of_bound() { + // Construct a value array + let value_data = ArrayData::builder(DataType::Int32) + .len(10) + .add_buffer(Buffer::from_slice_ref([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])) + .build() + .unwrap(); + + // Construct a buffer for value offsets, for the nested array: + // [[0, 1], null, null, [2, 3], [4, 5], null, [6, 7, 8], null, [9]] + let value_offsets = Buffer::from_slice_ref([0i64, 2, 2, 2, 4, 6, 6, 9, 9, 10]); + // Construct a buffer for value sizes + let value_sizes = Buffer::from_slice_ref([2i64, 0, 0, 2, 2, 0, 3, 0, 1]); + // 01011001 00000001 + let mut null_bits: [u8; 2] = [0; 2]; + bit_util::set_bit(&mut null_bits, 0); + bit_util::set_bit(&mut null_bits, 3); + bit_util::set_bit(&mut null_bits, 4); + bit_util::set_bit(&mut null_bits, 6); + bit_util::set_bit(&mut null_bits, 8); + + // Construct a list array from the above two + let list_data_type = LargeListView(Arc::new(Field::new("item", DataType::Int32, false))); + let list_data = ArrayData::builder(list_data_type) + .len(9) + .add_buffer(value_offsets) + .add_buffer(value_sizes) + .add_child_data(value_data) + .null_bit_buffer(Some(Buffer::from(null_bits))) + .build() + .unwrap(); + let list_array = LargeListViewArray::from(list_data); + assert_eq!(9, list_array.len()); + + list_array.value(10); + } + #[test] + #[should_panic(expected = "ListViewArray data should contain two buffer (value offsets & value size), had 0")] + #[cfg(not(feature = "force_validate"))] + fn test_list_view_array_invalid_buffer_len() { + let value_data = unsafe { + ArrayData::builder(DataType::Int32) + .len(8) + .add_buffer(Buffer::from_slice_ref([0, 1, 2, 3, 4, 5, 6, 7])) + .build_unchecked() + }; + let list_data_type = DataType::ListView(Arc::new(Field::new("item", DataType::Int32, false))); + let list_data = unsafe { + ArrayData::builder(list_data_type) + .len(3) + .add_child_data(value_data) + .build_unchecked() + }; + drop(ListViewArray::from(list_data)); + } + + #[test] + #[should_panic(expected = "ListViewArray data should contain two buffer (value offsets & value size), had 1")] + #[cfg(not(feature = "force_validate"))] + fn test_list_view_array_invalid_child_array_len() { + let value_offsets = Buffer::from_slice_ref([0, 2, 5, 7]); + let list_data_type = DataType::ListView(Arc::new(Field::new("item", DataType::Int32, false))); + let list_data = unsafe { + ArrayData::builder(list_data_type) + .len(3) + .add_buffer(value_offsets) + .build_unchecked() + }; + drop(ListViewArray::from(list_data)); + } + + #[test] + #[should_panic(expected = "[Large]ListViewArray's datatype must be [Large]ListViewArray(). It is ListView")] + fn test_from_array_data_validation() { + let mut builder = ListViewBuilder::new(Int32Builder::new()); + builder.values().append_value(1); + builder.append(true, 0); + let array = builder.finish(); + let _ = LargeListViewArray::from(array.into_data()); + } + + #[test] + fn test_list_view_array_offsets_need_not_start_at_zero() { + let value_data = ArrayData::builder(DataType::Int32) + .len(8) + .add_buffer(Buffer::from_slice_ref([0, 1, 2, 3, 4, 5, 6, 7])) + .build() + .unwrap(); + + let value_offsets = Buffer::from_slice_ref([2, 2, 5, 7]); + let value_sizes = Buffer::from_slice_ref([0, 0, 3, 2]); + + let list_data_type = DataType::ListView(Arc::new(Field::new("item", DataType::Int32, false))); + let list_data = ArrayData::builder(list_data_type) + .len(3) + .add_buffer(value_offsets) + .add_buffer(value_sizes) + .add_child_data(value_data) + .build() + .unwrap(); + + let list_array = ListViewArray::from(list_data); + assert_eq!(list_array.value_length(0), 0); + assert_eq!(list_array.value_length(1), 0); + assert_eq!(list_array.value_length(2), 3); + } + + #[test] + #[should_panic(expected = "Memory pointer is not aligned with the specified scalar type")] + #[cfg(not(feature = "force_validate"))] + fn test_list_view_array_alignment() { + let offset_buf = Buffer::from_slice_ref([0_u64]); + let offset_buf2 = offset_buf.slice(1); + + let size_buf = Buffer::from_slice_ref([0_u64]); + let size_buf2 = size_buf.slice(1); + + let values: [i32; 8] = [0; 8]; + let value_data = unsafe { + ArrayData::builder(DataType::Int32) + .add_buffer(Buffer::from_slice_ref(values)) + .build_unchecked() + }; + + let list_data_type = DataType::ListView(Arc::new(Field::new("item", DataType::Int32, false))); + let list_data = unsafe { + ArrayData::builder(list_data_type) + .add_buffer(offset_buf2) + .add_buffer(size_buf2) + .add_child_data(value_data) + .build_unchecked() + }; + drop(ListViewArray::from(list_data)); + } + + #[test] + fn list_array_equality() { + // test scaffold + fn do_comparison( + lhs_data: Vec>>>, + rhs_data: Vec>>>, + should_equal: bool, + ) { + let lhs = ListViewArray::from_iter_primitive::(lhs_data.clone()); + let rhs = ListViewArray::from_iter_primitive::(rhs_data.clone()); + assert_eq!(lhs == rhs, should_equal); + + let lhs = LargeListViewArray::from_iter_primitive::(lhs_data); + let rhs = LargeListViewArray::from_iter_primitive::(rhs_data); + assert_eq!(lhs == rhs, should_equal); + } + + do_comparison( + vec![ + Some(vec![Some(0), Some(1), Some(2)]), + None, + Some(vec![Some(3), None, Some(5)]), + Some(vec![Some(6), Some(7)]), + ], + vec![ + Some(vec![Some(0), Some(1), Some(2)]), + None, + Some(vec![Some(3), None, Some(5)]), + Some(vec![Some(6), Some(7)]), + ], + true, + ); + + do_comparison( + vec![ + None, + None, + Some(vec![Some(3), None, Some(5)]), + Some(vec![Some(6), Some(7)]), + ], + vec![ + Some(vec![Some(0), Some(1), Some(2)]), + None, + Some(vec![Some(3), None, Some(5)]), + Some(vec![Some(6), Some(7)]), + ], + false, + ); + + do_comparison( + vec![ + None, + None, + Some(vec![Some(3), None, Some(5)]), + Some(vec![Some(6), Some(7)]), + ], + vec![ + None, + None, + Some(vec![Some(3), None, Some(5)]), + Some(vec![Some(0), Some(0)]), + ], + false, + ); + + do_comparison( + vec![None, None, Some(vec![Some(1)])], + vec![None, None, Some(vec![Some(2)])], + false, + ); + } + + #[test] + fn test_empty_offsets() { + let f = Arc::new(Field::new("element", DataType::Int32, true)); + let string = ListViewArray::from( + ArrayData::builder(DataType::ListView(f.clone())) + .buffers(vec![Buffer::from(&[]), Buffer::from(&[])]) + .add_child_data(ArrayData::new_empty(&DataType::Int32)) + .build() + .unwrap(), + ); + assert_eq!(string.value_offsets(), &[0]); + assert_eq!(string.value_sizes(), &[0]); + + let string = LargeListViewArray::from( + ArrayData::builder(DataType::LargeListView(f)) + .buffers(vec![Buffer::from(&[]),Buffer::from(&[])]) + .add_child_data(ArrayData::new_empty(&DataType::Int32)) + .build() + .unwrap(), + ); + assert_eq!(string.len(), 0); + assert_eq!(string.value_offsets(), &[0]); + assert_eq!(string.value_sizes(), &[0]); + } + + #[test] + fn test_try_new() { + let offsets = OffsetBuffer::new(vec![0, 1, 4, 5].into()); + let sizes = SizeBuffer::new(vec![1, 3, 1, 0 ].into()); + let values = Int32Array::new(vec![1, 2, 3, 4, 5].into(), None); + let values = Arc::new(values) as ArrayRef; + + let field = Arc::new(Field::new("element", DataType::Int32, false)); + ListViewArray::new(field.clone(), offsets.clone(), sizes.clone(), values.clone(), None); + + let nulls = NullBuffer::new_null(4); + ListViewArray::new(field.clone(), offsets, sizes.clone(), values.clone(), Some(nulls)); + + let nulls = NullBuffer::new_null(4); + let offsets = OffsetBuffer::new(vec![0, 1, 2, 3, 4].into()); + let sizes = SizeBuffer::new(vec![1, 1, 1, 1, 0].into()); + let err = LargeListViewArray::try_new(field, offsets.clone(), sizes.clone(), values.clone(), Some(nulls)) + .unwrap_err(); + + assert_eq!( + err.to_string(), + "Invalid argument error: Incorrect length of null buffer for LargeListViewArray, expected 5 got 4" + ); + + let field = Arc::new(Field::new("element", DataType::Int64, false)); + let err = LargeListViewArray::try_new(field.clone(), offsets.clone(), sizes.clone(), values.clone(), None) + .unwrap_err(); + + assert_eq!( + err.to_string(), + "Invalid argument error: LargeListViewArray expected data type Int64 got Int32 for \"element\"" + ); + + let nulls = NullBuffer::new_null(7); + let values = Int64Array::new(vec![0; 7].into(), Some(nulls)); + let values = Arc::new(values); + + let err = + LargeListViewArray::try_new(field, offsets.clone(), sizes.clone(), values.clone(),None).unwrap_err(); + + assert_eq!( + err.to_string(), + "Invalid argument error: Non-nullable field of LargeListViewArray \"element\" cannot contain nulls" + ); + + } + + #[test] + fn test_from_fixed_size_list() { + let mut builder = FixedSizeListBuilder::new(Int32Builder::new(), 3); + builder.values().append_slice(&[1, 2, 3]); + builder.append(true); + builder.values().append_slice(&[0, 0, 0]); + builder.append(false); + builder.values().append_slice(&[4, 5, 6]); + builder.append(true); + let list: ListViewArray = builder.finish().into(); + + let values: Vec<_> = list + .iter() + .map(|x| x.map(|x| x.as_primitive::().values().to_vec())) + .collect(); + assert_eq!(values, vec![Some(vec![1, 2, 3]), None, Some(vec![4, 5, 6])]) + } +} \ No newline at end of file diff --git a/arrow-array/src/array/mod.rs b/arrow-array/src/array/mod.rs index b115ff9c14cc..f981ba4021a3 100644 --- a/arrow-array/src/array/mod.rs +++ b/arrow-array/src/array/mod.rs @@ -20,7 +20,7 @@ mod binary_array; use crate::types::*; -use arrow_buffer::{ArrowNativeType, NullBuffer, OffsetBuffer, ScalarBuffer}; +use arrow_buffer::{ArrowNativeType, NullBuffer, OffsetBuffer, ScalarBuffer, SizeBuffer}; use arrow_data::ArrayData; use arrow_schema::{DataType, IntervalUnit, TimeUnit}; use std::any::Any; @@ -65,13 +65,14 @@ mod union_array; pub use union_array::*; mod run_array; - pub use run_array::*; mod byte_view_array; - pub use byte_view_array::*; +mod list_view_array; +pub use list_view_array::*; + /// An array in the [arrow columnar format](https://arrow.apache.org/docs/format/Columnar.html) pub trait Array: std::fmt::Debug + Send + Sync { /// Returns the array as [`Any`] so that it can be @@ -519,6 +520,12 @@ impl PartialEq for GenericListArray { } } +impl PartialEq for GenericListViewArray { + fn eq(&self, other: &Self) -> bool { + self.to_data().eq(&other.to_data()) + } +} + impl PartialEq for MapArray { fn eq(&self, other: &Self) -> bool { self.to_data().eq(&other.to_data()) @@ -606,7 +613,9 @@ pub fn make_array(data: ArrayData) -> ArrayRef { DataType::LargeUtf8 => Arc::new(LargeStringArray::from(data)) as ArrayRef, DataType::Utf8View => Arc::new(StringViewArray::from(data)) as ArrayRef, DataType::List(_) => Arc::new(ListArray::from(data)) as ArrayRef, + DataType::ListView(_) => Arc::new(ListViewArray::from(data)) as ArrayRef, DataType::LargeList(_) => Arc::new(LargeListArray::from(data)) as ArrayRef, + DataType::LargeListView(_) => Arc::new(LargeListViewArray::from(data)) as ArrayRef, DataType::Struct(_) => Arc::new(StructArray::from(data)) as ArrayRef, DataType::Map(_, _) => Arc::new(MapArray::from(data)) as ArrayRef, DataType::Union(_, _) => Arc::new(UnionArray::from(data)) as ArrayRef, @@ -687,6 +696,22 @@ unsafe fn get_offsets(data: &ArrayData) -> OffsetBuffer { } } +/// Helper function that gets size from an [`ArrayData`] +/// +/// # Safety +unsafe fn get_sizes(data: &ArrayData) -> SizeBuffer { + match data.is_empty() && data.buffers()[1].is_empty() { + true => SizeBuffer::new_empty(), + false => { + let buffer = + ScalarBuffer::new(data.buffers()[1].clone(), data.offset(), data.len()); + // Safety: + // ArrayData is valid + SizeBuffer::new(buffer) + } + } +} + /// Helper function for printing potentially long arrays. fn print_long_array(array: &A, f: &mut std::fmt::Formatter, print_item: F) -> std::fmt::Result where diff --git a/arrow-array/src/builder/generic_list_builder.rs b/arrow-array/src/builder/generic_list_builder.rs index 25903bcf546b..d6fff12e1ab8 100644 --- a/arrow-array/src/builder/generic_list_builder.rs +++ b/arrow-array/src/builder/generic_list_builder.rs @@ -548,7 +548,7 @@ mod tests { } #[test] - fn test_boxed_primitive_aray_builder() { + fn test_boxed_primitive_array_builder() { let values_builder = make_builder(&DataType::Int32, 5); let mut builder = ListBuilder::new(values_builder); diff --git a/arrow-array/src/builder/generic_list_view_builder.rs b/arrow-array/src/builder/generic_list_view_builder.rs new file mode 100644 index 000000000000..59ec9592f5aa --- /dev/null +++ b/arrow-array/src/builder/generic_list_view_builder.rs @@ -0,0 +1,705 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + + +use std::any::Any; +use std::sync::Arc; +use arrow_buffer::{Buffer, BufferBuilder, NullBufferBuilder, OffsetBuffer, SizeBuffer}; +use arrow_schema::{Field, FieldRef}; +use crate::builder::ArrayBuilder; +use crate::{ArrayRef, GenericListViewArray, OffsetSizeTrait}; + +#[derive(Debug)] +pub struct GenericListViewBuilder { + offsets_builder: BufferBuilder, + sizes_builder: BufferBuilder, + null_buffer_builder: NullBufferBuilder, + values_builder: T, + field: Option, +} + + + + +impl Default for GenericListViewBuilder { + fn default() -> Self { + Self::new(T::default()) + } +} + +impl GenericListViewBuilder { + /// Creates a new [`GenericListBuilder`] from a given values array builder + pub fn new(values_builder: T) -> Self { + let capacity = values_builder.len(); + Self::with_capacity(values_builder, capacity) + } + + /// Creates a new [`GenericListBuilder`] from a given values array builder + /// `capacity` is the number of items to pre-allocate space for in this builder + pub fn with_capacity(values_builder: T, capacity: usize) -> Self { + let offsets_builder = BufferBuilder::::new(capacity); + let sizes_builder = BufferBuilder::::new(capacity); + Self { + offsets_builder, + null_buffer_builder: NullBufferBuilder::new(capacity), + values_builder, + sizes_builder, + field: None, + } + } + + /// Override the field passed to [`GenericListArray::new`] + /// + /// By default a nullable field is created with the name `item` + /// + /// Note: [`Self::finish`] and [`Self::finish_cloned`] will panic if the + /// field's data type does not match that of `T` + pub fn with_field(self, field: impl Into) -> Self { + Self { + field: Some(field.into()), + ..self + } + } +} + +impl ArrayBuilder +for GenericListViewBuilder + where + T: 'static, +{ + /// Returns the builder as a non-mutable `Any` reference. + fn as_any(&self) -> &dyn Any { + self + } + + /// Returns the builder as a mutable `Any` reference. + fn as_any_mut(&mut self) -> &mut dyn Any { + self + } + + /// Returns the boxed builder as a box of `Any`. + fn into_box_any(self: Box) -> Box { + self + } + + /// Returns the number of array slots in the builder + fn len(&self) -> usize { + self.null_buffer_builder.len() + } + + /// Builds the array and reset this builder. + fn finish(&mut self) -> ArrayRef { + Arc::new(self.finish()) + } + + /// Builds the array without resetting the builder. + fn finish_cloned(&self) -> ArrayRef { + Arc::new(self.finish_cloned()) + } +} + +impl GenericListViewBuilder + where + T: 'static, +{ + /// Returns the child array builder as a mutable reference. + /// + /// This mutable reference can be used to append values into the child array builder, + /// but you must call [`append`](#method.append) to delimit each distinct list value. + pub fn values(&mut self) -> &mut T { + &mut self.values_builder + } + + /// Returns the child array builder as an immutable reference + pub fn values_ref(&self) -> &T { + &self.values_builder + } + + /// Finish the current variable-length list array slot + /// + /// # Panics + /// + /// Panics if the length of [`Self::values`] exceeds `OffsetSize::MAX` + #[inline] + pub fn append(&mut self, is_valid: bool, size: usize) { + self.offsets_builder.append(OffsetSize::from_usize(self.values_builder.len() - size).unwrap()); + let size = OffsetSize::from_usize(size).unwrap(); + self.sizes_builder.append(size); + self.null_buffer_builder.append(is_valid); + } + + #[inline] + pub fn append_value(&mut self, i: I) + where + T: Extend>, + I: IntoIterator>, + { + self.extend(std::iter::once(Some(i))) + } + + /// Append a null to this [`GenericListBuilder`] + /// + /// See [`Self::append_value`] for an example use. + #[inline] + pub fn append_null(&mut self) { + self.offsets_builder.append(OffsetSize::from_usize(self.values_builder.len()).unwrap()); + self.sizes_builder.append(OffsetSize::from_usize(0).unwrap()); + self.null_buffer_builder.append_null(); + } + + /// Appends an optional value into this [`GenericListBuilder`] + /// + /// If `Some` calls [`Self::append_value`] otherwise calls [`Self::append_null`] + #[inline] + pub fn append_option(&mut self, i: Option) + where + T: Extend>, + I: IntoIterator>, + { + match i { + Some(i) => self.append_value(i), + None => self.append_null(), + } + } + + /// Builds the [`GenericListViewArray`] and reset this builder. + pub fn finish(&mut self) -> GenericListViewArray { + let values = self.values_builder.finish(); + let nulls = self.null_buffer_builder.finish(); + + let offsets = self.offsets_builder.finish(); + // Safety: Safe by construction + let offsets = unsafe { OffsetBuffer::new_unchecked(offsets.into()) }; + + let sizes = self.sizes_builder.finish(); + let sizes = SizeBuffer::new(sizes.into()); + + let field = match &self.field { + Some(f) => f.clone(), + None => Arc::new(Field::new("item", values.data_type().clone(), true)), + }; + GenericListViewArray::new(field, offsets, sizes, values ,nulls) + } + + /// Builds the [`GenericListArray`] without resetting the builder. + pub fn finish_cloned(&self) -> GenericListViewArray { + let values = self.values_builder.finish_cloned(); + let nulls = self.null_buffer_builder.finish_cloned(); + + let offsets = Buffer::from_slice_ref(self.offsets_builder.as_slice()); + // Safety: safe by construction + let offsets = unsafe { OffsetBuffer::new_unchecked(offsets.into()) }; + + let sizes = Buffer::from_slice_ref(self.sizes_builder.as_slice()); + let sizes = SizeBuffer::new(sizes.into()); + + let field = match &self.field { + Some(f) => f.clone(), + None => Arc::new(Field::new("item", values.data_type().clone(), true)), + }; + + GenericListViewArray::new(field, offsets, sizes, values, nulls) + } + + /// Returns the current offsets buffer as a slice + pub fn offsets_slice(&self) -> &[OffsetSize] { + self.offsets_builder.as_slice() + } +} + +impl Extend> for GenericListViewBuilder + where + O: OffsetSizeTrait, + B: ArrayBuilder + Extend, + V: IntoIterator, +{ + #[inline] + fn extend>>(&mut self, iter: T) { + for v in iter { + match v { + Some(elements) => { + let origin_size = self.values_builder.len(); + self.values_builder.extend(elements); + let size = self.values_builder.len() - origin_size; + self.append(true, size); + } + None => self.append(false, 0), + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::builder::{Int32Builder, ListViewBuilder, make_builder}; + use crate::cast::AsArray; + use crate::types::Int32Type; + use crate::{Array, Int32Array}; + use arrow_schema::DataType; + + fn _test_generic_list_view_array_builder() { + let values_builder = Int32Builder::with_capacity(10); + let mut builder = GenericListViewBuilder::::new(values_builder); + + // [[0, 1, 2], [3, 4, 5], [6, 7]] + builder.values().append_value(0); + builder.values().append_value(1); + builder.values().append_value(2); + builder.append(true,3); + builder.values().append_value(3); + builder.values().append_value(4); + builder.values().append_value(5); + builder.append(true,3); + builder.values().append_value(6); + builder.values().append_value(7); + builder.append(true,2); + let list_array = builder.finish(); + + let list_values = list_array.values().as_primitive::(); + assert_eq!(list_values.values(), &[0, 1, 2, 3, 4, 5, 6, 7]); + assert_eq!(list_array.value_offsets(), [0, 3, 6].map(O::usize_as)); + assert_eq!(list_array.value_sizes(), [3, 3, 2].map(O::usize_as)); + assert_eq!(DataType::Int32, list_array.value_type()); + assert_eq!(3, list_array.len()); + assert_eq!(0, list_array.null_count()); + assert_eq!(O::from_usize(6).unwrap(), list_array.value_offsets()[2]); + assert_eq!(O::from_usize(2).unwrap(), list_array.value_sizes()[2]); + assert_eq!(O::from_usize(2).unwrap(), list_array.value_length(2)); + for i in 0..2 { + assert!(list_array.is_valid(i)); + assert!(!list_array.is_null(i)); + } + } + + #[test] + fn test_list_view_array_builder() { + _test_generic_list_view_array_builder::() + } + + #[test] + fn test_large_list_view_array_builder() { + _test_generic_list_view_array_builder::() + } + + fn _test_generic_list_view_array_builder_nulls() { + let values_builder = Int32Builder::with_capacity(10); + let mut builder = GenericListViewBuilder::::new(values_builder); + + // [[0, 1, 2], null, [3, null, 5], [6, 7]] + builder.values().append_value(0); + builder.values().append_value(1); + builder.values().append_value(2); + builder.append(true,3); + builder.append(false,0); + builder.values().append_value(3); + builder.values().append_null(); + builder.values().append_value(5); + builder.append(true,3); + builder.values().append_value(6); + builder.values().append_value(7); + builder.append(true,2); + + let list_array = builder.finish(); + + assert_eq!(DataType::Int32, list_array.value_type()); + assert_eq!(4, list_array.len()); + assert_eq!(1, list_array.null_count()); + assert_eq!(O::from_usize(3).unwrap(), list_array.value_offsets()[2]); + assert_eq!(O::from_usize(3).unwrap(), list_array.value_sizes()[2]); + assert_eq!(O::from_usize(3).unwrap(), list_array.value_length(2)); + + } + + #[test] + fn test_list_view_array_builder_nulls() { + _test_generic_list_view_array_builder_nulls::() + } + + #[test] + fn test_large_list_view_array_builder_nulls() { + _test_generic_list_view_array_builder_nulls::() + } + + #[test] + fn test_list_view_array_builder_finish() { + let values_builder = Int32Array::builder(5); + let mut builder = ListViewBuilder::new(values_builder); + + builder.values().append_slice(&[1, 2, 3]); + builder.append(true,1); + builder.values().append_slice(&[4, 5, 6]); + builder.append(true,1); + + let mut arr = builder.finish(); + assert_eq!(2, arr.len()); + assert!(builder.is_empty()); + + builder.values().append_slice(&[7, 8, 9]); + builder.append(true,1); + arr = builder.finish(); + assert_eq!(1, arr.len()); + assert!(builder.is_empty()); + } + + #[test] + fn test_list_view_array_builder_finish_cloned() { + let values_builder = Int32Array::builder(5); + let mut builder = ListViewBuilder::new(values_builder); + + builder.values().append_slice(&[1, 2, 3]); + builder.append(true,1); + builder.values().append_slice(&[4, 5, 6]); + builder.append(true,1); + + let mut arr = builder.finish_cloned(); + assert_eq!(2, arr.len()); + assert!(!builder.is_empty()); + + builder.values().append_slice(&[7, 8, 9]); + builder.append(true,1); + arr = builder.finish(); + assert_eq!(3, arr.len()); + assert!(builder.is_empty()); + } + + #[test] + fn test_list_view_list_view_array_builder() { + let primitive_builder = Int32Builder::with_capacity(10); + let values_builder = ListViewBuilder::new(primitive_builder); + let mut builder = ListViewBuilder::new(values_builder); + + // [[[1, 2], [3, 4]], [[5, 6, 7], null, [8]], null, [[9, 10]]] + builder.values().values().append_value(1); + builder.values().values().append_value(2); + builder.values().append(true,2); + builder.values().values().append_value(3); + builder.values().values().append_value(4); + builder.values().append(true,2); + builder.append(true,2); + + builder.values().values().append_value(5); + builder.values().values().append_value(6); + builder.values().values().append_value(7); + builder.values().append(true,3); + builder.values().append(false,0); + builder.values().values().append_value(8); + builder.values().append(true,1); + builder.append(true,3); + + builder.append(false,0); + + builder.values().values().append_value(9); + builder.values().values().append_value(10); + builder.values().append(true,2); + builder.append(true,1); + + let l1 = builder.finish(); + + assert_eq!(4, l1.len()); + assert_eq!(1, l1.null_count()); + + assert_eq!(l1.value_offsets(), &[0, 2, 5, 5]); + assert_eq!(l1.value_sizes(), &[2, 3, 0, 1]); + + let l2 = l1.values().as_list_view::(); + + assert_eq!(6, l2.len()); + assert_eq!(1, l2.null_count()); + assert_eq!(l2.value_offsets(), &[0, 2, 4, 7, 7, 8]); + assert_eq!(l2.value_sizes(), &[2, 2, 3, 0, 1, 2]); + + let i1 = l2.values().as_primitive::(); + assert_eq!(10, i1.len()); + assert_eq!(0, i1.null_count()); + assert_eq!(i1.values(), &[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]); + } + + #[test] + fn test_extend() { + let mut builder = ListViewBuilder::new(Int32Builder::new()); + builder.extend([ + Some(vec![Some(1), Some(2), Some(7), None]), + Some(vec![]), + Some(vec![Some(4), Some(5)]), + None, + ]); + + let array = builder.finish(); + assert_eq!(array.value_offsets(), [0, 4, 4, 6]); + assert_eq!(array.value_sizes(), [4, 0, 2, 0]); + assert_eq!(array.null_count(), 1); + assert!(array.is_null(3)); + let elements = array.values().as_primitive::(); + assert_eq!(elements.values(), &[1, 2, 7, 0, 4, 5]); + assert_eq!(elements.null_count(), 1); + assert!(elements.is_null(3)); + } + + #[test] + fn test_boxed_primitive_array_builder() { + let values_builder = make_builder(&DataType::Int32, 5); + let mut builder = ListViewBuilder::new(values_builder); + + builder + .values() + .as_any_mut() + .downcast_mut::() + .expect("should be an Int32Builder") + .append_slice(&[1, 2, 3]); + builder.append(true,1); + + builder + .values() + .as_any_mut() + .downcast_mut::() + .expect("should be an Int32Builder") + .append_slice(&[4, 5, 6]); + builder.append(true,1); + + let arr = builder.finish(); + assert_eq!(2, arr.len()); + + let elements = arr.values().as_primitive::(); + assert_eq!(elements.values(), &[1, 2, 3, 4, 5, 6]); + } + + #[test] + fn test_boxed_list_view_list_view_array_builder() { + // This test is same as `test_list_list_array_builder` but uses boxed builders. + let values_builder = make_builder( + &DataType::ListView(Arc::new(Field::new("item", DataType::Int32, true))), + 10, + ); + test_boxed_generic_list_view_generic_list_view_array_builder::(values_builder); + } + + #[test] + fn test_boxed_large_list_view_large_list_view_array_builder() { + // This test is same as `test_list_list_array_builder` but uses boxed builders. + let values_builder = make_builder( + &DataType::LargeListView(Arc::new(Field::new("item", DataType::Int32, true))), + 10, + ); + test_boxed_generic_list_view_generic_list_view_array_builder::(values_builder); + } + + fn test_boxed_generic_list_view_generic_list_view_array_builder( + values_builder: Box, + ) { + let mut builder: GenericListViewBuilder> = + GenericListViewBuilder::>::new(values_builder); + + // [[[1, 2], [3, 4]], [[5, 6, 7], null, [8]], null, [[9, 10]]] + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an (Large)ListViewBuilder") + .values() + .as_any_mut() + .downcast_mut::() + .expect("should be an Int32Builder") + .append_value(1); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an (Large)ListViewBuilder") + .values() + .as_any_mut() + .downcast_mut::() + .expect("should be an Int32Builder") + .append_value(2); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an (Large)ListViewBuilder") + .append(true,2); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an (Large)ListViewBuilder") + .values() + .as_any_mut() + .downcast_mut::() + .expect("should be an Int32Builder") + .append_value(3); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an (Large)ListViewBuilder") + .values() + .as_any_mut() + .downcast_mut::() + .expect("should be an Int32Builder") + .append_value(4); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an (Large)ListViewBuilder") + .append(true,2); + builder.append(true,2); + + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an (Large)ListViewBuilder") + .values() + .as_any_mut() + .downcast_mut::() + .expect("should be an Int32Builder") + .append_value(5); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an (Large)ListViewBuilder") + .values() + .as_any_mut() + .downcast_mut::() + .expect("should be an Int32Builder") + .append_value(6); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an (Large)ListViewBuilder") + .values() + .as_any_mut() + .downcast_mut::() + .expect("should be an (Large)ListViewBuilder") + .append_value(7); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an (Large)ListViewBuilder") + .append(true,3); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an (Large)ListViewBuilder") + .append(false,0); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an (Large)ListViewBuilder") + .values() + .as_any_mut() + .downcast_mut::() + .expect("should be an Int32Builder") + .append_value(8); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an (Large)ListViewBuilder") + .append(true,1); + builder.append(true,3); + + builder.append(false,0); + + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an (Large)ListViewBuilder") + .values() + .as_any_mut() + .downcast_mut::() + .expect("should be an Int32Builder") + .append_value(9); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an (Large)ListViewBuilder") + .values() + .as_any_mut() + .downcast_mut::() + .expect("should be an Int32Builder") + .append_value(10); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an (Large)ListViewBuilder") + .append(true,2); + builder.append(true,1); + + let l1 = builder.finish(); + assert_eq!(4, l1.len()); + assert_eq!(1, l1.null_count()); + assert_eq!(l1.value_offsets(), &[0, 2, 5, 5].map(O::usize_as)); + assert_eq!(l1.value_sizes(), &[2, 3, 0, 1].map(O::usize_as)); + + let l2 = l1.values().as_list_view::(); + assert_eq!(6, l2.len()); + assert_eq!(1, l2.null_count()); + assert_eq!(l2.value_offsets(), &[0, 2, 4, 7, 7, 8].map(O::usize_as)); + assert_eq!(l2.value_sizes(), &[2, 2, 3, 0, 1, 2].map(O::usize_as)); + + let i1 = l2.values().as_primitive::(); + assert_eq!(10, i1.len()); + assert_eq!(0, i1.null_count()); + assert_eq!(i1.values(), &[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]); + } + + #[test] + fn test_with_field() { + let field = Arc::new(Field::new("bar", DataType::Int32, false)); + let mut builder = ListViewBuilder::new(Int32Builder::new()).with_field(field.clone()); + builder.append_value([Some(1), Some(2), Some(3)]); + builder.append_null(); // This is fine as nullability refers to nullability of values + builder.append_value([Some(4)]); + let array = builder.finish(); + assert_eq!(array.len(), 3); + assert_eq!(array.data_type(), &DataType::ListView(field.clone())); + + builder.append_value([Some(4), Some(5)]); + let array = builder.finish(); + assert_eq!(array.data_type(), &DataType::ListView(field)); + assert_eq!(array.len(), 1); + } + + #[test] + #[should_panic(expected = "Non-nullable field of ListViewArray \\\"item\\\" cannot contain nulls")] + fn test_checks_nullability() { + let field = Arc::new(Field::new("item", DataType::Int32, false)); + let mut builder = ListViewBuilder::new(Int32Builder::new()).with_field(field.clone()); + builder.append_value([Some(1), None]); + builder.finish(); + } + + #[test] + #[should_panic(expected = "ListViewArray expected data type Int64 got Int32")] + fn test_checks_data_type() { + let field = Arc::new(Field::new("item", DataType::Int64, false)); + let mut builder = ListViewBuilder::new(Int32Builder::new()).with_field(field.clone()); + builder.append_value([Some(1)]); + builder.finish(); + } +} + diff --git a/arrow-array/src/builder/mod.rs b/arrow-array/src/builder/mod.rs index e4ab7ae4ba23..109492de18ea 100644 --- a/arrow-array/src/builder/mod.rs +++ b/arrow-array/src/builder/mod.rs @@ -181,6 +181,8 @@ pub use generic_byte_run_builder::*; mod generic_bytes_view_builder; pub use generic_bytes_view_builder::*; mod union_builder; +mod generic_list_view_builder; +pub use generic_list_view_builder::*; pub use union_builder::*; @@ -315,3 +317,9 @@ pub type StringBuilder = GenericStringBuilder; /// Builder for [`LargeStringArray`](crate::array::LargeStringArray) pub type LargeStringBuilder = GenericStringBuilder; + +/// Builder for [`ListViewArray`](crate::array::ListViewArray) +pub type ListViewBuilder = GenericListViewBuilder; + +/// Builder for [`LargeListViewArray`](crate::array::LargeListViewArray) +pub type LargeListViewBuilder = GenericListViewBuilder; diff --git a/arrow-array/src/builder/struct_builder.rs b/arrow-array/src/builder/struct_builder.rs index 1e2e402f745f..52ed42cab6cf 100644 --- a/arrow-array/src/builder/struct_builder.rs +++ b/arrow-array/src/builder/struct_builder.rs @@ -253,6 +253,14 @@ pub fn make_builder(datatype: &DataType, capacity: usize) -> Box { + let builder = make_builder(field.data_type(), capacity); + Box::new(ListViewBuilder::with_capacity(builder, capacity).with_field(field.clone())) + } + DataType::LargeListView(field) => { + let builder = make_builder(field.data_type(), capacity); + Box::new(LargeListViewBuilder::with_capacity(builder, capacity).with_field(field.clone())) + } DataType::Map(field, _) => match field.data_type() { DataType::Struct(fields) => { let map_field_names = MapFieldNames { diff --git a/arrow-array/src/cast.rs b/arrow-array/src/cast.rs index 2e21f3e7e640..12b18e9817f5 100644 --- a/arrow-array/src/cast.rs +++ b/arrow-array/src/cast.rs @@ -795,6 +795,14 @@ pub trait AsArray: private::Sealed { self.as_list_opt().expect("list array") } + /// Downcast this to a [`GenericListViewArray`] returning `None` if not possible + fn as_list_view_opt(&self) -> Option<&GenericListViewArray>; + + /// Downcast this to a [`GenericListViewArray`] panicking if not possible + fn as_list_view(&self) -> &GenericListViewArray { + self.as_list_view_opt().expect("list view array") + } + /// Downcast this to a [`FixedSizeBinaryArray`] returning `None` if not possible fn as_fixed_size_binary_opt(&self) -> Option<&FixedSizeBinaryArray>; @@ -860,6 +868,10 @@ impl AsArray for dyn Array + '_ { self.as_any().downcast_ref() } + fn as_list_view_opt(&self) -> Option<&GenericListViewArray> { + self.as_any().downcast_ref() + } + fn as_fixed_size_binary_opt(&self) -> Option<&FixedSizeBinaryArray> { self.as_any().downcast_ref() } @@ -907,6 +919,10 @@ impl AsArray for ArrayRef { self.as_ref().as_list_opt() } + fn as_list_view_opt(&self) -> Option<&GenericListViewArray> { + self.as_ref().as_list_view_opt() + } + fn as_fixed_size_binary_opt(&self) -> Option<&FixedSizeBinaryArray> { self.as_ref().as_fixed_size_binary_opt() } diff --git a/arrow-array/src/iterator.rs b/arrow-array/src/iterator.rs index 3f9cc0d525c1..5b36d19359eb 100644 --- a/arrow-array/src/iterator.rs +++ b/arrow-array/src/iterator.rs @@ -19,7 +19,7 @@ use crate::array::{ ArrayAccessor, BooleanArray, FixedSizeBinaryArray, GenericBinaryArray, GenericListArray, - GenericStringArray, PrimitiveArray, + GenericListViewArray, GenericStringArray, PrimitiveArray, }; use crate::{FixedSizeListArray, MapArray}; use arrow_buffer::NullBuffer; @@ -141,6 +141,8 @@ pub type FixedSizeBinaryIter<'a> = ArrayIter<&'a FixedSizeBinaryArray>; pub type FixedSizeListIter<'a> = ArrayIter<&'a FixedSizeListArray>; /// an iterator that returns Some(T) or None, that can be used on any ListArray pub type GenericListArrayIter<'a, O> = ArrayIter<&'a GenericListArray>; +/// an iterator that returns Some(T) or None, that can be used on any ListArray +pub type GenericListViewArrayIter<'a, O> = ArrayIter<&'a GenericListViewArray>; /// an iterator that returns Some(T) or None, that can be used on any MapArray pub type MapArrayIter<'a> = ArrayIter<&'a MapArray>; diff --git a/arrow-array/src/record_batch.rs b/arrow-array/src/record_batch.rs index c56b1fd308cf..5fcfaa0b2709 100644 --- a/arrow-array/src/record_batch.rs +++ b/arrow-array/src/record_batch.rs @@ -626,9 +626,7 @@ mod tests { use std::collections::HashMap; use super::*; - use crate::{ - BooleanArray, Int32Array, Int64Array, Int8Array, ListArray, StringArray, StringViewArray, - }; + use crate::{BooleanArray, Int32Array, Int64Array, Int8Array, ListArray, StringArray, StringViewArray}; use arrow_buffer::{Buffer, ToByteSlice}; use arrow_data::{ArrayData, ArrayDataBuilder}; use arrow_schema::Fields; diff --git a/arrow-buffer/src/buffer/mod.rs b/arrow-buffer/src/buffer/mod.rs index d33e68795e4e..c00bd326b3cb 100644 --- a/arrow-buffer/src/buffer/mod.rs +++ b/arrow-buffer/src/buffer/mod.rs @@ -32,4 +32,7 @@ pub use boolean::*; mod null; pub use null::*; mod run; +mod size; +pub use size::*; + pub use run::*; diff --git a/arrow-buffer/src/buffer/size.rs b/arrow-buffer/src/buffer/size.rs new file mode 100644 index 000000000000..9d2e5889ca1c --- /dev/null +++ b/arrow-buffer/src/buffer/size.rs @@ -0,0 +1,91 @@ +use std::ops::Deref; +use crate::{ArrowNativeType, MutableBuffer, ScalarBuffer}; + +#[derive(Debug, Clone)] +pub struct SizeBuffer(ScalarBuffer); + +impl SizeBuffer { + + /// Create a new [`SizeBuffer`] containing `len` `0` values + pub fn new_zeroed(len: usize) -> Self { + let len_bytes = len.checked_mul(std::mem::size_of::()).expect("overflow"); + let buffer = MutableBuffer::from_len_zeroed(len_bytes); + Self(buffer.into_buffer().into()) + } + + /// Create a new [`SizeBuffer`] from the provided [`ScalarBuffer`] + pub fn new(buffer: ScalarBuffer) -> Self { + assert!(!buffer.is_empty(), "offsets cannot be empty"); + assert!( + buffer[0] >= O::usize_as(0), + "offsets must be greater than 0" + ); + Self(buffer) + } + + /// Create a new [`SizeBuffer`] containing a single 0 value + pub fn new_empty() -> Self { + let buffer = MutableBuffer::from_len_zeroed(std::mem::size_of::()); + Self(buffer.into_buffer().into()) + } + + /// Create a new [`SizeBuffer`] from the iterator of slice lengths + /// + /// ``` + /// # use arrow_buffer::SizeBuffer; + /// let offsets = SizeBuffer::::from_lengths([1, 3, 5]); + /// assert_eq!(offsets.as_ref(), &[1, 3, 5]); + /// ``` + pub fn from_lengths(lengths: I) -> Self + where + I: IntoIterator, + { + let iter = lengths.into_iter(); + let mut out = Vec::with_capacity(iter.size_hint().0); + + for size in iter { + out.push(O::usize_as(size)) + } + Self(out.into()) + } + + /// Returns the inner [`ScalarBuffer`] + pub fn inner(&self) -> &ScalarBuffer { + &self.0 + } + + /// Returns the inner [`ScalarBuffer`], consuming self + pub fn into_inner(self) -> ScalarBuffer { + self.0 + } + + /// Returns a zero-copy slice of this buffer with length `len` and starting at `offset` + pub fn slice(&self, offset: usize, len: usize) -> Self { + Self(self.0.slice(offset, len.saturating_add(1))) + } + + /// Returns true if this [`OffsetBuffer`] is equal to `other`, using pointer comparisons + /// to determine buffer equality. This is cheaper than `PartialEq::eq` but may + /// return false when the arrays are logically equal + #[inline] + pub fn ptr_eq(&self, other: &Self) -> bool { + self.0.ptr_eq(&other.0) + } +} + + +impl Deref for SizeBuffer { + type Target = [T]; + + #[inline] + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl AsRef<[T]> for SizeBuffer { + #[inline] + fn as_ref(&self) -> &[T] { + self + } +} diff --git a/arrow-data/src/data.rs b/arrow-data/src/data.rs index e227b168eee5..0d832a453693 100644 --- a/arrow-data/src/data.rs +++ b/arrow-data/src/data.rs @@ -119,17 +119,21 @@ pub(crate) fn new_buffers(data_type: &DataType, capacity: usize) -> [MutableBuff [buffer, empty_buffer] } DataType::ListView(_) => [ + // offset buffer MutableBuffer::new(capacity * mem::size_of::()), + // size buffer MutableBuffer::new(capacity * mem::size_of::()), ], DataType::LargeList(_) => { - // offset buffer always starts with a zero + // offset buffer let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::()); buffer.push(0i64); [buffer, empty_buffer] } DataType::LargeListView(_) => [ + // offset buffer MutableBuffer::new(capacity * mem::size_of::()), + // size buffer MutableBuffer::new(capacity * mem::size_of::()), ], DataType::FixedSizeBinary(size) => { @@ -856,6 +860,17 @@ impl ArrayData { self.typed_buffer(0, self.len + 1) } + /// Returns a reference to the data in `buffer` as a typed slice + /// after validating. The returned slice is guaranteed to have at + /// least `len` entries. + fn typed_sizes(&self) -> Result<&[T], ArrowError> { + // An empty list-like array can have 0 sizes + if self.len == 0 && self.buffers[1].is_empty() { + return Ok(&[]); + } + self.typed_buffer(1, self.len) + } + /// Returns a reference to the data in `buffers[idx]` as a typed slice after validating fn typed_buffer( &self, @@ -929,6 +944,26 @@ impl ArrayData { Ok(()) } + fn validate_sizes( + &self, + values_length: usize, + ) -> Result<(), ArrowError> { + let sizes = self.typed_sizes::()?; + if sizes.is_empty() { + return Ok(()); + } + + let total_size: usize = sizes.iter().map(|x| x.to_usize().unwrap()).sum(); + if total_size > values_length { + return Err(ArrowError::InvalidArgumentError(format!( + "Total size of {} is larger than values length {}", + total_size, values_length, + ))); + } + + Ok(()) + } + /// Validates the layout of `child_data` ArrayData structures fn validate_child_data(&self) -> Result<(), ArrowError> { match &self.data_type { @@ -937,11 +972,23 @@ impl ArrayData { self.validate_offsets::(values_data.len)?; Ok(()) } + DataType::ListView(field) => { + let values_data = self.get_single_valid_child_data(field.data_type())?; + self.validate_offsets::(values_data.len)?; + self.validate_sizes::(values_data.len)?; + Ok(()) + } DataType::LargeList(field) => { let values_data = self.get_single_valid_child_data(field.data_type())?; self.validate_offsets::(values_data.len)?; Ok(()) } + DataType::LargeListView(field) => { + let values_data = self.get_single_valid_child_data(field.data_type())?; + self.validate_offsets::(values_data.len)?; + self.validate_sizes::(values_data.len)?; + Ok(()) + } DataType::FixedSizeList(field, list_size) => { let values_data = self.get_single_valid_child_data(field.data_type())?; @@ -1546,9 +1593,8 @@ pub fn layout(data_type: &DataType) -> DataTypeLayout { DataType::BinaryView | DataType::Utf8View => DataTypeLayout::new_view(), DataType::FixedSizeList(_, _) => DataTypeLayout::new_empty(), // all in child data DataType::List(_) => DataTypeLayout::new_fixed_width::(), - DataType::ListView(_) | DataType::LargeListView(_) => { - unimplemented!("ListView/LargeListView not implemented") - } + DataType::ListView(_) => DataTypeLayout::new_list_view::(), + DataType::LargeListView(_) => DataTypeLayout::new_list_view::(), DataType::LargeList(_) => DataTypeLayout::new_fixed_width::(), DataType::Map(_, _) => DataTypeLayout::new_fixed_width::(), DataType::Struct(_) => DataTypeLayout::new_empty(), // all in child data, @@ -1652,6 +1698,21 @@ impl DataTypeLayout { variadic: true, } } + + /// Describes a list view type + pub fn new_list_view() -> Self { + Self { + buffers: vec![BufferSpec::FixedWidth { + byte_width: mem::size_of::(), + alignment: mem::align_of::(), + },BufferSpec::FixedWidth { + byte_width: mem::size_of::(), + alignment: mem::align_of::(), + }], + can_contain_null_mask: true, + variadic: true, + } + } } /// Layout specification for a single data type buffer diff --git a/arrow-data/src/equal/list_view.rs b/arrow-data/src/equal/list_view.rs new file mode 100644 index 000000000000..b6a5d7e945fa --- /dev/null +++ b/arrow-data/src/equal/list_view.rs @@ -0,0 +1,52 @@ +use num::Integer; +use arrow_buffer::ArrowNativeType; +use crate::ArrayData; +use crate::data::count_nulls; + +use super::equal_range; + +pub(super) fn list_view_equal( + lhs: &ArrayData, + rhs: &ArrayData, + lhs_start: usize, + rhs_start: usize, + len: usize, +) -> bool { + let lhs_offsets = lhs.buffer::(0); + let rhs_offsets = rhs.buffer::(0); + let lhs_sizes = lhs.buffer::(1); + let rhs_sizes = rhs.buffer::(1); + for i in 0..len { + + // compare offsets and sizes + let lhs_pos = lhs_start + i; + let rhs_pos = rhs_start + i; + let lhs_offset_start = lhs_offsets[lhs_pos].to_usize().unwrap(); + let rhs_offset_start = rhs_offsets[rhs_pos].to_usize().unwrap(); + let lhs_size = lhs_sizes[lhs_pos].to_usize().unwrap(); + let rhs_size = rhs_sizes[rhs_pos].to_usize().unwrap(); + if lhs_size != rhs_size { + return false; + } + + // compare nulls + let lhs_null_count = count_nulls(lhs.nulls(), lhs_offset_start, lhs_size); + let rhs_null_count = count_nulls(rhs.nulls(), rhs_offset_start, lhs_size); + if lhs_null_count != rhs_null_count { + return false; + } + + print!("lhs_offset_start: {}, rhs_offset_start: {}, lhs_size: {}\n", lhs_offset_start, rhs_offset_start, lhs_size); + // compare values + if !equal_range( + &lhs.child_data()[0], + &rhs.child_data()[0], + lhs_offset_start, + lhs_offset_start, + lhs_size, + ) { + return false; + } + } + true +} diff --git a/arrow-data/src/equal/mod.rs b/arrow-data/src/equal/mod.rs index dba6a0186a56..0c721a595eb4 100644 --- a/arrow-data/src/equal/mod.rs +++ b/arrow-data/src/equal/mod.rs @@ -37,6 +37,7 @@ mod structure; mod union; mod utils; mod variable_size; +mod list_view; // these methods assume the same type, len and null count. // For this reason, they are not exposed and are instead used @@ -52,6 +53,7 @@ use primitive::primitive_equal; use structure::struct_equal; use union::union_equal; use variable_size::variable_sized_equal; +use crate::equal::list_view::list_view_equal; use self::run::run_equal; @@ -102,10 +104,9 @@ fn equal_values( byte_view_equal(lhs, rhs, lhs_start, rhs_start, len) } DataType::List(_) => list_equal::(lhs, rhs, lhs_start, rhs_start, len), - DataType::ListView(_) | DataType::LargeListView(_) => { - unimplemented!("ListView/LargeListView not yet implemented") - } + DataType::ListView(_) => list_view_equal::(lhs, rhs, lhs_start, rhs_start, len), DataType::LargeList(_) => list_equal::(lhs, rhs, lhs_start, rhs_start, len), + DataType::LargeListView(_) => list_view_equal::(lhs, rhs, lhs_start, rhs_start, len), DataType::FixedSizeList(_, _) => fixed_list_equal(lhs, rhs, lhs_start, rhs_start, len), DataType::Struct(_) => struct_equal(lhs, rhs, lhs_start, rhs_start, len), DataType::Union(_, _) => union_equal(lhs, rhs, lhs_start, rhs_start, len),