diff --git a/arrow-array/src/array/byte_array.rs b/arrow-array/src/array/byte_array.rs index db825bbea97d..a0af2c268318 100644 --- a/arrow-array/src/array/byte_array.rs +++ b/arrow-array/src/array/byte_array.rs @@ -94,7 +94,7 @@ pub struct GenericByteArray { impl Clone for GenericByteArray { fn clone(&self) -> Self { Self { - data_type: self.data_type.clone(), + data_type: T::DATA_TYPE, value_offsets: self.value_offsets.clone(), value_data: self.value_data.clone(), nulls: self.nulls.clone(), diff --git a/arrow-array/src/array/byte_view_array.rs b/arrow-array/src/array/byte_view_array.rs new file mode 100644 index 000000000000..52f5ee05840e --- /dev/null +++ b/arrow-array/src/array/byte_view_array.rs @@ -0,0 +1,346 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::array::print_long_array; +use crate::builder::GenericByteViewBuilder; +use crate::iterator::ArrayIter; +use crate::types::bytes::ByteArrayNativeType; +use crate::types::{BinaryViewType, ByteViewType, StringViewType}; +use crate::{Array, ArrayAccessor, ArrayRef}; +use arrow_buffer::{Buffer, NullBuffer, ScalarBuffer}; +use arrow_data::view::View; +use arrow_data::{ArrayData, ArrayDataBuilder}; +use arrow_schema::{ArrowError, DataType}; +use std::any::Any; +use std::marker::PhantomData; +use std::sync::Arc; + +/// An array of variable length byte view arrays +pub struct GenericByteViewArray { + data_type: DataType, + views: ScalarBuffer, + buffers: Vec, + nulls: Option, + phantom: PhantomData, +} + +impl Clone for GenericByteViewArray { + fn clone(&self) -> Self { + Self { + data_type: T::DATA_TYPE, + views: self.views.clone(), + buffers: self.buffers.clone(), + nulls: self.nulls.clone(), + phantom: Default::default(), + } + } +} + +impl GenericByteViewArray { + /// Create a new [`GenericByteViewArray`] from the provided parts, panicking on failure + /// + /// # Panics + /// + /// Panics if [`GenericByteViewArray::try_new`] returns an error + pub fn new( + views: ScalarBuffer, + buffers: Vec, + nulls: Option, + ) -> Self { + Self::try_new(views, buffers, nulls).unwrap() + } + + /// Create a new [`GenericByteViewArray`] from the provided parts, returning an error on failure + /// + /// # Errors + /// + /// * `views.len() != nulls.len()` + /// * [ByteViewType::validate] fails + pub fn try_new( + views: ScalarBuffer, + buffers: Vec, + nulls: Option, + ) -> Result { + // Verify data is valid + T::validate(&views, &buffers)?; + + if let Some(n) = nulls.as_ref() { + if n.len() != views.len() { + return Err(ArrowError::InvalidArgumentError(format!( + "Incorrect length of null buffer for {}ViewArray, expected {} got {}", + T::PREFIX, + views.len(), + n.len(), + ))); + } + } + + Ok(Self { + data_type: T::DATA_TYPE, + phantom: Default::default(), + views, + buffers, + nulls, + }) + } + + /// Create a new [`GenericByteViewArray`] from the provided parts, without validation + /// + /// # Safety + /// + /// Safe if [`Self::try_new`] would not error + pub unsafe fn new_unchecked( + views: ScalarBuffer, + buffers: Vec, + nulls: Option, + ) -> Self { + Self { + data_type: T::DATA_TYPE, + phantom: Default::default(), + views, + buffers, + nulls, + } + } + + /// Create a new [`GenericByteViewArray`] of length `len` where all values are null + pub fn new_null(len: usize) -> Self { + Self { + data_type: T::DATA_TYPE, + views: vec![0; len].into(), + buffers: vec![], + nulls: Some(NullBuffer::new_null(len)), + phantom: Default::default(), + } + } + + /// Creates a [`GenericByteViewArray`] based on an iterator of values without nulls + pub fn from_iter_values(iter: I) -> Self + where + Ptr: AsRef, + I: IntoIterator, + { + let iter = iter.into_iter(); + let mut builder = GenericByteViewBuilder::::with_capacity(iter.size_hint().0); + for v in iter { + builder.append_value(v); + } + builder.finish() + } + + /// Deconstruct this array into its constituent parts + pub fn into_parts(self) -> (ScalarBuffer, Vec, Option) { + (self.views, self.buffers, self.nulls) + } + + /// Returns the views buffer + #[inline] + pub fn views(&self) -> &ScalarBuffer { + &self.views + } + + /// Returns the buffers storing string data + #[inline] + pub fn data_buffers(&self) -> &[Buffer] { + &self.buffers + } + + /// Returns the element at index `i` + /// # Panics + /// Panics if index `i` is out of bounds. + pub fn value(&self, i: usize) -> &T::Native { + assert!( + i < self.len(), + "Trying to access an element at index {} from a {}ViewArray of length {}", + i, + T::PREFIX, + self.len() + ); + + assert!(i < self.views.len()); + unsafe { self.value_unchecked(i) } + } + + /// Returns the element at index `i` + /// # Safety + /// Caller is responsible for ensuring that the index is within the bounds of the array + pub unsafe fn value_unchecked(&self, idx: usize) -> &T::Native { + let v = self.views.get_unchecked(idx); + let len = *v as u32; + let b = if len <= 12 { + let ptr = self.views.as_ptr() as *const u8; + std::slice::from_raw_parts(ptr.add(idx * 16 + 4), len as usize) + } else { + let view = View::from(*v); + let data = self.buffers.get_unchecked(view.buffer_index as usize); + let offset = view.offset as usize; + data.get_unchecked(offset..offset + len as usize) + }; + T::Native::from_bytes_unchecked(b) + } + + /// constructs a new iterator + pub fn iter(&self) -> ArrayIter<&Self> { + ArrayIter::new(self) + } + + /// Returns a zero-copy slice of this array with the indicated offset and length. + pub fn slice(&self, offset: usize, length: usize) -> Self { + Self { + data_type: T::DATA_TYPE, + views: self.views.slice(offset, length), + buffers: self.buffers.clone(), + nulls: self.nulls.as_ref().map(|n| n.slice(offset, length)), + phantom: Default::default(), + } + } +} + +impl std::fmt::Debug for GenericByteViewArray { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(f, "{}ViewArray\n[\n", T::PREFIX)?; + print_long_array(self, f, |array, index, f| { + std::fmt::Debug::fmt(&array.value(index), f) + })?; + write!(f, "]") + } +} + +impl Array for GenericByteViewArray { + fn as_any(&self) -> &dyn Any { + self + } + + fn to_data(&self) -> ArrayData { + self.clone().into() + } + + fn into_data(self) -> ArrayData { + self.into() + } + + fn data_type(&self) -> &DataType { + &self.data_type + } + + fn slice(&self, offset: usize, length: usize) -> ArrayRef { + Arc::new(self.slice(offset, length)) + } + + fn len(&self) -> usize { + self.views.len() + } + + fn is_empty(&self) -> bool { + self.views.is_empty() + } + + fn offset(&self) -> usize { + 0 + } + + fn nulls(&self) -> Option<&NullBuffer> { + self.nulls.as_ref() + } + + fn get_buffer_memory_size(&self) -> usize { + let mut sum = self.buffers.iter().map(|b| b.capacity()).sum::(); + sum += self.views.inner().capacity(); + if let Some(x) = &self.nulls { + sum += x.buffer().capacity() + } + sum + } + + fn get_array_memory_size(&self) -> usize { + std::mem::size_of::() + self.get_buffer_memory_size() + } +} + +impl<'a, T: ByteViewType> ArrayAccessor for &'a GenericByteViewArray { + type Item = &'a T::Native; + + fn value(&self, index: usize) -> Self::Item { + GenericByteViewArray::value(self, index) + } + + unsafe fn value_unchecked(&self, index: usize) -> Self::Item { + GenericByteViewArray::value_unchecked(self, index) + } +} + +impl<'a, T: ByteViewType> IntoIterator for &'a GenericByteViewArray { + type Item = Option<&'a T::Native>; + type IntoIter = ArrayIter; + + fn into_iter(self) -> Self::IntoIter { + ArrayIter::new(self) + } +} + +impl From for GenericByteViewArray { + fn from(value: ArrayData) -> Self { + let views = value.buffers()[0].clone(); + let views = ScalarBuffer::new(views, value.offset(), value.len()); + let buffers = value.buffers()[1..].to_vec(); + Self { + data_type: T::DATA_TYPE, + views, + buffers, + nulls: value.nulls().cloned(), + phantom: Default::default(), + } + } +} + +impl From> for ArrayData { + fn from(mut array: GenericByteViewArray) -> Self { + let len = array.len(); + array.buffers.insert(0, array.views.into_inner()); + let builder = ArrayDataBuilder::new(array.data_type) + .len(len) + .buffers(array.buffers) + .nulls(array.nulls); + + unsafe { builder.build_unchecked() } + } +} + +impl FromIterator> for GenericByteViewArray +where + Ptr: AsRef, +{ + fn from_iter>>(iter: I) -> Self { + let iter = iter.into_iter(); + let mut builder = GenericByteViewBuilder::::with_capacity(iter.size_hint().0); + builder.extend(iter); + builder.finish() + } +} + +/// A [`GenericByteViewArray`] of `str` +/// +/// ``` +/// # use arrow_array::StringViewArray; +/// let array = StringViewArray::from_iter_values(vec!["hello", "world", "foo", "large payload over 12 bytes"]); +/// assert_eq!(array.value(0), "hello"); +/// assert_eq!(array.value(3), "large payload over 12 bytes"); +/// ``` +pub type StringViewArray = GenericByteViewArray; + +/// A [`GenericByteViewArray`] of `[u8]` +pub type BinaryViewArray = GenericByteViewArray; diff --git a/arrow-array/src/array/mod.rs b/arrow-array/src/array/mod.rs index 7aa3f92bfbd2..e2e857774ae2 100644 --- a/arrow-array/src/array/mod.rs +++ b/arrow-array/src/array/mod.rs @@ -34,6 +34,9 @@ pub use boolean_array::*; mod byte_array; pub use byte_array::*; +mod byte_view_array; +pub use byte_view_array::*; + mod dictionary_array; pub use dictionary_array::*; diff --git a/arrow-array/src/builder/byte_view_builder.rs b/arrow-array/src/builder/byte_view_builder.rs new file mode 100644 index 000000000000..65b44496d3f6 --- /dev/null +++ b/arrow-array/src/builder/byte_view_builder.rs @@ -0,0 +1,215 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::builder::{ArrayBuilder, BufferBuilder}; +use crate::types::{BinaryViewType, ByteViewType, StringViewType}; +use crate::{ArrayRef, GenericByteViewArray}; +use arrow_buffer::{Buffer, NullBufferBuilder, ScalarBuffer}; +use arrow_data::view::View; +use std::any::Any; +use std::marker::PhantomData; +use std::sync::Arc; + +const DEFAULT_BLOCK_SIZE: u32 = 8 * 1024; + +/// A builder for [`GenericByteViewArray`] +/// +/// See [`Self::append_value`] for the allocation strategy +pub struct GenericByteViewBuilder { + views_builder: BufferBuilder, + null_buffer_builder: NullBufferBuilder, + completed: Vec, + in_progress: Vec, + block_size: u32, + phantom: PhantomData, +} + +impl GenericByteViewBuilder { + /// Creates a new [`GenericByteViewBuilder`]. + pub fn new() -> Self { + Self::with_capacity(1024) + } + + /// Creates a new [`GenericByteViewBuilder`] with space for `capacity` strings + pub fn with_capacity(capacity: usize) -> Self { + Self { + views_builder: BufferBuilder::new(capacity), + null_buffer_builder: NullBufferBuilder::new(capacity), + completed: vec![], + in_progress: vec![], + block_size: DEFAULT_BLOCK_SIZE, + phantom: Default::default(), + } + } + + /// Override the minimum size of buffers to allocate for string data + pub fn with_block_size(self, block_size: u32) -> Self { + Self { block_size, ..self } + } + + /// Appends a value into the builder + /// + /// # Panics + /// + /// Panics if + /// - String buffer count exceeds `u32::MAX` + /// - String length exceeds `u32::MAX` + #[inline] + pub fn append_value(&mut self, value: impl AsRef) { + let v: &[u8] = value.as_ref().as_ref(); + let length: u32 = v.len().try_into().unwrap(); + if length <= 12 { + let mut offset = [0; 16]; + offset[0..4].copy_from_slice(&length.to_le_bytes()); + offset[4..4 + v.len()].copy_from_slice(v); + self.views_builder.append(u128::from_le_bytes(offset)); + self.null_buffer_builder.append_non_null(); + return; + } + + let required_cap = self.in_progress.len() + v.len(); + if self.in_progress.capacity() < required_cap { + let in_progress = Vec::with_capacity(v.len().max(self.block_size as usize)); + let flushed = std::mem::replace(&mut self.in_progress, in_progress); + if !flushed.is_empty() { + assert!(self.completed.len() < u32::MAX as usize); + self.completed.push(flushed.into()); + } + }; + let offset = self.in_progress.len() as u32; + self.in_progress.extend_from_slice(v); + + let view = View { + length, + prefix: u32::from_le_bytes(v[0..4].try_into().unwrap()), + buffer_index: self.completed.len() as u32, + offset, + }; + self.views_builder.append(view.into()); + self.null_buffer_builder.append_non_null(); + } + + /// Append an `Option` value into the builder + #[inline] + pub fn append_option(&mut self, value: Option>) { + match value { + None => self.append_null(), + Some(v) => self.append_value(v), + }; + } + + /// Append a null value into the builder + #[inline] + pub fn append_null(&mut self) { + self.null_buffer_builder.append_null(); + self.views_builder.append(0); + } + + /// Builds the [`GenericByteViewArray`] and reset this builder + pub fn finish(&mut self) -> GenericByteViewArray { + let mut completed = std::mem::take(&mut self.completed); + if !self.in_progress.is_empty() { + completed.push(std::mem::take(&mut self.in_progress).into()); + } + let len = self.views_builder.len(); + let views = ScalarBuffer::new(self.views_builder.finish(), 0, len); + let nulls = self.null_buffer_builder.finish(); + // SAFETY: valid by construction + unsafe { GenericByteViewArray::new_unchecked(views, completed, nulls) } + } + + /// Builds the [`GenericByteViewArray`] without resetting the builder + pub fn finish_cloned(&self) -> GenericByteViewArray { + let mut completed = self.completed.clone(); + if !self.in_progress.is_empty() { + completed.push(Buffer::from_slice_ref(&self.in_progress)); + } + let len = self.views_builder.len(); + let views = Buffer::from_slice_ref(self.views_builder.as_slice()); + let views = ScalarBuffer::new(views, 0, len); + let nulls = self.null_buffer_builder.finish_cloned(); + // SAFETY: valid by construction + unsafe { GenericByteViewArray::new_unchecked(views, completed, nulls) } + } +} + +impl Default for GenericByteViewBuilder { + fn default() -> Self { + Self::new() + } +} + +impl std::fmt::Debug for GenericByteViewBuilder { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}ViewBuilder", T::PREFIX)?; + f.debug_struct("") + .field("views_builder", &self.views_builder) + .field("in_progress", &self.in_progress) + .field("completed", &self.completed) + .field("null_buffer_builder", &self.null_buffer_builder) + .finish() + } +} + +impl ArrayBuilder for GenericByteViewBuilder { + fn len(&self) -> usize { + self.null_buffer_builder.len() + } + + fn finish(&mut self) -> ArrayRef { + Arc::new(self.finish()) + } + + fn finish_cloned(&self) -> ArrayRef { + Arc::new(self.finish_cloned()) + } + + fn as_any(&self) -> &dyn Any { + self + } + + fn as_any_mut(&mut self) -> &mut dyn Any { + self + } + + fn into_box_any(self: Box) -> Box { + self + } +} + +impl> Extend> + for GenericByteViewBuilder +{ + #[inline] + fn extend>>(&mut self, iter: I) { + for v in iter { + self.append_option(v) + } + } +} + +/// Array builder for [`StringViewArray`][crate::StringViewArray] +/// +/// Values can be appended using [`GenericByteViewBuilder::append_value`], and nulls with +/// [`GenericByteViewBuilder::append_null`] as normal. +pub type StringViewBuilder = GenericByteViewBuilder; + +/// Array builder for [`BinaryViewArray`][crate::BinaryViewArray] +/// +/// Values can be appended using [`GenericByteViewBuilder::append_value`], and nulls with +/// [`GenericByteViewBuilder::append_null`] as normal. +pub type BinaryViewBuilder = GenericByteViewBuilder; diff --git a/arrow-array/src/builder/mod.rs b/arrow-array/src/builder/mod.rs index d33e565a868b..6d13a76b63a5 100644 --- a/arrow-array/src/builder/mod.rs +++ b/arrow-array/src/builder/mod.rs @@ -154,6 +154,8 @@ mod boolean_builder; pub use boolean_builder::*; mod buffer_builder; pub use buffer_builder::*; +mod byte_view_builder; +pub use byte_view_builder::*; mod fixed_size_binary_builder; pub use fixed_size_binary_builder::*; mod fixed_size_list_builder; diff --git a/arrow-array/src/types.rs b/arrow-array/src/types.rs index 83a229c1da0d..13d7145caa1d 100644 --- a/arrow-array/src/types.rs +++ b/arrow-array/src/types.rs @@ -25,6 +25,7 @@ use crate::timezone::Tz; use crate::{ArrowNativeTypeOp, OffsetSizeTrait}; use arrow_buffer::{i256, Buffer, OffsetBuffer}; use arrow_data::decimal::{validate_decimal256_precision, validate_decimal_precision}; +use arrow_data::view::{validate_binary_view, validate_string_view}; use arrow_schema::{ ArrowError, DataType, IntervalUnit, TimeUnit, DECIMAL128_MAX_PRECISION, DECIMAL128_MAX_SCALE, DECIMAL256_MAX_PRECISION, DECIMAL256_MAX_SCALE, DECIMAL_DEFAULT_SCALE, @@ -1441,6 +1442,10 @@ pub(crate) mod bytes { std::str::from_utf8_unchecked(b) } } + + pub trait ByteViewTypeSealed {} + impl ByteViewTypeSealed for BinaryViewType {} + impl ByteViewTypeSealed for StringViewType {} } /// A trait over the variable-size byte array types @@ -1544,6 +1549,49 @@ pub type BinaryType = GenericBinaryType; /// An arrow binary array with i64 offsets pub type LargeBinaryType = GenericBinaryType; +/// A trait over the variable-size byte view array types +pub trait ByteViewType: 'static + Send + Sync + bytes::ByteViewTypeSealed { + /// Type for representing its equivalent rust type i.e + /// Utf8Array will have native type has &str + /// BinaryArray will have type as [u8] + type Native: bytes::ByteArrayNativeType + AsRef + AsRef<[u8]> + ?Sized; + + /// Datatype of array elements + const DATA_TYPE: DataType; + + /// "Binary" or "String", for use in error messages + const PREFIX: &'static str; + + /// Verifies that the provided buffers are valid for this array type + fn validate(views: &[u128], buffers: &[Buffer]) -> Result<(), ArrowError>; +} + +/// [`ByteViewType`] for string arrays +pub struct StringViewType {} + +impl ByteViewType for StringViewType { + type Native = str; + const DATA_TYPE: DataType = DataType::Utf8View; + const PREFIX: &'static str = "String"; + + fn validate(views: &[u128], buffers: &[Buffer]) -> Result<(), ArrowError> { + validate_string_view(views, buffers) + } +} + +/// [`ByteViewType`] for binary arrays +pub struct BinaryViewType {} + +impl ByteViewType for BinaryViewType { + type Native = [u8]; + const DATA_TYPE: DataType = DataType::BinaryView; + const PREFIX: &'static str = "Binary"; + + fn validate(views: &[u128], buffers: &[Buffer]) -> Result<(), ArrowError> { + validate_binary_view(views, buffers) + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/arrow-buffer/src/native.rs b/arrow-buffer/src/native.rs index 38074a8dc26c..5184d60ac1fd 100644 --- a/arrow-buffer/src/native.rs +++ b/arrow-buffer/src/native.rs @@ -149,6 +149,7 @@ native_integer!(u8); native_integer!(u16); native_integer!(u32); native_integer!(u64); +native_integer!(u128); macro_rules! native_float { ($t:ty, $s:ident, $as_usize: expr, $i:ident, $usize_as: expr) => { diff --git a/arrow-data/src/data.rs b/arrow-data/src/data.rs index bd45c4f8ddda..3294389614d7 100644 --- a/arrow-data/src/data.rs +++ b/arrow-data/src/data.rs @@ -27,6 +27,7 @@ use std::ops::Range; use std::sync::Arc; use crate::equal; +use crate::view::{validate_binary_view, validate_string_view}; /// A collection of [`Buffer`] #[doc(hidden)] @@ -151,29 +152,6 @@ pub(crate) fn new_buffers(data_type: &DataType, capacity: usize) -> [MutableBuff } } -/// Maps 2 [`MutableBuffer`]s into a vector of [Buffer]s whose size depends on `data_type`. -#[inline] -pub(crate) fn into_buffers( - data_type: &DataType, - buffer1: MutableBuffer, - buffer2: MutableBuffer, -) -> Vec { - match data_type { - DataType::Null | DataType::Struct(_) | DataType::FixedSizeList(_, _) => vec![], - DataType::Utf8 | DataType::Binary | DataType::LargeUtf8 | DataType::LargeBinary => { - vec![buffer1.into(), buffer2.into()] - } - DataType::Union(_, mode) => { - match mode { - // Based on Union's DataTypeLayout - UnionMode::Sparse => vec![buffer1.into()], - UnionMode::Dense => vec![buffer1.into(), buffer2.into()], - } - } - _ => vec![buffer1.into()], - } -} - /// A generic representation of Arrow array data which encapsulates common attributes and /// operations for Arrow array. Specific operations for different arrays types (e.g., /// primitive, list, struct) are implemented in `Array`. @@ -737,7 +715,9 @@ impl ArrayData { ))); } - if self.buffers.len() != layout.buffers.len() { + if self.buffers.len() < layout.buffers.len() + || (!layout.variadic && self.buffers.len() != layout.buffers.len()) + { return Err(ArrowError::InvalidArgumentError(format!( "Expected {} buffers in array of type {:?}, got {}", layout.buffers.len(), @@ -1231,7 +1211,17 @@ impl ArrayData { DataType::Utf8 => self.validate_utf8::(), DataType::LargeUtf8 => self.validate_utf8::(), DataType::Binary => self.validate_offsets_full::(self.buffers[1].len()), - DataType::LargeBinary => self.validate_offsets_full::(self.buffers[1].len()), + DataType::LargeBinary => { + self.validate_offsets_full::(self.buffers[1].len()) + } + DataType::BinaryView => { + let views = self.typed_buffer::(0, self.len)?; + validate_binary_view(views, &self.buffers[1..]) + } + DataType::Utf8View => { + let views = self.typed_buffer::(0, self.len)?; + validate_string_view(views, &self.buffers[1..]) + } DataType::List(_) | DataType::Map(_, _) => { let child = &self.child_data[0]; self.validate_offsets_full::(child.len) @@ -1494,6 +1484,9 @@ impl ArrayData { /// Return the expected [`DataTypeLayout`] Arrays of this data /// type are expected to have +/// +/// For types with a variadic number of buffers, such as [`DataType::Utf8View`], +/// this only returns the required buffers pub fn layout(data_type: &DataType) -> DataTypeLayout { // based on C/C++ implementation in // https://github.com/apache/arrow/blob/661c7d749150905a63dd3b52e0a04dac39030d95/cpp/src/arrow/type.h (and .cc) @@ -1503,10 +1496,12 @@ pub fn layout(data_type: &DataType) -> DataTypeLayout { DataType::Null => DataTypeLayout { buffers: vec![], can_contain_null_mask: false, + variadic: false, }, DataType::Boolean => DataTypeLayout { buffers: vec![BufferSpec::BitMap], can_contain_null_mask: true, + variadic: false, }, DataType::Int8 => DataTypeLayout::new_fixed_width::(), DataType::Int16 => DataTypeLayout::new_fixed_width::(), @@ -1538,15 +1533,15 @@ pub fn layout(data_type: &DataType) -> DataTypeLayout { DataTypeLayout { buffers: vec![spec], can_contain_null_mask: true, + variadic: false, } } DataType::Binary => DataTypeLayout::new_binary::(), DataType::LargeBinary => DataTypeLayout::new_binary::(), + DataType::BinaryView => DataTypeLayout::new_view(), DataType::Utf8 => DataTypeLayout::new_binary::(), DataType::LargeUtf8 => DataTypeLayout::new_binary::(), - DataType::BinaryView | DataType::Utf8View => { - unimplemented!("BinaryView/Utf8View not implemented") - } + DataType::Utf8View => DataTypeLayout::new_view(), DataType::FixedSizeList(_, _) => DataTypeLayout::new_empty(), // all in child data DataType::List(_) => DataTypeLayout::new_fixed_width::(), DataType::LargeList(_) => DataTypeLayout::new_fixed_width::(), @@ -1575,6 +1570,7 @@ pub fn layout(data_type: &DataType) -> DataTypeLayout { } }, can_contain_null_mask: false, + variadic: false, } } DataType::Dictionary(key_type, _value_type) => layout(key_type), @@ -1590,6 +1586,11 @@ pub struct DataTypeLayout { /// Can contain a null bitmask pub can_contain_null_mask: bool, + + /// If the data type contains a variadic number of buffers + /// + /// This is false except for [`DataType::BinaryView`] and [`DataType::Utf8View`] + pub variadic: bool, } impl DataTypeLayout { @@ -1601,6 +1602,7 @@ impl DataTypeLayout { alignment: mem::align_of::(), }], can_contain_null_mask: true, + variadic: false, } } @@ -1611,6 +1613,7 @@ impl DataTypeLayout { Self { buffers: vec![], can_contain_null_mask: true, + variadic: false, } } @@ -1629,6 +1632,19 @@ impl DataTypeLayout { BufferSpec::VariableWidth, ], can_contain_null_mask: true, + variadic: false, + } + } + + /// Describes a view type + fn new_view() -> Self { + Self { + buffers: vec![BufferSpec::FixedWidth { + byte_width: mem::size_of::(), + alignment: mem::align_of::(), + }], + can_contain_null_mask: true, + variadic: true, } } } @@ -1834,7 +1850,7 @@ impl From for ArrayDataBuilder { #[cfg(test)] mod tests { use super::*; - use arrow_schema::{Field, UnionFields}; + use arrow_schema::Field; // See arrow/tests/array_data_validation.rs for test of array validation @@ -2082,23 +2098,6 @@ mod tests { assert!(!contains_nulls(Some(&buffer), 0, 0)); } - #[test] - fn test_into_buffers() { - let data_types = vec![ - DataType::Union(UnionFields::empty(), UnionMode::Dense), - DataType::Union(UnionFields::empty(), UnionMode::Sparse), - ]; - - for data_type in data_types { - let buffers = new_buffers(&data_type, 0); - let [buffer1, buffer2] = buffers; - let buffers = into_buffers(&data_type, buffer1, buffer2); - - let layout = layout(&data_type); - assert_eq!(buffers.len(), layout.buffers.len()); - } - } - #[test] fn test_alignment() { let buffer = Buffer::from_vec(vec![1_i32, 2_i32, 3_i32]); diff --git a/arrow-data/src/equal/binary_view.rs b/arrow-data/src/equal/binary_view.rs new file mode 100644 index 000000000000..0908def473ed --- /dev/null +++ b/arrow-data/src/equal/binary_view.rs @@ -0,0 +1,64 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::view::View; +use crate::ArrayData; + +pub(super) fn binary_view_equal( + lhs: &ArrayData, + rhs: &ArrayData, + lhs_start: usize, + rhs_start: usize, + len: usize, +) -> bool { + let lhs_views = &lhs.buffer::(0)[lhs_start..lhs_start + len]; + let lhs_b = lhs.buffers(); + let rhs_views = &rhs.buffer::(0)[rhs_start..rhs_start + len]; + let rhs_b = rhs.buffers(); + + for (idx, (l, r)) in lhs_views.iter().zip(rhs_views).enumerate() { + // Only checking one null mask here because by the time the control flow reaches + // this point, the equality of the two masks would have already been verified. + if lhs.is_null(idx) { + continue; + } + + let l_len = *l as u32; + let r_len = *r as u32; + if l_len != r_len { + return false; + } else if l_len <= 12 { + // Inline storage + if l != r { + return false; + } + } else { + let l_view = View::from(*l); + let r_view = View::from(*r); + let l_b = &lhs_b[(l_view.buffer_index as usize) + 1]; + let r_b = &rhs_b[(r_view.buffer_index as usize) + 1]; + + let l_o = l_view.offset as usize; + let r_o = r_view.offset as usize; + let len = l_len as usize; + if l_b[l_o..l_o + len] != r_b[r_o..r_o + len] { + return false; + } + } + } + true +} diff --git a/arrow-data/src/equal/mod.rs b/arrow-data/src/equal/mod.rs index 1255ff39e097..ae017a605669 100644 --- a/arrow-data/src/equal/mod.rs +++ b/arrow-data/src/equal/mod.rs @@ -24,6 +24,7 @@ use arrow_buffer::i256; use arrow_schema::{DataType, IntervalUnit}; use half::f16; +mod binary_view; mod boolean; mod dictionary; mod fixed_binary; @@ -40,6 +41,7 @@ mod variable_size; // these methods assume the same type, len and null count. // For this reason, they are not exposed and are instead used // to build the generic functions below (`equal_range` and `equal`). +use crate::equal::binary_view::binary_view_equal; use boolean::boolean_equal; use dictionary::dictionary_equal; use fixed_binary::fixed_binary_equal; @@ -74,6 +76,7 @@ fn equal_values( DataType::Int16 => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), DataType::Int32 => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), DataType::Int64 => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), + DataType::Float16 => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), DataType::Float32 => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), DataType::Float64 => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), DataType::Decimal128(_, _) => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), @@ -95,9 +98,11 @@ fn equal_values( DataType::LargeUtf8 | DataType::LargeBinary => { variable_sized_equal::(lhs, rhs, lhs_start, rhs_start, len) } - DataType::FixedSizeBinary(_) => fixed_binary_equal(lhs, rhs, lhs_start, rhs_start, len), DataType::BinaryView | DataType::Utf8View => { - unimplemented!("BinaryView/Utf8View not yet implemented") + binary_view_equal(lhs, rhs, lhs_start, rhs_start, len) + } + DataType::FixedSizeBinary(_) => { + fixed_binary_equal(lhs, rhs, lhs_start, rhs_start, len) } DataType::List(_) => list_equal::(lhs, rhs, lhs_start, rhs_start, len), DataType::LargeList(_) => list_equal::(lhs, rhs, lhs_start, rhs_start, len), @@ -115,7 +120,6 @@ fn equal_values( DataType::UInt64 => dictionary_equal::(lhs, rhs, lhs_start, rhs_start, len), _ => unreachable!(), }, - DataType::Float16 => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), DataType::Map(_, _) => list_equal::(lhs, rhs, lhs_start, rhs_start, len), DataType::RunEndEncoded(_, _) => run_equal(lhs, rhs, lhs_start, rhs_start, len), } diff --git a/arrow-data/src/lib.rs b/arrow-data/src/lib.rs index cfa0dba66c35..901dcfe95a07 100644 --- a/arrow-data/src/lib.rs +++ b/arrow-data/src/lib.rs @@ -30,3 +30,5 @@ pub mod decimal; #[cfg(feature = "ffi")] pub mod ffi; + +pub mod view; diff --git a/arrow-data/src/transform/mod.rs b/arrow-data/src/transform/mod.rs index ef53efac2373..ea0e15e76692 100644 --- a/arrow-data/src/transform/mod.rs +++ b/arrow-data/src/transform/mod.rs @@ -15,13 +15,11 @@ // specific language governing permissions and limitations // under the License. -use super::{ - data::{into_buffers, new_buffers}, - ArrayData, ArrayDataBuilder, -}; +use super::{data::new_buffers, ArrayData, ArrayDataBuilder}; use crate::bit_mask::set_bits; +use crate::view::View; use arrow_buffer::buffer::{BooleanBuffer, NullBuffer}; -use arrow_buffer::{bit_util, i256, ArrowNativeType, MutableBuffer}; +use arrow_buffer::{bit_util, i256, ArrowNativeType, Buffer, MutableBuffer}; use arrow_schema::{ArrowError, DataType, IntervalUnit, UnionMode}; use half::f16; use num::Integer; @@ -68,36 +66,6 @@ impl<'a> _MutableArrayData<'a> { .as_mut() .expect("MutableArrayData not nullable") } - - fn freeze(self, dictionary: Option) -> ArrayDataBuilder { - let buffers = into_buffers(&self.data_type, self.buffer1, self.buffer2); - - let child_data = match self.data_type { - DataType::Dictionary(_, _) => vec![dictionary.unwrap()], - _ => { - let mut child_data = Vec::with_capacity(self.child_data.len()); - for child in self.child_data { - child_data.push(child.freeze()); - } - child_data - } - }; - - let nulls = self - .null_buffer - .map(|nulls| { - let bools = BooleanBuffer::new(nulls.into(), 0, self.len); - unsafe { NullBuffer::new_unchecked(bools, self.null_count) } - }) - .filter(|n| n.null_count() > 0); - - ArrayDataBuilder::new(self.data_type) - .offset(0) - .len(self.len) - .nulls(nulls) - .buffers(buffers) - .child_data(child_data) - } } fn build_extend_null_bits(array: &ArrayData, use_nulls: bool) -> ExtendNullBits { @@ -138,26 +106,31 @@ fn build_extend_null_bits(array: &ArrayData, use_nulls: bool) -> ExtendNullBits pub struct MutableArrayData<'a> { #[allow(dead_code)] arrays: Vec<&'a ArrayData>, - // The attributes in [_MutableArrayData] cannot be in [MutableArrayData] due to - // mutability invariants (interior mutability): - // [MutableArrayData] contains a function that can only mutate [_MutableArrayData], not - // [MutableArrayData] itself + /// The attributes in [_MutableArrayData] cannot be in [MutableArrayData] due to + /// mutability invariants (interior mutability): + /// [MutableArrayData] contains a function that can only mutate [_MutableArrayData], not + /// [MutableArrayData] itself data: _MutableArrayData<'a>, - // the child data of the `Array` in Dictionary arrays. - // This is not stored in `MutableArrayData` because these values constant and only needed - // at the end, when freezing [_MutableArrayData]. + /// the child data of the `Array` in Dictionary arrays. + /// This is not stored in `MutableArrayData` because these values constant and only needed + /// at the end, when freezing [_MutableArrayData]. dictionary: Option, - // function used to extend values from arrays. This function's lifetime is bound to the array - // because it reads values from it. + /// View data buffers + /// This is not stored in `MutableArrayData` because these values constant and only needed + /// at the end, when freezing [_MutableArrayData]. + view_buffers: Vec, + + /// function used to extend values from arrays. This function's lifetime is bound to the array + /// because it reads values from it. extend_values: Vec>, - // function used to extend nulls from arrays. This function's lifetime is bound to the array - // because it reads nulls from it. + /// function used to extend nulls from arrays. This function's lifetime is bound to the array + /// because it reads nulls from it. extend_null_bits: Vec>, - // function used to extend nulls. - // this is independent of the arrays and therefore has no lifetime. + /// function used to extend nulls. + /// this is independent of the arrays and therefore has no lifetime. extend_nulls: ExtendNulls, } @@ -197,6 +170,26 @@ fn build_extend_dictionary(array: &ArrayData, offset: usize, max: usize) -> Opti } } +/// Builds an extend that adds `buffer_offset` to any buffer indices encountered +fn build_extend_view(array: &ArrayData, buffer_offset: u32) -> Extend { + let views = array.buffer::(0); + Box::new( + move |mutable: &mut _MutableArrayData, _, start: usize, len: usize| { + mutable + .buffer1 + .extend(views[start..start + len].iter().map(|v| { + let len = *v as u32; + if len <= 12 { + return *v; // Stored inline + } + let mut view = View::from(*v); + view.buffer_index += buffer_offset; + view.into() + })) + }, + ) +} + fn build_extend(array: &ArrayData) -> Extend { match array.data_type() { DataType::Null => null::build_extend(array), @@ -269,9 +262,7 @@ fn build_extend_nulls(data_type: &DataType) -> ExtendNulls { DataType::Decimal256(_, _) => primitive::extend_nulls::, DataType::Utf8 | DataType::Binary => variable_size::extend_nulls::, DataType::LargeUtf8 | DataType::LargeBinary => variable_size::extend_nulls::, - DataType::BinaryView | DataType::Utf8View => { - unimplemented!("BinaryView/Utf8View not implemented") - } + DataType::Utf8View | DataType::BinaryView => primitive::extend_nulls::, DataType::Map(_, _) | DataType::List(_) => list::extend_nulls::, DataType::LargeList(_) => list::extend_nulls::, DataType::Dictionary(child_data_type, _) => match child_data_type.as_ref() { @@ -423,11 +414,10 @@ impl<'a> MutableArrayData<'a> { | DataType::Binary | DataType::LargeUtf8 | DataType::LargeBinary + | DataType::Utf8View + | DataType::BinaryView | DataType::Interval(_) | DataType::FixedSizeBinary(_) => vec![], - DataType::BinaryView | DataType::Utf8View => { - unimplemented!("BinaryView/Utf8View not implemented") - } DataType::Map(_, _) | DataType::List(_) | DataType::LargeList(_) => { let children = arrays .iter() @@ -557,6 +547,15 @@ impl<'a> MutableArrayData<'a> { _ => (None, false), }; + let view_buffers = match &data_type { + DataType::BinaryView | DataType::Utf8View => arrays + .iter() + .flat_map(|x| x.buffers().iter().skip(1)) + .map(Buffer::clone) + .collect(), + _ => vec![], + }; + let extend_nulls = build_extend_nulls(data_type); let extend_null_bits = arrays @@ -589,6 +588,20 @@ impl<'a> MutableArrayData<'a> { extend_values.expect("MutableArrayData::new is infallible") } + DataType::Utf8View | DataType::BinaryView => { + let mut next_offset = 0_u32; + arrays + .iter() + .map(|array| { + let num_data_buffers = (array.buffers().len() - 1) as u32; + let offset = next_offset; + next_offset = next_offset + .checked_add(num_data_buffers) + .expect("buffer index overflow"); + build_extend_view(array, offset) + }) + .collect() + } _ => arrays.iter().map(|array| build_extend(array)).collect(), }; @@ -605,6 +618,7 @@ impl<'a> MutableArrayData<'a> { arrays, data, dictionary, + view_buffers, extend_values, extend_null_bits, extend_nulls, @@ -664,13 +678,56 @@ impl<'a> MutableArrayData<'a> { /// Creates a [ArrayData] from the pushed regions up to this point, consuming `self`. pub fn freeze(self) -> ArrayData { - unsafe { self.data.freeze(self.dictionary).build_unchecked() } + unsafe { self.into_builder().build_unchecked() } } /// Creates a [ArrayDataBuilder] from the pushed regions up to this point, consuming `self`. /// This is useful for extending the default behavior of MutableArrayData. pub fn into_builder(self) -> ArrayDataBuilder { - self.data.freeze(self.dictionary) + let data = self.data; + + let buffers = match data.data_type { + DataType::Null | DataType::Struct(_) | DataType::FixedSizeList(_, _) => { + vec![] + } + DataType::BinaryView | DataType::Utf8View => { + let mut b = self.view_buffers; + b.insert(0, data.buffer1.into()); + b + } + DataType::Utf8 + | DataType::Binary + | DataType::LargeUtf8 + | DataType::LargeBinary => vec![data.buffer1.into(), data.buffer2.into()], + DataType::Union(_, mode) => { + match mode { + // Based on Union's DataTypeLayout + UnionMode::Sparse => vec![data.buffer1.into()], + UnionMode::Dense => vec![data.buffer1.into(), data.buffer2.into()], + } + } + _ => vec![data.buffer1.into()], + }; + + let child_data = match data.data_type { + DataType::Dictionary(_, _) => vec![self.dictionary.unwrap()], + _ => data.child_data.into_iter().map(|x| x.freeze()).collect(), + }; + + let nulls = data + .null_buffer + .map(|nulls| { + let bools = BooleanBuffer::new(nulls.into(), 0, data.len); + unsafe { NullBuffer::new_unchecked(bools, data.null_count) } + }) + .filter(|n| n.null_count() > 0); + + ArrayDataBuilder::new(data.data_type) + .offset(0) + .len(data.len) + .nulls(nulls) + .buffers(buffers) + .child_data(child_data) } } diff --git a/arrow-data/src/view.rs b/arrow-data/src/view.rs new file mode 100644 index 000000000000..e49f41c5591c --- /dev/null +++ b/arrow-data/src/view.rs @@ -0,0 +1,192 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! View array utilities + +use arrow_buffer::Buffer; +use arrow_schema::ArrowError; + +/// The element layout of a view buffer +/// +/// See [`DataType::Utf8View`](arrow_schema::DataType) +pub struct View { + /// The length of the string + pub length: u32, + /// The first 4 bytes of string data + pub prefix: u32, + /// The buffer index + pub buffer_index: u32, + /// The offset into the buffer + pub offset: u32, +} + +impl From for View { + #[inline] + fn from(value: u128) -> Self { + Self { + length: value as u32, + prefix: (value >> 32) as u32, + buffer_index: (value >> 64) as u32, + offset: (value >> 96) as u32, + } + } +} + +impl From for u128 { + #[inline] + fn from(value: View) -> Self { + (value.length as u128) + | ((value.prefix as u128) << 32) + | ((value.buffer_index as u128) << 64) + | ((value.offset as u128) << 96) + } +} + +/// Validates the combination of `views` and `buffers` is a valid BinaryView +pub fn validate_binary_view( + views: &[u128], + buffers: &[Buffer], +) -> Result<(), ArrowError> { + validate_view_impl(views, buffers, |_, _| Ok(())) +} + +/// Validates the combination of `views` and `buffers` is a valid StringView +pub fn validate_string_view( + views: &[u128], + buffers: &[Buffer], +) -> Result<(), ArrowError> { + validate_view_impl(views, buffers, |idx, b| { + std::str::from_utf8(b).map_err(|e| { + ArrowError::InvalidArgumentError(format!( + "Encountered non-UTF-8 data at index {idx}: {e}" + )) + })?; + Ok(()) + }) +} + +fn validate_view_impl( + views: &[u128], + buffers: &[Buffer], + f: F, +) -> Result<(), ArrowError> +where + F: Fn(usize, &[u8]) -> Result<(), ArrowError>, +{ + for (idx, v) in views.iter().enumerate() { + let len = *v as u32; + if len <= 12 { + if len < 12 && (v >> (32 + len * 8)) != 0 { + return Err(ArrowError::InvalidArgumentError(format!( + "View at index {idx} contained non-zero padding for string of length {len}", + ))); + } + f(idx, &v.to_le_bytes()[4..4 + len as usize])?; + } else { + let view = View::from(*v); + let data = buffers.get(view.buffer_index as usize).ok_or_else(|| { + ArrowError::InvalidArgumentError(format!( + "Invalid buffer index at {idx}: got index {} but only has {} buffers", + view.buffer_index, + buffers.len() + )) + })?; + + let start = view.offset as usize; + let end = start + len as usize; + let b = data.get(start..end).ok_or_else(|| { + ArrowError::InvalidArgumentError(format!( + "Invalid buffer slice at {idx}: got {start}..{end} but buffer {} has length {}", + view.buffer_index, + data.len() + )) + })?; + + if !b.starts_with(&view.prefix.to_le_bytes()) { + return Err(ArrowError::InvalidArgumentError( + "Mismatch between embedded prefix and data".to_string(), + )); + } + + f(idx, b)?; + } + } + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_validate() { + let buffers = vec![ + Buffer::from(b"helloworldbongo\xFFfoo"), + Buffer::from(b"bar"), + ]; + + let test = |length: u32, prefix: u32, buffer_index: u32, offset: u32| { + let view = View { + length, + prefix, + buffer_index, + offset, + }; + validate_string_view(&[view.into()], &buffers) + }; + + let err = test(14, 0, 2, 0).unwrap_err().to_string(); + assert_eq!(err, "Invalid argument error: Invalid buffer index at 0: got index 2 but only has 2 buffers"); + + let err = test(14, 0, 1, 3).unwrap_err().to_string(); + assert_eq!(err, "Invalid argument error: Invalid buffer slice at 0: got 3..17 but buffer 1 has length 3"); + + let err = test(14, 0, 1, 3).unwrap_err().to_string(); + assert_eq!(err, "Invalid argument error: Invalid buffer slice at 0: got 3..17 but buffer 1 has length 3"); + + let err = test(15, 0, 0, 0).unwrap_err().to_string(); + assert_eq!( + err, + "Invalid argument error: Mismatch between embedded prefix and data" + ); + + let prefix = u32::from_le_bytes(*b"hell"); + test(15, prefix, 0, 0).unwrap(); + + let prefix = u32::from_le_bytes(*b"ello"); + let err = test(15, prefix, 0, 1).unwrap_err().to_string(); + assert_eq!( + err, + "Invalid argument error: Encountered non-UTF-8 data at index 0: invalid utf-8 sequence of 1 bytes from index 14" + ); + + let encoded = u128::from_le_bytes(*b"\x05\0\0\0hello\0\0\0\0\0\0\0"); + validate_string_view(&[encoded], &[]).unwrap(); + + let encoded = u128::from_le_bytes(*b"\x05\0\0\0hello\0\0\0s\0\0\0"); + let err = validate_string_view(&[encoded], &[]) + .unwrap_err() + .to_string(); + assert_eq!(err, "Invalid argument error: View at index 0 contained non-zero padding for string of length 5"); + + let encoded = u128::from_le_bytes(*b"\x05\0\0\0he\xFFlo\0\0\0\0\0\0\0"); + let err = validate_string_view(&[encoded], &[]) + .unwrap_err() + .to_string(); + assert_eq!(err, "Invalid argument error: Encountered non-UTF-8 data at index 0: invalid utf-8 sequence of 1 bytes from index 2"); + } +} diff --git a/arrow-integration-test/src/datatype.rs b/arrow-integration-test/src/datatype.rs index a04db1cf3538..f643547d12d4 100644 --- a/arrow-integration-test/src/datatype.rs +++ b/arrow-integration-test/src/datatype.rs @@ -343,7 +343,9 @@ pub fn data_type_to_json(data_type: &DataType) -> serde_json::Value { DataType::Map(_, keys_sorted) => { json!({"name": "map", "keysSorted": keys_sorted}) } - DataType::RunEndEncoded(_, _) => todo!(), + DataType::RunEndEncoded(_, _) => { + unimplemented!() + } } } diff --git a/arrow/tests/array_transform.rs b/arrow/tests/array_transform.rs index 5a267c876d6a..c53d36a5a64a 100644 --- a/arrow/tests/array_transform.rs +++ b/arrow/tests/array_transform.rs @@ -22,6 +22,7 @@ use arrow::array::{ UnionArray, }; use arrow::datatypes::Int16Type; +use arrow_array::StringViewArray; use arrow_buffer::Buffer; use arrow_data::transform::MutableArrayData; use arrow_data::ArrayData; @@ -1027,6 +1028,48 @@ fn test_extend_nulls_panic() { mutable.extend_nulls(2); } +#[test] +fn test_string_view() { + let a1 = StringViewArray::from_iter_values(vec![ + "foo", + "very long string over 12 bytes", + "bar", + ]) + .into_data(); + let a2 = StringViewArray::from_iter(vec![ + Some("bar"), + None, + Some("long string also over 12 bytes"), + ]) + .into_data(); + + a1.validate_full().unwrap(); + a2.validate_full().unwrap(); + + let mut mutable = MutableArrayData::new(vec![&a1, &a2], false, 4); + mutable.extend(1, 0, 1); + mutable.extend(0, 1, 2); + mutable.extend(0, 0, 1); + mutable.extend(1, 2, 3); + + let array = StringViewArray::from(mutable.freeze()); + assert_eq!(array.data_buffers().len(), 2); + // Should have reused data buffers + assert_eq!(array.data_buffers()[0].as_ptr(), a1.buffers()[1].as_ptr()); + assert_eq!(array.data_buffers()[1].as_ptr(), a2.buffers()[1].as_ptr()); + + let v = array.iter().collect::>(); + assert_eq!( + v, + vec![ + Some("bar"), + Some("very long string over 12 bytes"), + Some("foo"), + Some("long string also over 12 bytes") + ] + ) +} + #[test] #[should_panic(expected = "Arrays with inconsistent types passed to MutableArrayData")] fn test_mixed_types() {