From ffbf53bbbab5e681d6529a80c7c4d7d691095ba8 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 9 Apr 2024 15:23:43 -0400 Subject: [PATCH] Encapsulate View manipulation --- arrow-array/src/array/byte_view_array.rs | 214 +++++-- .../src/builder/generic_bytes_view_builder.rs | 47 +- arrow-array/src/types.rs | 21 + arrow-data/src/byte_view.rs | 576 +++++++++++++++++- arrow-data/src/transform/mod.rs | 20 +- 5 files changed, 790 insertions(+), 88 deletions(-) diff --git a/arrow-array/src/array/byte_view_array.rs b/arrow-array/src/array/byte_view_array.rs index 79f2d47587a6..9c4b72348dc3 100644 --- a/arrow-array/src/array/byte_view_array.rs +++ b/arrow-array/src/array/byte_view_array.rs @@ -22,52 +22,36 @@ use crate::types::bytes::ByteArrayNativeType; use crate::types::{BinaryViewType, ByteViewType, StringViewType}; use crate::{Array, ArrayAccessor, ArrayRef}; use arrow_buffer::{Buffer, NullBuffer, ScalarBuffer}; -use arrow_data::{ArrayData, ArrayDataBuilder, ByteView}; +use arrow_data::{ArrayData, ArrayDataBuilder, OffsetView, View}; use arrow_schema::{ArrowError, DataType}; use std::any::Any; use std::fmt::Debug; use std::marker::PhantomData; use std::sync::Arc; -/// [Variable-size Binary View Layout]: An array of variable length bytes view arrays. -/// -/// Different than [`crate::GenericByteArray`] as it stores both an offset and length -/// meaning that take / filter operations can be implemented without copying the underlying data. -/// -/// See [`StringViewArray`] for storing utf8 encoded string data and -/// [`BinaryViewArray`] for storing bytes. -/// -/// [Variable-size Binary View Layout]: https://arrow.apache.org/docs/format/Columnar.html#variable-size-binary-view-layout +/// [Variable-size Binary View Layout]: An array of variable length byte strings. /// /// A `GenericByteViewArray` stores variable length byte strings. An array of -/// `N` elements is stored as `N` fixed length "views" and a variable number +/// `N` elements is stored as `N` fixed length [`View`]s and some number /// of variable length "buffers". /// -/// Each view is a `u128` value layout is different depending on the -/// length of the string stored at that location: +/// There are no constraints on offsets other than they must point into a valid +/// buffer. The offsets can be out of order, non-continuous and overlapping. /// -/// ```text -/// ┌──────┬────────────────────────┐ -/// │length│ string value │ -/// Strings (len <= 12) │ │ (padded with 0) │ -/// └──────┴────────────────────────┘ -/// 0 31 127 -/// -/// ┌───────┬───────┬───────┬───────┐ -/// │length │prefix │ buf │offset │ -/// Strings (len > 12) │ │ │ index │ │ -/// └───────┴───────┴───────┴───────┘ -/// 0 31 63 95 127 -/// ``` +/// Because `GenericByteViewArray` stores both an offset and length for each +/// byte string, certain operations such as `take` and `filter` can be +/// implemented without copying the underlying data, unlike +/// [`GenericByteArray`], which requires the variable length data to be +/// contiguous. /// -/// * Strings with length <= 12 are stored directly in the view. +/// # See Also: +/// * [`StringViewArray`] for storing UTF-8 string data +/// * [`BinaryViewArray`] for storing bytes +/// * [`View`] for the format of the views and interpreting the `u128` views /// -/// * Strings with length > 12: The first four bytes are stored inline in the -/// view and the entire string is stored in one of the buffers. +/// [Variable-size Binary View Layout]: https://arrow.apache.org/docs/format/Columnar.html#variable-size-binary-view-layout /// -/// Unlike [`GenericByteArray`], there are no constraints on the offsets other -/// than they must point into a valid buffer. However, they can be out of order, -/// non continuous and overlapping. +/// # Example /// /// For example, in the following diagram, the strings "FishWasInTownToday" and /// "CrumpleFacedFish" are both longer than 12 bytes and thus are stored in a @@ -93,6 +77,7 @@ use std::sync::Arc; /// └───┘ /// ``` /// [`GenericByteArray`]: crate::array::GenericByteArray +/// [`View`]: arrow_data::View pub struct GenericByteViewArray { data_type: DataType, views: ScalarBuffer, @@ -114,16 +99,26 @@ impl Clone for GenericByteViewArray { } impl GenericByteViewArray { - /// Create a new [`GenericByteViewArray`] from the provided parts, panicking on failure + /// Create a new [`GenericByteViewArray`] from the provided parts, panicking + /// on failure. /// - /// # Panics + /// See [Self::try_new] for parameters /// + /// # Panics /// Panics if [`GenericByteViewArray::try_new`] returns an error + /// + /// [`View`]: arrow_data::View pub fn new(views: ScalarBuffer, buffers: Vec, nulls: Option) -> Self { Self::try_new(views, buffers, nulls).unwrap() } - /// Create a new [`GenericByteViewArray`] from the provided parts, returning an error on failure + /// Create a new [`GenericByteViewArray`] from the provided parts, returning + /// an error on failure + /// + /// # Parameters + /// * `views`: a [`ScalarBuffer`] of u128 views (see [`View`] for format) + /// * `buffers`: a vector of [`Buffer`]s storing the string data + /// * `nulls`: an optional [`NullBuffer`] for null values /// /// # Errors /// @@ -156,7 +151,10 @@ impl GenericByteViewArray { }) } - /// Create a new [`GenericByteViewArray`] from the provided parts, without validation + /// Create a new [`GenericByteViewArray`] from the provided parts, without + /// validation + /// + /// See [Self::try_new] for parameters /// /// # Safety /// @@ -233,20 +231,68 @@ impl GenericByteViewArray { } /// Returns the element at index `i` + /// /// # Safety /// Caller is responsible for ensuring that the index is within the bounds of the array pub unsafe fn value_unchecked(&self, idx: usize) -> &T::Native { let v = self.views.get_unchecked(idx); - let len = *v as u32; - let b = if len <= 12 { - let ptr = self.views.as_ptr() as *const u8; - std::slice::from_raw_parts(ptr.add(idx * 16 + 4), len as usize) - } else { - let view = ByteView::from(*v); - let data = self.buffers.get_unchecked(view.buffer_index as usize); - let offset = view.offset as usize; - data.get_unchecked(offset..offset + len as usize) - }; + match View::from(v) { + View::Inline(inline_view) => { + let bytes = inline_view.get_bytes_unchecked(v); + T::Native::from_bytes_unchecked(bytes) + } + View::Offset(offset_view) => self.value_from_offset_view_unchecked(offset_view), + } + } + + /// Return the value of element from this [`OffsetView`] + /// + /// # Errors + /// * the buffer index is out of bounds + ///* offset / length is out of bounds of the buffer + /// * The data is not valid for `T::Native` (e.g. not Utf8) + pub fn value_from_offset_view<'a>( + &'a self, + offset_view: OffsetView<'_>, + ) -> Result<&'a T::Native, ArrowError> { + let data = self + .buffers + .get(offset_view.buffer_index() as usize) + .ok_or_else(|| { + ArrowError::InvalidArgumentError(format!( + "Invalid ByteView. Requested buffer {} but only has {} buffers", + offset_view.buffer_index(), + self.buffers.len() + )) + })?; + + let b = data.get(offset_view.range()).ok_or_else(|| { + ArrowError::InvalidArgumentError(format!( + "Invalid ByteView. Requested range {:?} but buffer {} valid range is {:?}", + offset_view.range(), + offset_view.buffer_index(), + 0..data.len() + )) + })?; + + T::Native::try_from_bytes(b) + } + + /// Return the value from the [`OffsetView`] + /// + /// # Safety + /// The caller is responsible for ensuring: + /// * the buffer index is within of bounds + /// * offset / length is within of bounds of the buffer + /// * The data is valid for `T::Native` (e.g Utf8 for Strings) + pub unsafe fn value_from_offset_view_unchecked<'a>( + &'a self, + offset_view: OffsetView<'_>, + ) -> &'a T::Native { + let data = self + .buffers + .get_unchecked(offset_view.buffer_index() as usize); + let b = data.get_unchecked(offset_view.range()); T::Native::from_bytes_unchecked(b) } @@ -487,7 +533,7 @@ mod tests { use crate::builder::{BinaryViewBuilder, StringViewBuilder}; use crate::{Array, BinaryViewArray, StringViewArray}; use arrow_buffer::{Buffer, ScalarBuffer}; - use arrow_data::ByteView; + use arrow_data::{ByteView, OffsetView, View}; #[test] fn try_new_string() { @@ -533,6 +579,72 @@ mod tests { assert!(array.is_empty()); } + #[test] + fn test_value_from_offset_view() { + let array = test_array(); + let View::Offset(offset_view) = View::new(array.views().get(2).unwrap()) else { + panic!("Expected offset view"); + }; + assert_eq!( + array.value_from_offset_view(offset_view).unwrap(), + "large payload over 12 bytes" + ); + } + + #[test] + fn test_value_from_offset_view2() { + let array = test_array(); + // Get last 60 bytes from buffer (60 is in hex 0x3c) + // buffer is 65 + // offset 5, index 0, prefix=????, length 60 + let v = 0x00000005_00000000_00000000_0000003cu128; + + assert_eq!( + array.value_from_offset_view(OffsetView::from(&v)).unwrap(), + " payload over 12 bytessome other large payload over 12 bytes" + ); + } + + #[test] + #[should_panic(expected = "Invalid ByteView. Requested buffer 2 but only has 1 buffers")] + fn test_value_from_offset_view_invalid_buffer() { + let array = test_array(); + // offset 0, buffer = 2, prefix = ????, length = 256 + let v = 0x00000000_00000002_00000000_00000100u128; + array.value_from_offset_view(OffsetView::from(&v)).unwrap(); + } + + #[test] + #[should_panic( + expected = "Invalid ByteView. Requested range 256..271 but buffer 0 valid range is 0..65" + )] + fn test_value_from_offset_view_invalid_offset() { + let array = test_array(); + // offset 256, buffer = 0, prefix = ????, length = 15 + let v = 0x00000100_00000000_00000000_0000000fu128; + array.value_from_offset_view(OffsetView::from(&v)).unwrap(); + } + + #[test] + #[should_panic( + expected = "Invalid ByteView. Requested range 0..256 but buffer 0 valid range is 0..65" + )] + fn test_value_from_offset_view_invalid_too_long() { + let array = test_array(); + // offset 0, buffer = 0, prefix = ????, length = 256 + let v = 0x00000000_00000000_00000000_00000100u128; + array.value_from_offset_view(OffsetView::from(&v)).unwrap(); + } + + fn test_array() -> StringViewArray { + let mut builder = StringViewBuilder::new(); + builder.append_value("hello"); + builder.append_null(); + builder.append_option(Some("large payload over 12 bytes")); + builder.append_option(Some("some other large payload over 12 bytes")); + builder.finish() + } + #[test] fn test_append_string() { // test builder append @@ -620,8 +732,8 @@ mod tests { view_buffer[0..4].copy_from_slice(&1u32.to_le_bytes()); view_buffer[4..].copy_from_slice(&data); - let view = ByteView::from(u128::from_le_bytes(view_buffer)); - let views = ScalarBuffer::from(vec![view.into()]); + let view = u128::from_le_bytes(view_buffer); + let views = ScalarBuffer::from(vec![view]); let buffers = vec![]; StringViewArray::new(views, buffers, None); } @@ -639,8 +751,8 @@ mod tests { view_buffer[4..8].copy_from_slice(&input_str_1.as_bytes()[0..4]); view_buffer[8..12].copy_from_slice(&0u32.to_le_bytes()); view_buffer[12..].copy_from_slice(&0u32.to_le_bytes()); - let view = ByteView::from(u128::from_le_bytes(view_buffer)); - let views = ScalarBuffer::from(vec![view.into()]); + let view = u128::from_le_bytes(view_buffer); + let views = ScalarBuffer::from(vec![view]); let buffers = vec![Buffer::from_slice_ref(input_str_2.as_bytes())]; StringViewArray::new(views, buffers, None); diff --git a/arrow-array/src/builder/generic_bytes_view_builder.rs b/arrow-array/src/builder/generic_bytes_view_builder.rs index 9accb932ae20..4e10d1d106a2 100644 --- a/arrow-array/src/builder/generic_bytes_view_builder.rs +++ b/arrow-array/src/builder/generic_bytes_view_builder.rs @@ -19,7 +19,7 @@ use crate::builder::ArrayBuilder; use crate::types::{BinaryViewType, ByteViewType, StringViewType}; use crate::{ArrayRef, GenericByteViewArray}; use arrow_buffer::{Buffer, BufferBuilder, NullBufferBuilder, ScalarBuffer}; -use arrow_data::ByteView; +use arrow_data::{OffsetViewBuilder, OwnedView}; use std::any::Any; use std::marker::PhantomData; @@ -72,35 +72,28 @@ impl GenericByteViewBuilder { #[inline] pub fn append_value(&mut self, value: impl AsRef) { let v: &[u8] = value.as_ref().as_ref(); - let length: u32 = v.len().try_into().unwrap(); - if length <= 12 { - let mut view_buffer = [0; 16]; - view_buffer[0..4].copy_from_slice(&length.to_le_bytes()); - view_buffer[4..4 + v.len()].copy_from_slice(v); - self.views_builder.append(u128::from_le_bytes(view_buffer)); - self.null_buffer_builder.append_non_null(); - return; - } - let required_cap = self.in_progress.len() + v.len(); - if self.in_progress.capacity() < required_cap { - let in_progress = Vec::with_capacity(v.len().max(self.block_size as usize)); - let flushed = std::mem::replace(&mut self.in_progress, in_progress); - if !flushed.is_empty() { - assert!(self.completed.len() < u32::MAX as usize); - self.completed.push(flushed.into()); + let view: u128 = match OwnedView::from(v) { + OwnedView::Inline(view) => view, + OwnedView::Offset(view) => { + let required_cap = self.in_progress.len() + v.len(); + if self.in_progress.capacity() < required_cap { + let in_progress = Vec::with_capacity(v.len().max(self.block_size as usize)); + let flushed = std::mem::replace(&mut self.in_progress, in_progress); + if !flushed.is_empty() { + assert!(self.completed.len() < u32::MAX as usize); + self.completed.push(flushed.into()); + } + }; + let builder = OffsetViewBuilder::from(view) + .with_offset(self.in_progress.len() as u32) + .with_buffer_index(self.completed.len() as u32); + // copy the actual data into the in_progress buffer + self.in_progress.extend_from_slice(v); + builder.into() } }; - let offset = self.in_progress.len() as u32; - self.in_progress.extend_from_slice(v); - - let view = ByteView { - length, - prefix: u32::from_le_bytes(v[0..4].try_into().unwrap()), - buffer_index: self.completed.len() as u32, - offset, - }; - self.views_builder.append(view.into()); + self.views_builder.append(view); self.null_buffer_builder.append_non_null(); } diff --git a/arrow-array/src/types.rs b/arrow-array/src/types.rs index 038b2a291f58..64412f6c7c75 100644 --- a/arrow-array/src/types.rs +++ b/arrow-array/src/types.rs @@ -1425,10 +1425,19 @@ pub(crate) mod bytes { impl ByteArrayTypeSealed for GenericBinaryType {} pub trait ByteArrayNativeType: std::fmt::Debug + Send + Sync { + /// Covert bytes to this native type + /// /// # Safety /// /// `b` must be a valid byte sequence for `Self` unsafe fn from_bytes_unchecked(b: &[u8]) -> &Self; + + /// Covert bytes to this native type + /// + /// # Errors + /// + /// `b` is not a valid byte sequence for `Self` (e.g. not UTF8) + fn try_from_bytes(b: &[u8]) -> Result<&Self, ArrowError>; } impl ByteArrayNativeType for [u8] { @@ -1436,6 +1445,11 @@ pub(crate) mod bytes { unsafe fn from_bytes_unchecked(b: &[u8]) -> &Self { b } + + #[inline] + fn try_from_bytes(b: &[u8]) -> Result<&Self, ArrowError> { + Ok(b) + } } impl ByteArrayNativeType for str { @@ -1443,6 +1457,13 @@ pub(crate) mod bytes { unsafe fn from_bytes_unchecked(b: &[u8]) -> &Self { std::str::from_utf8_unchecked(b) } + + #[inline] + fn try_from_bytes(b: &[u8]) -> Result<&Self, ArrowError> { + std::str::from_utf8(b).map_err(|e| { + ArrowError::InvalidArgumentError(format!("Encountered non UTF-8 data: {e}")) + }) + } } } diff --git a/arrow-data/src/byte_view.rs b/arrow-data/src/byte_view.rs index b8b1731ac60b..69990a1da812 100644 --- a/arrow-data/src/byte_view.rs +++ b/arrow-data/src/byte_view.rs @@ -15,10 +15,464 @@ // specific language governing permissions and limitations // under the License. -use arrow_buffer::Buffer; +use arrow_buffer::{Buffer, ToByteSlice}; use arrow_schema::ArrowError; +use std::fmt::Formatter; +use std::ops::Range; -#[derive(Debug, Copy, Clone, Default)] +/// A `View` is a `u128` value that represents a single value in a +/// [`GenericByteViewArray`]. +/// +/// Depending on the array type, the value may be a utf8 string or simply bytes. +/// The layout of the u128 is different depending on the length of the bytes +/// stored at that location: +/// +/// # 12 or fewer bytes [`InlineView`] +/// +/// Values with 12 or fewer bytes are stored directly inlined in the `u128`. See +/// [`InlineView`] for field access. +/// +/// ```text +/// ┌───────────────────────────────────────────┬──────────────┐ +/// │ data │ length │ +/// Strings, len <= 12 │ (padded with \0) │ (u32) │ +/// (InlineView) │ │ │ +/// └───────────────────────────────────────────┴──────────────┘ +/// 127 31 0 bit +/// offset +/// ``` +/// +/// # More than 12 bytes [`OffsetView`] +/// +/// Values with more than 12 bytes store the first 4 bytes inline, an offset and +/// buffer index that reference the actual data (including the first 4 bytes) in +/// an externally managed buffer. See [`OffsetView`] for field access. +/// +/// ```text +/// ┌──────────────┬─────────────┬──────────────┬──────────────┐ +/// │buffer offset │ buffer index│ data prefix │ length │ +/// Strings, len > 12 │ (u32) │ (u32) │ (4 bytes) │ (u32) │ +/// (OffsetView) │ │ │ │ │ +/// └──────────────┴─────────────┴──────────────┴──────────────┘ +/// 127 95 63 31 0 bit +/// offset +/// ``` +/// +/// See Also: +/// * [`OwnedView`]: An owned variant of [`View`], used for constructing views +/// +/// [`GenericByteViewArray`]: https://docs.rs/arrow/latest/arrow/array/struct.GenericByteViewArray.html +/// +/// # Notes +/// Equality is based on the bitwise value of the view, not the data it logically points to +#[derive(Debug, Copy, Clone, PartialEq)] +pub enum View<'a> { + /// Entire string is inlined + Inline(InlineView<'a>), + /// String is stored in buffer, 4 byte prefix stored inline + Offset(OffsetView<'a>), +} + +impl<'a> View<'a> { + /// Create a new `View` representing the contents of a `u128` + #[inline(always)] + pub fn new(v: &'a u128) -> Self { + let len = *v as u32; + if len <= 12 { + Self::Inline(InlineView::from(v)) + } else { + Self::Offset(OffsetView::from(v)) + } + } +} + +impl<'a> From<&'a u128> for View<'a> { + #[inline(always)] + fn from(v: &'a u128) -> Self { + Self::new(v) + } +} + +/// Owned variant of [`View`] for constructing views from a string or byte slice. +/// +/// # Example +/// ``` +/// # use arrow_data::OwnedView; +/// // contruct a view from a string +/// let view = OwnedView::new_from_str("hello"); +/// assert!(matches!(view, OwnedView::Inline(_))); +/// ``` +/// +/// ``` +/// # use arrow_data::OwnedView; +/// // contruct a view from a longer string +/// let view = OwnedView::new_from_str("hello my name is crumple faced fish"); +/// assert!(matches!(view, OwnedView::Offset(_))); +/// ``` +/// +/// # Notes +/// Equality is based on the bitwise value of the view, not the data it logically points to +#[derive(PartialEq)] +pub enum OwnedView { + /// [`InlineView`]: Data is inlined (12 or fewer bytes) + Inline(u128), + /// [`OffsetView`]: Data is stored in a buffer (more than 12 bytes) + Offset(u128), +} + +impl OwnedView { + /// Create a new `OwnedView` from a preexisting u128 that represents a view. + /// + /// Note no validation is done on the u128 (e.g. no length checking) + pub fn new(v: u128) -> Self { + let len = v as u32; + if len <= 12 { + Self::Inline(v) + } else { + Self::Offset(v) + } + } + + /// Create a new view from a string + /// + /// See [`OwnedView::new_from_bytes`] for more details + pub fn new_from_str(value: &str) -> Self { + Self::new_from_bytes(value.as_bytes()) + } + + /// Construct an `OwnedView` from a byte slice + /// + /// This function constructs the appropriate view type to represent this + /// value, inlining the value or prefix as appropriate. + /// + /// # Notes: + /// * Does not manage any buffers / offsets + /// * A created [`OwnedView::Offset`] has buffer index and offset set to zero + #[inline(always)] + pub fn new_from_bytes(v: &[u8]) -> Self { + let length: u32 = v.len().try_into().unwrap(); + let mut view_buffer = [0; 16]; + view_buffer[0..4].copy_from_slice(&length.to_le_bytes()); + + if length <= 12 { + // copy all values + view_buffer[4..4 + v.len()].copy_from_slice(v); + Self::Inline(u128::from_le_bytes(view_buffer)) + } else { + // copy 4 byte prefix + view_buffer[4..8].copy_from_slice(&v[0..4]); + Self::Offset(u128::from_le_bytes(view_buffer)) + } + } + + // Convert this `OwnedView` to a `View` + pub fn as_view(&self) -> View { + match self { + Self::Inline(inline) => View::Inline(InlineView::from(inline)), + Self::Offset(offset) => View::Offset(OffsetView::from(offset)), + } + } +} + +impl std::fmt::Debug for OwnedView { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + // format with hex bytes + match self { + Self::Inline(inline) => write!(f, "OwnedView::Inline({inline:#20x})"), + Self::Offset(offset) => write!(f, "OwnedView::Offset({offset:#20x})"), + } + } +} + +impl From<&str> for OwnedView { + fn from(value: &str) -> Self { + Self::new_from_str(value) + } +} + +impl From<&[u8]> for OwnedView { + fn from(value: &[u8]) -> Self { + Self::new_from_bytes(value) + } +} + +impl From for OwnedView { + fn from(value: u128) -> Self { + Self::new(value) + } +} + +/// A view for data where the variable length data is less than or equal to 12. +/// +/// See documentation on [`View`] for details. +/// +/// # Notes +/// Note there is no validation done when converting to/from u128 +/// +/// Equality is based on the bitwise value of the view, not the data it +/// logically points to +#[derive(Copy, Clone, PartialEq)] +pub struct InlineView<'a>(&'a u128); + +impl<'a> std::fmt::Debug for InlineView<'a> { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + // format with hex bytes + write!(f, "InlineView({:#020x})", self.0) + } +} + +impl<'a> InlineView<'a> { + /// Create a new inline view from a u128 + #[inline(always)] + pub fn new_from_u128(v: &'a u128) -> Self { + Self(v) + } + + /// Return a reference to the u128 + pub fn as_u128(self) -> &'a u128 { + self.0 + } + + /// Convert this inline view to a u128 + pub fn into_u128(self) -> u128 { + *self.0 + } + + /// Return the length of the data, in bytes + #[inline(always)] + pub fn len(&self) -> usize { + // take first 4 bytes + let len = *self.0 as u32; + len as usize + } + + /// Return true if the length of the data is zero + #[inline(always)] + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// Access the value of the data, as bytes + /// + /// # Panics + /// If the length is greater than 12 (aka if this view is invalid) + #[inline(always)] + pub fn as_bytes(&self) -> &[u8] { + &self.0.to_byte_slice()[4..4 + self.len()] + } + + /// Access the value of the data, as bytes, unchecked + /// + /// # Safety + /// Undefined behavior if the length is greater than 12 + #[inline(always)] + pub unsafe fn as_bytes_unchecked(&self) -> &[u8] { + self.get_bytes_unchecked(self.0) + } + + /// Access the value of `v`, as bytes, unchecked described by this view + /// + /// This method can be used to access the inlined bytes described by this + /// view directly from a reference to the underlying `u128`. + /// + /// # Safety + /// Undefined behavior if the length is greater than 12 + #[inline(always)] + pub unsafe fn get_bytes_unchecked<'b>(&self, v: &'b u128) -> &'b [u8] { + v.to_byte_slice().get_unchecked(4..4 + self.len()) + } +} + +impl<'a> From<&'a u128> for InlineView<'a> { + #[inline(always)] + fn from(v: &'a u128) -> Self { + Self::new_from_u128(v) + } +} + +impl<'a> From> for &'a u128 { + #[inline(always)] + fn from(view: InlineView<'a>) -> Self { + view.as_u128() + } +} + +impl<'a> From> for u128 { + #[inline(always)] + fn from(view: InlineView) -> Self { + view.into_u128() + } +} + +/// A view for data where the length variable length data is greater than +/// 12 bytes. +/// +/// See documentation on [`View`] for details. +/// +/// # Notes +/// There is no validation done when converting to/from u128 +/// +/// # See Also +/// * [`View`] to determine the correct view type for a given `u128` +/// * [`OffsetViewBuilder`] for modifying the buffer index and offset of an `OffsetView` +#[derive(Copy, Clone, PartialEq)] +pub struct OffsetView<'a>(&'a u128); + +impl<'a> std::fmt::Debug for OffsetView<'a> { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + // format with hex bytes + write!(f, "OffsetView({:#020x})", self.0) + } +} + +impl<'a> OffsetView<'a> { + /// Create a new inline view from a u128 + #[inline(always)] + pub fn new_from_u128(v: &'a u128) -> Self { + Self(v) + } + + /// Return a reference to the inner u128 + pub fn as_u128(self) -> &'a u128 { + self.0 + } + + /// Convert this inline view to a u128 + pub fn to_u128(&self) -> u128 { + *self.0 + } + + /// Return the length of the data, in bytes + #[inline(always)] + pub fn len(&self) -> usize { + // take first 4 bytes + let len = *self.0 as u32; + len as usize + } + + /// Return true if the view represents an empty string + /// + /// # Notes + /// + /// Since an `OffsetView` is always greater than 12 bytes, this function + /// always returns false. + #[inline(always)] + pub fn is_empty(&self) -> bool { + false + } + + /// Return the prefix of the data (always 4 bytes) + #[inline(always)] + pub fn prefix_as_bytes(&self) -> &[u8] { + &self.0.to_byte_slice()[4..8] + } + + /// Return the buffer index + #[inline(always)] + pub fn buffer_index(&self) -> u32 { + (((*self.0) & 0x00000000_ffffffff_00000000_00000000) >> 64) as u32 + } + + /// Return the offset into the buffer + #[inline(always)] + pub fn offset(&self) -> u32 { + (((*self.0) & 0xffffffff_00000000_00000000_00000000) >> 96) as u32 + } + + /// Return the range of the data in the offset buffer + #[inline(always)] + pub fn range(&self) -> Range { + let offset = self.offset() as usize; + offset..(offset + self.len()) + } + + /// Return a builder for modifying this view + pub fn into_builder(&self) -> OffsetViewBuilder { + OffsetViewBuilder::new_from_u128(*self.0) + } +} + +impl<'a> From<&'a u128> for OffsetView<'a> { + #[inline(always)] + fn from(v: &'a u128) -> Self { + Self::new_from_u128(v) + } +} + +impl<'a> From> for &'a u128 { + #[inline(always)] + fn from(view: OffsetView<'a>) -> Self { + view.as_u128() + } +} + +impl<'a> From> for u128 { + #[inline(always)] + fn from(view: OffsetView) -> Self { + view.to_u128() + } +} + +/// Builder for [`OffsetView`]s +/// +/// This builder can help set offset and buffer index of an `OffsetView`. +/// +/// Note that the builder does not permit changing the length or prefix of the +/// view. To change the length or prefix, create a new `OffsetView` using +/// [`OwnedView::new_from_bytes`]. +#[derive(Clone, PartialEq)] +pub struct OffsetViewBuilder(u128); + +impl std::fmt::Debug for OffsetViewBuilder { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "OffsetViewBuilder({:#020x})", self.0) + } +} + +impl OffsetViewBuilder { + fn new_from_u128(v: u128) -> Self { + Self(v) + } + + /// Retrieve the u128 as a OffsetView + pub fn as_offset_view(&self) -> OffsetView<'_> { + OffsetView::new_from_u128(&self.0) + } + + /// Set the buffer index + pub fn with_buffer_index(self, buffer_index: u32) -> Self { + Self(self.0 | ((buffer_index as u128) << 64)) + } + + /// Set the offset + pub fn with_offset(self, offset: u32) -> Self { + Self(self.0 | ((offset as u128) << 96)) + } + + /// Return the inner u128, consuming the builder + pub fn build(self) -> u128 { + self.0 + } +} + +impl From for OffsetViewBuilder { + fn from(v: u128) -> Self { + Self::new_from_u128(v) + } +} + +impl From for u128 { + fn from(builder: OffsetViewBuilder) -> Self { + builder.build() + } +} + +/// A view for data where the variable length data has 12 or fewer bytes. See +/// [`View`] for details. +/// +/// Note: equality for `ByteView` is based on the bitwise value of the view, not +/// the data it logically points to +#[derive(Copy, Clone, Default, PartialEq)] #[repr(C)] pub struct ByteView { /// The length of the string/bytes. @@ -121,3 +575,121 @@ where } Ok(()) } + +#[cfg(test)] +mod tests { + use super::*; + #[test] + fn construction_empty() { + let s = ""; + let v = 0; + let owned = OwnedView::new_from_str(s); + assert_eq!(owned, OwnedView::Inline(v)); + assert_eq!(owned, OwnedView::from(v)); + assert_eq!(owned, OwnedView::from(s)); + } + + #[test] + fn construction_small() { + let s = "hello"; + // (7 bytes 0 padding, "hello", 5) + let v = 0x00000000_0000006f_6c6c6568_00000005u128; + let owned = OwnedView::new_from_str(s); + assert_eq!(owned, OwnedView::Inline(v)); + assert_eq!(owned, OwnedView::from(v)); + assert_eq!(owned, OwnedView::from(s)) + } + + #[test] + fn access_empty() { + let owned = OwnedView::new_from_str(""); + let View::Inline(inline) = owned.as_view() else { + panic!("unexpected view"); + }; + + assert_eq!(inline.len(), 0); + assert!(inline.is_empty()); + assert_eq!(inline.as_bytes(), []); + } + + #[test] + fn access_small() { + let owned = OwnedView::new_from_str("hello"); + let View::Inline(inline) = owned.as_view() else { + panic!("unexpected view"); + }; + assert_eq!(inline.len(), 5); + assert!(!inline.is_empty()); + assert_eq!(inline.as_bytes(), "hello".as_bytes()); + + // test accessing as a str (maybe make this unsafe or encapsulate in type system) + } + + #[test] + #[should_panic(expected = "range end index 19 out of range for slice of length 16")] + fn access_small_invalid() { + // use invalid length 20 + // (7 bytes 0 padding, "hello", 15) + let v = 0x00000000_0000006f_6c6c6568_0000000fu128; + let inline = InlineView(&v); + inline.as_bytes(); + } + + #[test] + fn construction_large() { + let s = "hello world here I am"; + // len = 21 (in hex is 0x15) + // prefix = "hell" (0x6c6c6568) + // offset/buffer_index = 0 + let v = 0x00000000_00000000_6c6c6568_00000015u128; + let owned = OwnedView::new_from_str(s); + let View::Offset(offset) = owned.as_view() else { + panic!("unexpected view"); + }; + assert_eq!(offset, OffsetView(&v)); + assert_eq!(offset.prefix_as_bytes(), "hell".as_bytes()); + } + + #[test] + fn access_large() { + // len = 0xdeadbeef + // prefix = "frob" (0x66 0x72 0x6f 0x62) + // offset = 0x12345678 + // buffer_index = 0x87654321 + let v = 0x12345678_87654321_626f7266_deadbeefu128; + let offset = OffsetView(&v); + assert_eq!(offset.len(), 0xdeadbeef); + assert_eq!(offset.buffer_index(), 0x87654321); + assert_eq!(offset.offset(), 0x12345678); + assert_eq!(offset.prefix_as_bytes(), "frob".as_bytes()); + } + + #[test] + fn modification_large() { + // len = 34 (0x22) + let v = 0x00000000_00000000_87654321_00000022u128; + let builder = OffsetViewBuilder::new_from_u128(v); + + let offset = builder.as_offset_view(); + assert_eq!(offset.len(), 34); + assert_eq!(offset.buffer_index(), 0); + assert_eq!(offset.offset(), 0); + assert_eq!(offset.prefix_as_bytes(), [0x21, 0x43, 0x65, 0x87]); + + // modify the buffer index + let builder = builder.with_buffer_index(0x12345678); + let offset = builder.as_offset_view(); + assert_eq!(offset.buffer_index(), 0x12345678); + assert_eq!(offset.offset(), 0); + assert_eq!(offset.prefix_as_bytes(), [0x21, 0x43, 0x65, 0x87]); + assert_eq!(offset.to_u128(), 0x00000000_12345678_87654321_00000022u128); + + // modify the offset + let builder = builder.with_offset(0xfeedbeef); + let offset = builder.as_offset_view(); + assert_eq!(offset.buffer_index(), 0x12345678); + assert_eq!(offset.offset(), 0xfeedbeef); + assert_eq!(offset.prefix_as_bytes(), [0x21, 0x43, 0x65, 0x87]); + assert_eq!(offset.to_u128(), 0xfeedbeef_12345678_87654321_00000022u128); + } +} diff --git a/arrow-data/src/transform/mod.rs b/arrow-data/src/transform/mod.rs index b0d9475afcd6..bc0971de106f 100644 --- a/arrow-data/src/transform/mod.rs +++ b/arrow-data/src/transform/mod.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use super::{data::new_buffers, ArrayData, ArrayDataBuilder, ByteView}; +use super::{data::new_buffers, ArrayData, ArrayDataBuilder, View}; use crate::bit_mask::set_bits; use arrow_buffer::buffer::{BooleanBuffer, NullBuffer}; use arrow_buffer::{bit_util, i256, ArrowNativeType, Buffer, MutableBuffer}; @@ -178,13 +178,17 @@ fn build_extend_view(array: &ArrayData, buffer_offset: u32) -> Extend { mutable .buffer1 .extend(views[start..start + len].iter().map(|v| { - let len = *v as u32; - if len <= 12 { - return *v; // Stored inline - } - let mut view = ByteView::from(*v); - view.buffer_index += buffer_offset; - view.into() + let new_view: u128 = match View::from(v) { + View::Inline(inline_view) => inline_view.into(), + View::Offset(offset_view) => { + let buffer_index = offset_view.buffer_index() + buffer_offset; + offset_view + .into_builder() + .with_buffer_index(buffer_index) + .build() + } + }; + new_view })) }, )