Skip to content

Commit

Permalink
[WIP] feat: support live_view_array
Browse files Browse the repository at this point in the history
  • Loading branch information
Kikkon committed Mar 24, 2024
1 parent f41c2a4 commit 92cbaba
Show file tree
Hide file tree
Showing 11 changed files with 999 additions and 15 deletions.
501 changes: 501 additions & 0 deletions arrow-array/src/array/list_view_array.rs

Large diffs are not rendered by default.

31 changes: 28 additions & 3 deletions arrow-array/src/array/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
mod binary_array;

use crate::types::*;
use arrow_buffer::{ArrowNativeType, NullBuffer, OffsetBuffer, ScalarBuffer};
use arrow_buffer::{ArrowNativeType, NullBuffer, OffsetBuffer, ScalarBuffer, SizeBuffer};
use arrow_data::ArrayData;
use arrow_schema::{DataType, IntervalUnit, TimeUnit};
use std::any::Any;
Expand Down Expand Up @@ -65,13 +65,14 @@ mod union_array;
pub use union_array::*;

mod run_array;

pub use run_array::*;

mod byte_view_array;

pub use byte_view_array::*;

mod list_view_array;
pub use list_view_array::*;

/// An array in the [arrow columnar format](https://arrow.apache.org/docs/format/Columnar.html)
pub trait Array: std::fmt::Debug + Send + Sync {
/// Returns the array as [`Any`] so that it can be
Expand Down Expand Up @@ -519,6 +520,12 @@ impl<OffsetSize: OffsetSizeTrait> PartialEq for GenericListArray<OffsetSize> {
}
}

impl<OffsetSize: OffsetSizeTrait> PartialEq for GenericListViewArray<OffsetSize> {
fn eq(&self, other: &Self) -> bool {
self.to_data().eq(&other.to_data())
}
}

impl PartialEq for MapArray {
fn eq(&self, other: &Self) -> bool {
self.to_data().eq(&other.to_data())
Expand Down Expand Up @@ -606,7 +613,9 @@ pub fn make_array(data: ArrayData) -> ArrayRef {
DataType::LargeUtf8 => Arc::new(LargeStringArray::from(data)) as ArrayRef,
DataType::Utf8View => Arc::new(StringViewArray::from(data)) as ArrayRef,
DataType::List(_) => Arc::new(ListArray::from(data)) as ArrayRef,
DataType::ListView(_) => Arc::new(ListViewArray::from(data)) as ArrayRef,
DataType::LargeList(_) => Arc::new(LargeListArray::from(data)) as ArrayRef,
DataType::LargeListView(_) => Arc::new(LargeListViewArray::from(data)) as ArrayRef,
DataType::Struct(_) => Arc::new(StructArray::from(data)) as ArrayRef,
DataType::Map(_, _) => Arc::new(MapArray::from(data)) as ArrayRef,
DataType::Union(_, _) => Arc::new(UnionArray::from(data)) as ArrayRef,
Expand Down Expand Up @@ -687,6 +696,22 @@ unsafe fn get_offsets<O: ArrowNativeType>(data: &ArrayData) -> OffsetBuffer<O> {
}
}

/// Helper function that gets size from an [`ArrayData`]
///
/// # Safety
unsafe fn get_sizes<O: ArrowNativeType>(data: &ArrayData) -> SizeBuffer<O> {
match data.is_empty() && data.buffers()[1].is_empty() {
true => SizeBuffer::new_empty(),
false => {
let buffer =
ScalarBuffer::new(data.buffers()[1].clone(), data.offset(), data.len());
// Safety:
// ArrayData is valid
SizeBuffer::new(buffer)
}
}
}

/// Helper function for printing potentially long arrays.
fn print_long_array<A, F>(array: &A, f: &mut std::fmt::Formatter, print_item: F) -> std::fmt::Result
where
Expand Down
249 changes: 249 additions & 0 deletions arrow-array/src/builder/generic_list_view_builder.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,249 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.


use std::any::Any;
use std::sync::Arc;
use arrow_buffer::{Buffer, BufferBuilder, NullBufferBuilder, OffsetBuffer, SizeBuffer};
use arrow_schema::{Field, FieldRef};
use crate::builder::ArrayBuilder;
use crate::{ArrayRef, GenericListViewArray, OffsetSizeTrait};

#[derive(Debug)]
pub struct GenericListViewBuilder<OffsetSize: OffsetSizeTrait, T: ArrayBuilder> {
//todo use OffsetBuffer?
offsets_builder: BufferBuilder<OffsetSize>,
sizes_builder: BufferBuilder<OffsetSize>,
null_buffer_builder: NullBufferBuilder,
values_builder: T,
field: Option<FieldRef>,
}




impl<O: OffsetSizeTrait, T: ArrayBuilder + Default> Default for GenericListViewBuilder<O, T> {
fn default() -> Self {
Self::new(T::default())
}
}

impl<OffsetSize: OffsetSizeTrait, T: ArrayBuilder> GenericListViewBuilder<OffsetSize, T> {
/// Creates a new [`GenericListBuilder`] from a given values array builder
pub fn new(values_builder: T) -> Self {
let capacity = values_builder.len();
Self::with_capacity(values_builder, capacity)
}

/// Creates a new [`GenericListBuilder`] from a given values array builder
/// `capacity` is the number of items to pre-allocate space for in this builder
pub fn with_capacity(values_builder: T, capacity: usize) -> Self {
let offsets_builder = BufferBuilder::<OffsetSize>::new(capacity);
let sizes_builder = BufferBuilder::<OffsetSize>::new(capacity);
Self {
offsets_builder,
null_buffer_builder: NullBufferBuilder::new(capacity),
values_builder,
sizes_builder,
field: None,
}
}

/// Override the field passed to [`GenericListArray::new`]
///
/// By default a nullable field is created with the name `item`
///
/// Note: [`Self::finish`] and [`Self::finish_cloned`] will panic if the
/// field's data type does not match that of `T`
pub fn with_field(self, field: impl Into<FieldRef>) -> Self {
Self {
field: Some(field.into()),
..self
}
}
}

impl<OffsetSize: OffsetSizeTrait, T: ArrayBuilder> ArrayBuilder
for GenericListViewBuilder<OffsetSize, T>
where
T: 'static,
{
/// Returns the builder as a non-mutable `Any` reference.
fn as_any(&self) -> &dyn Any {
self
}

/// Returns the builder as a mutable `Any` reference.
fn as_any_mut(&mut self) -> &mut dyn Any {
self
}

/// Returns the boxed builder as a box of `Any`.
fn into_box_any(self: Box<Self>) -> Box<dyn Any> {
self
}

/// Returns the number of array slots in the builder
fn len(&self) -> usize {
self.null_buffer_builder.len()
}

/// Builds the array and reset this builder.
fn finish(&mut self) -> ArrayRef {
Arc::new(self.finish())
}

/// Builds the array without resetting the builder.
fn finish_cloned(&self) -> ArrayRef {
Arc::new(self.finish_cloned())
}
}

impl<OffsetSize: OffsetSizeTrait, T: ArrayBuilder> GenericListViewBuilder<OffsetSize, T>
where
T: 'static,
{
/// Returns the child array builder as a mutable reference.
///
/// This mutable reference can be used to append values into the child array builder,
/// but you must call [`append`](#method.append) to delimit each distinct list value.
pub fn values(&mut self) -> &mut T {
&mut self.values_builder
}

/// Returns the child array builder as an immutable reference
pub fn values_ref(&self) -> &T {
&self.values_builder
}

/// Finish the current variable-length list array slot
///
/// # Panics
///
/// Panics if the length of [`Self::values`] exceeds `OffsetSize::MAX`
#[inline]
pub fn append(&mut self, is_valid: bool, size: usize) {
if is_valid {
self.offsets_builder.append(OffsetSize::from_usize(self.values_builder.len() - size).unwrap());
let size = OffsetSize::from_usize(size).unwrap();
self.sizes_builder.append(size);
}
self.null_buffer_builder.append(is_valid);
}

#[inline]
pub fn append_value<I, V>(&mut self, i: I)
where
T: Extend<Option<V>>,
I: IntoIterator<Item = Option<V>>,
{
self.extend(std::iter::once(Some(i)))
}

/// Append a null to this [`GenericListBuilder`]
///
/// See [`Self::append_value`] for an example use.
#[inline]
pub fn append_null(&mut self) {
self.offsets_builder.append(OffsetSize::from_usize(self.values_builder.len()).unwrap());
self.null_buffer_builder.append_null();
}

/// Appends an optional value into this [`GenericListBuilder`]
///
/// If `Some` calls [`Self::append_value`] otherwise calls [`Self::append_null`]
#[inline]
pub fn append_option<I, V>(&mut self, i: Option<I>)
where
T: Extend<Option<V>>,
I: IntoIterator<Item = Option<V>>,
{
match i {
Some(i) => self.append_value(i),
None => self.append_null(),
}
}

/// Builds the [`GenericListViewArray`] and reset this builder.
pub fn finish(&mut self) -> GenericListViewArray<OffsetSize> {
let values = self.values_builder.finish();
let nulls = self.null_buffer_builder.finish();

let offsets = self.offsets_builder.finish();
// Safety: Safe by construction
let offsets = unsafe { OffsetBuffer::new_unchecked(offsets.into()) };
self.offsets_builder.append(OffsetSize::zero());

let sizes = self.sizes_builder.finish();
// Safety: Safe by construction
let sizes = SizeBuffer::new(sizes.into());
self.sizes_builder.append(OffsetSize::zero());

let field = match &self.field {
Some(f) => f.clone(),
None => Arc::new(Field::new("item", values.data_type().clone(), true)),
};

GenericListViewArray::new(field, offsets, sizes, values ,nulls)
}

/// Builds the [`GenericListArray`] without resetting the builder.
pub fn finish_cloned(&self) -> GenericListViewArray<OffsetSize> {
let values = self.values_builder.finish_cloned();
let nulls = self.null_buffer_builder.finish_cloned();

let offsets = Buffer::from_slice_ref(self.offsets_builder.as_slice());
// Safety: safe by construction
let offsets = unsafe { OffsetBuffer::new_unchecked(offsets.into()) };

//todo sizes
let sizes = Buffer::from_slice_ref(self.sizes_builder.as_slice());
let sizes = SizeBuffer::new(sizes.into());

let field = match &self.field {
Some(f) => f.clone(),
None => Arc::new(Field::new("item", values.data_type().clone(), true)),
};

GenericListViewArray::new(field, offsets, sizes, values, nulls)
}

/// Returns the current offsets buffer as a slice
pub fn offsets_slice(&self) -> &[OffsetSize] {
self.offsets_builder.as_slice()
}
}

impl<O, B, V, E> Extend<Option<V>> for GenericListViewBuilder<O, B>
where
O: OffsetSizeTrait,
B: ArrayBuilder + Extend<E>,
V: IntoIterator<Item = E>,
{
#[inline]
fn extend<T: IntoIterator<Item = Option<V>>>(&mut self, iter: T) {
for v in iter {
match v {
Some(elements) => {
self.values_builder.extend(elements);
todo!()
}
None => self.append(false, 0),
}
}
}
}
2 changes: 2 additions & 0 deletions arrow-array/src/builder/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,8 @@ pub use generic_byte_run_builder::*;
mod generic_bytes_view_builder;
pub use generic_bytes_view_builder::*;
mod union_builder;
mod generic_list_view_builder;
pub use generic_list_view_builder::*;

pub use union_builder::*;

Expand Down
4 changes: 3 additions & 1 deletion arrow-array/src/iterator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
use crate::array::{
ArrayAccessor, BooleanArray, FixedSizeBinaryArray, GenericBinaryArray, GenericListArray,
GenericStringArray, PrimitiveArray,
GenericListViewArray, GenericStringArray, PrimitiveArray,
};
use crate::{FixedSizeListArray, MapArray};
use arrow_buffer::NullBuffer;
Expand Down Expand Up @@ -141,6 +141,8 @@ pub type FixedSizeBinaryIter<'a> = ArrayIter<&'a FixedSizeBinaryArray>;
pub type FixedSizeListIter<'a> = ArrayIter<&'a FixedSizeListArray>;
/// an iterator that returns Some(T) or None, that can be used on any ListArray
pub type GenericListArrayIter<'a, O> = ArrayIter<&'a GenericListArray<O>>;
/// an iterator that returns Some(T) or None, that can be used on any ListArray
pub type GenericListViewArrayIter<'a, O> = ArrayIter<&'a GenericListViewArray<O>>;
/// an iterator that returns Some(T) or None, that can be used on any MapArray
pub type MapArrayIter<'a> = ArrayIter<&'a MapArray>;

Expand Down
4 changes: 1 addition & 3 deletions arrow-array/src/record_batch.rs
Original file line number Diff line number Diff line change
Expand Up @@ -626,9 +626,7 @@ mod tests {
use std::collections::HashMap;

use super::*;
use crate::{
BooleanArray, Int32Array, Int64Array, Int8Array, ListArray, StringArray, StringViewArray,
};
use crate::{BooleanArray, Int32Array, Int64Array, Int8Array, ListArray, StringArray, StringViewArray};
use arrow_buffer::{Buffer, ToByteSlice};
use arrow_data::{ArrayData, ArrayDataBuilder};
use arrow_schema::Fields;
Expand Down
3 changes: 3 additions & 0 deletions arrow-buffer/src/buffer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,4 +32,7 @@ pub use boolean::*;
mod null;
pub use null::*;
mod run;
mod size;
pub use size::*;

pub use run::*;
Loading

0 comments on commit 92cbaba

Please sign in to comment.