Skip to content

Commit

Permalink
Add ListView & LargeListView basic construction and validation (#…
Browse files Browse the repository at this point in the history
…5664)

* feat: list view basic construction and validation

* fix: validate offset and sizes

* chore: remove unused check

* chore: add overflow checked

* chore: lint
  • Loading branch information
Kikkon authored Apr 30, 2024
1 parent a20d2e5 commit bbd85ed
Show file tree
Hide file tree
Showing 2 changed files with 154 additions and 4 deletions.
70 changes: 66 additions & 4 deletions arrow-data/src/data.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,9 @@ use crate::bit_iterator::BitSliceIterator;
use arrow_buffer::buffer::{BooleanBuffer, NullBuffer};
use arrow_buffer::{bit_util, i256, ArrowNativeType, Buffer, MutableBuffer};
use arrow_schema::{ArrowError, DataType, UnionMode};
use std::mem;
use std::ops::Range;
use std::sync::Arc;
use std::{mem, usize};

use crate::{equal, validate_binary_view, validate_string_view};

Expand Down Expand Up @@ -929,6 +929,41 @@ impl ArrayData {
Ok(())
}

/// Does a cheap sanity check that the `self.len` values in `buffer` are valid
/// offsets and sizes (of type T) into some other buffer of `values_length` bytes long
fn validate_offsets_and_sizes<T: ArrowNativeType + num::Num + std::fmt::Display>(
&self,
values_length: usize,
) -> Result<(), ArrowError> {
let offsets: &[T] = self.typed_buffer(0, self.len)?;
let sizes: &[T] = self.typed_buffer(1, self.len)?;
for i in 0..values_length {
let size = sizes[i].to_usize().ok_or_else(|| {
ArrowError::InvalidArgumentError(format!(
"Error converting size[{}] ({}) to usize for {}",
i, sizes[i], self.data_type
))
})?;
let offset = offsets[i].to_usize().ok_or_else(|| {
ArrowError::InvalidArgumentError(format!(
"Error converting offset[{}] ({}) to usize for {}",
i, offsets[i], self.data_type
))
})?;
if size
.checked_add(offset)
.expect("Offset and size have exceeded the usize boundary")
> values_length
{
return Err(ArrowError::InvalidArgumentError(format!(
"Size {} at index {} is larger than the remaining values for {}",
size, i, self.data_type
)));
}
}
Ok(())
}

/// Validates the layout of `child_data` ArrayData structures
fn validate_child_data(&self) -> Result<(), ArrowError> {
match &self.data_type {
Expand All @@ -942,6 +977,16 @@ impl ArrayData {
self.validate_offsets::<i64>(values_data.len)?;
Ok(())
}
DataType::ListView(field) => {
let values_data = self.get_single_valid_child_data(field.data_type())?;
self.validate_offsets_and_sizes::<i32>(values_data.len)?;
Ok(())
}
DataType::LargeListView(field) => {
let values_data = self.get_single_valid_child_data(field.data_type())?;
self.validate_offsets_and_sizes::<i64>(values_data.len)?;
Ok(())
}
DataType::FixedSizeList(field, list_size) => {
let values_data = self.get_single_valid_child_data(field.data_type())?;

Expand Down Expand Up @@ -1546,9 +1591,8 @@ pub fn layout(data_type: &DataType) -> DataTypeLayout {
DataType::BinaryView | DataType::Utf8View => DataTypeLayout::new_view(),
DataType::FixedSizeList(_, _) => DataTypeLayout::new_nullable_empty(), // all in child data
DataType::List(_) => DataTypeLayout::new_fixed_width::<i32>(),
DataType::ListView(_) | DataType::LargeListView(_) => {
unimplemented!("ListView/LargeListView not implemented")
}
DataType::ListView(_) => DataTypeLayout::new_list_view::<i32>(),
DataType::LargeListView(_) => DataTypeLayout::new_list_view::<i64>(),
DataType::LargeList(_) => DataTypeLayout::new_fixed_width::<i64>(),
DataType::Map(_, _) => DataTypeLayout::new_fixed_width::<i32>(),
DataType::Struct(_) => DataTypeLayout::new_nullable_empty(), // all in child data,
Expand Down Expand Up @@ -1661,6 +1705,24 @@ impl DataTypeLayout {
variadic: true,
}
}

/// Describes a list view type
pub fn new_list_view<T>() -> Self {
Self {
buffers: vec![
BufferSpec::FixedWidth {
byte_width: mem::size_of::<T>(),
alignment: mem::align_of::<T>(),
},
BufferSpec::FixedWidth {
byte_width: mem::size_of::<T>(),
alignment: mem::align_of::<T>(),
},
],
can_contain_null_mask: true,
variadic: true,
}
}
}

/// Layout specification for a single data type buffer
Expand Down
88 changes: 88 additions & 0 deletions arrow/tests/array_validation.rs
Original file line number Diff line number Diff line change
Expand Up @@ -342,6 +342,94 @@ fn test_validate_offsets_last_too_large() {
.unwrap();
}

/// Test that the list of type `data_type` generates correct offset and size out of bounds errors
fn check_list_view_offsets_sizes<T: ArrowNativeType>(
data_type: DataType,
offsets: Vec<T>,
sizes: Vec<T>,
) {
let values: Int32Array = [Some(1), Some(2), Some(3), Some(4)].into_iter().collect();
let offsets_buffer = Buffer::from_slice_ref(offsets);
let sizes_buffer = Buffer::from_slice_ref(sizes);
ArrayData::try_new(
data_type,
4,
None,
0,
vec![offsets_buffer, sizes_buffer],
vec![values.into_data()],
)
.unwrap();
}

#[test]
#[should_panic(expected = "Size 3 at index 3 is larger than the remaining values for ListView")]
fn test_validate_list_view_offsets_sizes() {
let field_type = Field::new("f", DataType::Int32, true);
check_list_view_offsets_sizes::<i32>(
DataType::ListView(Arc::new(field_type)),
vec![0, 1, 1, 2],
vec![1, 1, 1, 3],
);
}

#[test]
#[should_panic(
expected = "Size 3 at index 3 is larger than the remaining values for LargeListView"
)]
fn test_validate_large_list_view_offsets_sizes() {
let field_type = Field::new("f", DataType::Int32, true);
check_list_view_offsets_sizes::<i64>(
DataType::LargeListView(Arc::new(field_type)),
vec![0, 1, 1, 2],
vec![1, 1, 1, 3],
);
}

#[test]
#[should_panic(expected = "Error converting offset[1] (-1) to usize for ListView")]
fn test_validate_list_view_negative_offsets() {
let field_type = Field::new("f", DataType::Int32, true);
check_list_view_offsets_sizes::<i32>(
DataType::ListView(Arc::new(field_type)),
vec![0, -1, 1, 2],
vec![1, 1, 1, 3],
);
}

#[test]
#[should_panic(expected = "Error converting size[2] (-1) to usize for ListView")]
fn test_validate_list_view_negative_sizes() {
let field_type = Field::new("f", DataType::Int32, true);
check_list_view_offsets_sizes::<i32>(
DataType::ListView(Arc::new(field_type)),
vec![0, 1, 1, 2],
vec![1, 1, -1, 3],
);
}

#[test]
#[should_panic(expected = "Error converting offset[1] (-1) to usize for LargeListView")]
fn test_validate_large_list_view_negative_offsets() {
let field_type = Field::new("f", DataType::Int32, true);
check_list_view_offsets_sizes::<i64>(
DataType::LargeListView(Arc::new(field_type)),
vec![0, -1, 1, 2],
vec![1, 1, 1, 3],
);
}

#[test]
#[should_panic(expected = "Error converting size[2] (-1) to usize for LargeListView")]
fn test_validate_large_list_view_negative_sizes() {
let field_type = Field::new("f", DataType::Int32, true);
check_list_view_offsets_sizes::<i64>(
DataType::LargeListView(Arc::new(field_type)),
vec![0, 1, 1, 2],
vec![1, 1, -1, 3],
);
}

#[test]
#[should_panic(
expected = "Values length 4 is less than the length (2) multiplied by the value size (2) for FixedSizeList"
Expand Down

0 comments on commit bbd85ed

Please sign in to comment.