From bbd85ed3d29601a293f5297736101e6e4460d548 Mon Sep 17 00:00:00 2001 From: Kikkon <19528375+Kikkon@users.noreply.github.com> Date: Tue, 30 Apr 2024 23:48:33 +0800 Subject: [PATCH] Add `ListView` & `LargeListView` basic construction and validation (#5664) * feat: list view basic construction and validation * fix: validate offset and sizes * chore: remove unused check * chore: add overflow checked * chore: lint --- arrow-data/src/data.rs | 70 ++++++++++++++++++++++++-- arrow/tests/array_validation.rs | 88 +++++++++++++++++++++++++++++++++ 2 files changed, 154 insertions(+), 4 deletions(-) diff --git a/arrow-data/src/data.rs b/arrow-data/src/data.rs index 0ea74f789dd5..d092fd049d77 100644 --- a/arrow-data/src/data.rs +++ b/arrow-data/src/data.rs @@ -22,9 +22,9 @@ use crate::bit_iterator::BitSliceIterator; use arrow_buffer::buffer::{BooleanBuffer, NullBuffer}; use arrow_buffer::{bit_util, i256, ArrowNativeType, Buffer, MutableBuffer}; use arrow_schema::{ArrowError, DataType, UnionMode}; -use std::mem; use std::ops::Range; use std::sync::Arc; +use std::{mem, usize}; use crate::{equal, validate_binary_view, validate_string_view}; @@ -929,6 +929,41 @@ impl ArrayData { Ok(()) } + /// Does a cheap sanity check that the `self.len` values in `buffer` are valid + /// offsets and sizes (of type T) into some other buffer of `values_length` bytes long + fn validate_offsets_and_sizes( + &self, + values_length: usize, + ) -> Result<(), ArrowError> { + let offsets: &[T] = self.typed_buffer(0, self.len)?; + let sizes: &[T] = self.typed_buffer(1, self.len)?; + for i in 0..values_length { + let size = sizes[i].to_usize().ok_or_else(|| { + ArrowError::InvalidArgumentError(format!( + "Error converting size[{}] ({}) to usize for {}", + i, sizes[i], self.data_type + )) + })?; + let offset = offsets[i].to_usize().ok_or_else(|| { + ArrowError::InvalidArgumentError(format!( + "Error converting offset[{}] ({}) to usize for {}", + i, offsets[i], self.data_type + )) + })?; + if size + .checked_add(offset) + .expect("Offset and size have exceeded the usize boundary") + > values_length + { + return Err(ArrowError::InvalidArgumentError(format!( + "Size {} at index {} is larger than the remaining values for {}", + size, i, self.data_type + ))); + } + } + Ok(()) + } + /// Validates the layout of `child_data` ArrayData structures fn validate_child_data(&self) -> Result<(), ArrowError> { match &self.data_type { @@ -942,6 +977,16 @@ impl ArrayData { self.validate_offsets::(values_data.len)?; Ok(()) } + DataType::ListView(field) => { + let values_data = self.get_single_valid_child_data(field.data_type())?; + self.validate_offsets_and_sizes::(values_data.len)?; + Ok(()) + } + DataType::LargeListView(field) => { + let values_data = self.get_single_valid_child_data(field.data_type())?; + self.validate_offsets_and_sizes::(values_data.len)?; + Ok(()) + } DataType::FixedSizeList(field, list_size) => { let values_data = self.get_single_valid_child_data(field.data_type())?; @@ -1546,9 +1591,8 @@ pub fn layout(data_type: &DataType) -> DataTypeLayout { DataType::BinaryView | DataType::Utf8View => DataTypeLayout::new_view(), DataType::FixedSizeList(_, _) => DataTypeLayout::new_nullable_empty(), // all in child data DataType::List(_) => DataTypeLayout::new_fixed_width::(), - DataType::ListView(_) | DataType::LargeListView(_) => { - unimplemented!("ListView/LargeListView not implemented") - } + DataType::ListView(_) => DataTypeLayout::new_list_view::(), + DataType::LargeListView(_) => DataTypeLayout::new_list_view::(), DataType::LargeList(_) => DataTypeLayout::new_fixed_width::(), DataType::Map(_, _) => DataTypeLayout::new_fixed_width::(), DataType::Struct(_) => DataTypeLayout::new_nullable_empty(), // all in child data, @@ -1661,6 +1705,24 @@ impl DataTypeLayout { variadic: true, } } + + /// Describes a list view type + pub fn new_list_view() -> Self { + Self { + buffers: vec![ + BufferSpec::FixedWidth { + byte_width: mem::size_of::(), + alignment: mem::align_of::(), + }, + BufferSpec::FixedWidth { + byte_width: mem::size_of::(), + alignment: mem::align_of::(), + }, + ], + can_contain_null_mask: true, + variadic: true, + } + } } /// Layout specification for a single data type buffer diff --git a/arrow/tests/array_validation.rs b/arrow/tests/array_validation.rs index f5298f82e0a4..41def9051d44 100644 --- a/arrow/tests/array_validation.rs +++ b/arrow/tests/array_validation.rs @@ -342,6 +342,94 @@ fn test_validate_offsets_last_too_large() { .unwrap(); } +/// Test that the list of type `data_type` generates correct offset and size out of bounds errors +fn check_list_view_offsets_sizes( + data_type: DataType, + offsets: Vec, + sizes: Vec, +) { + let values: Int32Array = [Some(1), Some(2), Some(3), Some(4)].into_iter().collect(); + let offsets_buffer = Buffer::from_slice_ref(offsets); + let sizes_buffer = Buffer::from_slice_ref(sizes); + ArrayData::try_new( + data_type, + 4, + None, + 0, + vec![offsets_buffer, sizes_buffer], + vec![values.into_data()], + ) + .unwrap(); +} + +#[test] +#[should_panic(expected = "Size 3 at index 3 is larger than the remaining values for ListView")] +fn test_validate_list_view_offsets_sizes() { + let field_type = Field::new("f", DataType::Int32, true); + check_list_view_offsets_sizes::( + DataType::ListView(Arc::new(field_type)), + vec![0, 1, 1, 2], + vec![1, 1, 1, 3], + ); +} + +#[test] +#[should_panic( + expected = "Size 3 at index 3 is larger than the remaining values for LargeListView" +)] +fn test_validate_large_list_view_offsets_sizes() { + let field_type = Field::new("f", DataType::Int32, true); + check_list_view_offsets_sizes::( + DataType::LargeListView(Arc::new(field_type)), + vec![0, 1, 1, 2], + vec![1, 1, 1, 3], + ); +} + +#[test] +#[should_panic(expected = "Error converting offset[1] (-1) to usize for ListView")] +fn test_validate_list_view_negative_offsets() { + let field_type = Field::new("f", DataType::Int32, true); + check_list_view_offsets_sizes::( + DataType::ListView(Arc::new(field_type)), + vec![0, -1, 1, 2], + vec![1, 1, 1, 3], + ); +} + +#[test] +#[should_panic(expected = "Error converting size[2] (-1) to usize for ListView")] +fn test_validate_list_view_negative_sizes() { + let field_type = Field::new("f", DataType::Int32, true); + check_list_view_offsets_sizes::( + DataType::ListView(Arc::new(field_type)), + vec![0, 1, 1, 2], + vec![1, 1, -1, 3], + ); +} + +#[test] +#[should_panic(expected = "Error converting offset[1] (-1) to usize for LargeListView")] +fn test_validate_large_list_view_negative_offsets() { + let field_type = Field::new("f", DataType::Int32, true); + check_list_view_offsets_sizes::( + DataType::LargeListView(Arc::new(field_type)), + vec![0, -1, 1, 2], + vec![1, 1, 1, 3], + ); +} + +#[test] +#[should_panic(expected = "Error converting size[2] (-1) to usize for LargeListView")] +fn test_validate_large_list_view_negative_sizes() { + let field_type = Field::new("f", DataType::Int32, true); + check_list_view_offsets_sizes::( + DataType::LargeListView(Arc::new(field_type)), + vec![0, 1, 1, 2], + vec![1, 1, -1, 3], + ); +} + #[test] #[should_panic( expected = "Values length 4 is less than the length (2) multiplied by the value size (2) for FixedSizeList"