From 98f642c9789757d30e611b1255313ec6dea01f98 Mon Sep 17 00:00:00 2001 From: broccoliSpicy <93440049+broccoliSpicy@users.noreply.github.com> Date: Tue, 5 Nov 2024 16:17:53 -0500 Subject: [PATCH] feat: enable write variable width data in 2.1 (#3090) This PR adds a `Compute_Stat` trait to DataBlock, to separate the statistics computation and statistics inquiry. This PR also modifies `do_flush` and `encode_miniblock` in `PrimitiveStructuralEncoder` to enable write variable width data in 2.1(PrimitiveStructuralEncoder) --- rust/lance-encoding/src/data.rs | 11 +- .../src/encodings/logical/primitive.rs | 92 ++++-- rust/lance-encoding/src/statistics.rs | 269 +++++++++++------- 3 files changed, 234 insertions(+), 138 deletions(-) diff --git a/rust/lance-encoding/src/data.rs b/rust/lance-encoding/src/data.rs index 56f30f2bca..4e2a063801 100644 --- a/rust/lance-encoding/src/data.rs +++ b/rust/lance-encoding/src/data.rs @@ -29,7 +29,10 @@ use snafu::{location, Location}; use lance_core::{Error, Result}; -use crate::{buffer::LanceBuffer, statistics::Stat}; +use crate::{ + buffer::LanceBuffer, + statistics::{ComputeStat, Stat}, +}; /// `Encoding` enum serves as a encoding registration center. /// @@ -1232,7 +1235,7 @@ impl DataBlock { return Self::AllNull(AllNullDataBlock { num_values }); } - let encoded = match data_type { + let mut encoded = match data_type { DataType::Binary | DataType::Utf8 => arrow_binary_to_data_block(arrays, num_values, 32), DataType::BinaryView | DataType::Utf8View => { todo!() @@ -1318,6 +1321,10 @@ impl DataBlock { ) } }; + + // compute statistics + encoded.compute_stat(); + if !matches!(data_type, DataType::Dictionary(_, _)) { match nulls { Nullability::None => encoded, diff --git a/rust/lance-encoding/src/encodings/logical/primitive.rs b/rust/lance-encoding/src/encodings/logical/primitive.rs index a7c80bcbb7..b866cfe2a5 100644 --- a/rust/lance-encoding/src/encodings/logical/primitive.rs +++ b/rust/lance-encoding/src/encodings/logical/primitive.rs @@ -4,19 +4,21 @@ use std::{collections::VecDeque, fmt::Debug, iter, ops::Range, sync::Arc, vec}; use arrow::array::AsArray; -use arrow_array::{make_array, Array, ArrayRef}; +use arrow_array::{make_array, types::UInt64Type, Array, ArrayRef, PrimitiveArray}; use arrow_buffer::{bit_util, BooleanBuffer, NullBuffer}; use arrow_schema::{DataType, Field as ArrowField}; use futures::{future::BoxFuture, stream::FuturesUnordered, FutureExt, TryStreamExt}; -use lance_arrow::{deepcopy::deep_copy_array, DataTypeExt}; +use lance_arrow::deepcopy::deep_copy_array; use log::{debug, trace}; use snafu::{location, Location}; +use crate::data::DataBlock; +use crate::statistics::{GetStat, Stat}; use lance_core::{datatypes::Field, utils::tokio::spawn_cpu, Result}; use crate::{ buffer::LanceBuffer, - data::{BlockInfo, DataBlock, DataBlockBuilder, FixedWidthDataBlock, UsedEncoding}, + data::{BlockInfo, DataBlockBuilder, FixedWidthDataBlock, UsedEncoding}, decoder::{ BlockDecompressor, ColumnInfo, DecodeArrayTask, DecodePageTask, DecodedArray, DecodedPage, DecompressorStrategy, FieldScheduler, FilterExpression, LoadedPage, LogicalPageDecoder, @@ -1534,9 +1536,17 @@ impl PrimitiveStructuralEncoder { // As data gets wide then the # of values per block shrinks (very wide) // data doesn't even fit in a mini-block and the block overhead gets // too large and we prefer zipped. - fn is_narrow(arrays: &[ArrayRef], data_type: &DataType) -> bool { - let avg_bytes_per_row = Self::get_avg_value_size(arrays, data_type); - avg_bytes_per_row < 128 + fn is_narrow(data_block: &DataBlock) -> bool { + if let Some(max_len_array) = data_block.get_stat(Stat::MaxLength) { + let max_len_array = max_len_array + .as_any() + .downcast_ref::>() + .unwrap(); + if max_len_array.value(0) < 128 { + return true; + } + } + false } // Converts value data, repetition levels, and definition levels into a single @@ -1695,9 +1705,8 @@ impl PrimitiveStructuralEncoder { column_idx: u32, field: &Field, compression_strategy: &dyn CompressionStrategy, - arrays: Vec, + data: DataBlock, repdefs: Vec, - num_values: u64, row_number: u64, ) -> Result { let repdef = RepDefBuilder::serialize(repdefs); @@ -1707,7 +1716,6 @@ impl PrimitiveStructuralEncoder { // and potentially more decoder asymmetry. However, it may be worth // investigating at some point - let data = DataBlock::from_arrays(&arrays, num_values); let num_values = data.num_values(); // The validity is encoded in repdef so we can remove it let data = data.remove_validity(); @@ -1743,17 +1751,6 @@ impl PrimitiveStructuralEncoder { }) } - fn get_avg_value_size(_arrays: &[ArrayRef], data_type: &DataType) -> u64 { - // Simple types, we can infer avg size without looking at value - let byte_width = data_type.byte_width_opt(); - if let Some(byte_width) = byte_width { - return byte_width as u64; - } - - // Other types, we need to inspect buffers - todo!() - } - // Creates an encode task, consuming all buffered data fn do_flush( &mut self, @@ -1773,18 +1770,20 @@ impl PrimitiveStructuralEncoder { if num_values == num_nulls { Self::encode_all_null(column_idx, num_values, row_number) - } else if Self::is_narrow(&arrays, &field.data_type()) { - Self::encode_miniblock( - column_idx, - &field, - compression_strategy.as_ref(), - arrays, - repdefs, - num_values, - row_number, - ) } else { - todo!("Full zipped encoding") + let data_block = DataBlock::from_arrays(&arrays, num_values); + if Self::is_narrow(&data_block) { + Self::encode_miniblock( + column_idx, + &field, + compression_strategy.as_ref(), + data_block, + repdefs, + row_number, + ) + } else { + todo!("Full zipped encoding") + } } }) .boxed(); @@ -1857,3 +1856,34 @@ impl FieldEncoder for PrimitiveStructuralEncoder { std::future::ready(Ok(vec![EncodedColumn::default()])).boxed() } } + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use arrow_array::{ArrayRef, Int8Array, StringArray}; + + use crate::encodings::logical::primitive::PrimitiveStructuralEncoder; + + use super::DataBlock; + + #[test] + fn test_is_narrow() { + let int8_array = Int8Array::from(vec![1, 2, 3]); + let array_ref: ArrayRef = Arc::new(int8_array); + let block = DataBlock::from_array(array_ref); + + assert!(PrimitiveStructuralEncoder::is_narrow(&block)); + + let string_array = StringArray::from(vec![Some("hello"), Some("world")]); + let block = DataBlock::from_array(string_array); + assert!(PrimitiveStructuralEncoder::is_narrow(&block)); + + let string_array = StringArray::from(vec![ + Some("hello world".repeat(100)), + Some("world".to_string()), + ]); + let block = DataBlock::from_array(string_array); + assert!((!PrimitiveStructuralEncoder::is_narrow(&block))); + } +} diff --git a/rust/lance-encoding/src/statistics.rs b/rust/lance-encoding/src/statistics.rs index b5518017be..d437ee653c 100644 --- a/rust/lance-encoding/src/statistics.rs +++ b/rust/lance-encoding/src/statistics.rs @@ -44,12 +44,83 @@ impl fmt::Display for Stat { write!(f, "{:?}", self) } } + +pub trait ComputeStat { + fn compute_stat(&mut self); +} + +impl ComputeStat for DataBlock { + fn compute_stat(&mut self) { + match self { + Self::Empty() => {} + Self::Constant(_) => {} + Self::AllNull(_) => {} + Self::Nullable(data_block) => data_block.data.compute_stat(), + Self::FixedWidth(data_block) => data_block.compute_stat(), + Self::FixedSizeList(_) => {} + Self::VariableWidth(data_block) => data_block.compute_stat(), + Self::Opaque(data_block) => data_block.compute_stat(), + Self::Struct(_) => {} + Self::Dictionary(_) => {} + } + } +} + +impl ComputeStat for VariableWidthBlock { + fn compute_stat(&mut self) { + if !self.block_info.0.read().unwrap().is_empty() { + panic!("compute_stat should only be called once during DataBlock construction"); + } + let data_size = self.data_size(); + let data_size_array = Arc::new(UInt64Array::from(vec![data_size])); + + let cardinality_array = self.cardinality(); + + let max_length_array = self.max_length(); + + let mut info = self.block_info.0.write().unwrap(); + info.insert(Stat::DataSize, data_size_array); + info.insert(Stat::Cardinality, cardinality_array); + info.insert(Stat::MaxLength, max_length_array); + } +} + +impl ComputeStat for FixedWidthDataBlock { + fn compute_stat(&mut self) { + // compute this datablock's data_size + let data_size = self.data_size(); + let data_size_array = Arc::new(UInt64Array::from(vec![data_size])); + + // compute this datablock's max_bit_width + let max_bit_widths = self.max_bit_widths(); + + // the MaxLength of FixedWidthDataBlock is it's self.bits_per_value / 8 + let max_len = self.bits_per_value / 8; + let max_len_array = Arc::new(UInt64Array::from(vec![max_len])); + + let mut info = self.block_info.0.write().unwrap(); + info.insert(Stat::DataSize, data_size_array); + info.insert(Stat::BitWidth, max_bit_widths); + info.insert(Stat::MaxLength, max_len_array); + } +} + +impl ComputeStat for OpaqueBlock { + fn compute_stat(&mut self) { + // compute this datablock's data_size + let data_size = self.data_size(); + let data_size_array = Arc::new(UInt64Array::from(vec![data_size])); + let mut info = self.block_info.0.write().unwrap(); + info.insert(Stat::DataSize, data_size_array); + } +} + pub trait GetStat { - fn get_stat(&mut self, stat: Stat) -> Option>; + fn get_stat(&self, stat: Stat) -> Option>; } impl GetStat for DataBlock { - fn get_stat(&mut self, stat: Stat) -> Option> { + fn get_stat(&self, stat: Stat) -> Option> { match self { Self::Empty() => None, Self::Constant(_) => None, @@ -57,12 +128,9 @@ impl GetStat for DataBlock { // the statistics is not calculated here as this enum is going to deprecated soon anyway None } - Self::Nullable(_) => { - // the statistics is not calculated here as this enum is going to deprecated soon anyway - None - } + Self::Nullable(data_block) => data_block.data.get_stat(stat), Self::FixedWidth(data_block) => data_block.get_stat(stat), - Self::FixedSizeList(_) => todo!("get_stat for FixedSizeList is not implemented yet."), + Self::FixedSizeList(_) => None, Self::VariableWidth(data_block) => data_block.get_stat(stat), Self::Opaque(data_block) => data_block.get_stat(stat), Self::Struct(data_block) => data_block.get_stat(stat), @@ -72,13 +140,13 @@ impl GetStat for DataBlock { } impl GetStat for VariableWidthBlock { - fn get_stat(&mut self, stat: Stat) -> Option> { + fn get_stat(&self, stat: Stat) -> Option> { match stat { Stat::BitWidth => None, Stat::NullCount => None, _ => { if self.block_info.0.read().unwrap().is_empty() { - self.compute_statistics(); + panic!("get_stat should be called after statistics are computed"); } self.block_info.0.read().unwrap().get(&stat).cloned() } @@ -87,20 +155,6 @@ impl GetStat for VariableWidthBlock { } impl VariableWidthBlock { - fn compute_statistics(&mut self) { - let data_size = self.data_size(); - let data_size_array = Arc::new(UInt64Array::from(vec![data_size])); - - let cardinality_array = self.cardinality(); - - let max_length_array = self.max_length(); - - let mut info = self.block_info.0.write().unwrap(); - info.insert(Stat::DataSize, data_size_array); - info.insert(Stat::Cardinality, cardinality_array); - info.insert(Stat::MaxLength, max_length_array); - } - // Caveat: the computation here assumes VariableWidthBlock.offsets maps directly to VariableWidthBlock.data // without any adjustment(for example, no null_adjustment for offsets) fn cardinality(&mut self) -> Arc { @@ -173,13 +227,12 @@ impl VariableWidthBlock { } impl GetStat for FixedWidthDataBlock { - fn get_stat(&mut self, stat: Stat) -> Option> { + fn get_stat(&self, stat: Stat) -> Option> { match stat { Stat::NullCount => None, _ => { - // initialize statistics if self.block_info.0.read().unwrap().is_empty() { - self.compute_statistics(); + panic!("get_stat should be called after statistics are computed"); } self.block_info.0.read().unwrap().get(&stat).cloned() } @@ -188,19 +241,6 @@ impl GetStat for FixedWidthDataBlock { } impl FixedWidthDataBlock { - fn compute_statistics(&mut self) { - // compute this datablock's data_size - let data_size = self.data_size(); - let data_size_array = Arc::new(UInt64Array::from(vec![data_size])); - - // compute this datablock's max_bit_width - let max_bit_widths = self.max_bit_widths(); - - let mut info = self.block_info.0.write().unwrap(); - info.insert(Stat::DataSize, data_size_array); - info.insert(Stat::BitWidth, max_bit_widths); - } - fn max_bit_widths(&mut self) -> Arc { assert!(self.num_values > 0); @@ -255,7 +295,7 @@ impl FixedWidthDataBlock { } impl GetStat for OpaqueBlock { - fn get_stat(&mut self, stat: Stat) -> Option> { + fn get_stat(&self, stat: Stat) -> Option> { match stat { Stat::DataSize => self.block_info.0.read().unwrap().get(&stat).cloned(), _ => None, @@ -264,13 +304,13 @@ impl GetStat for OpaqueBlock { } impl GetStat for DictionaryDataBlock { - fn get_stat(&mut self, _stat: Stat) -> Option> { + fn get_stat(&self, _stat: Stat) -> Option> { None } } impl GetStat for StructDataBlock { - fn get_stat(&mut self, _stat: Stat) -> Option> { + fn get_stat(&self, _stat: Stat) -> Option> { None } } @@ -301,7 +341,7 @@ mod tests { let arr1 = gen.generate(RowCount::from(3), &mut rng).unwrap(); let arr2 = gen.generate(RowCount::from(3), &mut rng).unwrap(); let arr3 = gen.generate(RowCount::from(3), &mut rng).unwrap(); - let mut block = DataBlock::from_arrays(&[arr1.clone(), arr2.clone(), arr3.clone()], 9); + let block = DataBlock::from_arrays(&[arr1.clone(), arr2.clone(), arr3.clone()], 9); let concatenated_array = concat(&[ &*Arc::new(arr1.clone()) as &dyn Array, @@ -334,7 +374,7 @@ mod tests { // test DataType::Binary let mut gen = lance_datagen::array::rand_type(&DataType::Binary); let arr = gen.generate(RowCount::from(3), &mut rng).unwrap(); - let mut block = DataBlock::from_array(arr.clone()); + let block = DataBlock::from_array(arr.clone()); let data_size_array = block.get_stat(Stat::DataSize).unwrap_or_else(|| { panic!( "A data block of type: {} should have valid {} statistics", @@ -370,7 +410,7 @@ mod tests { let mut gen = lance_datagen::array::rand_type(&DataType::Struct(fields)); let arr = gen.generate(RowCount::from(3), &mut rng).unwrap(); - let mut block = DataBlock::from_array(arr.clone()); + let block = DataBlock::from_array(arr.clone()); assert!( block.get_stat(Stat::DataSize).is_none(), "Expected Stat::DataSize to be None for data block of type: {}", @@ -383,7 +423,7 @@ mod tests { Box::new(DataType::Utf8), )); let arr = gen.generate(RowCount::from(3), &mut rng).unwrap(); - let mut block = DataBlock::from_array(arr.clone()); + let block = DataBlock::from_array(arr.clone()); assert!( block.get_stat(Stat::DataSize).is_none(), "Expected Stat::DataSize to be None for data block of type: {}", @@ -392,19 +432,33 @@ mod tests { let mut gen = array::rand::().with_nulls(&[false, true, false]); let arr = gen.generate(RowCount::from(3), &mut rng).unwrap(); - let mut block = DataBlock::from_array(arr.clone()); - assert!( - block.get_stat(Stat::DataSize).is_none(), - "Expected Stat::DataSize to be None for data block of type: {}", - block.name() - ); + let block = DataBlock::from_array(arr.clone()); + let data_size_array = block.get_stat(Stat::DataSize).unwrap_or_else(|| { + panic!( + "A data block of type: {} should have valid {} statistics", + block.name(), + Stat::DataSize + ) + }); + let data_size = data_size_array + .as_any() + .downcast_ref::() + .unwrap() + .value(0); + let total_buffer_size: usize = arr + .to_data() + .buffers() + .iter() + .map(|buffer| buffer.len()) + .sum(); + assert!(data_size == total_buffer_size as u64); } #[test] fn test_bit_width_stat_for_integers() { let int8_array = Int8Array::from(vec![1, 2, 3]); let array_ref: ArrayRef = Arc::new(int8_array.clone()); - let mut block = DataBlock::from_array(array_ref); + let block = DataBlock::from_array(array_ref); let expected_bit_width = Arc::new(UInt64Array::from(vec![2])) as ArrayRef; let actual_bit_width = block.get_stat(Stat::BitWidth); @@ -419,7 +473,7 @@ mod tests { let int8_array = Int8Array::from(vec![0x1, 0x2, 0x3, 0x7F]); let array_ref: ArrayRef = Arc::new(int8_array.clone()); - let mut block = DataBlock::from_array(array_ref); + let block = DataBlock::from_array(array_ref); let expected_bit_width = Arc::new(UInt64Array::from(vec![7])) as ArrayRef; let actual_bit_width = block.get_stat(Stat::BitWidth); @@ -434,7 +488,7 @@ mod tests { let int8_array = Int8Array::from(vec![0x1, 0x2, 0x3, 0xF, 0x1F]); let array_ref: ArrayRef = Arc::new(int8_array.clone()); - let mut block = DataBlock::from_array(array_ref); + let block = DataBlock::from_array(array_ref); let expected_bit_width = Arc::new(UInt64Array::from(vec![5])) as ArrayRef; let actual_bit_width = block.get_stat(Stat::BitWidth); @@ -449,7 +503,7 @@ mod tests { let int8_array = Int8Array::from(vec![-1, 2, 3]); let array_ref: ArrayRef = Arc::new(int8_array.clone()); - let mut block = DataBlock::from_array(array_ref); + let block = DataBlock::from_array(array_ref); let expected_bit_width = Arc::new(UInt64Array::from(vec![8])) as ArrayRef; let actual_bit_width = block.get_stat(Stat::BitWidth); @@ -464,7 +518,7 @@ mod tests { let int16_array = Int16Array::from(vec![1, 2, 3]); let array_ref: ArrayRef = Arc::new(int16_array.clone()); - let mut block = DataBlock::from_array(array_ref); + let block = DataBlock::from_array(array_ref); let expected_bit_width = Arc::new(UInt64Array::from(vec![2])) as ArrayRef; let actual_bit_width = block.get_stat(Stat::BitWidth); @@ -479,7 +533,7 @@ mod tests { let int16_array = Int16Array::from(vec![0x1, 0x2, 0x3, 0x7F]); let array_ref: ArrayRef = Arc::new(int16_array.clone()); - let mut block = DataBlock::from_array(array_ref); + let block = DataBlock::from_array(array_ref); let expected_bit_width = Arc::new(UInt64Array::from(vec![7])) as ArrayRef; let actual_bit_width = block.get_stat(Stat::BitWidth); @@ -494,7 +548,7 @@ mod tests { let int16_array = Int16Array::from(vec![0x1, 0x2, 0x3, 0xFF]); let array_ref: ArrayRef = Arc::new(int16_array.clone()); - let mut block = DataBlock::from_array(array_ref); + let block = DataBlock::from_array(array_ref); let expected_bit_width = Arc::new(UInt64Array::from(vec![8])) as ArrayRef; let actual_bit_width = block.get_stat(Stat::BitWidth); @@ -509,7 +563,7 @@ mod tests { let int16_array = Int16Array::from(vec![0x1, 0x2, 0x3, 0x1FF]); let array_ref: ArrayRef = Arc::new(int16_array.clone()); - let mut block = DataBlock::from_array(array_ref); + let block = DataBlock::from_array(array_ref); let expected_bit_width = Arc::new(UInt64Array::from(vec![9])) as ArrayRef; let actual_bit_width = block.get_stat(Stat::BitWidth); @@ -523,7 +577,7 @@ mod tests { ); let int16_array = Int16Array::from(vec![0x1, 0x2, 0x3, 0xF, 0x1F]); let array_ref: ArrayRef = Arc::new(int16_array.clone()); - let mut block = DataBlock::from_array(array_ref); + let block = DataBlock::from_array(array_ref); let expected_bit_width = Arc::new(UInt64Array::from(vec![5])) as ArrayRef; let actual_bit_width = block.get_stat(Stat::BitWidth); @@ -538,7 +592,7 @@ mod tests { let int16_array = Int16Array::from(vec![-1, 2, 3]); let array_ref: ArrayRef = Arc::new(int16_array.clone()); - let mut block = DataBlock::from_array(array_ref); + let block = DataBlock::from_array(array_ref); let expected_bit_width = Arc::new(UInt64Array::from(vec![16])) as ArrayRef; let actual_bit_width = block.get_stat(Stat::BitWidth); @@ -553,7 +607,7 @@ mod tests { let int32_array = Int32Array::from(vec![1, 2, 3]); let array_ref: ArrayRef = Arc::new(int32_array.clone()); - let mut block = DataBlock::from_array(array_ref); + let block = DataBlock::from_array(array_ref); let expected_bit_width = Arc::new(UInt64Array::from(vec![2])) as ArrayRef; let actual_bit_width = block.get_stat(Stat::BitWidth); @@ -568,7 +622,7 @@ mod tests { let int32_array = Int32Array::from(vec![0x1, 0x2, 0x3, 0xFF]); let array_ref: ArrayRef = Arc::new(int32_array.clone()); - let mut block = DataBlock::from_array(array_ref); + let block = DataBlock::from_array(array_ref); let expected_bit_width = Arc::new(UInt64Array::from(vec![8])) as ArrayRef; let actual_bit_width = block.get_stat(Stat::BitWidth); @@ -583,7 +637,7 @@ mod tests { let int32_array = Int32Array::from(vec![0x1, 0x2, 0x3, 0xFF, 0x1FF]); let array_ref: ArrayRef = Arc::new(int32_array.clone()); - let mut block = DataBlock::from_array(array_ref); + let block = DataBlock::from_array(array_ref); let expected_bit_width = Arc::new(UInt64Array::from(vec![9])) as ArrayRef; let actual_bit_width = block.get_stat(Stat::BitWidth); @@ -598,7 +652,7 @@ mod tests { let int32_array = Int32Array::from(vec![-1, 2, 3]); let array_ref: ArrayRef = Arc::new(int32_array.clone()); - let mut block = DataBlock::from_array(array_ref); + let block = DataBlock::from_array(array_ref); let expected_bit_width = Arc::new(UInt64Array::from(vec![32])) as ArrayRef; let actual_bit_width = block.get_stat(Stat::BitWidth); @@ -613,7 +667,7 @@ mod tests { let int32_array = Int32Array::from(vec![-1, 2, 3, -88]); let array_ref: ArrayRef = Arc::new(int32_array.clone()); - let mut block = DataBlock::from_array(array_ref); + let block = DataBlock::from_array(array_ref); let expected_bit_width = Arc::new(UInt64Array::from(vec![32])) as ArrayRef; let actual_bit_width = block.get_stat(Stat::BitWidth); @@ -628,7 +682,7 @@ mod tests { let int64_array = Int64Array::from(vec![1, 2, 3]); let array_ref: ArrayRef = Arc::new(int64_array.clone()); - let mut block = DataBlock::from_array(array_ref); + let block = DataBlock::from_array(array_ref); let expected_bit_width = Arc::new(UInt64Array::from(vec![2])) as ArrayRef; let actual_bit_width = block.get_stat(Stat::BitWidth); @@ -643,7 +697,7 @@ mod tests { let int64_array = Int64Array::from(vec![0x1, 0x2, 0x3, 0xFF]); let array_ref: ArrayRef = Arc::new(int64_array.clone()); - let mut block = DataBlock::from_array(array_ref); + let block = DataBlock::from_array(array_ref); let expected_bit_width = Arc::new(UInt64Array::from(vec![8])) as ArrayRef; let actual_bit_width = block.get_stat(Stat::BitWidth); @@ -658,7 +712,7 @@ mod tests { let int64_array = Int64Array::from(vec![0x1, 0x2, 0x3, 0xFF, 0x1FF]); let array_ref: ArrayRef = Arc::new(int64_array.clone()); - let mut block = DataBlock::from_array(array_ref); + let block = DataBlock::from_array(array_ref); let expected_bit_width = Arc::new(UInt64Array::from(vec![9])) as ArrayRef; let actual_bit_width = block.get_stat(Stat::BitWidth); @@ -673,7 +727,7 @@ mod tests { let int64_array = Int64Array::from(vec![-1, 2, 3]); let array_ref: ArrayRef = Arc::new(int64_array.clone()); - let mut block = DataBlock::from_array(array_ref); + let block = DataBlock::from_array(array_ref); let expected_bit_width = Arc::new(UInt64Array::from(vec![64])) as ArrayRef; let actual_bit_width = block.get_stat(Stat::BitWidth); @@ -688,7 +742,7 @@ mod tests { let int64_array = Int64Array::from(vec![-1, 2, 3, -88]); let array_ref: ArrayRef = Arc::new(int64_array.clone()); - let mut block = DataBlock::from_array(array_ref); + let block = DataBlock::from_array(array_ref); let expected_bit_width = Arc::new(UInt64Array::from(vec![64])) as ArrayRef; let actual_bit_width = block.get_stat(Stat::BitWidth); @@ -703,7 +757,7 @@ mod tests { let uint8_array = UInt8Array::from(vec![1, 2, 3]); let array_ref: ArrayRef = Arc::new(uint8_array.clone()); - let mut block = DataBlock::from_array(array_ref); + let block = DataBlock::from_array(array_ref); let expected_bit_width = Arc::new(UInt64Array::from(vec![2])) as ArrayRef; let actual_bit_width = block.get_stat(Stat::BitWidth); @@ -718,7 +772,7 @@ mod tests { let uint8_array = UInt8Array::from(vec![0x1, 0x2, 0x3, 0x7F]); let array_ref: ArrayRef = Arc::new(uint8_array.clone()); - let mut block = DataBlock::from_array(array_ref); + let block = DataBlock::from_array(array_ref); let expected_bit_width = Arc::new(UInt64Array::from(vec![7])) as ArrayRef; let actual_bit_width = block.get_stat(Stat::BitWidth); @@ -733,7 +787,7 @@ mod tests { let uint8_array = UInt8Array::from(vec![0x1, 0x2, 0x3, 0xF, 0x1F]); let array_ref: ArrayRef = Arc::new(uint8_array.clone()); - let mut block = DataBlock::from_array(array_ref); + let block = DataBlock::from_array(array_ref); let expected_bit_width = Arc::new(UInt64Array::from(vec![5])) as ArrayRef; let actual_bit_width = block.get_stat(Stat::BitWidth); @@ -748,7 +802,7 @@ mod tests { let uint8_array = UInt8Array::from(vec![1, 2, 3, 0xF]); let array_ref: ArrayRef = Arc::new(uint8_array.clone()); - let mut block = DataBlock::from_array(array_ref); + let block = DataBlock::from_array(array_ref); let expected_bit_width = Arc::new(UInt64Array::from(vec![4])) as ArrayRef; let actual_bit_width = block.get_stat(Stat::BitWidth); @@ -763,7 +817,7 @@ mod tests { let uint16_array = UInt16Array::from(vec![1, 2, 3]); let array_ref: ArrayRef = Arc::new(uint16_array.clone()); - let mut block = DataBlock::from_array(array_ref); + let block = DataBlock::from_array(array_ref); let expected_bit_width = Arc::new(UInt64Array::from(vec![2])) as ArrayRef; let actual_bit_width = block.get_stat(Stat::BitWidth); @@ -778,7 +832,7 @@ mod tests { let uint16_array = UInt16Array::from(vec![0x1, 0x2, 0x3, 0x7F]); let array_ref: ArrayRef = Arc::new(uint16_array.clone()); - let mut block = DataBlock::from_array(array_ref); + let block = DataBlock::from_array(array_ref); let expected_bit_width = Arc::new(UInt64Array::from(vec![7])) as ArrayRef; let actual_bit_width = block.get_stat(Stat::BitWidth); @@ -793,7 +847,7 @@ mod tests { let uint16_array = UInt16Array::from(vec![0x1, 0x2, 0x3, 0xFF]); let array_ref: ArrayRef = Arc::new(uint16_array.clone()); - let mut block = DataBlock::from_array(array_ref); + let block = DataBlock::from_array(array_ref); let expected_bit_width = Arc::new(UInt64Array::from(vec![8])) as ArrayRef; let actual_bit_width = block.get_stat(Stat::BitWidth); @@ -808,7 +862,7 @@ mod tests { let uint16_array = UInt16Array::from(vec![0x1, 0x2, 0x3, 0x1FF]); let array_ref: ArrayRef = Arc::new(uint16_array.clone()); - let mut block = DataBlock::from_array(array_ref); + let block = DataBlock::from_array(array_ref); let expected_bit_width = Arc::new(UInt64Array::from(vec![9])) as ArrayRef; let actual_bit_width = block.get_stat(Stat::BitWidth); @@ -822,7 +876,7 @@ mod tests { ); let uint16_array = UInt16Array::from(vec![0x1, 0x2, 0x3, 0xF, 0x1F]); let array_ref: ArrayRef = Arc::new(uint16_array.clone()); - let mut block = DataBlock::from_array(array_ref); + let block = DataBlock::from_array(array_ref); let expected_bit_width = Arc::new(UInt64Array::from(vec![5])) as ArrayRef; let actual_bit_width = block.get_stat(Stat::BitWidth); @@ -837,7 +891,7 @@ mod tests { let uint16_array = UInt16Array::from(vec![1, 2, 3, 0xFFFF]); let array_ref: ArrayRef = Arc::new(uint16_array.clone()); - let mut block = DataBlock::from_array(array_ref); + let block = DataBlock::from_array(array_ref); let expected_bit_width = Arc::new(UInt64Array::from(vec![16])) as ArrayRef; let actual_bit_width = block.get_stat(Stat::BitWidth); @@ -852,7 +906,7 @@ mod tests { let uint32_array = UInt32Array::from(vec![1, 2, 3]); let array_ref: ArrayRef = Arc::new(uint32_array.clone()); - let mut block = DataBlock::from_array(array_ref); + let block = DataBlock::from_array(array_ref); let expected_bit_width = Arc::new(UInt64Array::from(vec![2])) as ArrayRef; let actual_bit_width = block.get_stat(Stat::BitWidth); @@ -867,7 +921,7 @@ mod tests { let uint32_array = UInt32Array::from(vec![0x1, 0x2, 0x3, 0xFF]); let array_ref: ArrayRef = Arc::new(uint32_array.clone()); - let mut block = DataBlock::from_array(array_ref); + let block = DataBlock::from_array(array_ref); let expected_bit_width = Arc::new(UInt64Array::from(vec![8])) as ArrayRef; let actual_bit_width = block.get_stat(Stat::BitWidth); @@ -882,7 +936,7 @@ mod tests { let uint32_array = UInt32Array::from(vec![0x1, 0x2, 0x3, 0xFF, 0x1FF]); let array_ref: ArrayRef = Arc::new(uint32_array.clone()); - let mut block = DataBlock::from_array(array_ref); + let block = DataBlock::from_array(array_ref); let expected_bit_width = Arc::new(UInt64Array::from(vec![9])) as ArrayRef; let actual_bit_width = block.get_stat(Stat::BitWidth); @@ -897,7 +951,7 @@ mod tests { let uint32_array = UInt32Array::from(vec![1, 2, 3, 0xF]); let array_ref: ArrayRef = Arc::new(uint32_array.clone()); - let mut block = DataBlock::from_array(array_ref); + let block = DataBlock::from_array(array_ref); let expected_bit_width = Arc::new(UInt64Array::from(vec![4])) as ArrayRef; let actual_bit_width = block.get_stat(Stat::BitWidth); @@ -912,7 +966,7 @@ mod tests { let uint32_array = UInt32Array::from(vec![1, 2, 3, 0x77]); let array_ref: ArrayRef = Arc::new(uint32_array.clone()); - let mut block = DataBlock::from_array(array_ref); + let block = DataBlock::from_array(array_ref); let expected_bit_width = Arc::new(UInt64Array::from(vec![7])) as ArrayRef; let actual_bit_width = block.get_stat(Stat::BitWidth); @@ -927,7 +981,7 @@ mod tests { let uint64_array = UInt64Array::from(vec![1, 2, 3]); let array_ref: ArrayRef = Arc::new(uint64_array.clone()); - let mut block = DataBlock::from_array(array_ref); + let block = DataBlock::from_array(array_ref); let expected_bit_width = Arc::new(UInt64Array::from(vec![2])) as ArrayRef; let actual_bit_width = block.get_stat(Stat::BitWidth); @@ -942,7 +996,7 @@ mod tests { let uint64_array = UInt64Array::from(vec![0x1, 0x2, 0x3, 0xFF]); let array_ref: ArrayRef = Arc::new(uint64_array.clone()); - let mut block = DataBlock::from_array(array_ref); + let block = DataBlock::from_array(array_ref); let expected_bit_width = Arc::new(UInt64Array::from(vec![8])) as ArrayRef; let actual_bit_width = block.get_stat(Stat::BitWidth); @@ -957,7 +1011,7 @@ mod tests { let uint64_array = UInt64Array::from(vec![0x1, 0x2, 0x3, 0xFF, 0x1FF]); let array_ref: ArrayRef = Arc::new(uint64_array.clone()); - let mut block = DataBlock::from_array(array_ref); + let block = DataBlock::from_array(array_ref); let expected_bit_width = Arc::new(UInt64Array::from(vec![9])) as ArrayRef; let actual_bit_width = block.get_stat(Stat::BitWidth); @@ -972,7 +1026,7 @@ mod tests { let uint64_array = UInt64Array::from(vec![0, 2, 3, 0xFFFF]); let array_ref: ArrayRef = Arc::new(uint64_array.clone()); - let mut block = DataBlock::from_array(array_ref); + let block = DataBlock::from_array(array_ref); let expected_bit_width = Arc::new(UInt64Array::from(vec![16])) as ArrayRef; let actual_bit_width = block.get_stat(Stat::BitWidth); @@ -987,7 +1041,7 @@ mod tests { let uint64_array = UInt64Array::from(vec![1, 2, 3, 0xFFFF_FFFF_FFFF_FFFF]); let array_ref: ArrayRef = Arc::new(uint64_array.clone()); - let mut block = DataBlock::from_array(array_ref); + let block = DataBlock::from_array(array_ref); let expected_bit_width = Arc::new(UInt64Array::from(vec![64])) as ArrayRef; let actual_bit_width = block.get_stat(Stat::BitWidth); @@ -1019,7 +1073,7 @@ mod tests { let arrays: Vec<&dyn arrow::array::Array> = vec![array1.as_ref(), array2.as_ref(), array3.as_ref()]; let concatenated = concat(&arrays).unwrap(); - let mut block = DataBlock::from_array(concatenated.clone()); + let block = DataBlock::from_array(concatenated.clone()); let expected_bit_width = Arc::new(UInt64Array::from(vec![ 2, @@ -1042,7 +1096,7 @@ mod tests { let mut rng = rand_xoshiro::Xoshiro256PlusPlus::seed_from_u64(DEFAULT_SEED.0); let mut gen = lance_datagen::array::rand_type(&DataType::Binary); let arr = gen.generate(RowCount::from(3), &mut rng).unwrap(); - let mut block = DataBlock::from_array(arr.clone()); + let block = DataBlock::from_array(arr.clone()); assert_eq!( block.get_stat(Stat::BitWidth), None, @@ -1054,7 +1108,7 @@ mod tests { #[test] fn test_cardinality_variable_width_datablock() { let string_array = StringArray::from(vec![Some("hello"), Some("world")]); - let mut block = DataBlock::from_array(string_array.clone()); + let block = DataBlock::from_array(string_array.clone()); let expected_cardinality = Arc::new(UInt64Array::from(vec![2])) as ArrayRef; let actual_cardinality = block.get_stat(Stat::Cardinality); @@ -1071,7 +1125,7 @@ mod tests { Some("to be passed as arguments to procedures"), Some("to be returned as values of procedures"), ]); - let mut block = DataBlock::from_array(string_array.clone()); + let block = DataBlock::from_array(string_array.clone()); let expected_cardinality = Arc::new(UInt64Array::from(vec![3])) as ArrayRef; let actual_cardinality = block.get_stat(Stat::Cardinality); @@ -1088,7 +1142,7 @@ mod tests { Some("Saunders Mac Lane"), Some("Samuel Eilenberg"), ]); - let mut block = DataBlock::from_array(string_array.clone()); + let block = DataBlock::from_array(string_array.clone()); let expected_cardinality = Arc::new(UInt64Array::from(vec![2])) as ArrayRef; let actual_cardinality = block.get_stat(Stat::Cardinality); @@ -1101,7 +1155,7 @@ mod tests { ); let string_array = LargeStringArray::from(vec![Some("hello"), Some("world")]); - let mut block = DataBlock::from_array(string_array.clone()); + let block = DataBlock::from_array(string_array.clone()); let expected_cardinality = Arc::new(UInt64Array::from(vec![2])) as ArrayRef; let actual_cardinality = block.get_stat(Stat::Cardinality); @@ -1118,7 +1172,7 @@ mod tests { Some("to be passed as arguments to procedures"), Some("to be returned as values of procedures"), ]); - let mut block = DataBlock::from_array(string_array.clone()); + let block = DataBlock::from_array(string_array.clone()); let expected_cardinality = Arc::new(UInt64Array::from(vec![3])) as ArrayRef; let actual_cardinality = block.get_stat(Stat::Cardinality); @@ -1135,7 +1189,7 @@ mod tests { Some("Saunders Mac Lane"), Some("Samuel Eilenberg"), ]); - let mut block = DataBlock::from_array(string_array.clone()); + let block = DataBlock::from_array(string_array.clone()); let expected_cardinality = Arc::new(UInt64Array::from(vec![2])) as ArrayRef; let actual_cardinality = block.get_stat(Stat::Cardinality); @@ -1151,7 +1205,8 @@ mod tests { #[test] fn test_max_length_variable_width_datablock() { let string_array = StringArray::from(vec![Some("hello"), Some("world")]); - let mut block = DataBlock::from_array(string_array.clone()); + let block = DataBlock::from_array(string_array.clone()); + let expected_max_length = Arc::new(UInt64Array::from(vec![string_array.value_length(0) as u64])) as ArrayRef; let actual_max_length = block.get_stat(Stat::MaxLength); @@ -1163,7 +1218,8 @@ mod tests { Some("to be passed as arguments to procedures"), // string that has max length Some("to be returned as values of procedures"), ]); - let mut block = DataBlock::from_array(string_array.clone()); + let block = DataBlock::from_array(string_array.clone()); + let expected_max_length = Arc::new(UInt64Array::from(vec![string_array.value_length(1) as u64])) as ArrayRef; let actual_max_length = block.get_stat(Stat::MaxLength); @@ -1175,7 +1231,8 @@ mod tests { Some("Saunders Mac Lane"), // string that has max length Some("Samuel Eilenberg"), ]); - let mut block = DataBlock::from_array(string_array.clone()); + let block = DataBlock::from_array(string_array.clone()); + let expected_max_length = Arc::new(UInt64Array::from(vec![string_array.value_length(1) as u64])) as ArrayRef; let actual_max_length = block.get_stat(Stat::MaxLength); @@ -1183,7 +1240,8 @@ mod tests { assert_eq!(actual_max_length, Some(expected_max_length),); let string_array = LargeStringArray::from(vec![Some("hello"), Some("world")]); - let mut block = DataBlock::from_array(string_array.clone()); + let block = DataBlock::from_array(string_array.clone()); + let expected_max_length = Arc::new(UInt64Array::from(vec![string_array.value(0).len() as u64])) as ArrayRef; let actual_max_length = block.get_stat(Stat::MaxLength); @@ -1195,7 +1253,8 @@ mod tests { Some("to be passed as arguments to procedures"), // string that has max length Some("to be returned as values of procedures"), ]); - let mut block = DataBlock::from_array(string_array.clone()); + let block = DataBlock::from_array(string_array.clone()); + let expected_max_length = Arc::new(UInt64Array::from(vec![string_array.value_length(1) as u64])) as ArrayRef; let actual_max_length = block.get_stat(Stat::MaxLength);