From e70c16d67d866ab7e4a75ef584c6fdc7b91e687c Mon Sep 17 00:00:00 2001 From: Xiangpeng Hao Date: Thu, 11 Jul 2024 04:39:44 -0400 Subject: [PATCH] Clean up unused code for view types in offset buffer (#6040) * clean up unused view types in offset buffer * make tests happy --- parquet/src/arrow/array_reader/byte_array.rs | 114 +------------------ parquet/src/arrow/buffer/offset_buffer.rs | 100 +++------------- 2 files changed, 14 insertions(+), 200 deletions(-) diff --git a/parquet/src/arrow/array_reader/byte_array.rs b/parquet/src/arrow/array_reader/byte_array.rs index 80c448edc637..ed5961586d91 100644 --- a/parquet/src/arrow/array_reader/byte_array.rs +++ b/parquet/src/arrow/array_reader/byte_array.rs @@ -599,7 +599,7 @@ mod tests { use super::*; use crate::arrow::array_reader::test_util::{byte_array_all_encodings, utf8_column}; use crate::arrow::record_reader::buffer::ValuesBuffer; - use arrow_array::{Array, StringArray, StringViewArray}; + use arrow_array::{Array, StringArray}; use arrow_buffer::Buffer; #[test] @@ -657,64 +657,6 @@ mod tests { } } - #[test] - fn test_byte_array_string_view_decoder() { - let (pages, encoded_dictionary) = - byte_array_all_encodings(vec!["hello", "world", "large payload over 12 bytes", "b"]); - - let column_desc = utf8_column(); - let mut decoder = ByteArrayColumnValueDecoder::new(&column_desc); - - decoder - .set_dict(encoded_dictionary, 4, Encoding::RLE_DICTIONARY, false) - .unwrap(); - - for (encoding, page) in pages { - let mut output = OffsetBuffer::::default(); - decoder.set_data(encoding, page, 4, Some(4)).unwrap(); - - assert_eq!(decoder.read(&mut output, 1).unwrap(), 1); - - assert_eq!(output.values.as_slice(), "hello".as_bytes()); - assert_eq!(output.offsets.as_slice(), &[0, 5]); - - assert_eq!(decoder.read(&mut output, 1).unwrap(), 1); - assert_eq!(output.values.as_slice(), "helloworld".as_bytes()); - assert_eq!(output.offsets.as_slice(), &[0, 5, 10]); - - assert_eq!(decoder.read(&mut output, 2).unwrap(), 2); - assert_eq!( - output.values.as_slice(), - "helloworldlarge payload over 12 bytesb".as_bytes() - ); - assert_eq!(output.offsets.as_slice(), &[0, 5, 10, 37, 38]); - - assert_eq!(decoder.read(&mut output, 4).unwrap(), 0); - - let valid = [false, false, true, true, false, true, true, false, false]; - let valid_buffer = Buffer::from_iter(valid.iter().cloned()); - - output.pad_nulls(0, 4, valid.len(), valid_buffer.as_slice()); - let array = output.into_array(Some(valid_buffer), ArrowType::Utf8View); - let strings = array.as_any().downcast_ref::().unwrap(); - - assert_eq!( - strings.iter().collect::>(), - vec![ - None, - None, - Some("hello"), - Some("world"), - None, - Some("large payload over 12 bytes"), - Some("b"), - None, - None, - ] - ); - } - } - #[test] fn test_byte_array_decoder_skip() { let (pages, encoded_dictionary) = @@ -759,60 +701,6 @@ mod tests { } } - #[test] - fn test_byte_array_string_view_decoder_skip() { - let (pages, encoded_dictionary) = - byte_array_all_encodings(vec!["hello", "world", "a", "large payload over 12 bytes"]); - - let column_desc = utf8_column(); - let mut decoder = ByteArrayColumnValueDecoder::new(&column_desc); - - decoder - .set_dict(encoded_dictionary, 4, Encoding::RLE_DICTIONARY, false) - .unwrap(); - - for (encoding, page) in pages { - let mut output = OffsetBuffer::::default(); - decoder.set_data(encoding, page, 4, Some(4)).unwrap(); - - assert_eq!(decoder.read(&mut output, 1).unwrap(), 1); - - assert_eq!(output.values.as_slice(), "hello".as_bytes()); - assert_eq!(output.offsets.as_slice(), &[0, 5]); - - assert_eq!(decoder.skip_values(1).unwrap(), 1); - assert_eq!(decoder.skip_values(1).unwrap(), 1); - - assert_eq!(decoder.read(&mut output, 1).unwrap(), 1); - assert_eq!( - output.values.as_slice(), - "hellolarge payload over 12 bytes".as_bytes() - ); - assert_eq!(output.offsets.as_slice(), &[0, 5, 32]); - - assert_eq!(decoder.read(&mut output, 4).unwrap(), 0); - - let valid = [false, false, true, true, false, false]; - let valid_buffer = Buffer::from_iter(valid.iter().cloned()); - - output.pad_nulls(0, 2, valid.len(), valid_buffer.as_slice()); - let array = output.into_array(Some(valid_buffer), ArrowType::Utf8View); - let strings = array.as_any().downcast_ref::().unwrap(); - - assert_eq!( - strings.iter().collect::>(), - vec![ - None, - None, - Some("hello"), - Some("large payload over 12 bytes"), - None, - None, - ] - ); - } - } - #[test] fn test_byte_array_decoder_nulls() { let (pages, encoded_dictionary) = byte_array_all_encodings(Vec::<&str>::new()); diff --git a/parquet/src/arrow/buffer/offset_buffer.rs b/parquet/src/arrow/buffer/offset_buffer.rs index 806f144d9666..ce9eb1142a5b 100644 --- a/parquet/src/arrow/buffer/offset_buffer.rs +++ b/parquet/src/arrow/buffer/offset_buffer.rs @@ -18,13 +18,10 @@ use crate::arrow::buffer::bit_util::iter_set_bits_rev; use crate::arrow::record_reader::buffer::ValuesBuffer; use crate::errors::{ParquetError, Result}; -use arrow_array::builder::GenericByteViewBuilder; -use arrow_array::types::BinaryViewType; use arrow_array::{make_array, ArrayRef, OffsetSizeTrait}; use arrow_buffer::{ArrowNativeType, Buffer}; use arrow_data::ArrayDataBuilder; use arrow_schema::DataType as ArrowType; -use std::sync::Arc; /// A buffer of variable-sized byte arrays that can be converted into /// a corresponding [`ArrayRef`] @@ -128,51 +125,18 @@ impl OffsetBuffer { /// Converts this into an [`ArrayRef`] with the provided `data_type` and `null_buffer` pub fn into_array(self, null_buffer: Option, data_type: ArrowType) -> ArrayRef { - match data_type { - ArrowType::Utf8View => { - let mut builder = self.build_generic_byte_view(); - Arc::new(builder.finish().to_string_view().unwrap()) - } - ArrowType::BinaryView => { - let mut builder = self.build_generic_byte_view(); - Arc::new(builder.finish()) - } - _ => { - let array_data_builder = ArrayDataBuilder::new(data_type) - .len(self.len()) - .add_buffer(Buffer::from_vec(self.offsets)) - .add_buffer(Buffer::from_vec(self.values)) - .null_bit_buffer(null_buffer); - - let data = match cfg!(debug_assertions) { - true => array_data_builder.build().unwrap(), - false => unsafe { array_data_builder.build_unchecked() }, - }; - - make_array(data) - } - } - } - - fn build_generic_byte_view(self) -> GenericByteViewBuilder { - let mut builder = GenericByteViewBuilder::::with_capacity(self.len()); - let buffer = self.values.into(); - let block = builder.append_block(buffer); - for window in self.offsets.windows(2) { - let start = window[0]; - let end = window[1]; - let len = (end - start).to_usize().unwrap(); - - if len != 0 { - // Safety: (1) the buffer is valid (2) the offsets are valid (3) the values in between are of ByteViewType - unsafe { - builder.append_view_unchecked(block, start.as_usize() as u32, len as u32); - } - } else { - builder.append_null(); - } - } - builder + let array_data_builder = ArrayDataBuilder::new(data_type) + .len(self.len()) + .add_buffer(Buffer::from_vec(self.offsets)) + .add_buffer(Buffer::from_vec(self.values)) + .null_bit_buffer(null_buffer); + + let data = match cfg!(debug_assertions) { + true => array_data_builder.build().unwrap(), + false => unsafe { array_data_builder.build_unchecked() }, + }; + + make_array(data) } } @@ -229,7 +193,7 @@ impl ValuesBuffer for OffsetBuffer { #[cfg(test)] mod tests { use super::*; - use arrow_array::{Array, LargeStringArray, StringArray, StringViewArray}; + use arrow_array::{Array, LargeStringArray, StringArray}; #[test] fn test_offset_buffer_empty() { @@ -280,44 +244,6 @@ mod tests { ); } - #[test] - fn test_string_view() { - let mut buffer = OffsetBuffer::::default(); - for v in [ - "hello", - "world", - "large payload over 12 bytes", - "a", - "b", - "c", - ] { - buffer.try_push(v.as_bytes(), false).unwrap() - } - let split = std::mem::take(&mut buffer); - - let array = split.into_array(None, ArrowType::Utf8View); - let strings = array.as_any().downcast_ref::().unwrap(); - assert_eq!( - strings.iter().map(|x| x.unwrap()).collect::>(), - vec![ - "hello", - "world", - "large payload over 12 bytes", - "a", - "b", - "c" - ] - ); - - buffer.try_push("test".as_bytes(), false).unwrap(); - let array = buffer.into_array(None, ArrowType::Utf8View); - let strings = array.as_any().downcast_ref::().unwrap(); - assert_eq!( - strings.iter().map(|x| x.unwrap()).collect::>(), - vec!["test"] - ); - } - #[test] fn test_offset_buffer_pad_nulls() { let mut buffer = OffsetBuffer::::default();