From e6bd74b2c381e67a2b08e15fc23672f8317436c4 Mon Sep 17 00:00:00 2001 From: kf zheng <100595273+Kev1n8@users.noreply.github.com> Date: Sat, 3 Aug 2024 19:21:14 +0800 Subject: [PATCH] Add support for `StringView` and `BinaryView` statistics in `StatisticsConverter` (#6181) * Add StringView and BinaryView support for the macro `get_statistics` * Add StringView and BinaryView support for the macro `get_data_page_statistics` * add tests to cover the support for StringView and BinaryView in the macro get_data_page_statistics * found potential bugs and ignore the tests * fake alarm! no bugs, fix the code by initiating all batches to have 5 rows * make the get_stat StringView and BinaryView tests cover bytes greater than 12 --- parquet/src/arrow/arrow_reader/statistics.rs | 173 ++++++++++++++++++- parquet/tests/arrow_reader/mod.rs | 55 +++++- parquet/tests/arrow_reader/statistics.rs | 62 ++++++- 3 files changed, 270 insertions(+), 20 deletions(-) diff --git a/parquet/src/arrow/arrow_reader/statistics.rs b/parquet/src/arrow/arrow_reader/statistics.rs index 6a1434bce906..c42f92838c8c 100644 --- a/parquet/src/arrow/arrow_reader/statistics.rs +++ b/parquet/src/arrow/arrow_reader/statistics.rs @@ -26,7 +26,8 @@ use crate::file::page_index::index::{Index, PageIndex}; use crate::file::statistics::Statistics as ParquetStatistics; use crate::schema::types::SchemaDescriptor; use arrow_array::builder::{ - BooleanBuilder, FixedSizeBinaryBuilder, LargeStringBuilder, StringBuilder, + BinaryViewBuilder, BooleanBuilder, FixedSizeBinaryBuilder, LargeStringBuilder, StringBuilder, + StringViewBuilder, }; use arrow_array::{ new_empty_array, new_null_array, ArrayRef, BinaryArray, BooleanArray, Date32Array, Date64Array, @@ -446,14 +447,43 @@ macro_rules! get_statistics { }, DataType::Dictionary(_, value_type) => { [<$stat_type_prefix:lower _ statistics>](value_type, $iterator) + }, + DataType::Utf8View => { + let iterator = [<$stat_type_prefix ByteArrayStatsIterator>]::new($iterator); + let mut builder = StringViewBuilder::new(); + for x in iterator { + let Some(x) = x else { + builder.append_null(); // no statistics value + continue; + }; + + let Ok(x) = std::str::from_utf8(x) else { + builder.append_null(); + continue; + }; + + builder.append_value(x); + } + Ok(Arc::new(builder.finish())) + }, + DataType::BinaryView => { + let iterator = [<$stat_type_prefix ByteArrayStatsIterator>]::new($iterator); + let mut builder = BinaryViewBuilder::new(); + for x in iterator { + let Some(x) = x else { + builder.append_null(); // no statistics value + continue; + }; + + builder.append_value(x); + } + Ok(Arc::new(builder.finish())) } DataType::Map(_,_) | DataType::Duration(_) | DataType::Interval(_) | DataType::Null | - DataType::BinaryView | - DataType::Utf8View | DataType::List(_) | DataType::ListView(_) | DataType::FixedSizeList(_, _) | @@ -919,7 +949,7 @@ macro_rules! get_data_page_statistics { } }) }, - Some(DataType::FixedSizeBinary(size)) => { + Some(DataType::FixedSizeBinary(size)) => { let mut builder = FixedSizeBinaryBuilder::new(*size); let iterator = [<$stat_type_prefix FixedLenByteArrayDataPageStatsIterator>]::new($iterator); for x in iterator { @@ -943,7 +973,58 @@ macro_rules! get_data_page_statistics { } Ok(Arc::new(builder.finish())) }, - _ => unimplemented!() + Some(DataType::Utf8View) => { + let mut builder = StringViewBuilder::new(); + let iterator = [<$stat_type_prefix ByteArrayDataPageStatsIterator>]::new($iterator); + for x in iterator { + for x in x.into_iter() { + let Some(x) = x else { + builder.append_null(); // no statistics value + continue; + }; + + let Ok(x) = std::str::from_utf8(x.data()) else { + builder.append_null(); + continue; + }; + + builder.append_value(x); + } + } + Ok(Arc::new(builder.finish())) + }, + Some(DataType::BinaryView) => { + let mut builder = BinaryViewBuilder::new(); + let iterator = [<$stat_type_prefix ByteArrayDataPageStatsIterator>]::new($iterator); + for x in iterator { + for x in x.into_iter() { + let Some(x) = x else { + builder.append_null(); // no statistics value + continue; + }; + + builder.append_value(x); + } + } + Ok(Arc::new(builder.finish())) + }, + Some(DataType::Null) | + Some(DataType::Duration(_)) | + Some(DataType::Interval(_)) | + Some(DataType::List(_)) | + Some(DataType::ListView(_)) | + Some(DataType::FixedSizeList(_, _)) | + Some(DataType::LargeList(_)) | + Some(DataType::LargeListView(_)) | + Some(DataType::Struct(_)) | + Some(DataType::Union(_, _)) | + Some(DataType::Map(_, _)) | + Some(DataType::RunEndEncoded(_, _)) => { + let len = $iterator.count(); + // don't know how to extract statistics, so return a null array + Ok(new_null_array($data_type.unwrap(), len)) + }, + None => unimplemented!() // not sure how to handle this } } } @@ -1499,10 +1580,10 @@ mod test { use arrow::datatypes::{i256, Date32Type, Date64Type}; use arrow::util::test_util::parquet_test_data; use arrow_array::{ - new_empty_array, new_null_array, Array, ArrayRef, BinaryArray, BooleanArray, Date32Array, - Date64Array, Decimal128Array, Decimal256Array, Float32Array, Float64Array, Int16Array, - Int32Array, Int64Array, Int8Array, LargeBinaryArray, RecordBatch, StringArray, StructArray, - TimestampNanosecondArray, + new_empty_array, new_null_array, Array, ArrayRef, BinaryArray, BinaryViewArray, + BooleanArray, Date32Array, Date64Array, Decimal128Array, Decimal256Array, Float32Array, + Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, LargeBinaryArray, RecordBatch, + StringArray, StringViewArray, StructArray, TimestampNanosecondArray, }; use arrow_schema::{DataType, Field, SchemaRef}; use bytes::Bytes; @@ -1916,6 +1997,65 @@ mod test { .run() } + #[test] + fn roundtrip_string_view() { + Test { + input: string_view_array([ + // row group 1 + Some("A"), + None, + Some("Q"), + // row group 2 + Some("ZZ"), + Some("A_longerthan12"), + None, + // row group 3 + Some("A_longerthan12"), + None, + None, + ]), + expected_min: string_view_array([ + Some("A"), + Some("A_longerthan12"), + Some("A_longerthan12"), + ]), + expected_max: string_view_array([Some("Q"), Some("ZZ"), Some("A_longerthan12")]), + } + .run() + } + + #[test] + fn roundtrip_binary_view() { + let input: Vec> = vec![ + // row group 1 + Some(b"A"), + None, + Some(b"Q"), + // row group 2 + Some(b"ZZ"), + Some(b"A_longerthan12"), + None, + // row group 3 + Some(b"A_longerthan12"), + None, + None, + ]; + + let expected_min: Vec> = + vec![Some(b"A"), Some(b"A_longerthan12"), Some(b"A_longerthan12")]; + let expected_max: Vec> = + vec![Some(b"Q"), Some(b"ZZ"), Some(b"A_longerthan12")]; + + let array = binary_view_array(input); + + Test { + input: array, + expected_min: binary_view_array(expected_min), + expected_max: binary_view_array(expected_max), + } + .run() + } + #[test] fn roundtrip_struct() { let mut test = Test { @@ -2539,4 +2679,19 @@ mod test { Arc::new(array) } + + fn string_view_array<'a>(input: impl IntoIterator>) -> ArrayRef { + let array: StringViewArray = input + .into_iter() + .map(|s| s.map(|s| s.to_string())) + .collect(); + + Arc::new(array) + } + + fn binary_view_array(input: Vec>) -> ArrayRef { + let array = BinaryViewArray::from(input.into_iter().collect::>>()); + + Arc::new(array) + } } diff --git a/parquet/tests/arrow_reader/mod.rs b/parquet/tests/arrow_reader/mod.rs index 4f63a505488c..7e979dcf3ec0 100644 --- a/parquet/tests/arrow_reader/mod.rs +++ b/parquet/tests/arrow_reader/mod.rs @@ -17,13 +17,13 @@ use arrow_array::types::{Int32Type, Int8Type}; use arrow_array::{ - Array, ArrayRef, BinaryArray, BooleanArray, Date32Array, Date64Array, Decimal128Array, - Decimal256Array, DictionaryArray, FixedSizeBinaryArray, Float16Array, Float32Array, - Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, LargeBinaryArray, - LargeStringArray, RecordBatch, StringArray, StructArray, Time32MillisecondArray, - Time32SecondArray, Time64MicrosecondArray, Time64NanosecondArray, TimestampMicrosecondArray, - TimestampMillisecondArray, TimestampNanosecondArray, TimestampSecondArray, UInt16Array, - UInt32Array, UInt64Array, UInt8Array, + Array, ArrayRef, BinaryArray, BinaryViewArray, BooleanArray, Date32Array, Date64Array, + Decimal128Array, Decimal256Array, DictionaryArray, FixedSizeBinaryArray, Float16Array, + Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, LargeBinaryArray, + LargeStringArray, RecordBatch, StringArray, StringViewArray, StructArray, + Time32MillisecondArray, Time32SecondArray, Time64MicrosecondArray, Time64NanosecondArray, + TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray, + TimestampSecondArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array, }; use arrow_buffer::i256; use arrow_schema::{DataType, Field, Schema, TimeUnit}; @@ -88,6 +88,8 @@ enum Scenario { PeriodsInColumnNames, StructArray, UTF8, + UTF8View, + BinaryView, } fn make_boolean_batch(v: Vec>) -> RecordBatch { @@ -589,6 +591,16 @@ fn make_utf8_batch(value: Vec>) -> RecordBatch { .unwrap() } +fn make_utf8_view_batch(value: Vec>) -> RecordBatch { + let utf8_view = StringViewArray::from(value); + RecordBatch::try_from_iter(vec![("utf8_view", Arc::new(utf8_view) as _)]).unwrap() +} + +fn make_binary_view_batch(value: Vec>) -> RecordBatch { + let binary_view = BinaryViewArray::from(value); + RecordBatch::try_from_iter(vec![("binary_view", Arc::new(binary_view) as _)]).unwrap() +} + fn make_dict_batch() -> RecordBatch { let values = [ Some("abc"), @@ -972,6 +984,35 @@ fn create_data_batch(scenario: Scenario) -> Vec { make_utf8_batch(vec![Some("e"), Some("f"), Some("g"), Some("h"), Some("i")]), ] } + Scenario::UTF8View => { + // Make utf8_view batch including string length <12 and >12 bytes + // as the internal representation of StringView is differed for strings + // shorter and longer than that length + vec![ + make_utf8_view_batch(vec![Some("a"), Some("b"), Some("c"), Some("d"), None]), + make_utf8_view_batch(vec![Some("a"), Some("e_longerthan12"), None, None, None]), + make_utf8_view_batch(vec![ + Some("e_longerthan12"), + Some("f_longerthan12"), + Some("g_longerthan12"), + Some("h_longerthan12"), + Some("i_longerthan12"), + ]), + ] + } + Scenario::BinaryView => { + vec![ + make_binary_view_batch(vec![Some(b"a"), Some(b"b"), Some(b"c"), Some(b"d"), None]), + make_binary_view_batch(vec![Some(b"a"), Some(b"e_longerthan12"), None, None, None]), + make_binary_view_batch(vec![ + Some(b"e_longerthan12"), + Some(b"f_longerthan12"), + Some(b"g_longerthan12"), + Some(b"h_longerthan12"), + Some(b"i_longerthan12"), + ]), + ] + } } } diff --git a/parquet/tests/arrow_reader/statistics.rs b/parquet/tests/arrow_reader/statistics.rs index 5702967ffdf4..75a73ac1309f 100644 --- a/parquet/tests/arrow_reader/statistics.rs +++ b/parquet/tests/arrow_reader/statistics.rs @@ -29,11 +29,11 @@ use arrow::datatypes::{ TimestampNanosecondType, TimestampSecondType, }; use arrow_array::{ - make_array, new_null_array, Array, ArrayRef, BinaryArray, BooleanArray, Date32Array, - Date64Array, Decimal128Array, Decimal256Array, FixedSizeBinaryArray, Float16Array, + make_array, new_null_array, Array, ArrayRef, BinaryArray, BinaryViewArray, BooleanArray, + Date32Array, Date64Array, Decimal128Array, Decimal256Array, FixedSizeBinaryArray, Float16Array, Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, LargeBinaryArray, - LargeStringArray, RecordBatch, StringArray, Time32MillisecondArray, Time32SecondArray, - Time64MicrosecondArray, Time64NanosecondArray, TimestampMicrosecondArray, + LargeStringArray, RecordBatch, StringArray, StringViewArray, Time32MillisecondArray, + Time32SecondArray, Time64MicrosecondArray, Time64NanosecondArray, TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray, TimestampSecondArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array, }; @@ -2059,6 +2059,60 @@ async fn test_utf8() { .run(); } +// UTF8View +#[tokio::test] +async fn test_utf8_view() { + let reader = TestReader { + scenario: Scenario::UTF8View, + row_per_group: 5, + } + .build() + .await; + + // test for utf8_view + Test { + reader: &reader, + expected_min: Arc::new(StringViewArray::from(vec!["a", "a", "e_longerthan12"])), + expected_max: Arc::new(StringViewArray::from(vec![ + "d", + "e_longerthan12", + "i_longerthan12", + ])), + expected_null_counts: UInt64Array::from(vec![1, 3, 0]), + expected_row_counts: Some(UInt64Array::from(vec![5, 5, 5])), + column_name: "utf8_view", + check: Check::Both, + } + .run() +} + +// BinaryView +#[tokio::test] +async fn test_binary_view() { + let reader = TestReader { + scenario: Scenario::BinaryView, + row_per_group: 5, + } + .build() + .await; + + let expected_min: Vec> = vec![Some(b"a"), Some(b"a"), Some(b"e_longerthan12")]; + let expected_max: Vec> = + vec![Some(b"d"), Some(b"e_longerthan12"), Some(b"i_longerthan12")]; + + // test for utf8_view + Test { + reader: &reader, + expected_min: Arc::new(BinaryViewArray::from(expected_min)), + expected_max: Arc::new(BinaryViewArray::from(expected_max)), + expected_null_counts: UInt64Array::from(vec![1, 3, 0]), + expected_row_counts: Some(UInt64Array::from(vec![5, 5, 5])), + column_name: "binary_view", + check: Check::Both, + } + .run() +} + ////// Files with missing statistics /////// #[tokio::test]