diff --git a/parquet/src/arrow/arrow_reader/statistics.rs b/parquet/src/arrow/arrow_reader/statistics.rs index 6a1434bce906..2003ddf82597 100644 --- a/parquet/src/arrow/arrow_reader/statistics.rs +++ b/parquet/src/arrow/arrow_reader/statistics.rs @@ -26,7 +26,8 @@ use crate::file::page_index::index::{Index, PageIndex}; use crate::file::statistics::Statistics as ParquetStatistics; use crate::schema::types::SchemaDescriptor; use arrow_array::builder::{ - BooleanBuilder, FixedSizeBinaryBuilder, LargeStringBuilder, StringBuilder, + BooleanBuilder, FixedSizeBinaryBuilder, LargeStringBuilder, StringBuilder, StringViewBuilder, + BinaryViewBuilder, }; use arrow_array::{ new_empty_array, new_null_array, ArrayRef, BinaryArray, BooleanArray, Date32Array, Date64Array, @@ -446,14 +447,43 @@ macro_rules! get_statistics { }, DataType::Dictionary(_, value_type) => { [<$stat_type_prefix:lower _ statistics>](value_type, $iterator) + }, + DataType::Utf8View => { + let iterator = [<$stat_type_prefix ByteArrayStatsIterator>]::new($iterator); + let mut builder = StringViewBuilder::new(); + for x in iterator { + let Some(x) = x else { + builder.append_null(); // no statistics value + continue; + }; + + let Ok(x) = std::str::from_utf8(x) else { + builder.append_null(); + continue; + }; + + builder.append_value(x); + } + Ok(Arc::new(builder.finish())) + }, + DataType::BinaryView => { + let iterator = [<$stat_type_prefix ByteArrayStatsIterator>]::new($iterator); + let mut builder = BinaryViewBuilder::new(); + for x in iterator { + let Some(x) = x else { + builder.append_null(); // no statistics value + continue; + }; + + builder.append_value(x); + } + Ok(Arc::new(builder.finish())) } DataType::Map(_,_) | DataType::Duration(_) | DataType::Interval(_) | DataType::Null | - DataType::BinaryView | - DataType::Utf8View | DataType::List(_) | DataType::ListView(_) | DataType::FixedSizeList(_, _) | @@ -919,7 +949,7 @@ macro_rules! get_data_page_statistics { } }) }, - Some(DataType::FixedSizeBinary(size)) => { + Some(DataType::FixedSizeBinary(size)) => { let mut builder = FixedSizeBinaryBuilder::new(*size); let iterator = [<$stat_type_prefix FixedLenByteArrayDataPageStatsIterator>]::new($iterator); for x in iterator { @@ -1498,12 +1528,7 @@ mod test { use arrow::compute::kernels::cast_utils::Parser; use arrow::datatypes::{i256, Date32Type, Date64Type}; use arrow::util::test_util::parquet_test_data; - use arrow_array::{ - new_empty_array, new_null_array, Array, ArrayRef, BinaryArray, BooleanArray, Date32Array, - Date64Array, Decimal128Array, Decimal256Array, Float32Array, Float64Array, Int16Array, - Int32Array, Int64Array, Int8Array, LargeBinaryArray, RecordBatch, StringArray, StructArray, - TimestampNanosecondArray, - }; + use arrow_array::{new_empty_array, new_null_array, Array, ArrayRef, BinaryArray, BooleanArray, Date32Array, Date64Array, Decimal128Array, Decimal256Array, Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, LargeBinaryArray, RecordBatch, StringArray, StructArray, TimestampNanosecondArray, StringViewArray, BinaryViewArray}; use arrow_schema::{DataType, Field, SchemaRef}; use bytes::Bytes; use std::path::PathBuf; @@ -1916,6 +1941,59 @@ mod test { .run() } + #[test] + fn roundtrip_string_view() { + Test { + input: string_view_array([ + // row group 1 + Some("A"), + None, + Some("Q"), + // row group 2 + Some("ZZ"), + Some("AA"), + None, + // row group 3 + None, + None, + None, + ]), + expected_min: string_view_array([Some("A"), Some("AA"), None]), + expected_max: string_view_array([Some("Q"), Some("ZZ"), None]), + } + .run() + } + + #[test] + fn roundtrip_binary_view() { + let input: Vec> = vec![ + // row group 1 + Some(b"A"), + None, + Some(b"Q"), + // row group 2 + Some(b"ZZ"), + Some(b"AA"), + None, + // row group 3 + None, + None, + None, + ]; + + let expected_min: Vec> = vec![Some(b"A"), Some(b"AA"), None]; + let expected_max: Vec> = vec![Some(b"Q"), Some(b"ZZ"), None]; + + let array = binary_view_array(input); + + Test { + input: array, + expected_min: binary_view_array(expected_min), + expected_max: binary_view_array(expected_max), + } + .run() + } + #[test] fn roundtrip_struct() { let mut test = Test { @@ -2539,4 +2617,19 @@ mod test { Arc::new(array) } + + fn string_view_array<'a>(input: impl IntoIterator>) -> ArrayRef { + let array: StringViewArray = input + .into_iter() + .map(|s| s.map(|s| s.to_string())) + .collect(); + + Arc::new(array) + } + + fn binary_view_array<'a>(input: Vec>) -> ArrayRef { + let array = BinaryViewArray::from(input.into_iter().collect::>>()); + + Arc::new(array) + } }