Skip to content

Commit

Permalink
Add support for StringView and BinaryView statistics in `Statisti…
Browse files Browse the repository at this point in the history
…csConverter` (apache#6181)

* Add StringView and BinaryView support for the macro `get_statistics`

* Add StringView and BinaryView support for the macro `get_data_page_statistics`

* add tests to cover the support for StringView and BinaryView in the macro get_data_page_statistics

* found potential bugs and ignore the tests

* fake alarm! no bugs, fix the code by initiating all batches to have 5 rows

* make the get_stat StringView and BinaryView tests cover bytes greater than 12
  • Loading branch information
Kev1n8 authored Aug 3, 2024
1 parent 36d567b commit e6bd74b
Show file tree
Hide file tree
Showing 3 changed files with 270 additions and 20 deletions.
173 changes: 164 additions & 9 deletions parquet/src/arrow/arrow_reader/statistics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@ use crate::file::page_index::index::{Index, PageIndex};
use crate::file::statistics::Statistics as ParquetStatistics;
use crate::schema::types::SchemaDescriptor;
use arrow_array::builder::{
BooleanBuilder, FixedSizeBinaryBuilder, LargeStringBuilder, StringBuilder,
BinaryViewBuilder, BooleanBuilder, FixedSizeBinaryBuilder, LargeStringBuilder, StringBuilder,
StringViewBuilder,
};
use arrow_array::{
new_empty_array, new_null_array, ArrayRef, BinaryArray, BooleanArray, Date32Array, Date64Array,
Expand Down Expand Up @@ -446,14 +447,43 @@ macro_rules! get_statistics {
},
DataType::Dictionary(_, value_type) => {
[<$stat_type_prefix:lower _ statistics>](value_type, $iterator)
},
DataType::Utf8View => {
let iterator = [<$stat_type_prefix ByteArrayStatsIterator>]::new($iterator);
let mut builder = StringViewBuilder::new();
for x in iterator {
let Some(x) = x else {
builder.append_null(); // no statistics value
continue;
};

let Ok(x) = std::str::from_utf8(x) else {
builder.append_null();
continue;
};

builder.append_value(x);
}
Ok(Arc::new(builder.finish()))
},
DataType::BinaryView => {
let iterator = [<$stat_type_prefix ByteArrayStatsIterator>]::new($iterator);
let mut builder = BinaryViewBuilder::new();
for x in iterator {
let Some(x) = x else {
builder.append_null(); // no statistics value
continue;
};

builder.append_value(x);
}
Ok(Arc::new(builder.finish()))
}

DataType::Map(_,_) |
DataType::Duration(_) |
DataType::Interval(_) |
DataType::Null |
DataType::BinaryView |
DataType::Utf8View |
DataType::List(_) |
DataType::ListView(_) |
DataType::FixedSizeList(_, _) |
Expand Down Expand Up @@ -919,7 +949,7 @@ macro_rules! get_data_page_statistics {
}
})
},
Some(DataType::FixedSizeBinary(size)) => {
Some(DataType::FixedSizeBinary(size)) => {
let mut builder = FixedSizeBinaryBuilder::new(*size);
let iterator = [<$stat_type_prefix FixedLenByteArrayDataPageStatsIterator>]::new($iterator);
for x in iterator {
Expand All @@ -943,7 +973,58 @@ macro_rules! get_data_page_statistics {
}
Ok(Arc::new(builder.finish()))
},
_ => unimplemented!()
Some(DataType::Utf8View) => {
let mut builder = StringViewBuilder::new();
let iterator = [<$stat_type_prefix ByteArrayDataPageStatsIterator>]::new($iterator);
for x in iterator {
for x in x.into_iter() {
let Some(x) = x else {
builder.append_null(); // no statistics value
continue;
};

let Ok(x) = std::str::from_utf8(x.data()) else {
builder.append_null();
continue;
};

builder.append_value(x);
}
}
Ok(Arc::new(builder.finish()))
},
Some(DataType::BinaryView) => {
let mut builder = BinaryViewBuilder::new();
let iterator = [<$stat_type_prefix ByteArrayDataPageStatsIterator>]::new($iterator);
for x in iterator {
for x in x.into_iter() {
let Some(x) = x else {
builder.append_null(); // no statistics value
continue;
};

builder.append_value(x);
}
}
Ok(Arc::new(builder.finish()))
},
Some(DataType::Null) |
Some(DataType::Duration(_)) |
Some(DataType::Interval(_)) |
Some(DataType::List(_)) |
Some(DataType::ListView(_)) |
Some(DataType::FixedSizeList(_, _)) |
Some(DataType::LargeList(_)) |
Some(DataType::LargeListView(_)) |
Some(DataType::Struct(_)) |
Some(DataType::Union(_, _)) |
Some(DataType::Map(_, _)) |
Some(DataType::RunEndEncoded(_, _)) => {
let len = $iterator.count();
// don't know how to extract statistics, so return a null array
Ok(new_null_array($data_type.unwrap(), len))
},
None => unimplemented!() // not sure how to handle this
}
}
}
Expand Down Expand Up @@ -1499,10 +1580,10 @@ mod test {
use arrow::datatypes::{i256, Date32Type, Date64Type};
use arrow::util::test_util::parquet_test_data;
use arrow_array::{
new_empty_array, new_null_array, Array, ArrayRef, BinaryArray, BooleanArray, Date32Array,
Date64Array, Decimal128Array, Decimal256Array, Float32Array, Float64Array, Int16Array,
Int32Array, Int64Array, Int8Array, LargeBinaryArray, RecordBatch, StringArray, StructArray,
TimestampNanosecondArray,
new_empty_array, new_null_array, Array, ArrayRef, BinaryArray, BinaryViewArray,
BooleanArray, Date32Array, Date64Array, Decimal128Array, Decimal256Array, Float32Array,
Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, LargeBinaryArray, RecordBatch,
StringArray, StringViewArray, StructArray, TimestampNanosecondArray,
};
use arrow_schema::{DataType, Field, SchemaRef};
use bytes::Bytes;
Expand Down Expand Up @@ -1916,6 +1997,65 @@ mod test {
.run()
}

#[test]
fn roundtrip_string_view() {
Test {
input: string_view_array([
// row group 1
Some("A"),
None,
Some("Q"),
// row group 2
Some("ZZ"),
Some("A_longerthan12"),
None,
// row group 3
Some("A_longerthan12"),
None,
None,
]),
expected_min: string_view_array([
Some("A"),
Some("A_longerthan12"),
Some("A_longerthan12"),
]),
expected_max: string_view_array([Some("Q"), Some("ZZ"), Some("A_longerthan12")]),
}
.run()
}

#[test]
fn roundtrip_binary_view() {
let input: Vec<Option<&[u8]>> = vec![
// row group 1
Some(b"A"),
None,
Some(b"Q"),
// row group 2
Some(b"ZZ"),
Some(b"A_longerthan12"),
None,
// row group 3
Some(b"A_longerthan12"),
None,
None,
];

let expected_min: Vec<Option<&[u8]>> =
vec![Some(b"A"), Some(b"A_longerthan12"), Some(b"A_longerthan12")];
let expected_max: Vec<Option<&[u8]>> =
vec![Some(b"Q"), Some(b"ZZ"), Some(b"A_longerthan12")];

let array = binary_view_array(input);

Test {
input: array,
expected_min: binary_view_array(expected_min),
expected_max: binary_view_array(expected_max),
}
.run()
}

#[test]
fn roundtrip_struct() {
let mut test = Test {
Expand Down Expand Up @@ -2539,4 +2679,19 @@ mod test {

Arc::new(array)
}

fn string_view_array<'a>(input: impl IntoIterator<Item = Option<&'a str>>) -> ArrayRef {
let array: StringViewArray = input
.into_iter()
.map(|s| s.map(|s| s.to_string()))
.collect();

Arc::new(array)
}

fn binary_view_array(input: Vec<Option<&[u8]>>) -> ArrayRef {
let array = BinaryViewArray::from(input.into_iter().collect::<Vec<Option<&[u8]>>>());

Arc::new(array)
}
}
55 changes: 48 additions & 7 deletions parquet/tests/arrow_reader/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,13 @@

use arrow_array::types::{Int32Type, Int8Type};
use arrow_array::{
Array, ArrayRef, BinaryArray, BooleanArray, Date32Array, Date64Array, Decimal128Array,
Decimal256Array, DictionaryArray, FixedSizeBinaryArray, Float16Array, Float32Array,
Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, LargeBinaryArray,
LargeStringArray, RecordBatch, StringArray, StructArray, Time32MillisecondArray,
Time32SecondArray, Time64MicrosecondArray, Time64NanosecondArray, TimestampMicrosecondArray,
TimestampMillisecondArray, TimestampNanosecondArray, TimestampSecondArray, UInt16Array,
UInt32Array, UInt64Array, UInt8Array,
Array, ArrayRef, BinaryArray, BinaryViewArray, BooleanArray, Date32Array, Date64Array,
Decimal128Array, Decimal256Array, DictionaryArray, FixedSizeBinaryArray, Float16Array,
Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, LargeBinaryArray,
LargeStringArray, RecordBatch, StringArray, StringViewArray, StructArray,
Time32MillisecondArray, Time32SecondArray, Time64MicrosecondArray, Time64NanosecondArray,
TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray,
TimestampSecondArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array,
};
use arrow_buffer::i256;
use arrow_schema::{DataType, Field, Schema, TimeUnit};
Expand Down Expand Up @@ -88,6 +88,8 @@ enum Scenario {
PeriodsInColumnNames,
StructArray,
UTF8,
UTF8View,
BinaryView,
}

fn make_boolean_batch(v: Vec<Option<bool>>) -> RecordBatch {
Expand Down Expand Up @@ -589,6 +591,16 @@ fn make_utf8_batch(value: Vec<Option<&str>>) -> RecordBatch {
.unwrap()
}

fn make_utf8_view_batch(value: Vec<Option<&str>>) -> RecordBatch {
let utf8_view = StringViewArray::from(value);
RecordBatch::try_from_iter(vec![("utf8_view", Arc::new(utf8_view) as _)]).unwrap()
}

fn make_binary_view_batch(value: Vec<Option<&[u8]>>) -> RecordBatch {
let binary_view = BinaryViewArray::from(value);
RecordBatch::try_from_iter(vec![("binary_view", Arc::new(binary_view) as _)]).unwrap()
}

fn make_dict_batch() -> RecordBatch {
let values = [
Some("abc"),
Expand Down Expand Up @@ -972,6 +984,35 @@ fn create_data_batch(scenario: Scenario) -> Vec<RecordBatch> {
make_utf8_batch(vec![Some("e"), Some("f"), Some("g"), Some("h"), Some("i")]),
]
}
Scenario::UTF8View => {
// Make utf8_view batch including string length <12 and >12 bytes
// as the internal representation of StringView is differed for strings
// shorter and longer than that length
vec![
make_utf8_view_batch(vec![Some("a"), Some("b"), Some("c"), Some("d"), None]),
make_utf8_view_batch(vec![Some("a"), Some("e_longerthan12"), None, None, None]),
make_utf8_view_batch(vec![
Some("e_longerthan12"),
Some("f_longerthan12"),
Some("g_longerthan12"),
Some("h_longerthan12"),
Some("i_longerthan12"),
]),
]
}
Scenario::BinaryView => {
vec![
make_binary_view_batch(vec![Some(b"a"), Some(b"b"), Some(b"c"), Some(b"d"), None]),
make_binary_view_batch(vec![Some(b"a"), Some(b"e_longerthan12"), None, None, None]),
make_binary_view_batch(vec![
Some(b"e_longerthan12"),
Some(b"f_longerthan12"),
Some(b"g_longerthan12"),
Some(b"h_longerthan12"),
Some(b"i_longerthan12"),
]),
]
}
}
}

Expand Down
62 changes: 58 additions & 4 deletions parquet/tests/arrow_reader/statistics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,11 @@ use arrow::datatypes::{
TimestampNanosecondType, TimestampSecondType,
};
use arrow_array::{
make_array, new_null_array, Array, ArrayRef, BinaryArray, BooleanArray, Date32Array,
Date64Array, Decimal128Array, Decimal256Array, FixedSizeBinaryArray, Float16Array,
make_array, new_null_array, Array, ArrayRef, BinaryArray, BinaryViewArray, BooleanArray,
Date32Array, Date64Array, Decimal128Array, Decimal256Array, FixedSizeBinaryArray, Float16Array,
Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, LargeBinaryArray,
LargeStringArray, RecordBatch, StringArray, Time32MillisecondArray, Time32SecondArray,
Time64MicrosecondArray, Time64NanosecondArray, TimestampMicrosecondArray,
LargeStringArray, RecordBatch, StringArray, StringViewArray, Time32MillisecondArray,
Time32SecondArray, Time64MicrosecondArray, Time64NanosecondArray, TimestampMicrosecondArray,
TimestampMillisecondArray, TimestampNanosecondArray, TimestampSecondArray, UInt16Array,
UInt32Array, UInt64Array, UInt8Array,
};
Expand Down Expand Up @@ -2059,6 +2059,60 @@ async fn test_utf8() {
.run();
}

// UTF8View
#[tokio::test]
async fn test_utf8_view() {
let reader = TestReader {
scenario: Scenario::UTF8View,
row_per_group: 5,
}
.build()
.await;

// test for utf8_view
Test {
reader: &reader,
expected_min: Arc::new(StringViewArray::from(vec!["a", "a", "e_longerthan12"])),
expected_max: Arc::new(StringViewArray::from(vec![
"d",
"e_longerthan12",
"i_longerthan12",
])),
expected_null_counts: UInt64Array::from(vec![1, 3, 0]),
expected_row_counts: Some(UInt64Array::from(vec![5, 5, 5])),
column_name: "utf8_view",
check: Check::Both,
}
.run()
}

// BinaryView
#[tokio::test]
async fn test_binary_view() {
let reader = TestReader {
scenario: Scenario::BinaryView,
row_per_group: 5,
}
.build()
.await;

let expected_min: Vec<Option<&[u8]>> = vec![Some(b"a"), Some(b"a"), Some(b"e_longerthan12")];
let expected_max: Vec<Option<&[u8]>> =
vec![Some(b"d"), Some(b"e_longerthan12"), Some(b"i_longerthan12")];

// test for utf8_view
Test {
reader: &reader,
expected_min: Arc::new(BinaryViewArray::from(expected_min)),
expected_max: Arc::new(BinaryViewArray::from(expected_max)),
expected_null_counts: UInt64Array::from(vec![1, 3, 0]),
expected_row_counts: Some(UInt64Array::from(vec![5, 5, 5])),
column_name: "binary_view",
check: Check::Both,
}
.run()
}

////// Files with missing statistics ///////

#[tokio::test]
Expand Down

0 comments on commit e6bd74b

Please sign in to comment.