Skip to content

Commit

Permalink
Merge branch 'refs/heads/add-stringview-binaryview-to-stat-convertor'
Browse files Browse the repository at this point in the history
  • Loading branch information
Kev1n8 committed Aug 2, 2024
2 parents ede5a64 + 27f58cd commit ccef93b
Showing 1 changed file with 103 additions and 10 deletions.
113 changes: 103 additions & 10 deletions parquet/src/arrow/arrow_reader/statistics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@ use crate::file::page_index::index::{Index, PageIndex};
use crate::file::statistics::Statistics as ParquetStatistics;
use crate::schema::types::SchemaDescriptor;
use arrow_array::builder::{
BooleanBuilder, FixedSizeBinaryBuilder, LargeStringBuilder, StringBuilder,
BooleanBuilder, FixedSizeBinaryBuilder, LargeStringBuilder, StringBuilder, StringViewBuilder,
BinaryViewBuilder,
};
use arrow_array::{
new_empty_array, new_null_array, ArrayRef, BinaryArray, BooleanArray, Date32Array, Date64Array,
Expand Down Expand Up @@ -446,14 +447,43 @@ macro_rules! get_statistics {
},
DataType::Dictionary(_, value_type) => {
[<$stat_type_prefix:lower _ statistics>](value_type, $iterator)
},
DataType::Utf8View => {
let iterator = [<$stat_type_prefix ByteArrayStatsIterator>]::new($iterator);
let mut builder = StringViewBuilder::new();
for x in iterator {
let Some(x) = x else {
builder.append_null(); // no statistics value
continue;
};

let Ok(x) = std::str::from_utf8(x) else {
builder.append_null();
continue;
};

builder.append_value(x);
}
Ok(Arc::new(builder.finish()))
},
DataType::BinaryView => {
let iterator = [<$stat_type_prefix ByteArrayStatsIterator>]::new($iterator);
let mut builder = BinaryViewBuilder::new();
for x in iterator {
let Some(x) = x else {
builder.append_null(); // no statistics value
continue;
};

builder.append_value(x);
}
Ok(Arc::new(builder.finish()))
}

DataType::Map(_,_) |
DataType::Duration(_) |
DataType::Interval(_) |
DataType::Null |
DataType::BinaryView |
DataType::Utf8View |
DataType::List(_) |
DataType::ListView(_) |
DataType::FixedSizeList(_, _) |
Expand Down Expand Up @@ -919,7 +949,7 @@ macro_rules! get_data_page_statistics {
}
})
},
Some(DataType::FixedSizeBinary(size)) => {
Some(DataType::FixedSizeBinary(size)) => {
let mut builder = FixedSizeBinaryBuilder::new(*size);
let iterator = [<$stat_type_prefix FixedLenByteArrayDataPageStatsIterator>]::new($iterator);
for x in iterator {
Expand Down Expand Up @@ -1498,12 +1528,7 @@ mod test {
use arrow::compute::kernels::cast_utils::Parser;
use arrow::datatypes::{i256, Date32Type, Date64Type};
use arrow::util::test_util::parquet_test_data;
use arrow_array::{
new_empty_array, new_null_array, Array, ArrayRef, BinaryArray, BooleanArray, Date32Array,
Date64Array, Decimal128Array, Decimal256Array, Float32Array, Float64Array, Int16Array,
Int32Array, Int64Array, Int8Array, LargeBinaryArray, RecordBatch, StringArray, StructArray,
TimestampNanosecondArray,
};
use arrow_array::{new_empty_array, new_null_array, Array, ArrayRef, BinaryArray, BooleanArray, Date32Array, Date64Array, Decimal128Array, Decimal256Array, Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, LargeBinaryArray, RecordBatch, StringArray, StructArray, TimestampNanosecondArray, StringViewArray, BinaryViewArray};
use arrow_schema::{DataType, Field, SchemaRef};
use bytes::Bytes;
use std::path::PathBuf;
Expand Down Expand Up @@ -1916,6 +1941,59 @@ mod test {
.run()
}

#[test]
fn roundtrip_string_view() {
Test {
input: string_view_array([
// row group 1
Some("A"),
None,
Some("Q"),
// row group 2
Some("ZZ"),
Some("AA"),
None,
// row group 3
None,
None,
None,
]),
expected_min: string_view_array([Some("A"), Some("AA"), None]),
expected_max: string_view_array([Some("Q"), Some("ZZ"), None]),
}
.run()
}

#[test]
fn roundtrip_binary_view() {
let input: Vec<Option<&[u8]>> = vec![
// row group 1
Some(b"A"),
None,
Some(b"Q"),
// row group 2
Some(b"ZZ"),
Some(b"AA"),
None,
// row group 3
None,
None,
None,
];

let expected_min: Vec<Option<&[u8]>> = vec![Some(b"A"), Some(b"AA"), None];
let expected_max: Vec<Option<&[u8]>> = vec![Some(b"Q"), Some(b"ZZ"), None];

let array = binary_view_array(input);

Test {
input: array,
expected_min: binary_view_array(expected_min),
expected_max: binary_view_array(expected_max),
}
.run()
}

#[test]
fn roundtrip_struct() {
let mut test = Test {
Expand Down Expand Up @@ -2539,4 +2617,19 @@ mod test {

Arc::new(array)
}

fn string_view_array<'a>(input: impl IntoIterator<Item = Option<&'a str>>) -> ArrayRef {
let array: StringViewArray = input
.into_iter()
.map(|s| s.map(|s| s.to_string()))
.collect();

Arc::new(array)
}

fn binary_view_array<'a>(input: Vec<Option<&[u8]>>) -> ArrayRef {
let array = BinaryViewArray::from(input.into_iter().collect::<Vec<Option<&[u8]>>>());

Arc::new(array)
}
}

0 comments on commit ccef93b

Please sign in to comment.