From 8382ab88a074fe652034c9f1f0cb131376a394d8 Mon Sep 17 00:00:00 2001 From: Marko Grujic Date: Tue, 17 Dec 2024 14:34:51 +0100 Subject: [PATCH] Allow hints for upcasting parquet to arrow integer types --- parquet/src/arrow/arrow_reader/mod.rs | 36 +++++++++++++++++++++------ parquet/src/arrow/schema/primitive.rs | 5 ++++ 2 files changed, 34 insertions(+), 7 deletions(-) diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index 378884a1c430..d64a9a718b2d 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -3294,11 +3294,13 @@ mod tests { let nested_fields = Fields::from(vec![ Field::new("utf8_to_dict", ArrowDataType::Utf8, false), Field::new("int64_to_ts_nano", ArrowDataType::Int64, false), + Field::new("int16_to_int32", ArrowDataType::Int16, false), ]); let nested_arrays: Vec = vec![ Arc::new(StringArray::from(vec!["a", "a", "a", "b"])) as ArrayRef, Arc::new(Int64Array::from(vec![1, 2, 3, 4])) as ArrayRef, + Arc::new(Int16Array::from(vec![1, 2, 3, 4])) as ArrayRef, ]; let nested = StructArray::try_new(nested_fields, nested_arrays, None).unwrap(); @@ -3312,6 +3314,10 @@ mod tests { "date32_to_date64", Arc::new(Date32Array::from(vec![0, 1, 2, 3])) as ArrayRef, ), + ( + "int8_to_int64", + Arc::new(Int8Array::from(vec![0, 1, 2, 3])) as ArrayRef, + ), ("nested", Arc::new(nested) as ArrayRef), ]); @@ -3326,21 +3332,20 @@ mod tests { ), Field::new( "int64_to_ts_nano", - ArrowDataType::Timestamp( - arrow::datatypes::TimeUnit::Nanosecond, - Some("+10:00".into()), - ), + ArrowDataType::Timestamp(TimeUnit::Nanosecond, Some("+10:00".into())), false, ), + Field::new("int16_to_int32", ArrowDataType::Int32, false), ]); let supplied_schema = Arc::new(Schema::new(vec![ Field::new( "int32_to_ts_second", - ArrowDataType::Timestamp(arrow::datatypes::TimeUnit::Second, Some("+01:00".into())), + ArrowDataType::Timestamp(TimeUnit::Second, Some("+01:00".into())), false, ), Field::new("date32_to_date64", ArrowDataType::Date64, false), + Field::new("int8_to_int64", ArrowDataType::Int64, false), Field::new( "nested", ArrowDataType::Struct(supplied_nested_fields), @@ -3359,7 +3364,7 @@ mod tests { assert_eq!(arrow_reader.schema(), supplied_schema); let batch = arrow_reader.next().unwrap().unwrap(); - assert_eq!(batch.num_columns(), 3); + assert_eq!(batch.num_columns(), 4); assert_eq!(batch.num_rows(), 4); assert_eq!( batch @@ -3383,9 +3388,17 @@ mod tests { .expect("value as date"), "1970-01-01" ); + assert_eq!( + batch + .column(2) + .as_any() + .downcast_ref::() + .expect("downcast to int64"), + &Int64Array::from(vec![0, 1, 2, 3]), + ); let nested = batch - .column(2) + .column(3) .as_any() .downcast_ref::() .expect("downcast to struct"); @@ -3423,6 +3436,15 @@ mod tests { .expect("value as datetime"), "1970-01-01 10:00:00.000000001 +10:00" ); + + assert_eq!( + nested + .column(2) + .as_any() + .downcast_ref::() + .expect("downcast to int64"), + &Int32Array::from(vec![1, 2, 3, 4]), + ); } #[test] diff --git a/parquet/src/arrow/schema/primitive.rs b/parquet/src/arrow/schema/primitive.rs index 9f215b4dc07e..92941b54101d 100644 --- a/parquet/src/arrow/schema/primitive.rs +++ b/parquet/src/arrow/schema/primitive.rs @@ -47,6 +47,11 @@ fn apply_hint(parquet: DataType, hint: DataType) -> DataType { // Date64 doesn't have a corresponding LogicalType / ConvertedType (DataType::Int64, DataType::Date64) => hint, + // Allow up-casting integers (i.e. no precision loss) + (DataType::Int8, DataType::Int16) => hint, + (DataType::Int8 | DataType::Int16, DataType::Int32) => hint, + (DataType::Int8 | DataType::Int16 | DataType::Int32, DataType::Int64) => hint, + // Coerce Date32 back to Date64 (#1666) (DataType::Date32, DataType::Date64) => hint,