diff --git a/arrow-array/src/array/primitive_array.rs b/arrow-array/src/array/primitive_array.rs index 1112acacfcd9..a4192f2ad2f5 100644 --- a/arrow-array/src/array/primitive_array.rs +++ b/arrow-array/src/array/primitive_array.rs @@ -1180,6 +1180,7 @@ def_from_for_primitive!(Int8Type, i8); def_from_for_primitive!(Int16Type, i16); def_from_for_primitive!(Int32Type, i32); def_from_for_primitive!(Int64Type, i64); +def_from_for_primitive!(Int128Type, i128); def_from_for_primitive!(UInt8Type, u8); def_from_for_primitive!(UInt16Type, u16); def_from_for_primitive!(UInt32Type, u32); diff --git a/arrow-array/src/cast.rs b/arrow-array/src/cast.rs index 2e21f3e7e640..99ac5df5152e 100644 --- a/arrow-array/src/cast.rs +++ b/arrow-array/src/cast.rs @@ -330,6 +330,54 @@ macro_rules! downcast_primitive { }; } +/// This macro functions similarly to [`downcast_primitive`], but it excludes +/// [`arrow_schema::IntervalUnit::DayTime`] and [`arrow_schema::IntervalUnit::MonthDayNano`] +/// because they cannot be simply cast to primitive types during a comparison operation. +#[macro_export] +macro_rules! downcast_primitive_cmp { + ($($data_type:expr),+ => ($m:path $(, $args:tt)*), $($p:pat => $fallback:expr $(,)*)*) => { + $crate::downcast_integer! { + $($data_type),+ => ($m $(, $args)*), + $crate::repeat_pat!(arrow_schema::DataType::Float16, $($data_type),+) => { + $m!($crate::types::Float16Type $(, $args)*) + } + $crate::repeat_pat!(arrow_schema::DataType::Float32, $($data_type),+) => { + $m!($crate::types::Float32Type $(, $args)*) + } + $crate::repeat_pat!(arrow_schema::DataType::Float64, $($data_type),+) => { + $m!($crate::types::Float64Type $(, $args)*) + } + $crate::repeat_pat!(arrow_schema::DataType::Decimal128(_, _), $($data_type),+) => { + $m!($crate::types::Decimal128Type $(, $args)*) + } + $crate::repeat_pat!(arrow_schema::DataType::Decimal256(_, _), $($data_type),+) => { + $m!($crate::types::Decimal256Type $(, $args)*) + } + $crate::repeat_pat!(arrow_schema::DataType::Interval(arrow_schema::IntervalUnit::YearMonth), $($data_type),+) => { + $m!($crate::types::IntervalYearMonthType $(, $args)*) + } + $crate::repeat_pat!(arrow_schema::DataType::Duration(arrow_schema::TimeUnit::Second), $($data_type),+) => { + $m!($crate::types::DurationSecondType $(, $args)*) + } + $crate::repeat_pat!(arrow_schema::DataType::Duration(arrow_schema::TimeUnit::Millisecond), $($data_type),+) => { + $m!($crate::types::DurationMillisecondType $(, $args)*) + } + $crate::repeat_pat!(arrow_schema::DataType::Duration(arrow_schema::TimeUnit::Microsecond), $($data_type),+) => { + $m!($crate::types::DurationMicrosecondType $(, $args)*) + } + $crate::repeat_pat!(arrow_schema::DataType::Duration(arrow_schema::TimeUnit::Nanosecond), $($data_type),+) => { + $m!($crate::types::DurationNanosecondType $(, $args)*) + } + _ => { + $crate::downcast_temporal! { + $($data_type),+ => ($m $(, $args)*), + $($p => $fallback,)* + } + } + } + }; +} + #[macro_export] #[doc(hidden)] macro_rules! downcast_primitive_array_helper { @@ -383,6 +431,28 @@ macro_rules! downcast_primitive_array { }; } +/// This macro serves a similar function to [`downcast_primitive_array`], but it +/// incorporates [`downcast_primitive_cmp`]. [`downcast_primitive_cmp`] is a specialized +/// version of [`downcast_primitive`] designed specifically for comparison operations. +#[macro_export] +macro_rules! downcast_primitive_array_cmp { + ($values:ident => $e:expr, $($p:pat => $fallback:expr $(,)*)*) => { + $crate::downcast_primitive_array_cmp!($values => {$e} $($p => $fallback)*) + }; + (($($values:ident),+) => $e:expr, $($p:pat => $fallback:expr $(,)*)*) => { + $crate::downcast_primitive_array_cmp!($($values),+ => {$e} $($p => $fallback)*) + }; + ($($values:ident),+ => $e:block $($p:pat => $fallback:expr $(,)*)*) => { + $crate::downcast_primitive_array_cmp!(($($values),+) => $e $($p => $fallback)*) + }; + (($($values:ident),+) => $e:block $($p:pat => $fallback:expr $(,)*)*) => { + $crate::downcast_primitive_cmp!{ + $($values.data_type()),+ => ($crate::downcast_primitive_array_helper, $($values),+, $e), + $($p => $fallback,)* + } + }; +} + /// Force downcast of an [`Array`], such as an [`ArrayRef`], to /// [`PrimitiveArray`], panic'ing on failure. /// diff --git a/arrow-array/src/types.rs b/arrow-array/src/types.rs index 16d0e822d052..2b7c6c527d48 100644 --- a/arrow-array/src/types.rs +++ b/arrow-array/src/types.rs @@ -107,6 +107,12 @@ make_type!( DataType::Int64, "A signed 64-bit integer type." ); +make_type!( + Int128Type, + i128, + DataType::Int128, + "A signed 128-bit integer type." +); make_type!( UInt8Type, u8, diff --git a/arrow-data/src/data.rs b/arrow-data/src/data.rs index 10c53c549e2b..d10ac4cd5635 100644 --- a/arrow-data/src/data.rs +++ b/arrow-data/src/data.rs @@ -84,6 +84,7 @@ pub(crate) fn new_buffers(data_type: &DataType, capacity: usize) -> [MutableBuff | DataType::Int16 | DataType::Int32 | DataType::Int64 + | DataType::Int128 | DataType::Float16 | DataType::Float32 | DataType::Float64 @@ -1509,6 +1510,7 @@ pub fn layout(data_type: &DataType) -> DataTypeLayout { DataType::Int16 => DataTypeLayout::new_fixed_width::(), DataType::Int32 => DataTypeLayout::new_fixed_width::(), DataType::Int64 => DataTypeLayout::new_fixed_width::(), + DataType::Int128 => DataTypeLayout::new_fixed_width::(), DataType::UInt8 => DataTypeLayout::new_fixed_width::(), DataType::UInt16 => DataTypeLayout::new_fixed_width::(), DataType::UInt32 => DataTypeLayout::new_fixed_width::(), diff --git a/arrow-data/src/equal/mod.rs b/arrow-data/src/equal/mod.rs index b279546474a0..ba56c42704bd 100644 --- a/arrow-data/src/equal/mod.rs +++ b/arrow-data/src/equal/mod.rs @@ -74,6 +74,7 @@ fn equal_values( DataType::Int16 => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), DataType::Int32 => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), DataType::Int64 => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), + DataType::Int128 => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), DataType::Float32 => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), DataType::Float64 => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), DataType::Decimal128(_, _) => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), diff --git a/arrow-data/src/transform/mod.rs b/arrow-data/src/transform/mod.rs index 268cf10f2326..2b243fab4ccc 100644 --- a/arrow-data/src/transform/mod.rs +++ b/arrow-data/src/transform/mod.rs @@ -209,6 +209,7 @@ fn build_extend(array: &ArrayData) -> Extend { DataType::Int16 => primitive::build_extend::(array), DataType::Int32 => primitive::build_extend::(array), DataType::Int64 => primitive::build_extend::(array), + DataType::Int128 => primitive::build_extend::(array), DataType::Float32 => primitive::build_extend::(array), DataType::Float64 => primitive::build_extend::(array), DataType::Date32 | DataType::Time32(_) | DataType::Interval(IntervalUnit::YearMonth) => { @@ -251,6 +252,7 @@ fn build_extend_nulls(data_type: &DataType) -> ExtendNulls { DataType::Int16 => primitive::extend_nulls::, DataType::Int32 => primitive::extend_nulls::, DataType::Int64 => primitive::extend_nulls::, + DataType::Int128 => primitive::extend_nulls::, DataType::Float32 => primitive::extend_nulls::, DataType::Float64 => primitive::extend_nulls::, DataType::Date32 | DataType::Time32(_) | DataType::Interval(IntervalUnit::YearMonth) => { @@ -404,6 +406,7 @@ impl<'a> MutableArrayData<'a> { | DataType::Int16 | DataType::Int32 | DataType::Int64 + | DataType::Int128 | DataType::Float16 | DataType::Float32 | DataType::Float64 diff --git a/arrow-integration-test/src/datatype.rs b/arrow-integration-test/src/datatype.rs index 42ac71fbbd7e..3382ecfaa193 100644 --- a/arrow-integration-test/src/datatype.rs +++ b/arrow-integration-test/src/datatype.rs @@ -260,6 +260,7 @@ pub fn data_type_to_json(data_type: &DataType) -> serde_json::Value { DataType::Int16 => json!({"name": "int", "bitWidth": 16, "isSigned": true}), DataType::Int32 => json!({"name": "int", "bitWidth": 32, "isSigned": true}), DataType::Int64 => json!({"name": "int", "bitWidth": 64, "isSigned": true}), + DataType::Int128 => json!({"name": "int", "bitWidth": 128, "isSigned": true}), DataType::UInt8 => json!({"name": "int", "bitWidth": 8, "isSigned": false}), DataType::UInt16 => json!({"name": "int", "bitWidth": 16, "isSigned": false}), DataType::UInt32 => json!({"name": "int", "bitWidth": 32, "isSigned": false}), diff --git a/arrow-ipc/src/convert.rs b/arrow-ipc/src/convert.rs index b290a09acf5d..42067a4ece9b 100644 --- a/arrow-ipc/src/convert.rs +++ b/arrow-ipc/src/convert.rs @@ -499,7 +499,7 @@ pub(crate) fn get_fb_field_type<'a>( children: Some(children), } } - Int8 | Int16 | Int32 | Int64 => { + Int8 | Int16 | Int32 | Int64 | Int128 => { let children = fbb.create_vector(&empty_fields[..]); let mut builder = crate::IntBuilder::new(fbb); builder.add_is_signed(true); @@ -508,6 +508,7 @@ pub(crate) fn get_fb_field_type<'a>( Int16 => builder.add_bitWidth(16), Int32 => builder.add_bitWidth(32), Int64 => builder.add_bitWidth(64), + Int128 => builder.add_bitWidth(128), _ => {} }; FBFieldType { diff --git a/arrow-ord/src/cmp.rs b/arrow-ord/src/cmp.rs index bfb1f64e2eb8..4b279feedf49 100644 --- a/arrow-ord/src/cmp.rs +++ b/arrow-ord/src/cmp.rs @@ -23,17 +23,20 @@ //! [here](https://doc.rust-lang.org/stable/core/arch/) for more information. //! +use std::ops::Not; + use arrow_array::cast::AsArray; -use arrow_array::types::ByteArrayType; +use arrow_array::types::{ + ByteArrayType, Int128Type, Int64Type, IntervalDayTimeType, IntervalMonthDayNanoType, +}; use arrow_array::{ - downcast_primitive_array, AnyDictionaryArray, Array, ArrowNativeTypeOp, BooleanArray, Datum, - FixedSizeBinaryArray, GenericByteArray, + downcast_primitive_array_cmp, AnyDictionaryArray, Array, ArrowNativeTypeOp, BooleanArray, + Datum, FixedSizeBinaryArray, GenericByteArray, PrimitiveArray, }; use arrow_buffer::bit_util::ceil; use arrow_buffer::{BooleanBuffer, MutableBuffer, NullBuffer}; -use arrow_schema::ArrowError; +use arrow_schema::{ArrowError, IntervalUnit}; use arrow_select::take::take; -use std::ops::Not; #[derive(Debug, Copy, Clone)] enum Op { @@ -206,7 +209,7 @@ fn compare_op(op: Op, lhs: &dyn Datum, rhs: &dyn Datum) -> Result BooleanBuffer { - let d = downcast_primitive_array! { + let d = downcast_primitive_array_cmp! { (l, r) => apply(op, l.values().as_ref(), l_s, l_v, r.values().as_ref(), r_s, r_v), (Boolean, Boolean) => apply(op, l.as_boolean(), l_s, l_v, r.as_boolean(), r_s, r_v), (Utf8, Utf8) => apply(op, l.as_string::(), l_s, l_v, r.as_string::(), r_s, r_v), @@ -214,6 +217,8 @@ fn compare_op(op: Op, lhs: &dyn Datum, rhs: &dyn Datum) -> Result apply(op, l.as_binary::(), l_s, l_v, r.as_binary::(), r_s, r_v), (LargeBinary, LargeBinary) => apply(op, l.as_binary::(), l_s, l_v, r.as_binary::(), r_s, r_v), (FixedSizeBinary(_), FixedSizeBinary(_)) => apply(op, l.as_fixed_size_binary(), l_s, l_v, r.as_fixed_size_binary(), r_s, r_v), + (Interval(IntervalUnit::DayTime), Interval(IntervalUnit::DayTime)) => apply_interval_dt(op, l, l_s, l_v, r, r_s, r_v), + (Interval(IntervalUnit::MonthDayNano), Interval(IntervalUnit::MonthDayNano)) => apply_interval_mdn(op, l, l_s, l_v, r, r_s, r_v), (Null, Null) => None, _ => unreachable!(), }; @@ -341,6 +346,82 @@ fn apply( } } +fn apply_interval_dt( + op: Op, + l: &dyn Array, + l_s: bool, + l_v: Option<&dyn AnyDictionaryArray>, + r: &dyn Array, + r_s: bool, + r_v: Option<&dyn AnyDictionaryArray>, +) -> Option { + let evaluate_min = apply( + op, + interval_dt_min(l).values().as_ref(), + l_s, + l_v, + interval_dt_min(r).values().as_ref(), + r_s, + r_v, + ); + let evaluate_max = apply( + op, + interval_dt_max(l).values().as_ref(), + l_s, + l_v, + interval_dt_max(r).values().as_ref(), + r_s, + r_v, + ); + definite_comparison(evaluate_min, evaluate_max) +} + +fn apply_interval_mdn( + op: Op, + l: &dyn Array, + l_s: bool, + l_v: Option<&dyn AnyDictionaryArray>, + r: &dyn Array, + r_s: bool, + r_v: Option<&dyn AnyDictionaryArray>, +) -> Option { + let evaluate_min = apply( + op, + interval_mdn_min(l).values().as_ref(), + l_s, + l_v, + interval_mdn_min(r).values().as_ref(), + r_s, + r_v, + ); + let evaluate_max = apply( + op, + interval_mdn_max(l).values().as_ref(), + l_s, + l_v, + interval_mdn_max(r).values().as_ref(), + r_s, + r_v, + ); + definite_comparison(evaluate_min, evaluate_max) +} + +fn definite_comparison( + min: Option, + max: Option, +) -> Option { + min.and_then(|min_values| { + max.map(|max_values| { + BooleanBuffer::from_iter( + min_values + .into_iter() + .zip(&max_values) + .map(|(min, max)| min & max), + ) + }) + }) +} + /// Perform a take operation on `buffer` with the given dictionary fn take_bits(v: &dyn AnyDictionaryArray, buffer: BooleanBuffer) -> BooleanBuffer { let array = take(&BooleanArray::new(buffer, None), v.keys(), None).unwrap(); @@ -540,6 +621,68 @@ impl<'a> ArrayOrd for &'a FixedSizeBinaryArray { } } +#[inline] +fn interval_dt_min(dt: &dyn Array) -> PrimitiveArray { + if let Some(dt) = dt.as_primitive_opt::() { + PrimitiveArray::::from_iter(dt.iter().map(|dt| { + dt.map(|dt| { + let d = dt >> 32; + let m = dt as i32 as i64; + d * (86_400_000) + m + }) + })) + } else { + panic!("Invalid datatype for Interval(IntervalDayTime) comparison") + } +} + +#[inline] +fn interval_dt_max(dt: &dyn Array) -> PrimitiveArray { + if let Some(dt) = dt.as_primitive_opt::() { + PrimitiveArray::::from_iter(dt.iter().map(|dt| { + dt.map(|dt| { + let d = dt >> 32; + let m = dt as i32 as i64; + d * (86_400_000 + 1_000) + m + }) + })) + } else { + panic!("Invalid datatype for Interval(IntervalDayTime) comparison") + } +} + +#[inline] +fn interval_mdn_min(mdn: &dyn Array) -> PrimitiveArray { + if let Some(mdn) = mdn.as_primitive_opt::() { + PrimitiveArray::::from_iter(mdn.iter().map(|mdn| { + mdn.map(|mdn| { + let m = (mdn >> 96) as i32; + let d = (mdn >> 64) as i32; + let n = mdn as i64; + ((m as i128 * 28) + d as i128) * (86_400_000_000_000) + n as i128 + }) + })) + } else { + panic!("Invalid datatype for Interval(IntervalMonthDayNano) comparison") + } +} + +#[inline] +fn interval_mdn_max(mdn: &dyn Array) -> PrimitiveArray { + if let Some(mdn) = mdn.as_primitive_opt::() { + PrimitiveArray::::from_iter(mdn.iter().map(|mdn| { + mdn.map(|mdn| { + let m = (mdn >> 96) as i32; + let d = (mdn >> 64) as i32; + let n = mdn as i64; + ((m as i128 * 31) + d as i128) * (86_400_000_000_000 + 1_000_000_000) + n as i128 + }) + })) + } else { + panic!("Invalid datatype for Interval(IntervalMonthDayNano) comparison") + } +} + #[cfg(test)] mod tests { use std::sync::Arc; diff --git a/arrow-ord/src/comparison.rs b/arrow-ord/src/comparison.rs index 021ecdf0e658..de8b769ea9ca 100644 --- a/arrow-ord/src/comparison.rs +++ b/arrow-ord/src/comparison.rs @@ -2036,6 +2036,53 @@ mod tests { ); } + #[test] + fn test_interval_array_unit_aware() { + let a = + IntervalDayTimeArray::from(vec![Some(IntervalDayTimeType::make_value(0, -5)),Some(IntervalDayTimeType::make_value(3, -1_000_000)),Some(IntervalDayTimeType::make_value(4, -1000)),Some(IntervalDayTimeType::make_value(10, 20)),Some(IntervalDayTimeType::make_value(1, 2))]); + let b = + IntervalDayTimeArray::from(vec![Some(IntervalDayTimeType::make_value(0, -10)),Some(IntervalDayTimeType::make_value(3, -2_000_000)),Some(IntervalDayTimeType::make_value(2, 1000)),Some(IntervalDayTimeType::make_value(5, 6)),Some(IntervalDayTimeType::make_value(1, 1))]); + let res = gt(&a, &b).unwrap(); + let res_eq = gt_eq(&a, &b).unwrap(); + assert_eq!(res, res_eq); + assert_eq!( + &res, + &BooleanArray::from(vec![ Some(true), Some(true), Some(true), Some(true), Some(true)]) + ); + let res = lt(&b, &a).unwrap(); + let res_eq = lt_eq(&b, &a).unwrap(); + assert_eq!(res, res_eq); + assert_eq!( + &res, + &BooleanArray::from(vec![ Some(true), Some(true), Some(true), Some(true), Some(true)]) + ); + + let a = IntervalMonthDayNanoArray::from( + vec![Some(IntervalMonthDayNanoType::make_value(0, 0, 1)),Some(IntervalMonthDayNanoType::make_value(0, 1, -1_000_000_000)),Some(IntervalMonthDayNanoType::make_value(3, 2, -100_000_000_000)),Some(IntervalMonthDayNanoType::make_value(0, 1, 86_400_000_000_999)),Some(IntervalMonthDayNanoType::make_value(1, 28, 0)), Some(IntervalMonthDayNanoType::make_value(10, 0, -1_000_000_000_000))], + ); + let b = IntervalMonthDayNanoArray::from( + vec![Some(IntervalMonthDayNanoType::make_value(0, 0,0)),Some(IntervalMonthDayNanoType::make_value(0, 1, -8_000_000_000)),Some(IntervalMonthDayNanoType::make_value(1, 25, 100_000_000_000)),Some(IntervalMonthDayNanoType::make_value(0, 2, 0)),Some(IntervalMonthDayNanoType::make_value(2, 0, 0)), Some(IntervalMonthDayNanoType::make_value(5, 150, 1_000_000_000_000))], + ); + let res = gt(&a, &b).unwrap(); + let res_eq = gt_eq(&a, &b).unwrap(); + assert_eq!(res, res_eq); + assert_eq!( + &res, + &BooleanArray::from( + vec![ Some(true), Some(true),Some(true),Some(false),Some(false), Some(false)] + ) + ); + let res = lt(&b, &a).unwrap(); + let res_eq = lt_eq(&b, &a).unwrap(); + assert_eq!(res, res_eq); + assert_eq!( + &res, + &BooleanArray::from( + vec![ Some(true), Some(true),Some(true),Some(false),Some(false), Some(false)] + ) + ); + } + macro_rules! test_binary { ($test_name:ident, $left:expr, $right:expr, $op:expr, $expected:expr) => { #[test] diff --git a/arrow-schema/src/datatype.rs b/arrow-schema/src/datatype.rs index b78c785ae279..f0c4e62fd753 100644 --- a/arrow-schema/src/datatype.rs +++ b/arrow-schema/src/datatype.rs @@ -50,6 +50,8 @@ pub enum DataType { Int32, /// A signed 64-bit integer. Int64, + /// A signed 128-bit integer. + Int128, /// An unsigned 8-bit integer. UInt8, /// An unsigned 16-bit integer. @@ -467,6 +469,7 @@ impl DataType { DataType::Int16 | DataType::UInt16 | DataType::Float16 => Some(2), DataType::Int32 | DataType::UInt32 | DataType::Float32 => Some(4), DataType::Int64 | DataType::UInt64 | DataType::Float64 => Some(8), + DataType::Int128 => Some(16), DataType::Timestamp(_, _) => Some(8), DataType::Date32 | DataType::Time32(_) => Some(4), DataType::Date64 | DataType::Time64(_) => Some(8), @@ -500,6 +503,7 @@ impl DataType { | DataType::Int16 | DataType::Int32 | DataType::Int64 + | DataType::Int128 | DataType::UInt8 | DataType::UInt16 | DataType::UInt32 diff --git a/arrow-schema/src/field.rs b/arrow-schema/src/field.rs index 574c024bb9b9..0bac28e6c7c0 100644 --- a/arrow-schema/src/field.rs +++ b/arrow-schema/src/field.rs @@ -459,6 +459,7 @@ impl Field { | DataType::Int16 | DataType::Int32 | DataType::Int64 + | DataType::Int128 | DataType::UInt8 | DataType::UInt16 | DataType::UInt32 diff --git a/parquet/src/arrow/schema/mod.rs b/parquet/src/arrow/schema/mod.rs index 4c350c4b1d8c..6de81162f1ef 100644 --- a/parquet/src/arrow/schema/mod.rs +++ b/parquet/src/arrow/schema/mod.rs @@ -341,6 +341,7 @@ fn arrow_to_parquet_type(field: &Field) -> Result { .with_repetition(repetition) .with_id(id) .build(), + DataType::Int128 => unimplemented!(), DataType::UInt8 => Type::primitive_type_builder(name, PhysicalType::INT32) .with_logical_type(Some(LogicalType::Integer { bit_width: 8, @@ -1944,4 +1945,4 @@ mod tests { fn test_get_arrow_schema_from_metadata() { assert!(get_arrow_schema_from_metadata("").is_err()); } -} +} \ No newline at end of file