From 634bf8a0cbfbc3ac6bb4d52c3a1f09f88132327d Mon Sep 17 00:00:00 2001 From: Jefffrey Date: Sun, 21 Jan 2024 13:59:27 +1100 Subject: [PATCH 1/5] Refactor temporal extract date part kernels --- arrow-arith/src/temporal.rs | 563 +++++++++++++++++++++++------------- 1 file changed, 365 insertions(+), 198 deletions(-) diff --git a/arrow-arith/src/temporal.rs b/arrow-arith/src/temporal.rs index a9c3de5401c1..463a18739e31 100644 --- a/arrow-arith/src/temporal.rs +++ b/arrow-arith/src/temporal.rs @@ -19,105 +19,352 @@ use std::sync::Arc; -use chrono::{DateTime, Datelike, NaiveDateTime, NaiveTime, Offset, Timelike}; - -use arrow_array::builder::*; -use arrow_array::iterator::ArrayIter; -use arrow_array::temporal_conversions::{as_datetime, as_datetime_with_timezone, as_time}; +use arrow_array::cast::{downcast_array, AsArray}; +use chrono::{DateTime, Datelike, NaiveDateTime, Offset, TimeZone, Timelike, Utc}; + +use arrow_array::temporal_conversions::{ + date32_to_datetime, date64_to_datetime, time32ms_to_time, time32s_to_time, time64ns_to_time, + time64us_to_time, timestamp_ms_to_datetime, timestamp_ns_to_datetime, timestamp_s_to_datetime, + timestamp_us_to_datetime, +}; use arrow_array::timezone::Tz; use arrow_array::types::*; use arrow_array::*; use arrow_buffer::ArrowNativeType; -use arrow_schema::{ArrowError, DataType}; - -/// This function takes an `ArrayIter` of input array and an extractor `op` which takes -/// an input `NaiveTime` and returns time component (e.g. hour) as `i32` value. -/// The extracted values are built by the given `builder` to be an `Int32Array`. -fn as_time_with_op, T: ArrowTemporalType, F>( - iter: ArrayIter, - mut builder: PrimitiveBuilder, - op: F, -) -> Int32Array -where - F: Fn(NaiveTime) -> i32, - i64: From, -{ - iter.into_iter().for_each(|value| { - if let Some(value) = value { - match as_time::(i64::from(value)) { - Some(dt) => builder.append_value(op(dt)), - None => builder.append_null(), - } - } else { - builder.append_null(); - } - }); +use arrow_schema::{ArrowError, DataType, TimeUnit}; + +/// Valid parts to extract from date/timestamp arrays. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum DatePart { + /// Quarter of the year, in range `1..=4` + Quarter, + /// Calendar year + Year, + /// Month in the year, in range `1..=12` + Month, + /// ISO week of the year, in range `1..=53` + Week, + /// Day of the month, in range `1..=31` + Day, + /// Day of the week, in range `0..=6`, where Sunday is 0 + DayOfWeekSunday0, + /// Day of the week, in range `0..=6`, where Monday is 0 + DayOfWeekMonday0, + /// Day of year, in range `1..=366` + DayOfYear, + /// Hour of the day, in range `0..=23` + Hour, + /// Minute of the hour, in range `0..=59` + Minute, + /// Second of the minute, in range `0..=59` + Second, + /// Millisecond of the second + Millisecond, + /// Microsecond of the second + Microsecond, + /// Nanosecond of the second + Nanosecond, +} - builder.finish() +impl std::fmt::Display for DatePart { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{:?}", self) + } } -/// This function takes an `ArrayIter` of input array and an extractor `op` which takes -/// an input `NaiveDateTime` and returns data time component (e.g. hour) as `i32` value. -/// The extracted values are built by the given `builder` to be an `Int32Array`. -fn as_datetime_with_op, T: ArrowTemporalType, F>( - iter: ArrayIter, - mut builder: PrimitiveBuilder, - op: F, -) -> Int32Array -where - F: Fn(NaiveDateTime) -> i32, - i64: From, -{ - iter.into_iter().for_each(|value| { - if let Some(value) = value { - match as_datetime::(i64::from(value)) { - Some(dt) => builder.append_value(op(dt)), - None => builder.append_null(), - } - } else { - builder.append_null(); - } - }); +/// Returns function to extract relevant [`DatePart`] from a [`NaiveDateTime`]. +fn get_naive_date_time_part_extract_fn(part: DatePart) -> fn(NaiveDateTime) -> i32 { + match part { + DatePart::Quarter => |d| d.quarter() as i32, + DatePart::Year => |d| d.year(), + DatePart::Month => |d| d.month() as i32, + DatePart::Week => |d| d.iso_week().week() as i32, + DatePart::Day => |d| d.day() as i32, + DatePart::DayOfWeekSunday0 => |d| d.num_days_from_sunday(), + DatePart::DayOfWeekMonday0 => |d| d.num_days_from_monday(), + DatePart::DayOfYear => |d| d.ordinal() as i32, + DatePart::Hour => |d| d.hour() as i32, + DatePart::Minute => |d| d.minute() as i32, + DatePart::Second => |d| d.second() as i32, + DatePart::Millisecond => |d| (d.nanosecond() / 1_000_000) as i32, + DatePart::Microsecond => |d| (d.nanosecond() / 1_000) as i32, + DatePart::Nanosecond => |d| (d.nanosecond()) as i32, + } +} - builder.finish() +/// Returns function to extract relevant [`DatePart`] from a [`DateTime`]. +fn get_date_time_tz_part_extract_fn(part: DatePart) -> fn(DateTime) -> i32 { + match part { + DatePart::Quarter => |d| d.quarter() as i32, + DatePart::Year => |d| d.year(), + DatePart::Month => |d| d.month() as i32, + DatePart::Week => |d| d.iso_week().week() as i32, + DatePart::Day => |d| d.day() as i32, + DatePart::DayOfWeekSunday0 => |d| d.num_days_from_sunday(), + DatePart::DayOfWeekMonday0 => |d| d.num_days_from_monday(), + DatePart::DayOfYear => |d| d.ordinal() as i32, + DatePart::Hour => |d| d.hour() as i32, + DatePart::Minute => |d| d.minute() as i32, + DatePart::Second => |d| d.second() as i32, + DatePart::Millisecond => |d| (d.nanosecond() / 1_000_000) as i32, + DatePart::Microsecond => |d| (d.nanosecond() / 1_000) as i32, + DatePart::Nanosecond => |d| (d.nanosecond()) as i32, + } } -/// This function extracts date time component (e.g. hour) from an array of datatime. -/// `iter` is the `ArrayIter` of input datatime array. `builder` is used to build the -/// returned `Int32Array` containing the extracted components. `tz` is timezone string -/// which will be added to datetime values in the input array. `parsed` is a `Parsed` -/// object used to parse timezone string. `op` is the extractor closure which takes -/// data time object of `NaiveDateTime` type and returns `i32` value of extracted -/// component. -fn extract_component_from_datetime_array< - A: ArrayAccessor, - T: ArrowTemporalType, - F, ->( - iter: ArrayIter, - mut builder: PrimitiveBuilder, - tz: &str, - op: F, -) -> Result -where - F: Fn(DateTime) -> i32, - i64: From, -{ - let tz: Tz = tz.parse()?; - for value in iter { - match value { - Some(value) => match as_datetime_with_timezone::(value.into(), tz) { - Some(time) => builder.append_value(op(time)), - _ => { - return Err(ArrowError::ComputeError( - "Unable to read value as datetime".to_string(), - )) - } - }, - None => builder.append_null(), +fn date_part_time32_s( + array: &PrimitiveArray, + part: DatePart, +) -> Result { + match part { + DatePart::Hour => Ok(array.unary_opt(|d| time32s_to_time(d).map(|c| c.hour() as i32))), + // TODO expand support for Time types, see: https://github.com/apache/arrow-rs/issues/5261 + _ => return_compute_error_with!(format!("{part} does not support"), array.data_type()), + } +} + +fn date_part_time32_ms( + array: &PrimitiveArray, + part: DatePart, +) -> Result { + match part { + DatePart::Hour => Ok(array.unary_opt(|d| time32ms_to_time(d).map(|c| c.hour() as i32))), + // TODO expand support for Time types, see: https://github.com/apache/arrow-rs/issues/5261 + _ => return_compute_error_with!(format!("{part} does not support"), array.data_type()), + } +} + +fn date_part_time64_us( + array: &PrimitiveArray, + part: DatePart, +) -> Result { + match part { + DatePart::Hour => Ok(array.unary_opt(|d| time64us_to_time(d).map(|c| c.hour() as i32))), + // TODO expand support for Time types, see: https://github.com/apache/arrow-rs/issues/5261 + _ => return_compute_error_with!(format!("{part} does not support"), array.data_type()), + } +} + +fn date_part_time64_ns( + array: &PrimitiveArray, + part: DatePart, +) -> Result { + match part { + DatePart::Hour => Ok(array.unary_opt(|d| time64ns_to_time(d).map(|c| c.hour() as i32))), + // TODO expand support for Time types, see: https://github.com/apache/arrow-rs/issues/5261 + _ => return_compute_error_with!(format!("{part} does not support"), array.data_type()), + } +} + +fn date_part_date32(array: &PrimitiveArray, part: DatePart) -> Int32Array { + // Date32 only encodes number of days, so these will always be 0 + if let DatePart::Hour + | DatePart::Minute + | DatePart::Second + | DatePart::Millisecond + | DatePart::Microsecond + | DatePart::Nanosecond = part + { + array.unary(|_| 0) + } else { + let map_func = get_naive_date_time_part_extract_fn(part); + array.unary_opt(|d| date32_to_datetime(d).map(map_func)) + } +} + +fn date_part_date64(array: &PrimitiveArray, part: DatePart) -> Int32Array { + let map_func = get_naive_date_time_part_extract_fn(part); + array.unary_opt(|d| date64_to_datetime(d).map(map_func)) +} + +fn date_part_timestamp_s( + array: &PrimitiveArray, + part: DatePart, +) -> Int32Array { + // TimestampSecond only encodes number of seconds, so these will always be 0 + if let DatePart::Millisecond | DatePart::Microsecond | DatePart::Nanosecond = part { + array.unary(|_| 0) + } else { + let map_func = get_naive_date_time_part_extract_fn(part); + array.unary_opt(|d| timestamp_s_to_datetime(d).map(map_func)) + } +} + +fn date_part_timestamp_s_tz( + array: &PrimitiveArray, + part: DatePart, + tz: Tz, +) -> Int32Array { + // TimestampSecond only encodes number of seconds, so these will always be 0 + if let DatePart::Millisecond | DatePart::Microsecond | DatePart::Nanosecond = part { + array.unary(|_| 0) + } else { + let map_func = get_date_time_tz_part_extract_fn(part); + array.unary_opt(|d| { + timestamp_s_to_datetime(d) + .map(|c| Utc.from_utc_datetime(&c).with_timezone(&tz)) + .map(map_func) + }) + } +} + +fn date_part_timestamp_ms( + array: &PrimitiveArray, + part: DatePart, +) -> Int32Array { + let map_func = get_naive_date_time_part_extract_fn(part); + array.unary_opt(|d| timestamp_ms_to_datetime(d).map(map_func)) +} + +fn date_part_timestamp_ms_tz( + array: &PrimitiveArray, + part: DatePart, + tz: Tz, +) -> Int32Array { + let map_func = get_date_time_tz_part_extract_fn(part); + array.unary_opt(|d| { + timestamp_ms_to_datetime(d) + .map(|c| Utc.from_utc_datetime(&c).with_timezone(&tz)) + .map(map_func) + }) +} + +fn date_part_timestamp_us( + array: &PrimitiveArray, + part: DatePart, +) -> Int32Array { + let map_func = get_naive_date_time_part_extract_fn(part); + array.unary_opt(|d| timestamp_us_to_datetime(d).map(map_func)) +} + +fn date_part_timestamp_us_tz( + array: &PrimitiveArray, + part: DatePart, + tz: Tz, +) -> Int32Array { + let map_func = get_date_time_tz_part_extract_fn(part); + array.unary_opt(|d| { + timestamp_us_to_datetime(d) + .map(|c| Utc.from_utc_datetime(&c).with_timezone(&tz)) + .map(map_func) + }) +} + +fn date_part_timestamp_ns( + array: &PrimitiveArray, + part: DatePart, +) -> Int32Array { + let map_func = get_naive_date_time_part_extract_fn(part); + array.unary_opt(|d| timestamp_ns_to_datetime(d).map(map_func)) +} + +fn date_part_timestamp_ns_tz( + array: &PrimitiveArray, + part: DatePart, + tz: Tz, +) -> Int32Array { + let map_func = get_date_time_tz_part_extract_fn(part); + array.unary_opt(|d| { + timestamp_ns_to_datetime(d) + .map(|c| Utc.from_utc_datetime(&c).with_timezone(&tz)) + .map(map_func) + }) +} + +/// Given array, return new array with the extracted [`DatePart`]. +/// +/// Returns an [`Int32Array`] unless input was a dictionary type, in which case returns +/// the dictionary but with this function applied onto its values. +/// +/// Returns error if attempting to extract date part from unsupported type (i.e. non-date/timestamp types). +pub fn date_part(array: &dyn Array, part: DatePart) -> Result { + downcast_primitive_array!( + array => { + let array = primitive_array_date_part(array, part)?; + let array = Arc::new(array) as ArrayRef; + Ok(array) + } + DataType::Dictionary(_, _) => { + let array = array.as_any_dictionary(); + let values = date_part(array.values(), part)?; + let new_array = array.with_values(Arc::new(values) as ArrayRef); + Ok(new_array) + } + t => return_compute_error_with!(format!("{part} does not support"), t), + ) +} + +/// Dispatch to specialized function depending on the array type. Since we don't need to consider +/// dictionary arrays here, can strictly control return type to be [`Int32Array`]. +fn primitive_array_date_part( + array: &PrimitiveArray, + part: DatePart, +) -> Result { + match array.data_type() { + DataType::Date32 => { + let array = downcast_array::(array); + Ok(date_part_date32(&array, part)) + } + DataType::Date64 => { + let array = downcast_array::(array); + Ok(date_part_date64(&array, part)) + } + DataType::Timestamp(TimeUnit::Second, None) => { + let array = downcast_array::(array); + Ok(date_part_timestamp_s(&array, part)) + } + DataType::Timestamp(TimeUnit::Second, Some(tz)) => { + let array = downcast_array::(array); + let tz = tz.parse()?; + Ok(date_part_timestamp_s_tz(&array, part, tz)) + } + DataType::Timestamp(TimeUnit::Millisecond, None) => { + let array = downcast_array::(array); + Ok(date_part_timestamp_ms(&array, part)) + } + DataType::Timestamp(TimeUnit::Millisecond, Some(tz)) => { + let array = downcast_array::(array); + let tz = tz.parse()?; + Ok(date_part_timestamp_ms_tz(&array, part, tz)) + } + DataType::Timestamp(TimeUnit::Microsecond, None) => { + let array = downcast_array::(array); + Ok(date_part_timestamp_us(&array, part)) + } + DataType::Timestamp(TimeUnit::Microsecond, Some(tz)) => { + let array = downcast_array::(array); + let tz = tz.parse()?; + Ok(date_part_timestamp_us_tz(&array, part, tz)) + } + DataType::Timestamp(TimeUnit::Nanosecond, None) => { + let array = downcast_array::(array); + Ok(date_part_timestamp_ns(&array, part)) + } + DataType::Timestamp(TimeUnit::Nanosecond, Some(tz)) => { + let array = downcast_array::(array); + let tz = tz.parse()?; + Ok(date_part_timestamp_ns_tz(&array, part, tz)) + } + DataType::Time32(TimeUnit::Second) => { + let array = downcast_array::(array); + Ok(date_part_time32_s(&array, part)?) + } + DataType::Time32(TimeUnit::Millisecond) => { + let array = downcast_array::(array); + Ok(date_part_time32_ms(&array, part)?) + } + DataType::Time64(TimeUnit::Microsecond) => { + let array = downcast_array::(array); + Ok(date_part_time64_us(&array, part)?) + } + DataType::Time64(TimeUnit::Nanosecond) => { + let array = downcast_array::(array); + Ok(date_part_time64_ns(&array, part)?) } + // TODO: support Interval + // DataType::Interval(_) => todo!(), + _ => return_compute_error_with!(format!("{part} does not support"), array.data_type()), } - Ok(builder.finish()) } macro_rules! return_compute_error_with { @@ -170,7 +417,6 @@ pub fn using_chrono_tz_and_utc_naive_date_time( tz: &str, utc: NaiveDateTime, ) -> Option { - use chrono::TimeZone; let tz: Tz = tz.parse().ok()?; Some(tz.offset_from_utc_datetime(&utc).fix()) } @@ -179,7 +425,7 @@ pub fn using_chrono_tz_and_utc_naive_date_time( /// the range of [0, 23]. If the given array isn't temporal primitive or dictionary array, /// an `Err` will be returned. pub fn hour_dyn(array: &dyn Array) -> Result { - time_fraction_dyn(array, "hour", |t| t.hour() as i32) + date_part(array, DatePart::Hour) } /// Extracts the hours of a given temporal primitive array as an array of integers within @@ -189,37 +435,14 @@ where T: ArrowTemporalType + ArrowNumericType, i64: From, { - let b = Int32Builder::with_capacity(array.len()); - match array.data_type() { - DataType::Time32(_) | DataType::Time64(_) => { - let iter = ArrayIter::new(array); - Ok(as_time_with_op::<&PrimitiveArray, T, _>(iter, b, |t| { - t.hour() as i32 - })) - } - DataType::Date32 | DataType::Date64 | DataType::Timestamp(_, None) => { - let iter = ArrayIter::new(array); - Ok(as_datetime_with_op::<&PrimitiveArray, T, _>( - iter, - b, - |t| t.hour() as i32, - )) - } - DataType::Timestamp(_, Some(tz)) => { - let iter = ArrayIter::new(array); - extract_component_from_datetime_array::<&PrimitiveArray, T, _>(iter, b, tz, |t| { - t.hour() as i32 - }) - } - _ => return_compute_error_with!("hour does not support", array.data_type()), - } + primitive_array_date_part(array, DatePart::Hour) } /// Extracts the years of a given temporal array as an array of integers. /// If the given array isn't temporal primitive or dictionary array, /// an `Err` will be returned. pub fn year_dyn(array: &dyn Array) -> Result { - time_fraction_dyn(array, "year", |t| t.year()) + date_part(array, DatePart::Year) } /// Extracts the years of a given temporal primitive array as an array of integers @@ -228,14 +451,14 @@ where T: ArrowTemporalType + ArrowNumericType, i64: From, { - time_fraction_internal(array, "year", |t| t.year()) + primitive_array_date_part(array, DatePart::Year) } /// Extracts the quarter of a given temporal array as an array of integersa within /// the range of [1, 4]. If the given array isn't temporal primitive or dictionary array, /// an `Err` will be returned. pub fn quarter_dyn(array: &dyn Array) -> Result { - time_fraction_dyn(array, "quarter", |t| t.quarter() as i32) + date_part(array, DatePart::Quarter) } /// Extracts the quarter of a given temporal primitive array as an array of integers within @@ -245,14 +468,14 @@ where T: ArrowTemporalType + ArrowNumericType, i64: From, { - time_fraction_internal(array, "quarter", |t| t.quarter() as i32) + primitive_array_date_part(array, DatePart::Quarter) } /// Extracts the month of a given temporal array as an array of integers. /// If the given array isn't temporal primitive or dictionary array, /// an `Err` will be returned. pub fn month_dyn(array: &dyn Array) -> Result { - time_fraction_dyn(array, "month", |t| t.month() as i32) + date_part(array, DatePart::Month) } /// Extracts the month of a given temporal primitive array as an array of integers within @@ -262,7 +485,7 @@ where T: ArrowTemporalType + ArrowNumericType, i64: From, { - time_fraction_internal(array, "month", |t| t.month() as i32) + primitive_array_date_part(array, DatePart::Month) } /// Extracts the day of week of a given temporal array as an array of @@ -275,7 +498,7 @@ where /// If the given array isn't temporal primitive or dictionary array, /// an `Err` will be returned. pub fn num_days_from_monday_dyn(array: &dyn Array) -> Result { - time_fraction_dyn(array, "num_days_from_monday", |t| t.num_days_from_monday()) + date_part(array, DatePart::DayOfWeekMonday0) } /// Extracts the day of week of a given temporal primitive array as an array of @@ -289,7 +512,7 @@ where T: ArrowTemporalType + ArrowNumericType, i64: From, { - time_fraction_internal(array, "num_days_from_monday", |t| t.num_days_from_monday()) + primitive_array_date_part(array, DatePart::DayOfWeekMonday0) } /// Extracts the day of week of a given temporal array as an array of @@ -302,7 +525,7 @@ where /// If the given array isn't temporal primitive or dictionary array, /// an `Err` will be returned. pub fn num_days_from_sunday_dyn(array: &dyn Array) -> Result { - time_fraction_dyn(array, "num_days_from_sunday", |t| t.num_days_from_sunday()) + date_part(array, DatePart::DayOfWeekSunday0) } /// Extracts the day of week of a given temporal primitive array as an array of @@ -316,14 +539,14 @@ where T: ArrowTemporalType + ArrowNumericType, i64: From, { - time_fraction_internal(array, "num_days_from_sunday", |t| t.num_days_from_sunday()) + primitive_array_date_part(array, DatePart::DayOfWeekSunday0) } /// Extracts the day of a given temporal array as an array of integers. /// If the given array isn't temporal primitive or dictionary array, /// an `Err` will be returned. pub fn day_dyn(array: &dyn Array) -> Result { - time_fraction_dyn(array, "day", |t| t.day() as i32) + date_part(array, DatePart::Day) } /// Extracts the day of a given temporal primitive array as an array of integers @@ -332,7 +555,7 @@ where T: ArrowTemporalType + ArrowNumericType, i64: From, { - time_fraction_internal(array, "day", |t| t.day() as i32) + primitive_array_date_part(array, DatePart::Day) } /// Extracts the day of year of a given temporal array as an array of integers @@ -340,7 +563,7 @@ where /// If the given array isn't temporal primitive or dictionary array, /// an `Err` will be returned. pub fn doy_dyn(array: &dyn Array) -> Result { - time_fraction_dyn(array, "doy", |t| t.ordinal() as i32) + date_part(array, DatePart::DayOfYear) } /// Extracts the day of year of a given temporal primitive array as an array of integers @@ -351,7 +574,7 @@ where T::Native: ArrowNativeType, i64: From, { - time_fraction_internal(array, "doy", |t| t.ordinal() as i32) + primitive_array_date_part(array, DatePart::DayOfYear) } /// Extracts the minutes of a given temporal primitive array as an array of integers @@ -360,14 +583,14 @@ where T: ArrowTemporalType + ArrowNumericType, i64: From, { - time_fraction_internal(array, "minute", |t| t.minute() as i32) + primitive_array_date_part(array, DatePart::Minute) } /// Extracts the week of a given temporal array as an array of integers. /// If the given array isn't temporal primitive or dictionary array, /// an `Err` will be returned. pub fn week_dyn(array: &dyn Array) -> Result { - time_fraction_dyn(array, "week", |t| t.iso_week().week() as i32) + date_part(array, DatePart::Week) } /// Extracts the week of a given temporal primitive array as an array of integers @@ -376,7 +599,7 @@ where T: ArrowTemporalType + ArrowNumericType, i64: From, { - time_fraction_internal(array, "week", |t| t.iso_week().week() as i32) + primitive_array_date_part(array, DatePart::Week) } /// Extracts the seconds of a given temporal primitive array as an array of integers @@ -385,7 +608,7 @@ where T: ArrowTemporalType + ArrowNumericType, i64: From, { - time_fraction_internal(array, "second", |t| t.second() as i32) + primitive_array_date_part(array, DatePart::Second) } /// Extracts the nanoseconds of a given temporal primitive array as an array of integers @@ -394,14 +617,14 @@ where T: ArrowTemporalType + ArrowNumericType, i64: From, { - time_fraction_internal(array, "nanosecond", |t| t.nanosecond() as i32) + primitive_array_date_part(array, DatePart::Nanosecond) } /// Extracts the nanoseconds of a given temporal primitive array as an array of integers. /// If the given array isn't temporal primitive or dictionary array, /// an `Err` will be returned. pub fn nanosecond_dyn(array: &dyn Array) -> Result { - time_fraction_dyn(array, "nanosecond", |t| t.nanosecond() as i32) + date_part(array, DatePart::Nanosecond) } /// Extracts the microseconds of a given temporal primitive array as an array of integers @@ -410,14 +633,14 @@ where T: ArrowTemporalType + ArrowNumericType, i64: From, { - time_fraction_internal(array, "microsecond", |t| (t.nanosecond() / 1_000) as i32) + primitive_array_date_part(array, DatePart::Microsecond) } /// Extracts the microseconds of a given temporal primitive array as an array of integers. /// If the given array isn't temporal primitive or dictionary array, /// an `Err` will be returned. pub fn microsecond_dyn(array: &dyn Array) -> Result { - time_fraction_dyn(array, "microsecond", |t| (t.nanosecond() / 1_000) as i32) + date_part(array, DatePart::Microsecond) } /// Extracts the milliseconds of a given temporal primitive array as an array of integers @@ -426,83 +649,27 @@ where T: ArrowTemporalType + ArrowNumericType, i64: From, { - time_fraction_internal(array, "millisecond", |t| { - (t.nanosecond() / 1_000_000) as i32 - }) + primitive_array_date_part(array, DatePart::Millisecond) } /// Extracts the milliseconds of a given temporal primitive array as an array of integers. /// If the given array isn't temporal primitive or dictionary array, /// an `Err` will be returned. pub fn millisecond_dyn(array: &dyn Array) -> Result { - time_fraction_dyn(array, "millisecond", |t| { - (t.nanosecond() / 1_000_000) as i32 - }) -} - -/// Extracts the time fraction of a given temporal array as an array of integers -fn time_fraction_dyn(array: &dyn Array, name: &str, op: F) -> Result -where - F: Fn(NaiveDateTime) -> i32, -{ - match array.data_type().clone() { - DataType::Dictionary(_, _) => { - downcast_dictionary_array!( - array => { - let values = time_fraction_dyn(array.values(), name, op)?; - Ok(Arc::new(array.with_values(values))) - } - dt => return_compute_error_with!(format!("{name} does not support"), dt), - ) - } - _ => { - downcast_temporal_array!( - array => { - time_fraction_internal(array, name, op) - .map(|a| Arc::new(a) as ArrayRef) - } - dt => return_compute_error_with!(format!("{name} does not support"), dt), - ) - } - } -} - -/// Extracts the time fraction of a given temporal array as an array of integers -fn time_fraction_internal( - array: &PrimitiveArray, - name: &str, - op: F, -) -> Result -where - F: Fn(NaiveDateTime) -> i32, - T: ArrowTemporalType + ArrowNumericType, - i64: From, -{ - let b = Int32Builder::with_capacity(array.len()); - match array.data_type() { - DataType::Date32 | DataType::Date64 | DataType::Timestamp(_, None) => { - let iter = ArrayIter::new(array); - Ok(as_datetime_with_op::<_, T, _>(iter, b, op)) - } - DataType::Timestamp(_, Some(tz)) => { - let iter = ArrayIter::new(array); - extract_component_from_datetime_array::<_, T, _>(iter, b, tz, |t| op(t.naive_local())) - } - _ => return_compute_error_with!(format!("{name} does not support"), array.data_type()), - } + date_part(array, DatePart::Millisecond) } /// Extracts the minutes of a given temporal array as an array of integers. /// If the given array isn't temporal primitive or dictionary array, /// an `Err` will be returned. pub fn minute_dyn(array: &dyn Array) -> Result { - time_fraction_dyn(array, "minute", |t| t.minute() as i32) + date_part(array, DatePart::Minute) } /// Extracts the seconds of a given temporal array as an array of integers. /// If the given array isn't temporal primitive or dictionary array, /// an `Err` will be returned. pub fn second_dyn(array: &dyn Array) -> Result { - time_fraction_dyn(array, "second", |t| t.second() as i32) + date_part(array, DatePart::Second) } #[cfg(test)] @@ -932,7 +1099,7 @@ mod tests { let expected = Arc::new(expected_dict) as ArrayRef; assert_eq!(&expected, &b); - let b = time_fraction_dyn(&dict, "minute", |t| t.minute() as i32).unwrap(); + let b = date_part(&dict, DatePart::Minute).unwrap(); let b_old = minute_dyn(&dict).unwrap(); @@ -942,7 +1109,7 @@ mod tests { assert_eq!(&expected, &b); assert_eq!(&expected, &b_old); - let b = time_fraction_dyn(&dict, "second", |t| t.second() as i32).unwrap(); + let b = date_part(&dict, DatePart::Second).unwrap(); let b_old = second_dyn(&dict).unwrap(); @@ -952,7 +1119,7 @@ mod tests { assert_eq!(&expected, &b); assert_eq!(&expected, &b_old); - let b = time_fraction_dyn(&dict, "nanosecond", |t| t.nanosecond() as i32).unwrap(); + let b = date_part(&dict, DatePart::Nanosecond).unwrap(); let expected_dict = DictionaryArray::new(keys, Arc::new(Int32Array::from(vec![0, 0, 0, 0, 0]))); From 0f6cf53df1993dec7d0bd86888e9763e517cdcca Mon Sep 17 00:00:00 2001 From: Jefffrey Date: Tue, 23 Jan 2024 18:16:41 +1100 Subject: [PATCH 2/5] Use generics to simplify code --- arrow-arith/src/temporal.rs | 437 +++++++++++++++--------------------- 1 file changed, 179 insertions(+), 258 deletions(-) diff --git a/arrow-arith/src/temporal.rs b/arrow-arith/src/temporal.rs index 463a18739e31..d04d6878945d 100644 --- a/arrow-arith/src/temporal.rs +++ b/arrow-arith/src/temporal.rs @@ -19,8 +19,8 @@ use std::sync::Arc; -use arrow_array::cast::{downcast_array, AsArray}; -use chrono::{DateTime, Datelike, NaiveDateTime, Offset, TimeZone, Timelike, Utc}; +use arrow_array::cast::AsArray; +use chrono::{Datelike, NaiveDateTime, Offset, TimeZone, Timelike, Utc}; use arrow_array::temporal_conversions::{ date32_to_datetime, date64_to_datetime, time32ms_to_time, time32s_to_time, time64ns_to_time, @@ -31,7 +31,7 @@ use arrow_array::timezone::Tz; use arrow_array::types::*; use arrow_array::*; use arrow_buffer::ArrowNativeType; -use arrow_schema::{ArrowError, DataType, TimeUnit}; +use arrow_schema::{ArrowError, DataType}; /// Valid parts to extract from date/timestamp arrays. #[derive(Debug, Clone, Copy, PartialEq, Eq)] @@ -72,8 +72,14 @@ impl std::fmt::Display for DatePart { } } -/// Returns function to extract relevant [`DatePart`] from a [`NaiveDateTime`]. -fn get_naive_date_time_part_extract_fn(part: DatePart) -> fn(NaiveDateTime) -> i32 { +/// Returns function to extract relevant [`DatePart`] from types like a +/// [`NaiveDateTime`] or [`DateTime`]. +/// +/// [`DateTime`]: chrono::DateTime +fn get_date_time_part_extract_fn(part: DatePart) -> fn(T) -> i32 +where + T: ChronoDateExt + Datelike + Timelike, +{ match part { DatePart::Quarter => |d| d.quarter() as i32, DatePart::Year => |d| d.year(), @@ -92,278 +98,193 @@ fn get_naive_date_time_part_extract_fn(part: DatePart) -> fn(NaiveDateTime) -> i } } -/// Returns function to extract relevant [`DatePart`] from a [`DateTime`]. -fn get_date_time_tz_part_extract_fn(part: DatePart) -> fn(DateTime) -> i32 { - match part { - DatePart::Quarter => |d| d.quarter() as i32, - DatePart::Year => |d| d.year(), - DatePart::Month => |d| d.month() as i32, - DatePart::Week => |d| d.iso_week().week() as i32, - DatePart::Day => |d| d.day() as i32, - DatePart::DayOfWeekSunday0 => |d| d.num_days_from_sunday(), - DatePart::DayOfWeekMonday0 => |d| d.num_days_from_monday(), - DatePart::DayOfYear => |d| d.ordinal() as i32, - DatePart::Hour => |d| d.hour() as i32, - DatePart::Minute => |d| d.minute() as i32, - DatePart::Second => |d| d.second() as i32, - DatePart::Millisecond => |d| (d.nanosecond() / 1_000_000) as i32, - DatePart::Microsecond => |d| (d.nanosecond() / 1_000) as i32, - DatePart::Nanosecond => |d| (d.nanosecond()) as i32, - } +/// Given array, return new array with the extracted [`DatePart`]. +/// +/// Returns an [`Int32Array`] unless input was a dictionary type, in which case returns +/// the dictionary but with this function applied onto its values. +/// +/// Returns error if attempting to extract date part from unsupported type (i.e. non-date/timestamp types). +pub fn date_part(array: &dyn Array, part: DatePart) -> Result { + downcast_temporal_array!( + array => { + let array = array.date_part(part)?; + let array = Arc::new(array) as ArrayRef; + Ok(array) + } + // TODO: support interval + // DataType::Interval(_) => { + // todo!(); + // } + DataType::Dictionary(_, _) => { + let array = array.as_any_dictionary(); + let values = date_part(array.values(), part)?; + let values = Arc::new(values) as ArrayRef; + let new_array = array.with_values(values); + Ok(new_array) + } + t => return_compute_error_with!(format!("{part} does not support"), t), + ) } -fn date_part_time32_s( - array: &PrimitiveArray, +/// Used to integrate new [`date_part()`] method with existing shims such as +/// [`hour()`] and [`week()`]. +fn date_part_primitive( + array: &PrimitiveArray, part: DatePart, ) -> Result { - match part { - DatePart::Hour => Ok(array.unary_opt(|d| time32s_to_time(d).map(|c| c.hour() as i32))), - // TODO expand support for Time types, see: https://github.com/apache/arrow-rs/issues/5261 - _ => return_compute_error_with!(format!("{part} does not support"), array.data_type()), - } + let array = date_part(array, part)?; + Ok(array.as_primitive::().to_owned()) } -fn date_part_time32_ms( - array: &PrimitiveArray, - part: DatePart, -) -> Result { - match part { - DatePart::Hour => Ok(array.unary_opt(|d| time32ms_to_time(d).map(|c| c.hour() as i32))), - // TODO expand support for Time types, see: https://github.com/apache/arrow-rs/issues/5261 - _ => return_compute_error_with!(format!("{part} does not support"), array.data_type()), +/// Extract optional [`Tz`] from timestamp data types, returning error +/// if called with a non-timestamp type. +fn get_tz(dt: &DataType) -> Result, ArrowError> { + match dt { + DataType::Timestamp(_, Some(tz)) => Ok(Some(tz.parse::()?)), + DataType::Timestamp(_, None) => Ok(None), + _ => Err(ArrowError::CastError(format!("Not a timestamp type: {dt}"))), } } -fn date_part_time64_us( - array: &PrimitiveArray, - part: DatePart, -) -> Result { - match part { - DatePart::Hour => Ok(array.unary_opt(|d| time64us_to_time(d).map(|c| c.hour() as i32))), - // TODO expand support for Time types, see: https://github.com/apache/arrow-rs/issues/5261 - _ => return_compute_error_with!(format!("{part} does not support"), array.data_type()), - } +/// Implement the specialized functions for extracting date part from temporal arrays. +trait ExtractDatePartExt { + fn date_part(&self, part: DatePart) -> Result; } -fn date_part_time64_ns( - array: &PrimitiveArray, - part: DatePart, -) -> Result { - match part { - DatePart::Hour => Ok(array.unary_opt(|d| time64ns_to_time(d).map(|c| c.hour() as i32))), - // TODO expand support for Time types, see: https://github.com/apache/arrow-rs/issues/5261 - _ => return_compute_error_with!(format!("{part} does not support"), array.data_type()), +impl ExtractDatePartExt for PrimitiveArray { + fn date_part(&self, part: DatePart) -> Result { + match part { + DatePart::Hour => Ok(self.unary_opt(|d| time32s_to_time(d).map(|c| c.hour() as i32))), + // TODO expand support for Time types, see: https://github.com/apache/arrow-rs/issues/5261 + _ => return_compute_error_with!(format!("{part} does not support"), self.data_type()), + } } } -fn date_part_date32(array: &PrimitiveArray, part: DatePart) -> Int32Array { - // Date32 only encodes number of days, so these will always be 0 - if let DatePart::Hour - | DatePart::Minute - | DatePart::Second - | DatePart::Millisecond - | DatePart::Microsecond - | DatePart::Nanosecond = part - { - array.unary(|_| 0) - } else { - let map_func = get_naive_date_time_part_extract_fn(part); - array.unary_opt(|d| date32_to_datetime(d).map(map_func)) +impl ExtractDatePartExt for PrimitiveArray { + fn date_part(&self, part: DatePart) -> Result { + match part { + DatePart::Hour => Ok(self.unary_opt(|d| time32ms_to_time(d).map(|c| c.hour() as i32))), + // TODO expand support for Time types, see: https://github.com/apache/arrow-rs/issues/5261 + _ => return_compute_error_with!(format!("{part} does not support"), self.data_type()), + } } } -fn date_part_date64(array: &PrimitiveArray, part: DatePart) -> Int32Array { - let map_func = get_naive_date_time_part_extract_fn(part); - array.unary_opt(|d| date64_to_datetime(d).map(map_func)) -} - -fn date_part_timestamp_s( - array: &PrimitiveArray, - part: DatePart, -) -> Int32Array { - // TimestampSecond only encodes number of seconds, so these will always be 0 - if let DatePart::Millisecond | DatePart::Microsecond | DatePart::Nanosecond = part { - array.unary(|_| 0) - } else { - let map_func = get_naive_date_time_part_extract_fn(part); - array.unary_opt(|d| timestamp_s_to_datetime(d).map(map_func)) +impl ExtractDatePartExt for PrimitiveArray { + fn date_part(&self, part: DatePart) -> Result { + match part { + DatePart::Hour => Ok(self.unary_opt(|d| time64us_to_time(d).map(|c| c.hour() as i32))), + // TODO expand support for Time types, see: https://github.com/apache/arrow-rs/issues/5261 + _ => return_compute_error_with!(format!("{part} does not support"), self.data_type()), + } } } -fn date_part_timestamp_s_tz( - array: &PrimitiveArray, - part: DatePart, - tz: Tz, -) -> Int32Array { - // TimestampSecond only encodes number of seconds, so these will always be 0 - if let DatePart::Millisecond | DatePart::Microsecond | DatePart::Nanosecond = part { - array.unary(|_| 0) - } else { - let map_func = get_date_time_tz_part_extract_fn(part); - array.unary_opt(|d| { - timestamp_s_to_datetime(d) - .map(|c| Utc.from_utc_datetime(&c).with_timezone(&tz)) - .map(map_func) - }) +impl ExtractDatePartExt for PrimitiveArray { + fn date_part(&self, part: DatePart) -> Result { + match part { + DatePart::Hour => Ok(self.unary_opt(|d| time64ns_to_time(d).map(|c| c.hour() as i32))), + // TODO expand support for Time types, see: https://github.com/apache/arrow-rs/issues/5261 + _ => return_compute_error_with!(format!("{part} does not support"), self.data_type()), + } } } -fn date_part_timestamp_ms( - array: &PrimitiveArray, - part: DatePart, -) -> Int32Array { - let map_func = get_naive_date_time_part_extract_fn(part); - array.unary_opt(|d| timestamp_ms_to_datetime(d).map(map_func)) -} - -fn date_part_timestamp_ms_tz( - array: &PrimitiveArray, - part: DatePart, - tz: Tz, -) -> Int32Array { - let map_func = get_date_time_tz_part_extract_fn(part); - array.unary_opt(|d| { - timestamp_ms_to_datetime(d) - .map(|c| Utc.from_utc_datetime(&c).with_timezone(&tz)) - .map(map_func) - }) -} - -fn date_part_timestamp_us( - array: &PrimitiveArray, - part: DatePart, -) -> Int32Array { - let map_func = get_naive_date_time_part_extract_fn(part); - array.unary_opt(|d| timestamp_us_to_datetime(d).map(map_func)) +impl ExtractDatePartExt for PrimitiveArray { + fn date_part(&self, part: DatePart) -> Result { + // Date32 only encodes number of days, so these will always be 0 + if let DatePart::Hour + | DatePart::Minute + | DatePart::Second + | DatePart::Millisecond + | DatePart::Microsecond + | DatePart::Nanosecond = part + { + Ok(self.unary(|_| 0)) + } else { + let map_func = get_date_time_part_extract_fn(part); + Ok(self.unary_opt(|d| date32_to_datetime(d).map(map_func))) + } + } } -fn date_part_timestamp_us_tz( - array: &PrimitiveArray, - part: DatePart, - tz: Tz, -) -> Int32Array { - let map_func = get_date_time_tz_part_extract_fn(part); - array.unary_opt(|d| { - timestamp_us_to_datetime(d) - .map(|c| Utc.from_utc_datetime(&c).with_timezone(&tz)) - .map(map_func) - }) +impl ExtractDatePartExt for PrimitiveArray { + fn date_part(&self, part: DatePart) -> Result { + let map_func = get_date_time_part_extract_fn(part); + Ok(self.unary_opt(|d| date64_to_datetime(d).map(map_func))) + } } -fn date_part_timestamp_ns( - array: &PrimitiveArray, - part: DatePart, -) -> Int32Array { - let map_func = get_naive_date_time_part_extract_fn(part); - array.unary_opt(|d| timestamp_ns_to_datetime(d).map(map_func)) +impl ExtractDatePartExt for PrimitiveArray { + fn date_part(&self, part: DatePart) -> Result { + // TimestampSecond only encodes number of seconds, so these will always be 0 + let array = + if let DatePart::Millisecond | DatePart::Microsecond | DatePart::Nanosecond = part { + self.unary(|_| 0) + } else if let Some(tz) = get_tz(self.data_type())? { + let map_func = get_date_time_part_extract_fn(part); + self.unary_opt(|d| { + timestamp_s_to_datetime(d) + .map(|c| Utc.from_utc_datetime(&c).with_timezone(&tz)) + .map(map_func) + }) + } else { + let map_func = get_date_time_part_extract_fn(part); + self.unary_opt(|d| timestamp_s_to_datetime(d).map(map_func)) + }; + Ok(array) + } } -fn date_part_timestamp_ns_tz( - array: &PrimitiveArray, - part: DatePart, - tz: Tz, -) -> Int32Array { - let map_func = get_date_time_tz_part_extract_fn(part); - array.unary_opt(|d| { - timestamp_ns_to_datetime(d) - .map(|c| Utc.from_utc_datetime(&c).with_timezone(&tz)) - .map(map_func) - }) +impl ExtractDatePartExt for PrimitiveArray { + fn date_part(&self, part: DatePart) -> Result { + let array = if let Some(tz) = get_tz(self.data_type())? { + let map_func = get_date_time_part_extract_fn(part); + self.unary_opt(|d| { + timestamp_ms_to_datetime(d) + .map(|c| Utc.from_utc_datetime(&c).with_timezone(&tz)) + .map(map_func) + }) + } else { + let map_func = get_date_time_part_extract_fn(part); + self.unary_opt(|d| timestamp_ms_to_datetime(d).map(map_func)) + }; + Ok(array) + } } -/// Given array, return new array with the extracted [`DatePart`]. -/// -/// Returns an [`Int32Array`] unless input was a dictionary type, in which case returns -/// the dictionary but with this function applied onto its values. -/// -/// Returns error if attempting to extract date part from unsupported type (i.e. non-date/timestamp types). -pub fn date_part(array: &dyn Array, part: DatePart) -> Result { - downcast_primitive_array!( - array => { - let array = primitive_array_date_part(array, part)?; - let array = Arc::new(array) as ArrayRef; - Ok(array) - } - DataType::Dictionary(_, _) => { - let array = array.as_any_dictionary(); - let values = date_part(array.values(), part)?; - let new_array = array.with_values(Arc::new(values) as ArrayRef); - Ok(new_array) - } - t => return_compute_error_with!(format!("{part} does not support"), t), - ) +impl ExtractDatePartExt for PrimitiveArray { + fn date_part(&self, part: DatePart) -> Result { + let array = if let Some(tz) = get_tz(self.data_type())? { + let map_func = get_date_time_part_extract_fn(part); + self.unary_opt(|d| { + timestamp_us_to_datetime(d) + .map(|c| Utc.from_utc_datetime(&c).with_timezone(&tz)) + .map(map_func) + }) + } else { + let map_func = get_date_time_part_extract_fn(part); + self.unary_opt(|d| timestamp_us_to_datetime(d).map(map_func)) + }; + Ok(array) + } } -/// Dispatch to specialized function depending on the array type. Since we don't need to consider -/// dictionary arrays here, can strictly control return type to be [`Int32Array`]. -fn primitive_array_date_part( - array: &PrimitiveArray, - part: DatePart, -) -> Result { - match array.data_type() { - DataType::Date32 => { - let array = downcast_array::(array); - Ok(date_part_date32(&array, part)) - } - DataType::Date64 => { - let array = downcast_array::(array); - Ok(date_part_date64(&array, part)) - } - DataType::Timestamp(TimeUnit::Second, None) => { - let array = downcast_array::(array); - Ok(date_part_timestamp_s(&array, part)) - } - DataType::Timestamp(TimeUnit::Second, Some(tz)) => { - let array = downcast_array::(array); - let tz = tz.parse()?; - Ok(date_part_timestamp_s_tz(&array, part, tz)) - } - DataType::Timestamp(TimeUnit::Millisecond, None) => { - let array = downcast_array::(array); - Ok(date_part_timestamp_ms(&array, part)) - } - DataType::Timestamp(TimeUnit::Millisecond, Some(tz)) => { - let array = downcast_array::(array); - let tz = tz.parse()?; - Ok(date_part_timestamp_ms_tz(&array, part, tz)) - } - DataType::Timestamp(TimeUnit::Microsecond, None) => { - let array = downcast_array::(array); - Ok(date_part_timestamp_us(&array, part)) - } - DataType::Timestamp(TimeUnit::Microsecond, Some(tz)) => { - let array = downcast_array::(array); - let tz = tz.parse()?; - Ok(date_part_timestamp_us_tz(&array, part, tz)) - } - DataType::Timestamp(TimeUnit::Nanosecond, None) => { - let array = downcast_array::(array); - Ok(date_part_timestamp_ns(&array, part)) - } - DataType::Timestamp(TimeUnit::Nanosecond, Some(tz)) => { - let array = downcast_array::(array); - let tz = tz.parse()?; - Ok(date_part_timestamp_ns_tz(&array, part, tz)) - } - DataType::Time32(TimeUnit::Second) => { - let array = downcast_array::(array); - Ok(date_part_time32_s(&array, part)?) - } - DataType::Time32(TimeUnit::Millisecond) => { - let array = downcast_array::(array); - Ok(date_part_time32_ms(&array, part)?) - } - DataType::Time64(TimeUnit::Microsecond) => { - let array = downcast_array::(array); - Ok(date_part_time64_us(&array, part)?) - } - DataType::Time64(TimeUnit::Nanosecond) => { - let array = downcast_array::(array); - Ok(date_part_time64_ns(&array, part)?) - } - // TODO: support Interval - // DataType::Interval(_) => todo!(), - _ => return_compute_error_with!(format!("{part} does not support"), array.data_type()), +impl ExtractDatePartExt for PrimitiveArray { + fn date_part(&self, part: DatePart) -> Result { + let array = if let Some(tz) = get_tz(self.data_type())? { + let map_func = get_date_time_part_extract_fn(part); + self.unary_opt(|d| { + timestamp_ns_to_datetime(d) + .map(|c| Utc.from_utc_datetime(&c).with_timezone(&tz)) + .map(map_func) + }) + } else { + let map_func = get_date_time_part_extract_fn(part); + self.unary_opt(|d| timestamp_ns_to_datetime(d).map(map_func)) + }; + Ok(array) } } @@ -435,7 +356,7 @@ where T: ArrowTemporalType + ArrowNumericType, i64: From, { - primitive_array_date_part(array, DatePart::Hour) + date_part_primitive(array, DatePart::Hour) } /// Extracts the years of a given temporal array as an array of integers. @@ -451,7 +372,7 @@ where T: ArrowTemporalType + ArrowNumericType, i64: From, { - primitive_array_date_part(array, DatePart::Year) + date_part_primitive(array, DatePart::Year) } /// Extracts the quarter of a given temporal array as an array of integersa within @@ -468,7 +389,7 @@ where T: ArrowTemporalType + ArrowNumericType, i64: From, { - primitive_array_date_part(array, DatePart::Quarter) + date_part_primitive(array, DatePart::Quarter) } /// Extracts the month of a given temporal array as an array of integers. @@ -485,7 +406,7 @@ where T: ArrowTemporalType + ArrowNumericType, i64: From, { - primitive_array_date_part(array, DatePart::Month) + date_part_primitive(array, DatePart::Month) } /// Extracts the day of week of a given temporal array as an array of @@ -512,7 +433,7 @@ where T: ArrowTemporalType + ArrowNumericType, i64: From, { - primitive_array_date_part(array, DatePart::DayOfWeekMonday0) + date_part_primitive(array, DatePart::DayOfWeekMonday0) } /// Extracts the day of week of a given temporal array as an array of @@ -539,7 +460,7 @@ where T: ArrowTemporalType + ArrowNumericType, i64: From, { - primitive_array_date_part(array, DatePart::DayOfWeekSunday0) + date_part_primitive(array, DatePart::DayOfWeekSunday0) } /// Extracts the day of a given temporal array as an array of integers. @@ -555,7 +476,7 @@ where T: ArrowTemporalType + ArrowNumericType, i64: From, { - primitive_array_date_part(array, DatePart::Day) + date_part_primitive(array, DatePart::Day) } /// Extracts the day of year of a given temporal array as an array of integers @@ -574,7 +495,7 @@ where T::Native: ArrowNativeType, i64: From, { - primitive_array_date_part(array, DatePart::DayOfYear) + date_part_primitive(array, DatePart::DayOfYear) } /// Extracts the minutes of a given temporal primitive array as an array of integers @@ -583,7 +504,7 @@ where T: ArrowTemporalType + ArrowNumericType, i64: From, { - primitive_array_date_part(array, DatePart::Minute) + date_part_primitive(array, DatePart::Minute) } /// Extracts the week of a given temporal array as an array of integers. @@ -599,7 +520,7 @@ where T: ArrowTemporalType + ArrowNumericType, i64: From, { - primitive_array_date_part(array, DatePart::Week) + date_part_primitive(array, DatePart::Week) } /// Extracts the seconds of a given temporal primitive array as an array of integers @@ -608,7 +529,7 @@ where T: ArrowTemporalType + ArrowNumericType, i64: From, { - primitive_array_date_part(array, DatePart::Second) + date_part_primitive(array, DatePart::Second) } /// Extracts the nanoseconds of a given temporal primitive array as an array of integers @@ -617,7 +538,7 @@ where T: ArrowTemporalType + ArrowNumericType, i64: From, { - primitive_array_date_part(array, DatePart::Nanosecond) + date_part_primitive(array, DatePart::Nanosecond) } /// Extracts the nanoseconds of a given temporal primitive array as an array of integers. @@ -633,7 +554,7 @@ where T: ArrowTemporalType + ArrowNumericType, i64: From, { - primitive_array_date_part(array, DatePart::Microsecond) + date_part_primitive(array, DatePart::Microsecond) } /// Extracts the microseconds of a given temporal primitive array as an array of integers. @@ -649,7 +570,7 @@ where T: ArrowTemporalType + ArrowNumericType, i64: From, { - primitive_array_date_part(array, DatePart::Millisecond) + date_part_primitive(array, DatePart::Millisecond) } /// Extracts the milliseconds of a given temporal primitive array as an array of integers. /// If the given array isn't temporal primitive or dictionary array, From 87dfcea1daaec029e1f95afb6e3f88eef48f16f4 Mon Sep 17 00:00:00 2001 From: Jefffrey Date: Wed, 24 Jan 2024 07:05:24 +1100 Subject: [PATCH 3/5] Refactor how zeroed arrays are created --- arrow-arith/src/temporal.rs | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/arrow-arith/src/temporal.rs b/arrow-arith/src/temporal.rs index d04d6878945d..d0a705d160e8 100644 --- a/arrow-arith/src/temporal.rs +++ b/arrow-arith/src/temporal.rs @@ -201,7 +201,10 @@ impl ExtractDatePartExt for PrimitiveArray { | DatePart::Microsecond | DatePart::Nanosecond = part { - Ok(self.unary(|_| 0)) + Ok(Int32Array::new( + vec![0; self.len()].into(), + self.nulls().cloned(), + )) } else { let map_func = get_date_time_part_extract_fn(part); Ok(self.unary_opt(|d| date32_to_datetime(d).map(map_func))) @@ -221,7 +224,7 @@ impl ExtractDatePartExt for PrimitiveArray { // TimestampSecond only encodes number of seconds, so these will always be 0 let array = if let DatePart::Millisecond | DatePart::Microsecond | DatePart::Nanosecond = part { - self.unary(|_| 0) + Int32Array::new(vec![0; self.len()].into(), self.nulls().cloned()) } else if let Some(tz) = get_tz(self.data_type())? { let map_func = get_date_time_part_extract_fn(part); self.unary_opt(|d| { From 81f39a8ea76caa7ac8144dc818e0be9ca4e37de8 Mon Sep 17 00:00:00 2001 From: Jefffrey Date: Wed, 24 Jan 2024 19:49:30 +1100 Subject: [PATCH 4/5] Update docs and add deprecated labels --- arrow-arith/src/temporal.rs | 68 +++++++++++++++++++++++++++++++++---- 1 file changed, 62 insertions(+), 6 deletions(-) diff --git a/arrow-arith/src/temporal.rs b/arrow-arith/src/temporal.rs index d0a705d160e8..a386559e30ba 100644 --- a/arrow-arith/src/temporal.rs +++ b/arrow-arith/src/temporal.rs @@ -33,8 +33,14 @@ use arrow_array::*; use arrow_buffer::ArrowNativeType; use arrow_schema::{ArrowError, DataType}; -/// Valid parts to extract from date/timestamp arrays. +/// Valid parts to extract from date/time/timestamp arrays. +/// +/// See [`date_part`]. +/// +/// Marked as non-exhaustive as may expand to support more types of +/// date parts in the future. #[derive(Debug, Clone, Copy, PartialEq, Eq)] +#[non_exhaustive] pub enum DatePart { /// Quarter of the year, in range `1..=4` Quarter, @@ -46,9 +52,9 @@ pub enum DatePart { Week, /// Day of the month, in range `1..=31` Day, - /// Day of the week, in range `0..=6`, where Sunday is 0 + /// Day of the week, in range `0..=6`, where Sunday is `0` DayOfWeekSunday0, - /// Day of the week, in range `0..=6`, where Monday is 0 + /// Day of the week, in range `0..=6`, where Monday is `0` DayOfWeekMonday0, /// Day of year, in range `1..=366` DayOfYear, @@ -98,12 +104,32 @@ where } } -/// Given array, return new array with the extracted [`DatePart`]. +/// Given an array, return a new array with the extracted [`DatePart`] as signed 32-bit +/// integer values. +/// +/// Currently only supports temporal types: +/// - Date32/Date64 +/// - Time32/Time64 (Limited support) +/// - Timestamp /// /// Returns an [`Int32Array`] unless input was a dictionary type, in which case returns /// the dictionary but with this function applied onto its values. /// -/// Returns error if attempting to extract date part from unsupported type (i.e. non-date/timestamp types). +/// If array passed in is not of the above listed types (or is a dictionary array where the +/// values array isn't of the above listed types), then this function will return an error. +/// +/// # Examples +/// +/// ``` +/// # use arrow_array::{Int32Array, TimestampMicrosecondArray}; +/// # use arrow_arith::temporal::{DatePart, date_part}; +/// let input: TimestampMicrosecondArray = +/// vec![Some(1612025847000000), None, Some(1722015847000000)].into(); +/// +/// let actual = date_part(&input, DatePart::Week).unwrap(); +/// let expected: Int32Array = vec![Some(4), None, Some(30)].into(); +/// assert_eq!(actual.as_ref(), &expected); +/// ``` pub fn date_part(array: &dyn Array, part: DatePart) -> Result { downcast_temporal_array!( array => { @@ -126,7 +152,7 @@ pub fn date_part(array: &dyn Array, part: DatePart) -> Result( array: &PrimitiveArray, @@ -348,12 +374,14 @@ pub fn using_chrono_tz_and_utc_naive_date_time( /// Extracts the hours of a given array as an array of integers within /// the range of [0, 23]. If the given array isn't temporal primitive or dictionary array, /// an `Err` will be returned. +#[deprecated(since = "51.0.0", note = "Use `date_part` instead")] pub fn hour_dyn(array: &dyn Array) -> Result { date_part(array, DatePart::Hour) } /// Extracts the hours of a given temporal primitive array as an array of integers within /// the range of [0, 23]. +#[deprecated(since = "51.0.0", note = "Use `date_part` instead")] pub fn hour(array: &PrimitiveArray) -> Result where T: ArrowTemporalType + ArrowNumericType, @@ -365,11 +393,13 @@ where /// Extracts the years of a given temporal array as an array of integers. /// If the given array isn't temporal primitive or dictionary array, /// an `Err` will be returned. +#[deprecated(since = "51.0.0", note = "Use `date_part` instead")] pub fn year_dyn(array: &dyn Array) -> Result { date_part(array, DatePart::Year) } /// Extracts the years of a given temporal primitive array as an array of integers +#[deprecated(since = "51.0.0", note = "Use `date_part` instead")] pub fn year(array: &PrimitiveArray) -> Result where T: ArrowTemporalType + ArrowNumericType, @@ -381,12 +411,14 @@ where /// Extracts the quarter of a given temporal array as an array of integersa within /// the range of [1, 4]. If the given array isn't temporal primitive or dictionary array, /// an `Err` will be returned. +#[deprecated(since = "51.0.0", note = "Use `date_part` instead")] pub fn quarter_dyn(array: &dyn Array) -> Result { date_part(array, DatePart::Quarter) } /// Extracts the quarter of a given temporal primitive array as an array of integers within /// the range of [1, 4]. +#[deprecated(since = "51.0.0", note = "Use `date_part` instead")] pub fn quarter(array: &PrimitiveArray) -> Result where T: ArrowTemporalType + ArrowNumericType, @@ -398,12 +430,14 @@ where /// Extracts the month of a given temporal array as an array of integers. /// If the given array isn't temporal primitive or dictionary array, /// an `Err` will be returned. +#[deprecated(since = "51.0.0", note = "Use `date_part` instead")] pub fn month_dyn(array: &dyn Array) -> Result { date_part(array, DatePart::Month) } /// Extracts the month of a given temporal primitive array as an array of integers within /// the range of [1, 12]. +#[deprecated(since = "51.0.0", note = "Use `date_part` instead")] pub fn month(array: &PrimitiveArray) -> Result where T: ArrowTemporalType + ArrowNumericType, @@ -421,6 +455,7 @@ where /// /// If the given array isn't temporal primitive or dictionary array, /// an `Err` will be returned. +#[deprecated(since = "51.0.0", note = "Use `date_part` instead")] pub fn num_days_from_monday_dyn(array: &dyn Array) -> Result { date_part(array, DatePart::DayOfWeekMonday0) } @@ -431,6 +466,7 @@ pub fn num_days_from_monday_dyn(array: &dyn Array) -> Result(array: &PrimitiveArray) -> Result where T: ArrowTemporalType + ArrowNumericType, @@ -448,6 +484,7 @@ where /// /// If the given array isn't temporal primitive or dictionary array, /// an `Err` will be returned. +#[deprecated(since = "51.0.0", note = "Use `date_part` instead")] pub fn num_days_from_sunday_dyn(array: &dyn Array) -> Result { date_part(array, DatePart::DayOfWeekSunday0) } @@ -458,6 +495,7 @@ pub fn num_days_from_sunday_dyn(array: &dyn Array) -> Result(array: &PrimitiveArray) -> Result where T: ArrowTemporalType + ArrowNumericType, @@ -469,11 +507,13 @@ where /// Extracts the day of a given temporal array as an array of integers. /// If the given array isn't temporal primitive or dictionary array, /// an `Err` will be returned. +#[deprecated(since = "51.0.0", note = "Use `date_part` instead")] pub fn day_dyn(array: &dyn Array) -> Result { date_part(array, DatePart::Day) } /// Extracts the day of a given temporal primitive array as an array of integers +#[deprecated(since = "51.0.0", note = "Use `date_part` instead")] pub fn day(array: &PrimitiveArray) -> Result where T: ArrowTemporalType + ArrowNumericType, @@ -486,12 +526,14 @@ where /// The day of year that ranges from 1 to 366. /// If the given array isn't temporal primitive or dictionary array, /// an `Err` will be returned. +#[deprecated(since = "51.0.0", note = "Use `date_part` instead")] pub fn doy_dyn(array: &dyn Array) -> Result { date_part(array, DatePart::DayOfYear) } /// Extracts the day of year of a given temporal primitive array as an array of integers /// The day of year that ranges from 1 to 366 +#[deprecated(since = "51.0.0", note = "Use `date_part` instead")] pub fn doy(array: &PrimitiveArray) -> Result where T: ArrowTemporalType + ArrowNumericType, @@ -502,6 +544,7 @@ where } /// Extracts the minutes of a given temporal primitive array as an array of integers +#[deprecated(since = "51.0.0", note = "Use `date_part` instead")] pub fn minute(array: &PrimitiveArray) -> Result where T: ArrowTemporalType + ArrowNumericType, @@ -513,11 +556,13 @@ where /// Extracts the week of a given temporal array as an array of integers. /// If the given array isn't temporal primitive or dictionary array, /// an `Err` will be returned. +#[deprecated(since = "51.0.0", note = "Use `date_part` instead")] pub fn week_dyn(array: &dyn Array) -> Result { date_part(array, DatePart::Week) } /// Extracts the week of a given temporal primitive array as an array of integers +#[deprecated(since = "51.0.0", note = "Use `date_part` instead")] pub fn week(array: &PrimitiveArray) -> Result where T: ArrowTemporalType + ArrowNumericType, @@ -527,6 +572,7 @@ where } /// Extracts the seconds of a given temporal primitive array as an array of integers +#[deprecated(since = "51.0.0", note = "Use `date_part` instead")] pub fn second(array: &PrimitiveArray) -> Result where T: ArrowTemporalType + ArrowNumericType, @@ -536,6 +582,7 @@ where } /// Extracts the nanoseconds of a given temporal primitive array as an array of integers +#[deprecated(since = "51.0.0", note = "Use `date_part` instead")] pub fn nanosecond(array: &PrimitiveArray) -> Result where T: ArrowTemporalType + ArrowNumericType, @@ -547,11 +594,13 @@ where /// Extracts the nanoseconds of a given temporal primitive array as an array of integers. /// If the given array isn't temporal primitive or dictionary array, /// an `Err` will be returned. +#[deprecated(since = "51.0.0", note = "Use `date_part` instead")] pub fn nanosecond_dyn(array: &dyn Array) -> Result { date_part(array, DatePart::Nanosecond) } /// Extracts the microseconds of a given temporal primitive array as an array of integers +#[deprecated(since = "51.0.0", note = "Use `date_part` instead")] pub fn microsecond(array: &PrimitiveArray) -> Result where T: ArrowTemporalType + ArrowNumericType, @@ -563,11 +612,13 @@ where /// Extracts the microseconds of a given temporal primitive array as an array of integers. /// If the given array isn't temporal primitive or dictionary array, /// an `Err` will be returned. +#[deprecated(since = "51.0.0", note = "Use `date_part` instead")] pub fn microsecond_dyn(array: &dyn Array) -> Result { date_part(array, DatePart::Microsecond) } /// Extracts the milliseconds of a given temporal primitive array as an array of integers +#[deprecated(since = "51.0.0", note = "Use `date_part` instead")] pub fn millisecond(array: &PrimitiveArray) -> Result where T: ArrowTemporalType + ArrowNumericType, @@ -575,9 +626,11 @@ where { date_part_primitive(array, DatePart::Millisecond) } + /// Extracts the milliseconds of a given temporal primitive array as an array of integers. /// If the given array isn't temporal primitive or dictionary array, /// an `Err` will be returned. +#[deprecated(since = "51.0.0", note = "Use `date_part` instead")] pub fn millisecond_dyn(array: &dyn Array) -> Result { date_part(array, DatePart::Millisecond) } @@ -585,6 +638,7 @@ pub fn millisecond_dyn(array: &dyn Array) -> Result { /// Extracts the minutes of a given temporal array as an array of integers. /// If the given array isn't temporal primitive or dictionary array, /// an `Err` will be returned. +#[deprecated(since = "51.0.0", note = "Use `date_part` instead")] pub fn minute_dyn(array: &dyn Array) -> Result { date_part(array, DatePart::Minute) } @@ -592,11 +646,13 @@ pub fn minute_dyn(array: &dyn Array) -> Result { /// Extracts the seconds of a given temporal array as an array of integers. /// If the given array isn't temporal primitive or dictionary array, /// an `Err` will be returned. +#[deprecated(since = "51.0.0", note = "Use `date_part` instead")] pub fn second_dyn(array: &dyn Array) -> Result { date_part(array, DatePart::Second) } #[cfg(test)] +#[allow(deprecated)] mod tests { use super::*; From 39a69ce571b82db66024855706b10a6d58c6efd5 Mon Sep 17 00:00:00 2001 From: Jefffrey Date: Thu, 25 Jan 2024 06:15:27 +1100 Subject: [PATCH 5/5] Fix clippy --- arrow/tests/arithmetic.rs | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/arrow/tests/arithmetic.rs b/arrow/tests/arithmetic.rs index 81a19d4b5e20..59a162ef6dc0 100644 --- a/arrow/tests/arithmetic.rs +++ b/arrow/tests/arithmetic.rs @@ -16,7 +16,7 @@ // under the License. use arrow_arith::numeric::{add, sub}; -use arrow_arith::temporal::hour; +use arrow_arith::temporal::{date_part, DatePart}; use arrow_array::cast::AsArray; use arrow_array::temporal_conversions::as_datetime_with_timezone; use arrow_array::timezone::Tz; @@ -28,7 +28,8 @@ use chrono::{DateTime, TimeZone}; fn test_temporal_array_timestamp_hour_with_timezone_using_chrono_tz() { let a = TimestampSecondArray::from(vec![60 * 60 * 10]).with_timezone("Asia/Kolkata".to_string()); - let b = hour(&a).unwrap(); + let b = date_part(&a, DatePart::Hour).unwrap(); + let b = b.as_primitive::(); assert_eq!(15, b.value(0)); } @@ -41,7 +42,8 @@ fn test_temporal_array_timestamp_hour_with_dst_timezone_using_chrono_tz() { let a = TimestampMillisecondArray::from(vec![Some(1635577147000)]) .with_timezone("Australia/Sydney".to_string()); - let b = hour(&a).unwrap(); + let b = date_part(&a, DatePart::Hour).unwrap(); + let b = b.as_primitive::(); assert_eq!(17, b.value(0)); }