From 9ec60f307fbb6c0630cfe4fc8a0924c5ae077a51 Mon Sep 17 00:00:00 2001 From: Clide Stefani <109172241+Monkwire3@users.noreply.github.com> Date: Thu, 28 Mar 2024 09:26:13 -0400 Subject: [PATCH 1/4] Spit cast::string into a submodule of cast --- arrow-cast/src/cast/mod.rs | 219 +-------------------------- arrow-cast/src/cast/string.rs | 274 ++++++++++++++++++++++++++++++++++ 2 files changed, 276 insertions(+), 217 deletions(-) create mode 100644 arrow-cast/src/cast/string.rs diff --git a/arrow-cast/src/cast/mod.rs b/arrow-cast/src/cast/mod.rs index 52eb0d367271..446a370e61f5 100644 --- a/arrow-cast/src/cast/mod.rs +++ b/arrow-cast/src/cast/mod.rs @@ -40,9 +40,11 @@ mod decimal; mod dictionary; mod list; +mod string; use crate::cast::decimal::*; use crate::cast::dictionary::*; use crate::cast::list::*; +use crate::cast::string::*; use chrono::{NaiveTime, Offset, TimeZone, Utc}; use std::cmp::Ordering; @@ -2001,26 +2003,6 @@ where from.unary_opt::<_, R>(num::cast::cast::) } -fn value_to_string( - array: &dyn Array, - options: &CastOptions, -) -> Result { - let mut builder = GenericStringBuilder::::new(); - let formatter = ArrayFormatter::try_new(array, &options.format_options)?; - let nulls = array.nulls(); - for i in 0..array.len() { - match nulls.map(|x| x.is_null(i)).unwrap_or_default() { - true => builder.append_null(), - false => { - formatter.value(i).write(&mut builder)?; - // tell the builder the row is finished - builder.append_value(""); - } - } - } - Ok(Arc::new(builder.finish())) -} - fn cast_numeric_to_binary( array: &dyn Array, ) -> Result { @@ -2034,172 +2016,6 @@ fn cast_numeric_to_binary( ))) } -/// Parse UTF-8 -fn parse_string( - array: &dyn Array, - cast_options: &CastOptions, -) -> Result { - let string_array = array.as_string::(); - let array = if cast_options.safe { - let iter = string_array.iter().map(|x| x.and_then(P::parse)); - - // Benefit: - // 20% performance improvement - // Soundness: - // The iterator is trustedLen because it comes from an `StringArray`. - unsafe { PrimitiveArray::

::from_trusted_len_iter(iter) } - } else { - let v = string_array - .iter() - .map(|x| match x { - Some(v) => P::parse(v).ok_or_else(|| { - ArrowError::CastError(format!( - "Cannot cast string '{}' to value of {:?} type", - v, - P::DATA_TYPE - )) - }), - None => Ok(P::Native::default()), - }) - .collect::, ArrowError>>()?; - PrimitiveArray::new(v.into(), string_array.nulls().cloned()) - }; - - Ok(Arc::new(array) as ArrayRef) -} - -/// Casts generic string arrays to an ArrowTimestampType (TimeStampNanosecondArray, etc.) -fn cast_string_to_timestamp( - array: &dyn Array, - to_tz: &Option>, - cast_options: &CastOptions, -) -> Result { - let array = array.as_string::(); - let out: PrimitiveArray = match to_tz { - Some(tz) => { - let tz: Tz = tz.as_ref().parse()?; - cast_string_to_timestamp_impl(array, &tz, cast_options)? - } - None => cast_string_to_timestamp_impl(array, &Utc, cast_options)?, - }; - Ok(Arc::new(out.with_timezone_opt(to_tz.clone()))) -} - -fn cast_string_to_timestamp_impl( - array: &GenericStringArray, - tz: &Tz, - cast_options: &CastOptions, -) -> Result, ArrowError> { - if cast_options.safe { - let iter = array.iter().map(|v| { - v.and_then(|v| { - let naive = string_to_datetime(tz, v).ok()?.naive_utc(); - T::make_value(naive) - }) - }); - // Benefit: - // 20% performance improvement - // Soundness: - // The iterator is trustedLen because it comes from an `StringArray`. - - Ok(unsafe { PrimitiveArray::from_trusted_len_iter(iter) }) - } else { - let vec = array - .iter() - .map(|v| { - v.map(|v| { - let naive = string_to_datetime(tz, v)?.naive_utc(); - T::make_value(naive).ok_or_else(|| { - ArrowError::CastError(format!( - "Overflow converting {naive} to {:?}", - T::UNIT - )) - }) - }) - .transpose() - }) - .collect::>, _>>()?; - - // Benefit: - // 20% performance improvement - // Soundness: - // The iterator is trustedLen because it comes from an `StringArray`. - Ok(unsafe { PrimitiveArray::from_trusted_len_iter(vec.iter()) }) - } -} - -fn cast_string_to_interval( - array: &dyn Array, - cast_options: &CastOptions, - parse_function: F, -) -> Result -where - Offset: OffsetSizeTrait, - ArrowType: ArrowPrimitiveType, - F: Fn(&str) -> Result + Copy, -{ - let string_array = array - .as_any() - .downcast_ref::>() - .unwrap(); - let interval_array = if cast_options.safe { - let iter = string_array - .iter() - .map(|v| v.and_then(|v| parse_function(v).ok())); - - // Benefit: - // 20% performance improvement - // Soundness: - // The iterator is trustedLen because it comes from an `StringArray`. - unsafe { PrimitiveArray::::from_trusted_len_iter(iter) } - } else { - let vec = string_array - .iter() - .map(|v| v.map(parse_function).transpose()) - .collect::, ArrowError>>()?; - - // Benefit: - // 20% performance improvement - // Soundness: - // The iterator is trustedLen because it comes from an `StringArray`. - unsafe { PrimitiveArray::::from_trusted_len_iter(vec) } - }; - Ok(Arc::new(interval_array) as ArrayRef) -} - -fn cast_string_to_year_month_interval( - array: &dyn Array, - cast_options: &CastOptions, -) -> Result { - cast_string_to_interval::( - array, - cast_options, - parse_interval_year_month, - ) -} - -fn cast_string_to_day_time_interval( - array: &dyn Array, - cast_options: &CastOptions, -) -> Result { - cast_string_to_interval::( - array, - cast_options, - parse_interval_day_time, - ) -} - -fn cast_string_to_month_day_nano_interval( - array: &dyn Array, - cast_options: &CastOptions, -) -> Result { - cast_string_to_interval::( - array, - cast_options, - parse_interval_month_day_nano, - ) -} - fn adjust_timestamp_to_timezone( array: PrimitiveArray, to_tz: &Tz, @@ -2325,37 +2141,6 @@ where unsafe { PrimitiveArray::::from_trusted_len_iter(iter) } } -/// A specified helper to cast from `GenericBinaryArray` to `GenericStringArray` when they have same -/// offset size so re-encoding offset is unnecessary. -fn cast_binary_to_string( - array: &dyn Array, - cast_options: &CastOptions, -) -> Result { - let array = array - .as_any() - .downcast_ref::>>() - .unwrap(); - - match GenericStringArray::::try_from_binary(array.clone()) { - Ok(a) => Ok(Arc::new(a)), - Err(e) => match cast_options.safe { - true => { - // Fallback to slow method to convert invalid sequences to nulls - let mut builder = - GenericStringBuilder::::with_capacity(array.len(), array.value_data().len()); - - let iter = array - .iter() - .map(|v| v.and_then(|v| std::str::from_utf8(v).ok())); - - builder.extend(iter); - Ok(Arc::new(builder.finish())) - } - false => Err(e), - }, - } -} - /// Helper function to cast from one `BinaryArray` or 'LargeBinaryArray' to 'FixedSizeBinaryArray'. fn cast_binary_to_fixed_size_binary( array: &dyn Array, diff --git a/arrow-cast/src/cast/string.rs b/arrow-cast/src/cast/string.rs new file mode 100644 index 000000000000..e7d7bfe1b3d5 --- /dev/null +++ b/arrow-cast/src/cast/string.rs @@ -0,0 +1,274 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::cast::*; + +pub(crate) fn value_to_string( + array: &dyn Array, + options: &CastOptions, +) -> Result { + let mut builder = GenericStringBuilder::::new(); + let formatter = ArrayFormatter::try_new(array, &options.format_options)?; + let nulls = array.nulls(); + for i in 0..array.len() { + match nulls.map(|x| x.is_null(i)).unwrap_or_default() { + true => builder.append_null(), + false => { + formatter.value(i).write(&mut builder)?; + // tell the builder the row is finished + builder.append_value(""); + } + } + } + Ok(Arc::new(builder.finish())) +} + +/// Parse UTF-8 +pub(crate) fn parse_string( + array: &dyn Array, + cast_options: &CastOptions, +) -> Result { + let string_array = array.as_string::(); + let array = if cast_options.safe { + let iter = string_array.iter().map(|x| x.and_then(P::parse)); + + // Benefit: + // 20% performance improvement + // Soundness: + // The iterator is trustedLen because it comes from an `StringArray`. + unsafe { PrimitiveArray::

::from_trusted_len_iter(iter) } + } else { + let v = string_array + .iter() + .map(|x| match x { + Some(v) => P::parse(v).ok_or_else(|| { + ArrowError::CastError(format!( + "Cannot cast string '{}' to value of {:?} type", + v, + P::DATA_TYPE + )) + }), + None => Ok(P::Native::default()), + }) + .collect::, ArrowError>>()?; + PrimitiveArray::new(v.into(), string_array.nulls().cloned()) + }; + + Ok(Arc::new(array) as ArrayRef) +} + +/// Casts generic string arrays to an ArrowTimestampType (TimeStampNanosecondArray, etc.) +pub(crate) fn cast_string_to_timestamp( + array: &dyn Array, + to_tz: &Option>, + cast_options: &CastOptions, +) -> Result { + let array = array.as_string::(); + let out: PrimitiveArray = match to_tz { + Some(tz) => { + let tz: Tz = tz.as_ref().parse()?; + cast_string_to_timestamp_impl(array, &tz, cast_options)? + } + None => cast_string_to_timestamp_impl(array, &Utc, cast_options)?, + }; + Ok(Arc::new(out.with_timezone_opt(to_tz.clone()))) +} + +pub(crate) fn cast_string_to_timestamp_impl< + O: OffsetSizeTrait, + T: ArrowTimestampType, + Tz: TimeZone, +>( + array: &GenericStringArray, + tz: &Tz, + cast_options: &CastOptions, +) -> Result, ArrowError> { + if cast_options.safe { + let iter = array.iter().map(|v| { + v.and_then(|v| { + let naive = string_to_datetime(tz, v).ok()?.naive_utc(); + T::make_value(naive) + }) + }); + // Benefit: + // 20% performance improvement + // Soundness: + // The iterator is trustedLen because it comes from an `StringArray`. + + Ok(unsafe { PrimitiveArray::from_trusted_len_iter(iter) }) + } else { + let vec = array + .iter() + .map(|v| { + v.map(|v| { + let naive = string_to_datetime(tz, v)?.naive_utc(); + T::make_value(naive).ok_or_else(|| { + ArrowError::CastError(format!( + "Overflow converting {naive} to {:?}", + T::UNIT + )) + }) + }) + .transpose() + }) + .collect::>, _>>()?; + + // Benefit: + // 20% performance improvement + // Soundness: + // The iterator is trustedLen because it comes from an `StringArray`. + Ok(unsafe { PrimitiveArray::from_trusted_len_iter(vec.iter()) }) + } +} + +pub(crate) fn cast_string_to_interval( + array: &dyn Array, + cast_options: &CastOptions, + parse_function: F, +) -> Result +where + Offset: OffsetSizeTrait, + ArrowType: ArrowPrimitiveType, + F: Fn(&str) -> Result + Copy, +{ + let string_array = array + .as_any() + .downcast_ref::>() + .unwrap(); + let interval_array = if cast_options.safe { + let iter = string_array + .iter() + .map(|v| v.and_then(|v| parse_function(v).ok())); + + // Benefit: + // 20% performance improvement + // Soundness: + // The iterator is trustedLen because it comes from an `StringArray`. + unsafe { PrimitiveArray::::from_trusted_len_iter(iter) } + } else { + let vec = string_array + .iter() + .map(|v| v.map(parse_function).transpose()) + .collect::, ArrowError>>()?; + + // Benefit: + // 20% performance improvement + // Soundness: + // The iterator is trustedLen because it comes from an `StringArray`. + unsafe { PrimitiveArray::::from_trusted_len_iter(vec) } + }; + Ok(Arc::new(interval_array) as ArrayRef) +} + +pub(crate) fn cast_string_to_year_month_interval( + array: &dyn Array, + cast_options: &CastOptions, +) -> Result { + cast_string_to_interval::( + array, + cast_options, + parse_interval_year_month, + ) +} + +pub(crate) fn cast_string_to_day_time_interval( + array: &dyn Array, + cast_options: &CastOptions, +) -> Result { + cast_string_to_interval::( + array, + cast_options, + parse_interval_day_time, + ) +} + +pub(crate) fn cast_string_to_month_day_nano_interval( + array: &dyn Array, + cast_options: &CastOptions, +) -> Result { + cast_string_to_interval::( + array, + cast_options, + parse_interval_month_day_nano, + ) +} + +/// A specified helper to cast from `GenericBinaryArray` to `GenericStringArray` when they have same +/// offset size so re-encoding offset is unnecessary. +pub(crate) fn cast_binary_to_string( + array: &dyn Array, + cast_options: &CastOptions, +) -> Result { + let array = array + .as_any() + .downcast_ref::>>() + .unwrap(); + + match GenericStringArray::::try_from_binary(array.clone()) { + Ok(a) => Ok(Arc::new(a)), + Err(e) => match cast_options.safe { + true => { + // Fallback to slow method to convert invalid sequences to nulls + let mut builder = + GenericStringBuilder::::with_capacity(array.len(), array.value_data().len()); + + let iter = array + .iter() + .map(|v| v.and_then(|v| std::str::from_utf8(v).ok())); + + builder.extend(iter); + Ok(Arc::new(builder.finish())) + } + false => Err(e), + }, + } +} + +/// Casts Utf8 to Boolean +pub(crate) fn cast_utf8_to_boolean( + from: &dyn Array, + cast_options: &CastOptions, +) -> Result +where + OffsetSize: OffsetSizeTrait, +{ + let array = from + .as_any() + .downcast_ref::>() + .unwrap(); + + let output_array = array + .iter() + .map(|value| match value { + Some(value) => match value.to_ascii_lowercase().trim() { + "t" | "tr" | "tru" | "true" | "y" | "ye" | "yes" | "on" | "1" => Ok(Some(true)), + "f" | "fa" | "fal" | "fals" | "false" | "n" | "no" | "of" | "off" | "0" => { + Ok(Some(false)) + } + invalid_value => match cast_options.safe { + true => Ok(None), + false => Err(ArrowError::CastError(format!( + "Cannot cast value '{invalid_value}' to value of Boolean type", + ))), + }, + }, + None => Ok(None), + }) + .collect::>()?; + + Ok(Arc::new(output_array)) +} From 592129a3d26220b9898480d0f9e7a1204a1c987e Mon Sep 17 00:00:00 2001 From: Clide Stefani <109172241+Monkwire3@users.noreply.github.com> Date: Mon, 1 Apr 2024 15:46:54 -0400 Subject: [PATCH 2/4] Remove duplicate function --- arrow-cast/src/cast/mod.rs | 35 ----------------------------------- 1 file changed, 35 deletions(-) diff --git a/arrow-cast/src/cast/mod.rs b/arrow-cast/src/cast/mod.rs index 446a370e61f5..3e2bf4392ff0 100644 --- a/arrow-cast/src/cast/mod.rs +++ b/arrow-cast/src/cast/mod.rs @@ -2038,41 +2038,6 @@ fn adjust_timestamp_to_timezone( Ok(adjusted) } -/// Casts Utf8 to Boolean -fn cast_utf8_to_boolean( - from: &dyn Array, - cast_options: &CastOptions, -) -> Result -where - OffsetSize: OffsetSizeTrait, -{ - let array = from - .as_any() - .downcast_ref::>() - .unwrap(); - - let output_array = array - .iter() - .map(|value| match value { - Some(value) => match value.to_ascii_lowercase().trim() { - "t" | "tr" | "tru" | "true" | "y" | "ye" | "yes" | "on" | "1" => Ok(Some(true)), - "f" | "fa" | "fal" | "fals" | "false" | "n" | "no" | "of" | "off" | "0" => { - Ok(Some(false)) - } - invalid_value => match cast_options.safe { - true => Ok(None), - false => Err(ArrowError::CastError(format!( - "Cannot cast value '{invalid_value}' to value of Boolean type", - ))), - }, - }, - None => Ok(None), - }) - .collect::>()?; - - Ok(Arc::new(output_array)) -} - /// Cast numeric types to Boolean /// /// Any zero value returns `false` while non-zero returns `true` From 9eaeb6bf7a9a5203d02779ae95fcd935e65dbc85 Mon Sep 17 00:00:00 2001 From: Clide Stefani <109172241+Monkwire3@users.noreply.github.com> Date: Wed, 3 Apr 2024 08:57:51 -0400 Subject: [PATCH 3/4] Apply changes --- arrow-cast/src/cast/string.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-cast/src/cast/string.rs b/arrow-cast/src/cast/string.rs index e7d7bfe1b3d5..088018672341 100644 --- a/arrow-cast/src/cast/string.rs +++ b/arrow-cast/src/cast/string.rs @@ -88,7 +88,7 @@ pub(crate) fn cast_string_to_timestamp Date: Wed, 3 Apr 2024 09:00:58 -0400 Subject: [PATCH 4/4] Format change --- arrow-cast/src/cast/string.rs | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/arrow-cast/src/cast/string.rs b/arrow-cast/src/cast/string.rs index 088018672341..e9c1ff58d62f 100644 --- a/arrow-cast/src/cast/string.rs +++ b/arrow-cast/src/cast/string.rs @@ -88,11 +88,7 @@ pub(crate) fn cast_string_to_timestamp( +fn cast_string_to_timestamp_impl( array: &GenericStringArray, tz: &Tz, cast_options: &CastOptions,