Skip to content

Commit

Permalink
Split arrow_cast::cast::string into it's own submodule (#5563)
Browse files Browse the repository at this point in the history
* Spit cast::string into a submodule of cast

* Remove duplicate function

* Apply changes

* Format change

---------

Co-authored-by: Clide Stefani <[email protected]>
  • Loading branch information
monkwire and monkwire authored Apr 3, 2024
1 parent 8884083 commit 40409e4
Show file tree
Hide file tree
Showing 2 changed files with 272 additions and 252 deletions.
254 changes: 2 additions & 252 deletions arrow-cast/src/cast/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,11 @@
mod decimal;
mod dictionary;
mod list;
mod string;
use crate::cast::decimal::*;
use crate::cast::dictionary::*;
use crate::cast::list::*;
use crate::cast::string::*;

use chrono::{NaiveTime, Offset, TimeZone, Utc};
use std::cmp::Ordering;
Expand Down Expand Up @@ -2001,26 +2003,6 @@ where
from.unary_opt::<_, R>(num::cast::cast::<T::Native, R::Native>)
}

fn value_to_string<O: OffsetSizeTrait>(
array: &dyn Array,
options: &CastOptions,
) -> Result<ArrayRef, ArrowError> {
let mut builder = GenericStringBuilder::<O>::new();
let formatter = ArrayFormatter::try_new(array, &options.format_options)?;
let nulls = array.nulls();
for i in 0..array.len() {
match nulls.map(|x| x.is_null(i)).unwrap_or_default() {
true => builder.append_null(),
false => {
formatter.value(i).write(&mut builder)?;
// tell the builder the row is finished
builder.append_value("");
}
}
}
Ok(Arc::new(builder.finish()))
}

fn cast_numeric_to_binary<FROM: ArrowPrimitiveType, O: OffsetSizeTrait>(
array: &dyn Array,
) -> Result<ArrayRef, ArrowError> {
Expand All @@ -2034,172 +2016,6 @@ fn cast_numeric_to_binary<FROM: ArrowPrimitiveType, O: OffsetSizeTrait>(
)))
}

/// Parse UTF-8
fn parse_string<P: Parser, O: OffsetSizeTrait>(
array: &dyn Array,
cast_options: &CastOptions,
) -> Result<ArrayRef, ArrowError> {
let string_array = array.as_string::<O>();
let array = if cast_options.safe {
let iter = string_array.iter().map(|x| x.and_then(P::parse));

// Benefit:
// 20% performance improvement
// Soundness:
// The iterator is trustedLen because it comes from an `StringArray`.
unsafe { PrimitiveArray::<P>::from_trusted_len_iter(iter) }
} else {
let v = string_array
.iter()
.map(|x| match x {
Some(v) => P::parse(v).ok_or_else(|| {
ArrowError::CastError(format!(
"Cannot cast string '{}' to value of {:?} type",
v,
P::DATA_TYPE
))
}),
None => Ok(P::Native::default()),
})
.collect::<Result<Vec<_>, ArrowError>>()?;
PrimitiveArray::new(v.into(), string_array.nulls().cloned())
};

Ok(Arc::new(array) as ArrayRef)
}

/// Casts generic string arrays to an ArrowTimestampType (TimeStampNanosecondArray, etc.)
fn cast_string_to_timestamp<O: OffsetSizeTrait, T: ArrowTimestampType>(
array: &dyn Array,
to_tz: &Option<Arc<str>>,
cast_options: &CastOptions,
) -> Result<ArrayRef, ArrowError> {
let array = array.as_string::<O>();
let out: PrimitiveArray<T> = match to_tz {
Some(tz) => {
let tz: Tz = tz.as_ref().parse()?;
cast_string_to_timestamp_impl(array, &tz, cast_options)?
}
None => cast_string_to_timestamp_impl(array, &Utc, cast_options)?,
};
Ok(Arc::new(out.with_timezone_opt(to_tz.clone())))
}

fn cast_string_to_timestamp_impl<O: OffsetSizeTrait, T: ArrowTimestampType, Tz: TimeZone>(
array: &GenericStringArray<O>,
tz: &Tz,
cast_options: &CastOptions,
) -> Result<PrimitiveArray<T>, ArrowError> {
if cast_options.safe {
let iter = array.iter().map(|v| {
v.and_then(|v| {
let naive = string_to_datetime(tz, v).ok()?.naive_utc();
T::make_value(naive)
})
});
// Benefit:
// 20% performance improvement
// Soundness:
// The iterator is trustedLen because it comes from an `StringArray`.

Ok(unsafe { PrimitiveArray::from_trusted_len_iter(iter) })
} else {
let vec = array
.iter()
.map(|v| {
v.map(|v| {
let naive = string_to_datetime(tz, v)?.naive_utc();
T::make_value(naive).ok_or_else(|| {
ArrowError::CastError(format!(
"Overflow converting {naive} to {:?}",
T::UNIT
))
})
})
.transpose()
})
.collect::<Result<Vec<Option<i64>>, _>>()?;

// Benefit:
// 20% performance improvement
// Soundness:
// The iterator is trustedLen because it comes from an `StringArray`.
Ok(unsafe { PrimitiveArray::from_trusted_len_iter(vec.iter()) })
}
}

fn cast_string_to_interval<Offset, F, ArrowType>(
array: &dyn Array,
cast_options: &CastOptions,
parse_function: F,
) -> Result<ArrayRef, ArrowError>
where
Offset: OffsetSizeTrait,
ArrowType: ArrowPrimitiveType,
F: Fn(&str) -> Result<ArrowType::Native, ArrowError> + Copy,
{
let string_array = array
.as_any()
.downcast_ref::<GenericStringArray<Offset>>()
.unwrap();
let interval_array = if cast_options.safe {
let iter = string_array
.iter()
.map(|v| v.and_then(|v| parse_function(v).ok()));

// Benefit:
// 20% performance improvement
// Soundness:
// The iterator is trustedLen because it comes from an `StringArray`.
unsafe { PrimitiveArray::<ArrowType>::from_trusted_len_iter(iter) }
} else {
let vec = string_array
.iter()
.map(|v| v.map(parse_function).transpose())
.collect::<Result<Vec<_>, ArrowError>>()?;

// Benefit:
// 20% performance improvement
// Soundness:
// The iterator is trustedLen because it comes from an `StringArray`.
unsafe { PrimitiveArray::<ArrowType>::from_trusted_len_iter(vec) }
};
Ok(Arc::new(interval_array) as ArrayRef)
}

fn cast_string_to_year_month_interval<Offset: OffsetSizeTrait>(
array: &dyn Array,
cast_options: &CastOptions,
) -> Result<ArrayRef, ArrowError> {
cast_string_to_interval::<Offset, _, IntervalYearMonthType>(
array,
cast_options,
parse_interval_year_month,
)
}

fn cast_string_to_day_time_interval<Offset: OffsetSizeTrait>(
array: &dyn Array,
cast_options: &CastOptions,
) -> Result<ArrayRef, ArrowError> {
cast_string_to_interval::<Offset, _, IntervalDayTimeType>(
array,
cast_options,
parse_interval_day_time,
)
}

fn cast_string_to_month_day_nano_interval<Offset: OffsetSizeTrait>(
array: &dyn Array,
cast_options: &CastOptions,
) -> Result<ArrayRef, ArrowError> {
cast_string_to_interval::<Offset, _, IntervalMonthDayNanoType>(
array,
cast_options,
parse_interval_month_day_nano,
)
}

fn adjust_timestamp_to_timezone<T: ArrowTimestampType>(
array: PrimitiveArray<Int64Type>,
to_tz: &Tz,
Expand All @@ -2222,41 +2038,6 @@ fn adjust_timestamp_to_timezone<T: ArrowTimestampType>(
Ok(adjusted)
}

/// Casts Utf8 to Boolean
fn cast_utf8_to_boolean<OffsetSize>(
from: &dyn Array,
cast_options: &CastOptions,
) -> Result<ArrayRef, ArrowError>
where
OffsetSize: OffsetSizeTrait,
{
let array = from
.as_any()
.downcast_ref::<GenericStringArray<OffsetSize>>()
.unwrap();

let output_array = array
.iter()
.map(|value| match value {
Some(value) => match value.to_ascii_lowercase().trim() {
"t" | "tr" | "tru" | "true" | "y" | "ye" | "yes" | "on" | "1" => Ok(Some(true)),
"f" | "fa" | "fal" | "fals" | "false" | "n" | "no" | "of" | "off" | "0" => {
Ok(Some(false))
}
invalid_value => match cast_options.safe {
true => Ok(None),
false => Err(ArrowError::CastError(format!(
"Cannot cast value '{invalid_value}' to value of Boolean type",
))),
},
},
None => Ok(None),
})
.collect::<Result<BooleanArray, _>>()?;

Ok(Arc::new(output_array))
}

/// Cast numeric types to Boolean
///
/// Any zero value returns `false` while non-zero returns `true`
Expand Down Expand Up @@ -2325,37 +2106,6 @@ where
unsafe { PrimitiveArray::<T>::from_trusted_len_iter(iter) }
}

/// A specified helper to cast from `GenericBinaryArray` to `GenericStringArray` when they have same
/// offset size so re-encoding offset is unnecessary.
fn cast_binary_to_string<O: OffsetSizeTrait>(
array: &dyn Array,
cast_options: &CastOptions,
) -> Result<ArrayRef, ArrowError> {
let array = array
.as_any()
.downcast_ref::<GenericByteArray<GenericBinaryType<O>>>()
.unwrap();

match GenericStringArray::<O>::try_from_binary(array.clone()) {
Ok(a) => Ok(Arc::new(a)),
Err(e) => match cast_options.safe {
true => {
// Fallback to slow method to convert invalid sequences to nulls
let mut builder =
GenericStringBuilder::<O>::with_capacity(array.len(), array.value_data().len());

let iter = array
.iter()
.map(|v| v.and_then(|v| std::str::from_utf8(v).ok()));

builder.extend(iter);
Ok(Arc::new(builder.finish()))
}
false => Err(e),
},
}
}

/// Helper function to cast from one `BinaryArray` or 'LargeBinaryArray' to 'FixedSizeBinaryArray'.
fn cast_binary_to_fixed_size_binary<O: OffsetSizeTrait>(
array: &dyn Array,
Expand Down
Loading

0 comments on commit 40409e4

Please sign in to comment.