Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Split arrow_cast::cast::string into it's own submodule #5563

Merged
merged 5 commits into from
Apr 3, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
254 changes: 2 additions & 252 deletions arrow-cast/src/cast/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,11 @@
mod decimal;
mod dictionary;
mod list;
mod string;
use crate::cast::decimal::*;
use crate::cast::dictionary::*;
use crate::cast::list::*;
use crate::cast::string::*;

use chrono::{NaiveTime, Offset, TimeZone, Utc};
use std::cmp::Ordering;
Expand Down Expand Up @@ -2001,26 +2003,6 @@ where
from.unary_opt::<_, R>(num::cast::cast::<T::Native, R::Native>)
}

fn value_to_string<O: OffsetSizeTrait>(
array: &dyn Array,
options: &CastOptions,
) -> Result<ArrayRef, ArrowError> {
let mut builder = GenericStringBuilder::<O>::new();
let formatter = ArrayFormatter::try_new(array, &options.format_options)?;
let nulls = array.nulls();
for i in 0..array.len() {
match nulls.map(|x| x.is_null(i)).unwrap_or_default() {
true => builder.append_null(),
false => {
formatter.value(i).write(&mut builder)?;
// tell the builder the row is finished
builder.append_value("");
}
}
}
Ok(Arc::new(builder.finish()))
}

fn cast_numeric_to_binary<FROM: ArrowPrimitiveType, O: OffsetSizeTrait>(
array: &dyn Array,
) -> Result<ArrayRef, ArrowError> {
Expand All @@ -2034,172 +2016,6 @@ fn cast_numeric_to_binary<FROM: ArrowPrimitiveType, O: OffsetSizeTrait>(
)))
}

/// Parse UTF-8
fn parse_string<P: Parser, O: OffsetSizeTrait>(
array: &dyn Array,
cast_options: &CastOptions,
) -> Result<ArrayRef, ArrowError> {
let string_array = array.as_string::<O>();
let array = if cast_options.safe {
let iter = string_array.iter().map(|x| x.and_then(P::parse));

// Benefit:
// 20% performance improvement
// Soundness:
// The iterator is trustedLen because it comes from an `StringArray`.
unsafe { PrimitiveArray::<P>::from_trusted_len_iter(iter) }
} else {
let v = string_array
.iter()
.map(|x| match x {
Some(v) => P::parse(v).ok_or_else(|| {
ArrowError::CastError(format!(
"Cannot cast string '{}' to value of {:?} type",
v,
P::DATA_TYPE
))
}),
None => Ok(P::Native::default()),
})
.collect::<Result<Vec<_>, ArrowError>>()?;
PrimitiveArray::new(v.into(), string_array.nulls().cloned())
};

Ok(Arc::new(array) as ArrayRef)
}

/// Casts generic string arrays to an ArrowTimestampType (TimeStampNanosecondArray, etc.)
fn cast_string_to_timestamp<O: OffsetSizeTrait, T: ArrowTimestampType>(
array: &dyn Array,
to_tz: &Option<Arc<str>>,
cast_options: &CastOptions,
) -> Result<ArrayRef, ArrowError> {
let array = array.as_string::<O>();
let out: PrimitiveArray<T> = match to_tz {
Some(tz) => {
let tz: Tz = tz.as_ref().parse()?;
cast_string_to_timestamp_impl(array, &tz, cast_options)?
}
None => cast_string_to_timestamp_impl(array, &Utc, cast_options)?,
};
Ok(Arc::new(out.with_timezone_opt(to_tz.clone())))
}

fn cast_string_to_timestamp_impl<O: OffsetSizeTrait, T: ArrowTimestampType, Tz: TimeZone>(
array: &GenericStringArray<O>,
tz: &Tz,
cast_options: &CastOptions,
) -> Result<PrimitiveArray<T>, ArrowError> {
if cast_options.safe {
let iter = array.iter().map(|v| {
v.and_then(|v| {
let naive = string_to_datetime(tz, v).ok()?.naive_utc();
T::make_value(naive)
})
});
// Benefit:
// 20% performance improvement
// Soundness:
// The iterator is trustedLen because it comes from an `StringArray`.

Ok(unsafe { PrimitiveArray::from_trusted_len_iter(iter) })
} else {
let vec = array
.iter()
.map(|v| {
v.map(|v| {
let naive = string_to_datetime(tz, v)?.naive_utc();
T::make_value(naive).ok_or_else(|| {
ArrowError::CastError(format!(
"Overflow converting {naive} to {:?}",
T::UNIT
))
})
})
.transpose()
})
.collect::<Result<Vec<Option<i64>>, _>>()?;

// Benefit:
// 20% performance improvement
// Soundness:
// The iterator is trustedLen because it comes from an `StringArray`.
Ok(unsafe { PrimitiveArray::from_trusted_len_iter(vec.iter()) })
}
}

fn cast_string_to_interval<Offset, F, ArrowType>(
array: &dyn Array,
cast_options: &CastOptions,
parse_function: F,
) -> Result<ArrayRef, ArrowError>
where
Offset: OffsetSizeTrait,
ArrowType: ArrowPrimitiveType,
F: Fn(&str) -> Result<ArrowType::Native, ArrowError> + Copy,
{
let string_array = array
.as_any()
.downcast_ref::<GenericStringArray<Offset>>()
.unwrap();
let interval_array = if cast_options.safe {
let iter = string_array
.iter()
.map(|v| v.and_then(|v| parse_function(v).ok()));

// Benefit:
// 20% performance improvement
// Soundness:
// The iterator is trustedLen because it comes from an `StringArray`.
unsafe { PrimitiveArray::<ArrowType>::from_trusted_len_iter(iter) }
} else {
let vec = string_array
.iter()
.map(|v| v.map(parse_function).transpose())
.collect::<Result<Vec<_>, ArrowError>>()?;

// Benefit:
// 20% performance improvement
// Soundness:
// The iterator is trustedLen because it comes from an `StringArray`.
unsafe { PrimitiveArray::<ArrowType>::from_trusted_len_iter(vec) }
};
Ok(Arc::new(interval_array) as ArrayRef)
}

fn cast_string_to_year_month_interval<Offset: OffsetSizeTrait>(
array: &dyn Array,
cast_options: &CastOptions,
) -> Result<ArrayRef, ArrowError> {
cast_string_to_interval::<Offset, _, IntervalYearMonthType>(
array,
cast_options,
parse_interval_year_month,
)
}

fn cast_string_to_day_time_interval<Offset: OffsetSizeTrait>(
array: &dyn Array,
cast_options: &CastOptions,
) -> Result<ArrayRef, ArrowError> {
cast_string_to_interval::<Offset, _, IntervalDayTimeType>(
array,
cast_options,
parse_interval_day_time,
)
}

fn cast_string_to_month_day_nano_interval<Offset: OffsetSizeTrait>(
array: &dyn Array,
cast_options: &CastOptions,
) -> Result<ArrayRef, ArrowError> {
cast_string_to_interval::<Offset, _, IntervalMonthDayNanoType>(
array,
cast_options,
parse_interval_month_day_nano,
)
}

fn adjust_timestamp_to_timezone<T: ArrowTimestampType>(
array: PrimitiveArray<Int64Type>,
to_tz: &Tz,
Expand All @@ -2222,41 +2038,6 @@ fn adjust_timestamp_to_timezone<T: ArrowTimestampType>(
Ok(adjusted)
}

/// Casts Utf8 to Boolean
fn cast_utf8_to_boolean<OffsetSize>(
from: &dyn Array,
cast_options: &CastOptions,
) -> Result<ArrayRef, ArrowError>
where
OffsetSize: OffsetSizeTrait,
{
let array = from
.as_any()
.downcast_ref::<GenericStringArray<OffsetSize>>()
.unwrap();

let output_array = array
.iter()
.map(|value| match value {
Some(value) => match value.to_ascii_lowercase().trim() {
"t" | "tr" | "tru" | "true" | "y" | "ye" | "yes" | "on" | "1" => Ok(Some(true)),
"f" | "fa" | "fal" | "fals" | "false" | "n" | "no" | "of" | "off" | "0" => {
Ok(Some(false))
}
invalid_value => match cast_options.safe {
true => Ok(None),
false => Err(ArrowError::CastError(format!(
"Cannot cast value '{invalid_value}' to value of Boolean type",
))),
},
},
None => Ok(None),
})
.collect::<Result<BooleanArray, _>>()?;

Ok(Arc::new(output_array))
}

/// Cast numeric types to Boolean
///
/// Any zero value returns `false` while non-zero returns `true`
Expand Down Expand Up @@ -2325,37 +2106,6 @@ where
unsafe { PrimitiveArray::<T>::from_trusted_len_iter(iter) }
}

/// A specified helper to cast from `GenericBinaryArray` to `GenericStringArray` when they have same
/// offset size so re-encoding offset is unnecessary.
fn cast_binary_to_string<O: OffsetSizeTrait>(
array: &dyn Array,
cast_options: &CastOptions,
) -> Result<ArrayRef, ArrowError> {
let array = array
.as_any()
.downcast_ref::<GenericByteArray<GenericBinaryType<O>>>()
.unwrap();

match GenericStringArray::<O>::try_from_binary(array.clone()) {
Ok(a) => Ok(Arc::new(a)),
Err(e) => match cast_options.safe {
true => {
// Fallback to slow method to convert invalid sequences to nulls
let mut builder =
GenericStringBuilder::<O>::with_capacity(array.len(), array.value_data().len());

let iter = array
.iter()
.map(|v| v.and_then(|v| std::str::from_utf8(v).ok()));

builder.extend(iter);
Ok(Arc::new(builder.finish()))
}
false => Err(e),
},
}
}

/// Helper function to cast from one `BinaryArray` or 'LargeBinaryArray' to 'FixedSizeBinaryArray'.
fn cast_binary_to_fixed_size_binary<O: OffsetSizeTrait>(
array: &dyn Array,
Expand Down
Loading
Loading