diff --git a/Cargo.lock b/Cargo.lock index 213ccbc3..7135d487 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1688,6 +1688,32 @@ version = "1.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b" +[[package]] +name = "jiff" +version = "0.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a45489186a6123c128fdf6016183fcfab7113e1820eb813127e036e287233fb" +dependencies = [ + "jiff-tzdb-platform", + "serde", + "windows-sys 0.59.0", +] + +[[package]] +name = "jiff-tzdb" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91335e575850c5c4c673b9bd467b0e025f164ca59d0564f69d0c2ee0ffad4653" + +[[package]] +name = "jiff-tzdb-platform" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9835f0060a626fe59f160437bc725491a6af23133ea906500027d1bd2f8f4329" +dependencies = [ + "jiff-tzdb", +] + [[package]] name = "js-sys" version = "0.3.70" @@ -2359,6 +2385,7 @@ dependencies = [ "chrono", "criterion", "half", + "jiff", "rand", "rust_decimal", "serde", diff --git a/Changes.md b/Changes.md index dab8148c..24659aaf 100644 --- a/Changes.md +++ b/Changes.md @@ -8,6 +8,8 @@ New features to `Time64(Nanosecond))`) in `from_samples` - Improved error messages for non self describing types (`chrono::*`, `uuid::Uuid`, `std::net::IpAddr`) +- Add support for various `jiff` types (`jiff::Date`, `jiff::Time`, `jiff::DateTime`, + `jiff::Timestamp`, `jiff::Span`, `jiff::SignedDuration`) ## 0.12.0 diff --git a/serde_arrow/Cargo.toml b/serde_arrow/Cargo.toml index e546c5e0..fa1c2157 100644 --- a/serde_arrow/Cargo.toml +++ b/serde_arrow/Cargo.toml @@ -138,6 +138,7 @@ serde_bytes = "0.11" rand = "0.8" bigdecimal = {version = "0.4", features = ["serde"] } uuid = { version = "1.10.0", features = ["serde", "v4"] } +jiff = { version = "0.1", features = ["serde"] } # for benchmarks # arrow-version:replace: arrow-json-{version} = {{ package = "arrow-json", version = "{version}" }} diff --git a/serde_arrow/Status.md b/serde_arrow/Status.md index 86b86464..9a68cc11 100644 --- a/serde_arrow/Status.md +++ b/serde_arrow/Status.md @@ -1,6 +1,15 @@ # Status -Supported arrow data types: +The page documents the supported types both from an Arrow and a Rust perspective. + +- [Arrow data types](#arrow-data-types) +- [Rust types](#rust-types) + - [Native / standard types](#native--standard-types) + - [`chrono` types](#chrono-types) + - [`jiff` types](#jiff-types) + - [`rust_decimal` and `bigdecimal` types](#rust_decimal-and-bigdecimal-types) + +## Arrow data types - [x] [`Null`](https://docs.rs/arrow/latest/arrow/datatypes/enum.DataType.html#variant.Null) - [x] [`Boolean`](https://docs.rs/arrow/latest/arrow/datatypes/enum.DataType.html#variant.Boolean) @@ -49,7 +58,9 @@ Supported arrow data types: serialization error. - [ ] [`Decimal256(precision, scale)`](https://docs.rs/arrow/latest/arrow/datatypes/enum.DataType.html#variant.Decimal256) -Native / standard Rust types: +## Rust types + +### Native / standard types - [x] `bool` - [x] `i8`, `i16`, `i32`, `i64` @@ -72,54 +83,118 @@ Native / standard Rust types: supported - [x] `struct S(T)`: newtype structs are supported, if `T` is supported -Non-standard Rust types - -- [x] `chrono::DateTime`: - - is serialized / deserialized as strings - - can be mapped to `Utf8`, `LargeUtf8`, `Timestamp(.., Some("Utc"))`, `Date64` with strategy `UtcStrAsDate64` - - `from_samples` detects the type `LargeUtf8` without configuration, the type `Date64` with - strategy `UtcStrAsDate64` when setting `guess_dates = true` - - `from_type` is not supported, as the type is not self-describing -- [x] `chrono::DateTime` using [`chrono::serde::ts_microseconds`][chrono-ts-microseconds]: - - is serialized / deserialized as `i64` - - can be mapped to `Utf8`, `LargeUtf8`, `Timestamp(.., Some("Utc"))`, `Date64` without Strategy, - `Date64` with strategy `UtcStrAsDate64` - - `from_samples` and `from_type` detect the type `Int64` -- [x] `chrono::NaiveDateTime`: - - is serialized / deserialized as strings - - can be mapped to `Utf8`, `LargeUtf8`, `Timestamp(.., None)`, `Date64` with strategy `NaiveStrAsDate64` - - `from_samples` detects the type `LargeUtf8` without configuration, the type `Date64` with - strategy `NaiveStrAsDate64` when setting `guess_dates = true` - - `from_type` is not supported, as the type is not self-describing -- [x] `chrono::NaiveTime`: - - serialized / deserialized as strings - - can be mapped to `Utf8`, `LargeUtf8`, `Time32(..)` and `Time64` arrays - - `from_samples` detects the type `LargeUtf8` without configuration, the type `Time64(Nanosecond)` - when setting `guess_dates = true` - - `from_type` is not supported, as the type is not self-describing -- [x] `chrono::NaiveDate`: - - is serialized as Serde strings - - can be mapped to `Utf8`, `LargeUtf8`, `Date32` arrays - - `from_samples` detects the type `LargeUtf8` without configuration, to `Date32` when setting - `guess_dates = true` - - `from_type` is not supported, as the type is not self-describing -- [ ] `chrono::Duration`: does not support Serde and is therefore not supported -- [x] [`rust_decimal::Decimal`][rust_decimal::Decimal] for the `float` and `str` - (de)serialization options when using the `Decimal128(..)` data type -- [x] [`bigdecimal::BigDecimal`][bigdecimal::BigDecimal] when using the - `Decimal128(..)` data type - - -[crate::base::Event]: https://docs.rs/serde_arrow/latest/serde_arrow/event/enum.Event.html -[crate::to_record_batch]: https://docs.rs/serde_arrow/latest/serde_arrow/fn.to_record_batch.html -[crate::trace_schema]: https://docs.rs/serde_arrow/latest/serde_arrow/fn.trace_schema.html -[serde::Serialize]: https://docs.serde.rs/serde/trait.Serialize.html -[serde::Deserialize]: https://docs.serde.rs/serde/trait.Deserialize.html -[crate::Schema::from_records]: https://docs.rs/serde_arrow/latest/serde_arrow/struct.Schema.html#method.from_records -[chrono]: https://docs.rs/chrono/latest/chrono/ - -[crate::base::EventSource]: https://docs.rs/serde_arrow -[crate::base::EventSink]: https://docs.rs/serde_arrow +### `chrono` types + +#### `chrono::DateTime` + +- is serialized / deserialized as strings +- can be mapped to `Utf8`, `LargeUtf8`, `Timestamp(.., Some("Utc"))`, `Date64` with strategy `UtcStrAsDate64` +- `from_samples` detects + - `LargeUtf8` without configuration + - `Date64` with strategy `UtcStrAsDate64` when setting `guess_dates = true` +- `from_type` is not supported, as the type is not self-describing + +With [`chrono::serde::ts_microseconds`][chrono-ts-microseconds]: + +- is serialized / deserialized as `i64` +- can be mapped to `Utf8`, `LargeUtf8`, `Timestamp(.., Some("Utc"))`, `Date64` without Strategy, + `Date64` with strategy `UtcStrAsDate64` +- `from_samples` and `from_type` detect `Int64` + +#### `chrono::NaiveDateTime` + +- is serialized / deserialized as strings +- can be mapped to `Utf8`, `LargeUtf8`, `Timestamp(.., None)`, `Date64` with strategy `NaiveStrAsDate64` +- `from_samples` detects + - `LargeUtf8` without configuration + - `Date64` with strategy `NaiveStrAsDate64` when setting `guess_dates = true` +- `from_type` is not supported, as the type is not self-describing + +#### `chrono::NaiveTime` + +- serialized / deserialized as strings +- can be mapped to `Utf8`, `LargeUtf8`, `Time32(..)` and `Time64` arrays +- `from_samples` detects + - `LargeUtf8` without configuration + - `Time64(Nanosecond)` when setting `guess_dates = true` +- `from_type` is not supported, as the type is not self-describing + +#### `chrono::NaiveDate` + +- is serialized as Serde strings +- can be mapped to `Utf8`, `LargeUtf8`, `Date32` arrays +- `from_samples` detects + - `LargeUtf8` without configuration + - `Date32` when setting `guess_dates = true` +- `from_type` is not supported, as the type is not self-describing + +`chrono::Duration` does not support Serde and is therefore not supported + +### `jiff` types + +#### `jiff::Date` + +- is serialized as Serde strings +- can me mapped to `Utf8`, `LargeUtf8`, `Date32` +- `from_samples` detects + - `LargeUtf8` without configuration + - `Date32` when setting `guess_dates = true` +- `from_type` is not supported, as the type is not self-describing + +#### `jiff::Time` + +- is serialized as Serde strings +- can me mapped to `Utf8`, `LargeUtf8`, `Time32(..)`, `Time64(..)` +- `from_samples` detects + - `LargeUtf8` without configuration + - `Time64(Nanosecond)` when setitng `guess_dates = true` +- `from_type` is not supported, as the type is not self-describing + +#### `jiff::DateTime` + +- is serialized as Serde strings +- can me mapped to `Utf8`, `LargeUtf8`, `Timestmap(.., None)`, `Date64` with strategy + `NaiveStrAsDate64` +- `from_samples` detects + - `LargeUtf8` without configuration + - `Date64` with strategy `NaiveStrAsDate64` when setting `guess_dates = true` +- `from_type` is not supported, as the type is not self-describing + +#### `jiff::Timestamp` + +- is serialized as Serde strings +- can me mapped to `Utf8`, `LargeUtf8`, `Timestamp(.., Some("UTC"))`, `Date64` with strategy + `UtcStrAsDate64` +- `from_samples` detects + - `LargeUtf8` without configuration + - `Date64` with strategy `UtcStrDate64` when setting `guess_dates = true` +- `from_type` is not supported, as the type is not self-describing + +#### `jiff::Span` + +- is serialized as Serde strings +- can me mapped to `Utf8`, `LargeUtf8`, `Duration(..)` +- `from_samples` detects `LargeUtf8` +- `from_type` is not supported, as the type is not self-describing + +#### `jiff::SignedDuration` + +Same as `jiff::Span` + +#### `jiff::Zoned` + +is not supported as there is no clear way of implementation + +### `rust_decimal` and `bigdecimal` types + +### [`rust_decimal::Decimal`][rust_decimal::Decimal] + +- for the `float` and `str` (de)serialization options when using the `Decimal128(..)` data type + +### [`bigdecimal::BigDecimal`][bigdecimal::BigDecimal] + +- when using the `Decimal128(..)` data type + [chrono-ts-microseconds]: https://docs.rs/chrono/latest/chrono/serde/ts_microseconds/ [rust_decimal::Decimal]: https://docs.rs/rust_decimal/latest/rust_decimal/struct.Decimal.html [bigdecimal::BigDecimal]: https://docs.rs/bigdecimal/0.4.2/bigdecimal/struct.BigDecimal.html diff --git a/serde_arrow/src/_impl/docs/defs.rs b/serde_arrow/src/_impl/docs/defs.rs index 3320c2fa..e30c8df8 100644 --- a/serde_arrow/src/_impl/docs/defs.rs +++ b/serde_arrow/src/_impl/docs/defs.rs @@ -28,7 +28,7 @@ pub fn example_arrow_arrays() -> (Vec, let items = example_records(); let fields = Vec::::from_type::(TracingOptions::default()).unwrap(); - let arrays = crate::to_arrow(&fields, &items).unwrap(); + let arrays = crate::to_arrow(&fields, items).unwrap(); (fields, arrays) } @@ -40,7 +40,7 @@ pub fn example_arrow2_arrays() -> (Vec, let items = example_records(); let fields = Vec::::from_type::(TracingOptions::default()).unwrap(); - let arrays = crate::to_arrow2(&fields, &items).unwrap(); + let arrays = crate::to_arrow2(&fields, items).unwrap(); (fields, arrays) } diff --git a/serde_arrow/src/arrow2_impl/api.rs b/serde_arrow/src/arrow2_impl/api.rs index 41cad44b..602d4e90 100644 --- a/serde_arrow/src/arrow2_impl/api.rs +++ b/serde_arrow/src/arrow2_impl/api.rs @@ -107,7 +107,7 @@ impl crate::internal::array_builder::ArrayBuilder { /// Construct `arrow2` arrays and reset the builder (*requires one of the /// `arrow2-*` features*) pub fn to_arrow2(&mut self) -> Result>> { - self.to_arrays()? + self.build_arrays()? .into_iter() .map(Box::::try_from) .collect() diff --git a/serde_arrow/src/arrow_impl/api.rs b/serde_arrow/src/arrow_impl/api.rs index cf298f27..126c4d3b 100644 --- a/serde_arrow/src/arrow_impl/api.rs +++ b/serde_arrow/src/arrow_impl/api.rs @@ -186,7 +186,7 @@ impl crate::internal::array_builder::ArrayBuilder { /// Construct `arrow` arrays and reset the builder (*requires one of the /// `arrow-*` features*) pub fn to_arrow(&mut self) -> Result> { - self.to_arrays()? + self.build_arrays()? .into_iter() .map(ArrayRef::try_from) .collect() diff --git a/serde_arrow/src/internal/array_builder.rs b/serde_arrow/src/internal/array_builder.rs index 869c9694..1be34f3e 100644 --- a/serde_arrow/src/internal/array_builder.rs +++ b/serde_arrow/src/internal/array_builder.rs @@ -83,7 +83,7 @@ impl ArrayBuilder { self.builder.extend(items) } - pub(crate) fn to_arrays(&mut self) -> Result> { + pub(crate) fn build_arrays(&mut self) -> Result> { let mut arrays = Vec::new(); for field in self.builder.take_records()? { arrays.push(field.into_array()?); diff --git a/serde_arrow/src/internal/chrono.rs b/serde_arrow/src/internal/chrono.rs new file mode 100644 index 00000000..d7a0fbe7 --- /dev/null +++ b/serde_arrow/src/internal/chrono.rs @@ -0,0 +1,1003 @@ +//! Support for Parsing datetime related quantities +//! +use crate::internal::{arrow::TimeUnit, error::Result}; + +use parsing::ParseResult; + +pub use parsing::Span; + +use super::error::fail; + +/// Check whether `s` can be parsed as a naive datetime +pub fn matches_naive_datetime(s: &str) -> bool { + parsing::match_naive_datetime(s).matches() +} + +/// Check whether `s` can be parsed as a UTC datetime +pub fn matches_utc_datetime(s: &str) -> bool { + parsing::match_utc_datetime(s).matches() +} + +/// Check whether `s` can be parsed as a naive date +pub fn matches_naive_date(s: &str) -> bool { + parsing::match_naive_date(s).matches() +} + +/// Check whether `s` can be parsed as a naive time +pub fn matches_naive_time(s: &str) -> bool { + parsing::match_naive_time(s).matches() +} + +/// Parse `s` as a span +pub fn parse_span(s: &str) -> Result> { + parsing::match_span(s).into_result("Span") +} + +impl<'a> parsing::Span<'a> { + /// Convert the `Span` into an `i64`` with the given `unit` + pub fn to_arrow_duration(&self, unit: TimeUnit) -> Result { + if get_optional_digit_value(self.year)? != 0 || get_optional_digit_value(self.month)? != 0 { + fail!("Cannot convert interval style spans to a duration"); + } + + let second_value = self.get_second_value()?; + let nanosecond_value = self.get_nanosecond_value()?; + Self::build_duration(self.sign, second_value, nanosecond_value, unit) + } + + fn get_second_value(&self) -> Result { + Ok(get_optional_digit_value(self.week)? * 7 * 24 * 60 * 60 + + get_optional_digit_value(self.day)? * 24 * 60 * 60 + + get_optional_digit_value(self.hour)? * 60 * 60 + + get_optional_digit_value(self.minute)? * 60 + + get_optional_digit_value(self.second)?) + } + + fn get_nanosecond_value(&self) -> Result { + let Some(subsecond) = self.subsecond else { + return Ok(0); + }; + let subsecond_val: i64 = subsecond.parse()?; + let subsecond_len = u32::try_from(subsecond.len())?; + + if subsecond_len <= 9 { + Ok(subsecond_val * 10_i64.pow(9 - subsecond_len)) + } else { + Ok(subsecond_val / 10_i64.pow(subsecond_len - 9)) + } + } + + fn build_duration( + sign: Option, + second_value: i64, + nanosecond_value: i64, + unit: TimeUnit, + ) -> Result { + let unsigned_duration = match unit { + TimeUnit::Second => second_value, + TimeUnit::Millisecond => match second_value.checked_mul(1_000_i64) { + Some(res) => res + nanosecond_value / 1_000_000, + None => fail!("Cannot represent {second_value} with Microsecond resolution"), + }, + TimeUnit::Microsecond => match second_value.checked_mul(1_000_000_i64) { + Some(res) => res + nanosecond_value / 1_000, + None => fail!("Cannot represent {second_value} with Millisecond resolution"), + }, + TimeUnit::Nanosecond => match second_value.checked_mul(1_000_000_000_i64) { + Some(res) => res + nanosecond_value, + None => fail!("Cannot represent {second_value} with Nanosecond resolution"), + }, + }; + + if sign == Some('-') { + Ok(-unsigned_duration) + } else { + Ok(unsigned_duration) + } + } +} + +/// Format a duration in the given unit as a Span string +pub fn format_arrow_duration_as_span(value: i64, unit: TimeUnit) -> String { + let (value, sign) = if value < 0 { + (-value, "-") + } else { + (value, "") + }; + + match unit { + TimeUnit::Second => format!("{sign}PT{value}s"), + TimeUnit::Millisecond => format!( + "{sign}PT{second}.{subsecond:03}s", + second = value / 1_000, + subsecond = value % 1_000 + ), + TimeUnit::Microsecond => format!( + "{sign}PT{second}.{subsecond:06}s", + second = value / 1_000_000, + subsecond = value % 1_000_000 + ), + TimeUnit::Nanosecond => format!( + "{sign}PT{second}.{subsecond:09}s", + second = value / 1_000_000_000, + subsecond = value % 1_000_000_000 + ), + } +} + +fn get_optional_digit_value(s: Option<&str>) -> Result { + match s { + Some(s) => Ok(s.parse()?), + None => Ok(0), + } +} + +/// Minimalistic monadic parsers for datetime objects +/// +/// Each parser has the the following interface: +/// +/// `fn (string_to_parse, ..extra_args) -> Result<(rest, result), unmatched_string>` +/// +mod parsing { + pub const DIGIT: &[char] = &['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']; + + pub trait ParseResult { + type Output; + + fn matches(&self) -> bool; + fn into_result(self, output_type: &str) -> crate::internal::error::Result; + } + + impl<'a, 'e, R> ParseResult for Result<(&'a str, R), &'e str> { + type Output = R; + + fn matches(&self) -> bool { + match self { + Ok((rest, _)) => rest.is_empty(), + Err(_) => false, + } + } + + fn into_result(self, output_type: &str) -> crate::internal::error::Result { + match self { + Ok(("", output)) => Ok(output), + Ok((unmatched, _)) | Err(unmatched) => crate::internal::error::fail!( + "Could not parse the string as {output_type}, unmatched content: {unmatched:?}" + ), + } + } + } + + #[derive(Debug, Default, Clone, PartialEq, Eq)] + pub struct Date<'a> { + pub sign: Option, + pub year: &'a str, + pub month: &'a str, + pub day: &'a str, + } + + #[derive(Debug, Default, Clone, PartialEq, Eq)] + pub struct Time<'a> { + pub hour: &'a str, + pub minute: &'a str, + pub second: &'a str, + pub subsecond: Option<&'a str>, + } + + #[derive(Debug, Default, Clone, PartialEq, Eq)] + pub struct DateTime<'a> { + pub date: Date<'a>, + pub time: Time<'a>, + } + + #[derive(Debug, Default, Clone, PartialEq, Eq)] + pub struct DateTimeUtc<'a> { + pub date: Date<'a>, + pub time: Time<'a>, + pub timezone: &'a str, + } + + #[derive(Debug, Default, Clone, PartialEq, Eq)] + pub struct Span<'a> { + pub sign: Option, + pub year: Option<&'a str>, + pub month: Option<&'a str>, + pub day: Option<&'a str>, + pub week: Option<&'a str>, + pub hour: Option<&'a str>, + pub minute: Option<&'a str>, + pub second: Option<&'a str>, + pub subsecond: Option<&'a str>, + } + + pub fn match_utc_datetime(s: &str) -> Result<(&str, DateTimeUtc<'_>), &str> { + let (s, DateTime { date, time }) = match_naive_datetime_with_sep(s, &['T', ' '])?; + let (s, timezone) = match_utc_timezone(s)?; + Ok(( + s, + DateTimeUtc { + date, + time, + timezone, + }, + )) + } + + pub fn match_naive_datetime(s: &str) -> Result<(&str, DateTime<'_>), &str> { + match_naive_datetime_with_sep(s, &['T']) + } + + pub fn match_naive_date(s: &str) -> Result<(&str, Date<'_>), &str> { + let (s, sign) = match_optional_sign(s)?; + let (s, year) = match_one_or_more_digits(s)?; + let (s, _) = match_char(s, '-')?; + let (s, month) = match_one_or_two_digits(s)?; + let (s, _) = match_char(s, '-')?; + let (s, day) = match_one_or_two_digits(s)?; + Ok(( + s, + Date { + sign, + year, + month, + day, + }, + )) + } + + pub fn match_naive_time(s: &str) -> Result<(&str, Time<'_>), &str> { + let (s, hour) = match_one_or_two_digits(s)?; + let (s, _) = match_char(s, ':')?; + let (s, minute) = match_one_or_two_digits(s)?; + let (s, _) = match_char(s, ':')?; + let (s, second) = match_one_or_two_digits(s)?; + + let (s, subsecond) = if let Some(s) = s.strip_prefix('.') { + let (s, subsecond) = match_one_or_more_digits(s)?; + (s, Some(subsecond)) + } else { + (s, None) + }; + + Ok(( + s, + Time { + hour, + minute, + second, + subsecond, + }, + )) + } + + pub fn match_span(s: &str) -> Result<(&str, Span<'_>), &str> { + let (s, sign) = match_optional_sign(s)?; + let (s, _) = match_char_case_insensitive(s, 'P')?; + let (s, year) = match_optional_span_value(s, 'Y')?; + let (s, month) = match_optional_span_value(s, 'M')?; + let (s, week) = match_optional_span_value(s, 'W')?; + let (s, day) = match_optional_span_value(s, 'D')?; + + let (s, hour, minute, second, subsecond) = if let Some(s) = s.strip_prefix(['t', 'T']) { + let (s, hour) = match_optional_span_value(s, 'H')?; + let (s, minute) = match_optional_span_value(s, 'M')?; + let (s, second, subsecond) = match_optional_span_seconds(s)?; + (s, hour, minute, second, subsecond) + } else { + (s, None, None, None, None) + }; + + Ok(( + s, + Span { + sign, + year, + month, + week, + day, + hour, + minute, + second, + subsecond, + }, + )) + } + + pub fn match_optional_span_seconds( + s: &str, + ) -> Result<(&str, Option<&str>, Option<&str>), &str> { + let Ok((rest, second)) = match_one_or_more_digits(s) else { + return Ok((s, None, None)); + }; + let second = Some(second); + + let (rest, subsecond) = if let Some(rest) = rest.strip_prefix('.') { + // Q: is a subsecond part really required after a '.'? + let (rest, subsecond) = match_one_or_more_digits(rest)?; + (rest, Some(subsecond)) + } else { + (rest, None) + }; + + let Ok((rest, _)) = match_char_case_insensitive(rest, 'S') else { + return Ok((s, None, None)); + }; + + Ok((rest, second, subsecond)) + } + + pub fn match_naive_datetime_with_sep<'a>( + s: &'a str, + sep: &'_ [char], + ) -> Result<(&'a str, DateTime<'a>), &'a str> { + let (s, date) = match_naive_date(s)?; + let s = s.strip_prefix(sep).ok_or(s)?; + let (s, time) = match_naive_time(s)?; + Ok((s, DateTime { date, time })) + } + + /// Match known UTC time zone designators + /// + /// Note: this function is more permissive than some libraries (e.g., jiff) + pub fn match_utc_timezone(s: &str) -> Result<(&str, &str), &str> { + for prefix in ["Z", "+0000", "+00:00"] { + if let Some(rest) = s.strip_prefix(prefix) { + return Ok((rest, get_prefix(s, rest))); + } + } + Err(s) + } + + fn get_prefix<'a>(s: &'a str, rest: &str) -> &'a str { + debug_assert!(s.ends_with(rest), "Invalid call to get prefix"); + let len_prefix = s.len() - rest.len(); + &s[..len_prefix] + } + + /// Match a value in a span + pub fn match_optional_span_value(s: &str, unit: char) -> Result<(&str, Option<&str>), &str> { + let Ok((rest, value)) = match_one_or_more_digits(s) else { + return Ok((s, None)); + }; + let Ok((rest, _)) = match_char_case_insensitive(rest, unit) else { + return Ok((s, None)); + }; + Ok((rest, Some(value))) + } + + pub fn match_optional_sign(s: &str) -> Result<(&str, Option), &str> { + if let Some(rest) = s.strip_prefix('+') { + Ok((rest, Some('+'))) + } else if let Some(rest) = s.strip_prefix('-') { + Ok((rest, Some('-'))) + } else { + Ok((s, None)) + } + } + + pub fn match_one_or_more_digits(s: &str) -> Result<(&str, &str), &str> { + let mut rest = s.strip_prefix(DIGIT).ok_or(s)?; + while let Some(new_rest) = rest.strip_prefix(DIGIT) { + rest = new_rest; + } + Ok((rest, get_prefix(s, rest))) + } + + pub fn match_one_or_two_digits(s: &str) -> Result<(&str, &str), &str> { + let rest = s.strip_prefix(DIGIT).ok_or(s)?; + let rest = rest.strip_prefix(DIGIT).unwrap_or(rest); + Ok((rest, get_prefix(s, rest))) + } + + pub fn match_char(s: &str, c: char) -> Result<(&str, char), &str> { + if let Some(rest) = s.strip_prefix(c) { + Ok((rest, c)) + } else { + Err(s) + } + } + + /// Match a character case insensitive + /// + /// Note: `c` must be an ASCII character and must be uppercase + pub fn match_char_case_insensitive(s: &str, c: char) -> Result<(&str, char), &str> { + debug_assert!(c.is_ascii()); + debug_assert!(c.is_ascii_uppercase()); + + let c_lowercase = c.to_ascii_lowercase(); + + if let Some(rest) = s.strip_prefix(c) { + Ok((rest, c)) + } else if let Some(rest) = s.strip_prefix(c_lowercase) { + Ok((rest, c_lowercase)) + } else { + Err(s) + } + } +} + +#[test] +fn test_match_naive_datetime() { + // chrono examples + assert_eq!( + parsing::match_naive_datetime("2015-09-18T23:56:04"), + Ok(( + "", + parsing::DateTime { + date: parsing::Date { + sign: None, + year: "2015", + month: "09", + day: "18" + }, + time: parsing::Time { + hour: "23", + minute: "56", + second: "04", + subsecond: None + } + } + )) + ); + assert_eq!( + parsing::match_naive_datetime("+12345-6-7T7:59:60.5"), + Ok(( + "", + parsing::DateTime { + date: parsing::Date { + sign: Some('+'), + year: "12345", + month: "6", + day: "7" + }, + time: parsing::Time { + hour: "7", + minute: "59", + second: "60", + subsecond: Some("5") + }, + } + )) + ); +} + +#[test] +fn test_match_utc_datetime() { + // examples from the chrono docs + assert_eq!( + parsing::match_utc_datetime("2012-12-12T12:12:12Z"), + Ok(( + "", + parsing::DateTimeUtc { + date: parsing::Date { + sign: None, + year: "2012", + month: "12", + day: "12" + }, + time: parsing::Time { + hour: "12", + minute: "12", + second: "12", + subsecond: None + }, + timezone: "Z", + } + )) + ); + assert_eq!( + parsing::match_utc_datetime("2012-12-12 12:12:12Z"), + Ok(( + "", + parsing::DateTimeUtc { + date: parsing::Date { + sign: None, + year: "2012", + month: "12", + day: "12" + }, + time: parsing::Time { + hour: "12", + minute: "12", + second: "12", + subsecond: None + }, + timezone: "Z", + } + )) + ); + assert_eq!( + parsing::match_utc_datetime("2012-12-12 12:12:12+0000"), + Ok(( + "", + parsing::DateTimeUtc { + date: parsing::Date { + sign: None, + year: "2012", + month: "12", + day: "12" + }, + time: parsing::Time { + hour: "12", + minute: "12", + second: "12", + subsecond: None + }, + timezone: "+0000", + } + )) + ); + assert_eq!( + parsing::match_utc_datetime("2012-12-12 12:12:12+00:00"), + Ok(( + "", + parsing::DateTimeUtc { + date: parsing::Date { + sign: None, + year: "2012", + month: "12", + day: "12" + }, + time: parsing::Time { + hour: "12", + minute: "12", + second: "12", + subsecond: None + }, + timezone: "+00:00", + } + )) + ); +} + +#[test] +fn test_match_naive_date() { + assert_eq!( + parsing::match_naive_date("+12345-6-7"), + Ok(( + "", + parsing::Date { + sign: Some('+'), + year: "12345", + month: "6", + day: "7" + } + )) + ); + assert_eq!( + parsing::match_naive_date("2015-09-18"), + Ok(( + "", + parsing::Date { + sign: None, + year: "2015", + month: "09", + day: "18" + } + )) + ); + + // NOTE: the content is not verified + assert_eq!( + parsing::match_naive_date("-20-21-22"), + Ok(( + "", + parsing::Date { + sign: Some('-'), + year: "20", + month: "21", + day: "22" + } + )) + ); + + assert_eq!(parsing::match_naive_date("foo"), Err("foo")); + + assert_eq!(parsing::match_naive_date("2015-123-18"), Err("3-18")); + + // trailing digits are returned as rest + assert_eq!( + parsing::match_naive_date("2024-12-091234"), + Ok(( + "1234", + parsing::Date { + sign: None, + year: "2024", + month: "12", + day: "09" + } + )) + ); +} + +#[test] +fn test_match_naive_time() { + assert_eq!( + parsing::match_naive_time("23:00:12"), + Ok(( + "", + parsing::Time { + hour: "23", + minute: "00", + second: "12", + subsecond: None + } + )) + ); + assert_eq!( + parsing::match_naive_time("23:00:12.999"), + Ok(( + "", + parsing::Time { + hour: "23", + minute: "00", + second: "12", + subsecond: Some("999") + } + )) + ); +} + +#[test] +fn match_span() { + // jiff examples + assert_eq!( + parsing::match_span("P40D"), + Ok(( + "", + parsing::Span { + day: Some("40"), + ..Default::default() + } + )) + ); + assert_eq!( + parsing::match_span("P1y1d"), + Ok(( + "", + parsing::Span { + year: Some("1"), + day: Some("1"), + ..Default::default() + } + )) + ); + assert_eq!( + parsing::match_span("P1m"), + Ok(( + "", + parsing::Span { + month: Some("1"), + ..Default::default() + } + )) + ); + assert_eq!( + parsing::match_span("P1w"), + Ok(( + "", + parsing::Span { + week: Some("1"), + ..Default::default() + } + )) + ); + assert_eq!( + parsing::match_span("P1w4d"), + Ok(( + "", + parsing::Span { + week: Some("1"), + day: Some("4"), + ..Default::default() + } + )) + ); + assert_eq!( + parsing::match_span("P0d"), + Ok(( + "", + parsing::Span { + day: Some("0"), + ..Default::default() + } + )) + ); + + assert_eq!( + parsing::match_span("P3dT4h59m"), + Ok(( + "", + parsing::Span { + day: Some("3"), + hour: Some("4"), + minute: Some("59"), + ..Default::default() + } + )) + ); + assert_eq!( + parsing::match_span("PT2H30M"), + Ok(( + "", + parsing::Span { + hour: Some("2"), + minute: Some("30"), + ..Default::default() + } + )) + ); + assert_eq!( + parsing::match_span("PT1m"), + Ok(( + "", + parsing::Span { + minute: Some("1"), + ..Default::default() + } + )) + ); + assert_eq!( + parsing::match_span("PT0s"), + Ok(( + "", + parsing::Span { + second: Some("0"), + ..Default::default() + } + )) + ); + assert_eq!( + parsing::match_span("PT0.0021s"), + Ok(( + "", + parsing::Span { + second: Some("0"), + subsecond: Some("0021"), + ..Default::default() + } + )) + ); + assert_eq!( + parsing::match_span("P1y1m1dT1h1m1.1s"), + Ok(( + "", + parsing::Span { + year: Some("1"), + month: Some("1"), + day: Some("1"), + hour: Some("1"), + minute: Some("1"), + second: Some("1"), + subsecond: Some("1"), + ..Default::default() + } + )) + ); + assert_eq!( + parsing::match_span("+P3dT4h59m"), + Ok(( + "", + parsing::Span { + sign: Some('+'), + day: Some("3"), + hour: Some("4"), + minute: Some("59"), + ..Default::default() + } + )) + ); + assert_eq!( + parsing::match_span("-P1w4d"), + Ok(( + "", + parsing::Span { + sign: Some('-'), + week: Some("1"), + day: Some("4"), + ..Default::default() + } + )) + ); + assert_eq!( + parsing::match_span("PT0.020s"), + Ok(( + "", + parsing::Span { + second: Some("0"), + subsecond: Some("020"), + ..Default::default() + } + )) + ) +} + +#[test] +fn match_optional_sign() { + assert_eq!(parsing::match_optional_sign("foo"), Ok(("foo", None))); + assert_eq!(parsing::match_optional_sign("?foo"), Ok(("?foo", None))); + assert_eq!(parsing::match_optional_sign("+foo"), Ok(("foo", Some('+')))); + assert_eq!(parsing::match_optional_sign("-foo"), Ok(("foo", Some('-')))); +} + +#[test] +fn match_one_or_more_digits() { + assert_eq!(parsing::match_one_or_more_digits("foo"), Err("foo")); + assert_eq!(parsing::match_one_or_more_digits(" 1foo"), Err(" 1foo")); + assert_eq!(parsing::match_one_or_more_digits("1foo"), Ok(("foo", "1"))); + assert_eq!( + parsing::match_one_or_more_digits("12foo"), + Ok(("foo", "12")) + ); + assert_eq!( + parsing::match_one_or_more_digits("123foo"), + Ok(("foo", "123")) + ); + assert_eq!( + parsing::match_one_or_more_digits("1234foo"), + Ok(("foo", "1234")) + ); +} + +#[test] +fn match_one_or_two_digits() { + assert_eq!(parsing::match_one_or_two_digits("foo"), Err("foo")); + assert_eq!(parsing::match_one_or_two_digits(" 1foo"), Err(" 1foo")); + assert_eq!(parsing::match_one_or_two_digits("1foo"), Ok(("foo", "1"))); + assert_eq!(parsing::match_one_or_two_digits("12foo"), Ok(("foo", "12"))); + assert_eq!( + parsing::match_one_or_two_digits("123foo"), + Ok(("3foo", "12")) + ); + assert_eq!( + parsing::match_one_or_two_digits("1234foo"), + Ok(("34foo", "12")) + ); +} + +#[test] +fn test_parse_and_format_duration() { + fn parse_as_duration(s: &str, unit: TimeUnit) -> i64 { + parse_span(s).unwrap().to_arrow_duration(unit).unwrap() + } + + assert_eq!(format_arrow_duration_as_span(20, TimeUnit::Second), "PT20s"); + assert_eq!( + format_arrow_duration_as_span(20, TimeUnit::Millisecond), + "PT0.020s" + ); + assert_eq!( + format_arrow_duration_as_span(20, TimeUnit::Microsecond), + "PT0.000020s" + ); + assert_eq!( + format_arrow_duration_as_span(20, TimeUnit::Nanosecond), + "PT0.000000020s" + ); + + assert_eq!(parse_as_duration("PT20s", TimeUnit::Second), 20); + assert_eq!(parse_as_duration("PT0.020s", TimeUnit::Millisecond), 20); + assert_eq!(parse_as_duration("PT0.000020s", TimeUnit::Microsecond), 20); + assert_eq!( + parse_as_duration("PT0.000000020s", TimeUnit::Nanosecond), + 20 + ); + + assert_eq!( + format_arrow_duration_as_span(-13, TimeUnit::Second), + "-PT13s" + ); + assert_eq!( + format_arrow_duration_as_span(-13, TimeUnit::Millisecond), + "-PT0.013s" + ); + assert_eq!( + format_arrow_duration_as_span(-13, TimeUnit::Microsecond), + "-PT0.000013s" + ); + assert_eq!( + format_arrow_duration_as_span(-13, TimeUnit::Nanosecond), + "-PT0.000000013s" + ); + + assert_eq!(parse_as_duration("-PT13s", TimeUnit::Second), -13); + assert_eq!(parse_as_duration("-PT0.013s", TimeUnit::Millisecond), -13); + assert_eq!( + parse_as_duration("-PT0.000013s", TimeUnit::Microsecond), + -13 + ); + assert_eq!( + parse_as_duration("-PT0.000000013s", TimeUnit::Nanosecond), + -13 + ); + + assert_eq!( + format_arrow_duration_as_span(1234, TimeUnit::Second), + "PT1234s" + ); + assert_eq!( + format_arrow_duration_as_span(1234, TimeUnit::Millisecond), + "PT1.234s" + ); + assert_eq!( + format_arrow_duration_as_span(1234, TimeUnit::Microsecond), + "PT0.001234s" + ); + assert_eq!( + format_arrow_duration_as_span(1234, TimeUnit::Nanosecond), + "PT0.000001234s" + ); + + assert_eq!(parse_as_duration("PT1234s", TimeUnit::Second), 1234); + assert_eq!(parse_as_duration("PT1.234s", TimeUnit::Millisecond), 1234); + assert_eq!( + parse_as_duration("PT0.001234s", TimeUnit::Microsecond), + 1234 + ); + assert_eq!( + parse_as_duration("PT0.000001234s", TimeUnit::Nanosecond), + 1234 + ); + + assert_eq!( + format_arrow_duration_as_span(-2010, TimeUnit::Second), + "-PT2010s" + ); + assert_eq!( + format_arrow_duration_as_span(-2010, TimeUnit::Millisecond), + "-PT2.010s" + ); + assert_eq!( + format_arrow_duration_as_span(-2010, TimeUnit::Microsecond), + "-PT0.002010s" + ); + assert_eq!( + format_arrow_duration_as_span(-2010, TimeUnit::Nanosecond), + "-PT0.000002010s" + ); + + assert_eq!(parse_as_duration("-PT2010s", TimeUnit::Second), -2010); + assert_eq!(parse_as_duration("-PT2.010s", TimeUnit::Millisecond), -2010); + assert_eq!( + parse_as_duration("-PT0.002010s", TimeUnit::Microsecond), + -2010 + ); + assert_eq!( + parse_as_duration("-PT0.000002010s", TimeUnit::Nanosecond), + -2010 + ); + + assert_eq!( + format_arrow_duration_as_span(123456789, TimeUnit::Second), + "PT123456789s" + ); + assert_eq!( + format_arrow_duration_as_span(123456789, TimeUnit::Millisecond), + "PT123456.789s" + ); + assert_eq!( + format_arrow_duration_as_span(123456789, TimeUnit::Microsecond), + "PT123.456789s" + ); + assert_eq!( + format_arrow_duration_as_span(123456789, TimeUnit::Nanosecond), + "PT0.123456789s" + ); + + assert_eq!( + parse_as_duration("PT123456789s", TimeUnit::Second), + 123456789 + ); + assert_eq!( + parse_as_duration("PT123456.789s", TimeUnit::Millisecond), + 123456789 + ); + assert_eq!( + parse_as_duration("PT123.456789s", TimeUnit::Microsecond), + 123456789 + ); + assert_eq!( + parse_as_duration("PT0.123456789s", TimeUnit::Nanosecond), + 123456789 + ); +} diff --git a/serde_arrow/src/internal/deserialization/array_deserializer.rs b/serde_arrow/src/internal/deserialization/array_deserializer.rs index c423d105..4707c4fa 100644 --- a/serde_arrow/src/internal/deserialization/array_deserializer.rs +++ b/serde_arrow/src/internal/deserialization/array_deserializer.rs @@ -12,7 +12,7 @@ use super::{ binary_deserializer::BinaryDeserializer, bool_deserializer::BoolDeserializer, date32_deserializer::Date32Deserializer, date64_deserializer::Date64Deserializer, decimal_deserializer::DecimalDeserializer, dictionary_deserializer::DictionaryDeserializer, - enum_deserializer::EnumDeserializer, + duration_deserializer::DurationDeserializer, enum_deserializer::EnumDeserializer, fixed_size_binary_deserializer::FixedSizeBinaryDeserializer, fixed_size_list_deserializer::FixedSizeListDeserializer, float_deserializer::FloatDeserializer, integer_deserializer::IntegerDeserializer, list_deserializer::ListDeserializer, @@ -36,6 +36,7 @@ pub enum ArrayDeserializer<'a> { F32(FloatDeserializer<'a, f32>), F64(FloatDeserializer<'a, f64>), Decimal128(DecimalDeserializer<'a>), + Duration(DurationDeserializer<'a>), Date32(Date32Deserializer<'a>), Date64(Date64Deserializer<'a>), Time32(TimeDeserializer<'a, i32>), @@ -122,8 +123,9 @@ impl<'a> ArrayDeserializer<'a> { is_utc_timestamp(view.timezone.as_deref())?, ))), }, - V::Duration(view) => Ok(D::I64(IntegerDeserializer::new( + V::Duration(view) => Ok(D::Duration(DurationDeserializer::new( path, + view.unit, PrimitiveArrayView { values: view.values, validity: view.validity, @@ -342,6 +344,7 @@ macro_rules! dispatch { $wrapper::F32($name) => $expr, $wrapper::F64($name) => $expr, $wrapper::Decimal128($name) => $expr, + $wrapper::Duration($name) => $expr, $wrapper::Date32($name) => $expr, $wrapper::Date64($name) => $expr, $wrapper::Time32($name) => $expr, diff --git a/serde_arrow/src/internal/deserialization/date32_deserializer.rs b/serde_arrow/src/internal/deserialization/date32_deserializer.rs index f37f0b00..bf35e471 100644 --- a/serde_arrow/src/internal/deserialization/date32_deserializer.rs +++ b/serde_arrow/src/internal/deserialization/date32_deserializer.rs @@ -1,4 +1,4 @@ -use chrono::{Duration, NaiveDate, NaiveDateTime}; +use chrono::{Datelike, Duration, NaiveDate, NaiveDateTime}; use serde::de::Visitor; use crate::internal::{ @@ -27,7 +27,23 @@ impl<'a> Date32Deserializer<'a> { #[allow(deprecated)] let delta = Duration::days(ts as i64); let date = UNIX_EPOCH + delta; - Ok(date.to_string()) + + // special handling of negative dates: + // + // - jiff expects 6 digits years in this case + // - chrono allows an arbitrary number of digits, when prefixed with a sign + // + // https://github.com/chronotope/chrono/blob/05a6ce68cf18a01274cef211b080a7170c7c1a1f/src/format/parse.rs#L368 + if date.year() < 0 { + Ok(format!( + "-{positive_year:06}-{month:02}-{day:02}", + positive_year = -date.year(), + month = date.month(), + day = date.day(), + )) + } else { + Ok(date.to_string()) + } } } @@ -78,4 +94,16 @@ impl<'de> SimpleDeserializer<'de> for Date32Deserializer<'de> { }) .ctx(self) } + + fn deserialize_bytes>(&mut self, visitor: V) -> Result { + try_(|| self.deserialize_byte_buf(visitor)).ctx(self) + } + + fn deserialize_byte_buf>(&mut self, visitor: V) -> Result { + try_(|| { + let ts = self.array.next_required()?; + visitor.visit_byte_buf(self.get_string_repr(ts)?.into_bytes()) + }) + .ctx(self) + } } diff --git a/serde_arrow/src/internal/deserialization/date64_deserializer.rs b/serde_arrow/src/internal/deserialization/date64_deserializer.rs index b6bc2601..9db12467 100644 --- a/serde_arrow/src/internal/deserialization/date64_deserializer.rs +++ b/serde_arrow/src/internal/deserialization/date64_deserializer.rs @@ -1,4 +1,4 @@ -use chrono::DateTime; +use chrono::{DateTime, Datelike, Utc}; use serde::de::Visitor; use crate::internal::{ @@ -43,11 +43,32 @@ impl<'a> Date64Deserializer<'a> { }; if self.is_utc { + Ok(self.format_with_suffix(date_time, "Z")) + } else { + Ok(self.format_with_suffix(date_time, "")) + } + } + + pub fn format_with_suffix(&self, date_time: DateTime, suffix: &str) -> String { + let date_time = date_time.naive_utc(); + // special handling of negative dates: + // + // - jiff expects 6 digits years in this case + // - chrono allows an arbitrary number of digits, when prefixed with a sign + // + // https://github.com/chronotope/chrono/blob/05a6ce68cf18a01274cef211b080a7170c7c1a1f/src/format/parse.rs#L368 + if date_time.year() < 0 { // NOTE: chrono documents that Debug, not Display, can be parsed - Ok(format!("{:?}", date_time)) + format!( + "-{positive_year:06}-{month:02}-{day:02}T{time:?}{suffix}", + positive_year = -date_time.year(), + month = date_time.month(), + day = date_time.day(), + time = date_time.time(), + ) } else { // NOTE: chrono documents that Debug, not Display, can be parsed - Ok(format!("{:?}", date_time.naive_utc())) + format!("{:?}{suffix}", date_time) } } } @@ -99,4 +120,16 @@ impl<'de> SimpleDeserializer<'de> for Date64Deserializer<'de> { }) .ctx(self) } + + fn deserialize_bytes>(&mut self, visitor: V) -> Result { + try_(|| self.deserialize_byte_buf(visitor).ctx(self)) + } + + fn deserialize_byte_buf>(&mut self, visitor: V) -> Result { + try_(|| { + let ts = self.array.next_required()?; + visitor.visit_byte_buf(self.get_string_repr(ts)?.into_bytes()) + }) + .ctx(self) + } } diff --git a/serde_arrow/src/internal/deserialization/duration_deserializer.rs b/serde_arrow/src/internal/deserialization/duration_deserializer.rs new file mode 100644 index 00000000..bd91c2bb --- /dev/null +++ b/serde_arrow/src/internal/deserialization/duration_deserializer.rs @@ -0,0 +1,84 @@ +use serde::de::Visitor; + +use crate::internal::{ + arrow::{PrimitiveArrayView, TimeUnit}, + chrono, + error::{set_default, try_, Context, ContextSupport, Result}, + utils::Mut, +}; + +use super::{simple_deserializer::SimpleDeserializer, utils::ArrayBufferIterator}; + +pub struct DurationDeserializer<'a> { + path: String, + unit: TimeUnit, + array: ArrayBufferIterator<'a, i64>, +} + +impl<'a> DurationDeserializer<'a> { + pub fn new(path: String, unit: TimeUnit, view: PrimitiveArrayView<'a, i64>) -> Self { + Self { + path, + unit, + array: ArrayBufferIterator::new(view.values, view.validity), + } + } + + pub fn next_string_value_required(&mut self) -> Result { + let value = self.array.next_required()?; + Ok(chrono::format_arrow_duration_as_span(value, self.unit)) + } +} + +impl<'de> Context for DurationDeserializer<'de> { + fn annotate(&self, annotations: &mut std::collections::BTreeMap) { + set_default(annotations, "field", &self.path); + set_default(annotations, "data_type", "Duration(..)"); + } +} + +impl<'de> SimpleDeserializer<'de> for DurationDeserializer<'de> { + fn deserialize_any>(&mut self, visitor: V) -> Result { + try_(|| { + if self.array.peek_next()? { + self.deserialize_i64(visitor) + } else { + self.array.consume_next(); + visitor.visit_none() + } + }) + .ctx(self) + } + + fn deserialize_option>(&mut self, visitor: V) -> Result { + try_(|| { + if self.array.peek_next()? { + visitor.visit_some(Mut(&mut *self)) + } else { + self.array.consume_next(); + visitor.visit_none() + } + }) + .ctx(self) + } + + fn deserialize_i64>(&mut self, visitor: V) -> Result { + try_(|| visitor.visit_i64(self.array.next_required()?)).ctx(self) + } + + fn deserialize_str>(&mut self, visitor: V) -> Result { + try_(|| visitor.visit_str(self.next_string_value_required()?.as_str())).ctx(self) + } + + fn deserialize_string>(&mut self, visitor: V) -> Result { + try_(|| visitor.visit_string(self.next_string_value_required()?)).ctx(self) + } + + fn deserialize_bytes>(&mut self, visitor: V) -> Result { + try_(|| visitor.visit_bytes(self.next_string_value_required()?.as_bytes())).ctx(self) + } + + fn deserialize_byte_buf>(&mut self, visitor: V) -> Result { + try_(|| visitor.visit_byte_buf(self.next_string_value_required()?.into_bytes())).ctx(self) + } +} diff --git a/serde_arrow/src/internal/deserialization/mod.rs b/serde_arrow/src/internal/deserialization/mod.rs index 8da9f023..42a3a80f 100644 --- a/serde_arrow/src/internal/deserialization/mod.rs +++ b/serde_arrow/src/internal/deserialization/mod.rs @@ -5,6 +5,7 @@ pub mod date32_deserializer; pub mod date64_deserializer; pub mod decimal_deserializer; pub mod dictionary_deserializer; +pub mod duration_deserializer; pub mod enum_deserializer; pub mod enums_as_string_impl; pub mod fixed_size_binary_deserializer; diff --git a/serde_arrow/src/internal/deserialization/string_deserializer.rs b/serde_arrow/src/internal/deserialization/string_deserializer.rs index 067e79a1..183f5832 100644 --- a/serde_arrow/src/internal/deserialization/string_deserializer.rs +++ b/serde_arrow/src/internal/deserialization/string_deserializer.rs @@ -1,6 +1,6 @@ use crate::internal::{ arrow::BytesArrayView, - error::{fail, set_default, Context, ContextSupport, Result}, + error::{fail, set_default, try_, Context, ContextSupport, Result}, utils::{Mut, NamedType, Offset}, }; @@ -85,72 +85,55 @@ impl<'a, O: NamedType + Offset> Context for StringDeserializer<'a, O> { impl<'a, O: NamedType + Offset> SimpleDeserializer<'a> for StringDeserializer<'a, O> { fn deserialize_any>(&mut self, visitor: V) -> Result { - self.deserialize_any_impl(visitor).ctx(self) + try_(|| { + if self.peek_next()? { + self.deserialize_str(visitor) + } else { + self.consume_next(); + visitor.visit_none() + } + }) + .ctx(self) } fn deserialize_option>(&mut self, visitor: V) -> Result { - self.deserialize_option_impl(visitor).ctx(self) + try_(|| { + if self.peek_next()? { + visitor.visit_some(Mut(self)) + } else { + self.consume_next(); + visitor.visit_none() + } + }) + .ctx(self) } fn deserialize_str>(&mut self, visitor: V) -> Result { - self.deserialize_str_impl(visitor).ctx(self) + try_(|| visitor.visit_borrowed_str(self.next_required()?)).ctx(self) } fn deserialize_string>(&mut self, visitor: V) -> Result { - self.deserialize_string_impl(visitor).ctx(self) - } - - fn deserialize_enum>( - &mut self, - name: &'static str, - variants: &'static [&'static str], - visitor: V, - ) -> Result { - self.deserialize_enum_impl(name, variants, visitor) - .ctx(self) - } -} - -impl<'a, O: NamedType + Offset> StringDeserializer<'a, O> { - fn deserialize_any_impl>(&mut self, visitor: V) -> Result { - if self.peek_next()? { - self.deserialize_str(visitor) - } else { - self.consume_next(); - visitor.visit_none() - } + try_(|| visitor.visit_string(self.next_required()?.to_owned())).ctx(self) } - fn deserialize_option_impl>( - &mut self, - visitor: V, - ) -> Result { - if self.peek_next()? { - visitor.visit_some(Mut(self)) - } else { - self.consume_next(); - visitor.visit_none() - } + fn deserialize_bytes>(&mut self, visitor: V) -> Result { + try_(|| visitor.visit_bytes(self.next_required()?.as_bytes())).ctx(self) } - fn deserialize_str_impl>(&mut self, visitor: V) -> Result { - visitor.visit_borrowed_str(self.next_required()?) + fn deserialize_byte_buf>(&mut self, visitor: V) -> Result { + try_(|| visitor.visit_byte_buf(self.next_required()?.to_owned().into_bytes())).ctx(self) } - fn deserialize_string_impl>( - &mut self, - visitor: V, - ) -> Result { - visitor.visit_string(self.next_required()?.to_owned()) - } - - fn deserialize_enum_impl>( + fn deserialize_enum>( &mut self, - _: &'static str, - _: &'static [&'static str], + _name: &'static str, + _variants: &'static [&'static str], visitor: V, ) -> Result { - let variant = self.next_required()?; - visitor.visit_enum(EnumAccess(variant)) + try_(|| { + let variant = self.next_required()?; + visitor.visit_enum(EnumAccess(variant)) + }) + .ctx(self) } } diff --git a/serde_arrow/src/internal/deserialization/time_deserializer.rs b/serde_arrow/src/internal/deserialization/time_deserializer.rs index 92d15206..7ae77fb9 100644 --- a/serde_arrow/src/internal/deserialization/time_deserializer.rs +++ b/serde_arrow/src/internal/deserialization/time_deserializer.rs @@ -106,4 +106,16 @@ impl<'de, T: NamedType + Integer> SimpleDeserializer<'de> for TimeDeserializer<' }) .ctx(self) } + + fn deserialize_bytes>(&mut self, visitor: V) -> Result { + try_(|| self.deserialize_byte_buf(visitor)).ctx(self) + } + + fn deserialize_byte_buf>(&mut self, visitor: V) -> Result { + try_(|| { + let ts = self.array.next_required()?.into_i64()?; + visitor.visit_byte_buf(self.get_string_repr(ts)?.into_bytes()) + }) + .ctx(self) + } } diff --git a/serde_arrow/src/internal/mod.rs b/serde_arrow/src/internal/mod.rs index 1c7188d6..a53053bc 100644 --- a/serde_arrow/src/internal/mod.rs +++ b/serde_arrow/src/internal/mod.rs @@ -1,5 +1,6 @@ pub mod array_builder; pub mod arrow; +pub mod chrono; pub mod deserialization; pub mod deserializer; pub mod error; diff --git a/serde_arrow/src/internal/schema/from_samples/chrono.rs b/serde_arrow/src/internal/schema/from_samples/chrono.rs deleted file mode 100644 index 957dbdac..00000000 --- a/serde_arrow/src/internal/schema/from_samples/chrono.rs +++ /dev/null @@ -1,135 +0,0 @@ -pub fn matches_naive_datetime(s: &str) -> bool { - eval_parser(parsing::match_naive_datetime, s) -} - -pub fn matches_utc_datetime(s: &str) -> bool { - eval_parser(parsing::match_utc_datetime, s) -} - -pub fn matches_naive_date(s: &str) -> bool { - eval_parser(parsing::match_naive_date, s) -} - -pub fn matches_naive_time(s: &str) -> bool { - eval_parser(parsing::match_naive_time, s) -} - -fn eval_parser Result<&str, &str>>(parser: F, s: &str) -> bool { - parser(s.trim()).map(str::is_empty).unwrap_or_default() -} - -/// minimalistic monadic parser -/// -/// Returns the Err(unmatched_string) on error and Ok(rest) on success -mod parsing { - pub const DIGIT: &[char] = &['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']; - - pub fn match_optional_sign(s: &str) -> Result<&str, &str> { - Ok(s.strip_prefix(['+', '-']).unwrap_or(s)) - } - - pub fn match_one_or_more_digits(s: &str) -> Result<&str, &str> { - let mut s = s.strip_prefix(DIGIT).ok_or(s)?; - while let Some(new_s) = s.strip_prefix(DIGIT) { - s = new_s; - } - Ok(s) - } - - pub fn match_one_or_two_digits(s: &str) -> Result<&str, &str> { - let s = s.strip_prefix(DIGIT).ok_or(s)?; - Ok(s.strip_prefix(DIGIT).unwrap_or(s)) - } - - pub fn match_char(s: &str, c: char) -> Result<&str, &str> { - s.strip_prefix(c).ok_or(s) - } - - pub fn match_naive_datetime_with_sep<'a>( - s: &'a str, - sep: &'_ [char], - ) -> Result<&'a str, &'a str> { - let s = match_naive_date(s)?; - let s = s.strip_prefix(sep).ok_or(s)?; - match_naive_time(s) - } - - pub fn match_naive_date(s: &str) -> Result<&str, &str> { - let s = match_optional_sign(s)?; - let s = match_one_or_more_digits(s)?; - let s = match_char(s, '-')?; - let s = match_one_or_two_digits(s)?; - let s = match_char(s, '-')?; - match_one_or_two_digits(s) - } - - pub fn match_naive_time(s: &str) -> Result<&str, &str> { - let s = match_one_or_two_digits(s)?; - let s = match_char(s, ':')?; - let s = match_one_or_two_digits(s)?; - let s = match_char(s, ':')?; - let s = match_one_or_two_digits(s)?; - - if let Some(s) = s.strip_prefix('.') { - match_one_or_more_digits(s) - } else { - Ok(s) - } - } - - pub fn match_naive_datetime(s: &str) -> Result<&str, &str> { - match_naive_datetime_with_sep(s, &['T']) - } - - pub fn match_utc_datetime(s: &str) -> Result<&str, &str> { - let s = match_naive_datetime_with_sep(s, &['T', ' '])?; - - if let Some(s) = s.strip_prefix('Z') { - Ok(s) - } else if let Some(s) = s.strip_prefix("+0000") { - Ok(s) - } else if let Some(s) = s.strip_prefix("+00:00") { - Ok(s) - } else { - Err(s) - } - } -} - -#[test] -fn test_match_naive_datetime() { - // chrono examples - assert_eq!(parsing::match_naive_datetime("2015-09-18T23:56:04"), Ok("")); - assert_eq!( - parsing::match_naive_datetime("+12345-6-7T7:59:60.5"), - Ok("") - ); -} - -#[test] -fn test_match_utc_datetime() { - // examples from the chrono docs - assert_eq!(parsing::match_utc_datetime("2012-12-12T12:12:12Z"), Ok("")); - assert_eq!(parsing::match_utc_datetime("2012-12-12 12:12:12Z"), Ok("")); - assert_eq!( - parsing::match_utc_datetime("2012-12-12 12:12:12+0000"), - Ok("") - ); - assert_eq!( - parsing::match_utc_datetime("2012-12-12 12:12:12+00:00"), - Ok("") - ); -} - -#[test] -fn test_match_naive_date() { - assert_eq!(parsing::match_naive_date("+12345-6-7"), Ok("")); - assert_eq!(parsing::match_naive_date("2015-09-18"), Ok("")); - assert_eq!(parsing::match_naive_date("-20-21-22"), Ok("")); -} - -#[test] -fn test_match_naive_time() { - assert_eq!(parsing::match_naive_time("23:00:12"), Ok("")); - assert_eq!(parsing::match_naive_time("23:00:12.999"), Ok("")); -} diff --git a/serde_arrow/src/internal/schema/from_samples/mod.rs b/serde_arrow/src/internal/schema/from_samples/mod.rs index bf36b934..9103204c 100644 --- a/serde_arrow/src/internal/schema/from_samples/mod.rs +++ b/serde_arrow/src/internal/schema/from_samples/mod.rs @@ -1,5 +1,4 @@ //! Support for `from_samples` -mod chrono; #[cfg(test)] mod test_error_messages; @@ -9,6 +8,7 @@ use serde::{ser::Impossible, Serialize}; use crate::internal::{ arrow::{DataType, TimeUnit}, + chrono, error::{fail, try_, Context, ContextSupport, Error, Result}, schema::{Strategy, TracingMode, TracingOptions}, }; @@ -335,6 +335,7 @@ impl<'a> serde::ser::Serializer for TracerSerializer<'a> { fn serialize_str(self, s: &str) -> Result { try_(|| { + #[allow(clippy::collapsible_else_if)] let (ty, st) = if !self.0.get_options().guess_dates { (DataType::LargeUtf8, None) } else { diff --git a/serde_arrow/src/internal/serialization/duration_builder.rs b/serde_arrow/src/internal/serialization/duration_builder.rs index 2c73e250..3762afc8 100644 --- a/serde_arrow/src/internal/serialization/duration_builder.rs +++ b/serde_arrow/src/internal/serialization/duration_builder.rs @@ -2,6 +2,7 @@ use std::collections::BTreeMap; use crate::internal::{ arrow::{Array, PrimitiveArray, TimeArray, TimeUnit}, + chrono, error::{set_default, try_, Context, ContextSupport, Result}, utils::array_ext::{new_primitive_array, ArrayExt, ScalarArrayExt}, }; @@ -92,4 +93,12 @@ impl SimpleSerializer for DurationBuilder { fn serialize_u64(&mut self, v: u64) -> Result<()> { try_(|| self.array.push_scalar_value(i64::try_from(v)?)).ctx(self) } + + fn serialize_str(&mut self, v: &str) -> Result<()> { + try_(|| { + let value = chrono::parse_span(v)?.to_arrow_duration(self.unit)?; + self.array.push_scalar_value(value) + }) + .ctx(self) + } } diff --git a/serde_arrow/src/internal/utils/mod.rs b/serde_arrow/src/internal/utils/mod.rs index 7fa35e5d..43536bf4 100644 --- a/serde_arrow/src/internal/utils/mod.rs +++ b/serde_arrow/src/internal/utils/mod.rs @@ -90,7 +90,6 @@ impl<'de, T: Deserialize<'de>> Deserialize<'de> for Item { } } -// TODO: implement for all types? impl<'de, T: Deserialize<'de>> Deserialize<'de> for Items> { fn deserialize>( deserializer: D, diff --git a/serde_arrow/src/internal/utils/value.rs b/serde_arrow/src/internal/utils/value.rs index 68037dfe..7cdd9fb8 100644 --- a/serde_arrow/src/internal/utils/value.rs +++ b/serde_arrow/src/internal/utils/value.rs @@ -82,7 +82,7 @@ impl std::hash::Hash for HashF64 { } } -pub fn transmute(value: S) -> Result { +pub fn transmute(value: impl Serialize) -> Result { let value = value.serialize(ValueSerializer)?; T::deserialize(ValueDeserializer::new(&value)) } @@ -577,6 +577,7 @@ impl<'de, 'a> serde::de::Deserializer<'de> for ValueDeserializer<'a> { fn deserialize_byte_buf>(self, visitor: V) -> Result { match self.0 { Value::Bytes(v) => visitor.visit_byte_buf(v.to_owned()), + Value::String(v) => visitor.visit_byte_buf(v.as_bytes().to_owned()), v => fail!("Cannot deserialize bytes from non-bytes value {v:?}"), } } @@ -584,6 +585,7 @@ impl<'de, 'a> serde::de::Deserializer<'de> for ValueDeserializer<'a> { fn deserialize_bytes>(self, visitor: V) -> Result { match self.0 { Value::Bytes(v) => visitor.visit_bytes(v), + Value::String(v) => visitor.visit_bytes(v.as_bytes()), v => fail!("Cannot deserialize bytes from non-bytes value {v:?}"), } } diff --git a/serde_arrow/src/test/jiff.rs b/serde_arrow/src/test/jiff.rs new file mode 100644 index 00000000..fb3bcce0 --- /dev/null +++ b/serde_arrow/src/test/jiff.rs @@ -0,0 +1,136 @@ +use jiff::{ + civil::{date, time, Date, DateTime, Time}, + Span, Timestamp, Zoned, +}; + +use crate::internal::{testing::assert_error_contains, utils::value}; + +#[test] +fn string_repr_examples() { + // date + let obj = date(2023, 12, 31); + assert_eq!(value::transmute::(&obj).unwrap(), "2023-12-31"); + + let obj = date(-10, 10, 30); + assert_eq!(value::transmute::(&obj).unwrap(), "-000010-10-30"); + assert_eq!(value::transmute::("-000010-10-30").unwrap(), obj); + assert_error_contains( + &value::transmute::("-0010-10-30"), + "six digit integer", + ); + + // date time without time zone + let obj = date(2023, 12, 31).at(18, 30, 0, 0); + assert_eq!( + value::transmute::(&obj).unwrap(), + "2023-12-31T18:30:00" + ); + + // date time with timezone + let obj = date(2023, 12, 31).at(18, 30, 0, 0).intz("UTC").unwrap(); + assert_eq!( + value::transmute::(&obj).unwrap(), + "2023-12-31T18:30:00+00:00[UTC]" + ); + + // time without fractional part + let obj = time(16, 56, 42, 0); + assert_eq!(value::transmute::(&obj).unwrap(), "16:56:42"); + + // time with fractional part + let obj = time(16, 56, 42, 123_000_000); + assert_eq!(value::transmute::(&obj).unwrap(), "16:56:42.123"); + + // day span + let obj = Span::new().days(32); + assert_eq!(value::transmute::(&obj).unwrap(), "P32d"); + + // year month span + let obj = Span::new().years(4).months(7); + assert_eq!(value::transmute::(&obj).unwrap(), "P4y7m"); +} + +/// Test that the different reprs between chrono and jiff are compatible +#[test] +fn transmute_jiff_chrono() { + // date + let chrono = chrono::NaiveDate::from_ymd_opt(2023, 12, 31).unwrap(); + let jiff = date(2023, 12, 31); + + assert_eq!(value::transmute::(&chrono).unwrap(), jiff); + assert_eq!( + value::transmute::(&jiff).unwrap(), + chrono + ); + + // time without fractional part + let chrono = chrono::NaiveTime::from_hms_opt(19, 31, 22).unwrap(); + let jiff = time(19, 31, 22, 0); + + assert_eq!(value::transmute::