From e121347f3bbd1ba38c336d16a2beb8360cde7b4e Mon Sep 17 00:00:00 2001 From: Christopher Prohm Date: Fri, 27 Oct 2023 16:49:07 +0200 Subject: [PATCH 01/27] Reformat code with current rustfmt --- serde_arrow/src/internal/common/checks.rs | 4 +++- serde_arrow/src/internal/deserialization/mod.rs | 13 ++++++++++--- serde_arrow/src/internal/error.rs | 2 +- serde_arrow/src/test_impls/macros.rs | 4 +++- 4 files changed, 17 insertions(+), 6 deletions(-) diff --git a/serde_arrow/src/internal/common/checks.rs b/serde_arrow/src/internal/common/checks.rs index 798c2145..56beb63f 100644 --- a/serde_arrow/src/internal/common/checks.rs +++ b/serde_arrow/src/internal/common/checks.rs @@ -19,7 +19,9 @@ pub fn check_supported_list_layout<'a, O>( where O: std::ops::Sub + std::cmp::PartialEq + From + Copy, { - let Some(validity) = validity else { return Ok(()) }; + let Some(validity) = validity else { + return Ok(()); + }; if offsets.len() != validity.len() + 1 { fail!( diff --git a/serde_arrow/src/internal/deserialization/mod.rs b/serde_arrow/src/internal/deserialization/mod.rs index 562e9b7f..8989f95a 100644 --- a/serde_arrow/src/internal/deserialization/mod.rs +++ b/serde_arrow/src/internal/deserialization/mod.rs @@ -174,7 +174,10 @@ impl<'a> Compiler<'a> { if let Some(option_instr) = option_instr { let if_none = self.program.len(); - let Some(Bytecode::EmitOptionPrimitive(instr)) = self.program.get_mut(option_instr) else { unreachable!() }; + let Some(Bytecode::EmitOptionPrimitive(instr)) = self.program.get_mut(option_instr) + else { + unreachable!() + }; instr.if_none = if_none; instr.positions_to_increment = inner_child_positions; } @@ -320,7 +323,11 @@ impl<'a> Compiler<'a> { M::Map { offsets, entries, .. } => { - let M::Struct { fields: entries_fields,.. } = entries.as_ref() else { + let M::Struct { + fields: entries_fields, + .. + } = entries.as_ref() + else { fail!("cannot extract entries arrays mapping") }; let Some(key_field) = entries_fields.get(0) else { @@ -572,7 +579,7 @@ impl<'a> Compiler<'a> { redirect_instrs.push(redirect_instr); } - let Some(Bytecode::UnionDispatch(instr)) = self.program.get_mut(dispatch_instr) else { + let Some(Bytecode::UnionDispatch(instr)) = self.program.get_mut(dispatch_instr) else { fail!("internal error: did not find union dispatch") }; instr.field_instr = field_instr; diff --git a/serde_arrow/src/internal/error.rs b/serde_arrow/src/internal/error.rs index 3d2e22ea..1563eef6 100644 --- a/serde_arrow/src/internal/error.rs +++ b/serde_arrow/src/internal/error.rs @@ -132,7 +132,7 @@ impl From for Error { impl From for Error { fn from(err: std::num::TryFromIntError) -> Error { - Self::custom_from(format!("arrow2::Error: {err}"), err) + Self::custom_from(format!("TryFromIntError: {err}"), err) } } diff --git a/serde_arrow/src/test_impls/macros.rs b/serde_arrow/src/test_impls/macros.rs index 7c33858d..302e04cb 100644 --- a/serde_arrow/src/test_impls/macros.rs +++ b/serde_arrow/src/test_impls/macros.rs @@ -333,7 +333,9 @@ macro_rules! test_error_impl { let actual = block(); let expected = $expected_error; - let Err(actual) = actual else { panic!("expected an error, but no error was raised"); }; + let Err(actual) = actual else { + panic!("expected an error, but no error was raised"); + }; let actual = actual.to_string(); From d43211c2333ea042c135c0d4cc2481786f89e116 Mon Sep 17 00:00:00 2001 From: Christopher Prohm Date: Fri, 27 Oct 2023 16:49:36 +0200 Subject: [PATCH 02/27] Add prototype for type based tracing / start to unify the tracing approaches --- serde_arrow/src/internal/mod.rs | 6 +- .../src/internal/serialization/compiler.rs | 3 +- serde_arrow/src/internal/tracing/mod.rs | 158 +++++ .../{tracing.rs => tracing/samples.rs} | 554 +++++++--------- serde_arrow/src/internal/tracing/tracer.rs | 590 +++++++++++++++++ serde_arrow/src/internal/tracing/types.rs | 619 ++++++++++++++++++ serde_arrow/src/test_impls/macros.rs | 4 +- 7 files changed, 1594 insertions(+), 340 deletions(-) create mode 100644 serde_arrow/src/internal/tracing/mod.rs rename serde_arrow/src/internal/{tracing.rs => tracing/samples.rs} (79%) create mode 100644 serde_arrow/src/internal/tracing/tracer.rs create mode 100644 serde_arrow/src/internal/tracing/types.rs diff --git a/serde_arrow/src/internal/mod.rs b/serde_arrow/src/internal/mod.rs index 3efb02db..5233c778 100644 --- a/serde_arrow/src/internal/mod.rs +++ b/serde_arrow/src/internal/mod.rs @@ -19,7 +19,7 @@ use self::{ schema::{GenericDataType, GenericField}, sink::{serialize_into_sink, EventSerializer, EventSink, StripOuterSequenceSink}, source::deserialize_from_source, - tracing::{Tracer, TracingOptions}, + tracing::{trace_type, SamplesTracer, TracingOptions}, }; pub static CONFIGURATION: RwLock = RwLock::new(Configuration { @@ -57,7 +57,7 @@ pub fn serialize_into_fields(items: &T, options: TracingOptions) -> Result( where T: Serialize + ?Sized, { - let tracer = Tracer::new(String::from("$"), options); + let tracer = SamplesTracer::new(String::from("$"), options); let tracer = StripOuterSequenceSink::new(tracer); let mut tracer = tracer; serialize_into_sink(&mut tracer, items)?; diff --git a/serde_arrow/src/internal/serialization/compiler.rs b/serde_arrow/src/internal/serialization/compiler.rs index a4779ada..5d4a0430 100644 --- a/serde_arrow/src/internal/serialization/compiler.rs +++ b/serde_arrow/src/internal/serialization/compiler.rs @@ -504,7 +504,8 @@ impl Program { if let Some(option_marker_pos) = option_marker_pos { let current_program_len = self.structure.program.len(); - let Bytecode::OptionMarker(instr) = &mut self.structure.program[option_marker_pos] else { + let Bytecode::OptionMarker(instr) = &mut self.structure.program[option_marker_pos] + else { fail!("Internal error during compilation"); }; instr.if_none = current_program_len; diff --git a/serde_arrow/src/internal/tracing/mod.rs b/serde_arrow/src/internal/tracing/mod.rs new file mode 100644 index 00000000..e3da14aa --- /dev/null +++ b/serde_arrow/src/internal/tracing/mod.rs @@ -0,0 +1,158 @@ +pub mod samples; +pub mod tracer; +pub mod types; + +use serde::{Deserialize, Serialize}; + +use crate::internal::{ + schema::{GenericField, Schema}, + Result, +}; + +pub use samples::SamplesTracer; +pub use types::trace_type; + +/// Configure how the schema is traced +/// +/// Example: +/// +/// ```rust +/// # use serde_arrow::schema::TracingOptions; +/// let tracing_options = TracingOptions::default() +/// .map_as_struct(true) +/// .string_dictionary_encoding(false); +/// ``` +/// +/// The defaults are: +/// +/// ```rust +/// # use serde_arrow::schema::TracingOptions; +/// # let defaults = +/// TracingOptions { +/// allow_null_fields: false, +/// map_as_struct: true, +/// string_dictionary_encoding: false, +/// coerce_numbers: false, +/// try_parse_dates: false, +/// } +/// # ; +/// # assert_eq!(defaults, TracingOptions::default()); +/// ``` +#[derive(Debug, Clone, PartialEq)] +pub struct TracingOptions { + /// If `true`, accept null-only fields (e.g., fields with type `()` or fields + /// with only `None` entries). If `false`, schema tracing will fail in this + /// case. + pub allow_null_fields: bool, + + /// If `true` serialize maps as structs (the default). See + /// [`Strategy::MapAsStruct`] for details. + pub map_as_struct: bool, + + /// If `true` serialize strings dictionary encoded. The default is `false`. + /// + /// If `true`, strings are traced as `Dictionary(UInt64, LargeUtf8)`. If + /// `false`, strings are traced as `LargeUtf8`. + pub string_dictionary_encoding: bool, + + /// If `true`, coerce different numeric types. + /// + /// This option may be helpful when dealing with data formats that do not + /// encode the complete numeric type, e.g., JSON. The following rules are + /// used: + /// + /// - unsigned + other unsigned -> u64 + /// - signed + other signed -> i64 + /// - float + other float -> f64 + /// - unsigned + signed -> i64 + /// - unsigned + float -> f64 + /// - signed + float -> f64 + pub coerce_numbers: bool, + + /// If `true`, try to auto detect datetimes in string columns + /// + /// Currently the naive datetime (`YYYY-MM-DDThh:mm:ss`) and UTC datetimes + /// (`YYYY-MM-DDThh:mm:ssZ`) are understood. + /// + /// For string fields where all values are either missing or conform to one + /// of the format the data type is set as `Date64` with strategy + /// [`NaiveStrAsDate64`][Strategy::NaiveStrAsDate64] or + /// [`UtcStrAsDate64`][Strategy::UtcStrAsDate64]. + pub try_parse_dates: bool, +} + +impl Default for TracingOptions { + fn default() -> Self { + Self { + allow_null_fields: false, + map_as_struct: true, + string_dictionary_encoding: false, + coerce_numbers: false, + try_parse_dates: false, + } + } +} + +impl TracingOptions { + pub fn new() -> Self { + Default::default() + } + + /// Configure `allow_null_fields` + pub fn allow_null_fields(mut self, value: bool) -> Self { + self.allow_null_fields = value; + self + } + + /// Configure `map_as_struct` + pub fn map_as_struct(mut self, value: bool) -> Self { + self.map_as_struct = value; + self + } + + /// Configure `string_dictionary_encoding` + pub fn string_dictionary_encoding(mut self, value: bool) -> Self { + self.string_dictionary_encoding = value; + self + } + + /// Configure `coerce_numbers` + pub fn coerce_numbers(mut self, value: bool) -> Self { + self.coerce_numbers = value; + self + } + + /// Configure `coerce_numbers` + pub fn guess_dates(mut self, value: bool) -> Self { + self.try_parse_dates = value; + self + } +} + +pub struct TracedSchema {} + +impl TracedSchema { + pub fn new() -> Self { + Self {} + } + + pub fn get_schema(&self) -> Result { + todo!() + } + + // TODO: add get_arrow2_fields + // TODO: add get_arrow_fields +} + +impl TracedSchema { + pub fn trace_samples(&mut self, samples: &T) -> Result<()> { + todo!() + } + + pub fn trace_type<'de, T: Deserialize<'de>>(&mut self) -> Result<()> { + todo!() + } +} + +#[test] +fn test_trace_type() {} diff --git a/serde_arrow/src/internal/tracing.rs b/serde_arrow/src/internal/tracing/samples.rs similarity index 79% rename from serde_arrow/src/internal/tracing.rs rename to serde_arrow/src/internal/tracing/samples.rs index 6a6dace6..0eecd67b 100644 --- a/serde_arrow/src/internal/tracing.rs +++ b/serde_arrow/src/internal/tracing/samples.rs @@ -7,147 +7,34 @@ use crate::internal::{ error::{fail, Result}, event::Event, sink::EventSink, + tracing::tracer::{PrimitiveTracer, UnknownTracer}, }; use super::{ - schema::{GenericDataType, GenericField, Strategy}, - sink::macros, + super::{ + schema::{GenericDataType, GenericField, Strategy}, + sink::macros, + }, + TracingOptions, }; -/// Configure how the schema is traced -/// -/// Example: -/// -/// ```rust -/// # use serde_arrow::schema::TracingOptions; -/// let tracing_options = TracingOptions::default() -/// .map_as_struct(true) -/// .string_dictionary_encoding(false); -/// ``` -/// -/// The defaults are: -/// -/// ```rust -/// # use serde_arrow::schema::TracingOptions; -/// # let defaults = -/// TracingOptions { -/// allow_null_fields: false, -/// map_as_struct: true, -/// string_dictionary_encoding: false, -/// coerce_numbers: false, -/// try_parse_dates: false, -/// } -/// # ; -/// # assert_eq!(defaults, TracingOptions::default()); -/// ``` -#[derive(Debug, Clone, PartialEq)] -pub struct TracingOptions { - /// If `true`, accept null-only fields (e.g., fields with type `()` or fields - /// with only `None` entries). If `false`, schema tracing will fail in this - /// case. - pub allow_null_fields: bool, - - /// If `true` serialize maps as structs (the default). See - /// [`Strategy::MapAsStruct`] for details. - pub map_as_struct: bool, - - /// If `true` serialize strings dictionary encoded. The default is `false`. - /// - /// If `true`, strings are traced as `Dictionary(UInt64, LargeUtf8)`. If - /// `false`, strings are traced as `LargeUtf8`. - pub string_dictionary_encoding: bool, - - /// If `true`, coerce different numeric types. - /// - /// This option may be helpful when dealing with data formats that do not - /// encode the complete numeric type, e.g., JSON. The following rules are - /// used: - /// - /// - unsigned + other unsigned -> u64 - /// - signed + other signed -> i64 - /// - float + other float -> f64 - /// - unsigned + signed -> i64 - /// - unsigned + float -> f64 - /// - signed + float -> f64 - pub coerce_numbers: bool, - - /// If `true`, try to auto detect datetimes in string columns - /// - /// Currently the naive datetime (`YYYY-MM-DDThh:mm:ss`) and UTC datetimes - /// (`YYYY-MM-DDThh:mm:ssZ`) are understood. - /// - /// For string fields where all values are either missing or conform to one - /// of the format the data type is set as `Date64` with strategy - /// [`NaiveStrAsDate64`][Strategy::NaiveStrAsDate64] or - /// [`UtcStrAsDate64`][Strategy::UtcStrAsDate64]. - pub try_parse_dates: bool, -} - -impl Default for TracingOptions { - fn default() -> Self { - Self { - allow_null_fields: false, - map_as_struct: true, - string_dictionary_encoding: false, - coerce_numbers: false, - try_parse_dates: false, - } - } -} - -impl TracingOptions { - pub fn new() -> Self { - Default::default() - } - - /// Configure `allow_null_fields` - pub fn allow_null_fields(mut self, value: bool) -> Self { - self.allow_null_fields = value; - self - } - - /// Configure `map_as_struct` - pub fn map_as_struct(mut self, value: bool) -> Self { - self.map_as_struct = value; - self - } - - /// Configure `string_dictionary_encoding` - pub fn string_dictionary_encoding(mut self, value: bool) -> Self { - self.string_dictionary_encoding = value; - self - } - - /// Configure `coerce_numbers` - pub fn coerce_numbers(mut self, value: bool) -> Self { - self.coerce_numbers = value; - self - } - - /// Configure `coerce_numbers` - pub fn guess_dates(mut self, value: bool) -> Self { - self.try_parse_dates = value; - self - } -} - -pub enum Tracer { +pub enum SamplesTracer { Unknown(UnknownTracer), - Struct(StructTracer), - List(ListTracer), Primitive(PrimitiveTracer), - Tuple(TupleTracer), - Union(UnionTracer), + List(ListTracer), Map(MapTracer), + Struct(StructTracer), + Union(UnionTracer), + Tuple(TupleTracer), } -impl Tracer { +impl SamplesTracer { pub fn new(path: String, options: TracingOptions) -> Self { Self::Unknown(UnknownTracer::new(path, options)) } pub fn to_field(&self, name: &str) -> Result { - use Tracer::*; + use SamplesTracer::*; match self { Unknown(t) => t.to_field(name), List(t) => t.to_field(name), @@ -160,7 +47,7 @@ impl Tracer { } pub fn mark_nullable(&mut self) { - use Tracer::*; + use SamplesTracer::*; match self { Unknown(_) => {} List(t) => { @@ -183,9 +70,33 @@ impl Tracer { } } } + + fn reset(&mut self) -> Result<()> { + match self { + Self::Unknown(tracer) => tracer.reset(), + Self::List(tracer) => tracer.reset(), + Self::Struct(tracer) => tracer.reset(), + Self::Primitive(tracer) => tracer.reset(), + Self::Tuple(tracer) => tracer.reset(), + Self::Union(tracer) => tracer.reset(), + Self::Map(tracer) => tracer.reset(), + } + } + + pub fn finish(&mut self) -> Result<()> { + match self { + Self::Unknown(tracer) => tracer.finish(), + Self::List(tracer) => tracer.finish(), + Self::Struct(tracer) => tracer.finish(), + Self::Primitive(tracer) => tracer.finish(), + Self::Tuple(tracer) => tracer.finish(), + Self::Union(tracer) => tracer.finish(), + Self::Map(tracer) => tracer.finish(), + } + } } -impl EventSink for Tracer { +impl EventSink for SamplesTracer { macros::forward_specialized_to_generic!(); fn accept(&mut self, event: Event<'_>) -> Result<()> { @@ -206,9 +117,14 @@ impl EventSink for Tracer { | Event::F64(_) | Event::Str(_) | Event::OwnedStr(_) => { - let mut tracer = PrimitiveTracer::new(tracer.nullable, tracer.options.clone()); + let mut tracer = PrimitiveTracer::new( + tracer.path.clone(), + tracer.options.clone(), + GenericDataType::Null, + tracer.nullable, + ); tracer.accept(event)?; - *self = Tracer::Primitive(tracer) + *self = SamplesTracer::Primitive(tracer) } Event::StartSequence => { let mut tracer = ListTracer::new( @@ -217,7 +133,7 @@ impl EventSink for Tracer { tracer.nullable, ); tracer.accept(event)?; - *self = Tracer::List(tracer); + *self = SamplesTracer::List(tracer); } Event::StartStruct => { let mut tracer = StructTracer::new( @@ -227,7 +143,7 @@ impl EventSink for Tracer { tracer.nullable, ); tracer.accept(event)?; - *self = Tracer::Struct(tracer); + *self = SamplesTracer::Struct(tracer); } Event::StartTuple => { let mut tracer = TupleTracer::new( @@ -236,7 +152,7 @@ impl EventSink for Tracer { tracer.nullable, ); tracer.accept(event)?; - *self = Tracer::Tuple(tracer); + *self = SamplesTracer::Tuple(tracer); } Event::StartMap => { if tracer.options.map_as_struct { @@ -247,7 +163,7 @@ impl EventSink for Tracer { tracer.nullable, ); tracer.accept(event)?; - *self = Tracer::Struct(tracer); + *self = SamplesTracer::Struct(tracer); } else { let mut tracer = MapTracer::new( tracer.path.clone(), @@ -255,7 +171,7 @@ impl EventSink for Tracer { tracer.nullable, ); tracer.accept(event)?; - *self = Tracer::Map(tracer); + *self = SamplesTracer::Map(tracer); } } Event::Variant(_, _) => { @@ -265,7 +181,7 @@ impl EventSink for Tracer { tracer.nullable, ); tracer.accept(event)?; - *self = Tracer::Union(tracer) + *self = SamplesTracer::Union(tracer) } ev if ev.is_end() => fail!( "Invalid end nesting events for unknown tracer ({path})", @@ -287,56 +203,7 @@ impl EventSink for Tracer { } fn finish(&mut self) -> Result<()> { - match self { - Self::Unknown(tracer) => tracer.finish(), - Self::List(tracer) => tracer.finish(), - Self::Struct(tracer) => tracer.finish(), - Self::Primitive(tracer) => tracer.finish(), - Self::Tuple(tracer) => tracer.finish(), - Self::Union(tracer) => tracer.finish(), - Self::Map(tracer) => tracer.finish(), - } - } -} - -pub struct UnknownTracer { - pub nullable: bool, - pub finished: bool, - pub path: String, - pub options: TracingOptions, -} - -impl UnknownTracer { - pub fn new(path: String, options: TracingOptions) -> Self { - Self { - nullable: false, - finished: false, - path, - options, - } - } - - pub fn to_field(&self, name: &str) -> Result { - if !self.finished { - fail!("Cannot build field {name} from unfinished tracer"); - } - if !self.options.allow_null_fields { - fail!(concat!( - "Encountered null only field. This error can be disabled by ", - "setting `allow_null_fields` to `true` in `TracingOptions`", - )); - } - - Ok(GenericField::new( - name, - GenericDataType::Null, - self.nullable, - )) - } - - pub fn finish(&mut self) -> Result<()> { - self.finished = true; - Ok(()) + SamplesTracer::finish(self) } } @@ -348,7 +215,7 @@ pub enum StructMode { pub struct StructTracer { pub mode: StructMode, - pub field_tracers: Vec, + pub field_tracers: Vec, pub nullable: bool, pub field_names: Vec, pub index: HashMap, @@ -356,7 +223,6 @@ pub struct StructTracer { pub item_index: usize, pub seen_this_item: BTreeSet, pub seen_previous_items: BTreeSet, - pub finished: bool, pub path: String, pub options: TracingOptions, } @@ -366,6 +232,7 @@ pub enum StructTracerState { Start, Key, Value(usize, usize), + Finished, } impl StructTracer { @@ -382,12 +249,15 @@ impl StructTracer { item_index: 0, seen_this_item: BTreeSet::new(), seen_previous_items: BTreeSet::new(), - finished: false, } } + pub fn mark_seen(&mut self, field: usize) { + self.seen_this_item.insert(field); + } + pub fn to_field(&self, name: &str) -> Result { - if !self.finished { + if !matches!(self.next, StructTracerState::Finished) { fail!("Cannot build field {name} from unfinished tracer"); } let mut field = GenericField::new(name, GenericDataType::Struct, self.nullable); @@ -402,8 +272,30 @@ impl StructTracer { Ok(field) } - pub fn mark_seen(&mut self, field: usize) { - self.seen_this_item.insert(field); + pub fn reset(&mut self) -> Result<()> { + if !matches!(self.next, StructTracerState::Finished) { + fail!("Cannot reset unfinished tracer"); + } + for tracer in &mut self.field_tracers { + tracer.reset()?; + } + + self.next = StructTracerState::Start; + Ok(()) + } + + pub fn finish(&mut self) -> Result<()> { + if !matches!(self.next, StructTracerState::Start) { + fail!("Incomplete struct in schema tracing"); + } + + for tracer in &mut self.field_tracers { + tracer.finish()?; + } + + self.next = StructTracerState::Finished; + + Ok(()) } } @@ -428,7 +320,7 @@ impl EventSink for StructTracer { Value(field, 0) } else { let field = self.field_tracers.len(); - self.field_tracers.push(Tracer::new( + self.field_tracers.push(SamplesTracer::new( format!("{path}.{key}", path = self.path), self.options.clone(), )); @@ -484,34 +376,32 @@ impl EventSink for StructTracer { _ => Value(field, depth), } } + (Finished, _) => fail!("finished StructTracer cannot handle events"), }; Ok(()) } fn finish(&mut self) -> Result<()> { - if !matches!(self.next, StructTracerState::Start) { - fail!("Incomplete struct in schema tracing"); - } - - for tracer in &mut self.field_tracers { - tracer.finish()?; - } - - self.finished = true; - - Ok(()) + StructTracer::finish(self) } } pub struct TupleTracer { - pub field_tracers: Vec, + pub field_tracers: Vec, pub nullable: bool, pub next: TupleTracerState, - pub finished: bool, pub path: String, pub options: TracingOptions, } +#[derive(Debug, Clone, Copy)] +pub enum TupleTracerState { + WaitForStart, + WaitForItem(usize), + Item(usize, usize), + Finished, +} + impl TupleTracer { pub fn new(path: String, options: TracingOptions, nullable: bool) -> Self { Self { @@ -520,12 +410,11 @@ impl TupleTracer { field_tracers: Vec::new(), nullable, next: TupleTracerState::WaitForStart, - finished: false, } } pub fn to_field(&self, name: &str) -> Result { - if !self.finished { + if !matches!(self.next, TupleTracerState::Finished) { fail!("Cannot build field {name} from unfinished tracer"); } @@ -538,22 +427,37 @@ impl TupleTracer { Ok(field) } - fn field_tracer(&mut self, idx: usize) -> &mut Tracer { + fn field_tracer(&mut self, idx: usize) -> &mut SamplesTracer { while self.field_tracers.len() <= idx { - self.field_tracers.push(Tracer::new( + self.field_tracers.push(SamplesTracer::new( format!("{path}.{idx}", path = self.path), self.options.clone(), )); } &mut self.field_tracers[idx] } -} -#[derive(Debug, Clone, Copy)] -pub enum TupleTracerState { - WaitForStart, - WaitForItem(usize), - Item(usize, usize), + pub fn reset(&mut self) -> Result<()> { + if !matches!(self.next, TupleTracerState::Finished) { + fail!("Cannot reset unfinished tuple tracer"); + } + for tracer in &mut self.field_tracers { + tracer.reset()?; + } + self.next = TupleTracerState::WaitForStart; + Ok(()) + } + + pub fn finish(&mut self) -> Result<()> { + if !matches!(self.next, TupleTracerState::WaitForStart) { + fail!("Incomplete tuple in schema tracing"); + } + for tracer in &mut self.field_tracers { + tracer.finish()?; + } + self.next = TupleTracerState::Finished; + Ok(()) + } } impl EventSink for TupleTracer { @@ -607,27 +511,20 @@ impl EventSink for TupleTracer { _ => Item(field, depth), } } + (Finished, ev) => fail!("finished tuple tracer cannot handle event {ev}"), }; Ok(()) } fn finish(&mut self) -> Result<()> { - if !matches!(self.next, TupleTracerState::WaitForStart) { - fail!("Incomplete tuple in schema tracing"); - } - for tracer in &mut self.field_tracers { - tracer.finish()?; - } - self.finished = true; - Ok(()) + TupleTracer::finish(self) } } pub struct ListTracer { - pub item_tracer: Box, + pub item_tracer: Box, pub nullable: bool, - pub next: ListTracerState, - pub finished: bool, + pub state: ListTracerState, pub path: String, } @@ -635,22 +532,22 @@ pub struct ListTracer { pub enum ListTracerState { WaitForStart, WaitForItem, - Item(usize), + InItem(usize), + Finished, } impl ListTracer { pub fn new(path: String, options: TracingOptions, nullable: bool) -> Self { Self { path: path.clone(), - item_tracer: Box::new(Tracer::new(path, options)), + item_tracer: Box::new(SamplesTracer::new(path, options)), nullable, - next: ListTracerState::WaitForStart, - finished: false, + state: ListTracerState::WaitForStart, } } - fn to_field(&self, name: &str) -> Result { - if !self.finished { + pub fn to_field(&self, name: &str) -> Result { + if !matches!(self.state, ListTracerState::Finished) { fail!("Cannot build field {name} from unfinished tracer"); } @@ -659,6 +556,24 @@ impl ListTracer { Ok(field) } + + pub fn reset(&mut self) -> Result<()> { + if !matches!(self.state, ListTracerState::Finished) { + fail!("Cannot reset unfinished list tracer"); + } + self.item_tracer.reset()?; + self.state = ListTracerState::Finished; + Ok(()) + } + + pub fn finish(&mut self) -> Result<()> { + if !matches!(self.state, ListTracerState::WaitForStart) { + fail!("Incomplete list in schema tracing"); + } + self.item_tracer.finish()?; + self.state = ListTracerState::Finished; + Ok(()) + } } impl EventSink for ListTracer { @@ -667,19 +582,19 @@ impl EventSink for ListTracer { fn accept(&mut self, event: Event<'_>) -> Result<()> { use {Event as E, ListTracerState as S}; - self.next = match (self.next, event) { + self.state = match (self.state, event) { (S::WaitForStart, E::Null | E::Some) => { self.nullable = true; S::WaitForStart } (S::WaitForStart, E::StartSequence) => S::WaitForItem, (S::WaitForItem, E::EndSequence) => S::WaitForStart, - (S::WaitForItem, E::Item) => S::Item(0), - (S::Item(depth), ev) if ev.is_start() => { + (S::WaitForItem, E::Item) => S::InItem(0), + (S::InItem(depth), ev) if ev.is_start() => { self.item_tracer.accept(ev)?; - S::Item(depth + 1) + S::InItem(depth + 1) } - (S::Item(depth), ev) if ev.is_end() => match depth { + (S::InItem(depth), ev) if ev.is_end() => match depth { 0 => fail!( "Invalid event {ev} for list tracer ({path}) in state Item(0)", path = self.path @@ -690,16 +605,16 @@ impl EventSink for ListTracer { } depth => { self.item_tracer.accept(ev)?; - S::Item(depth - 1) + S::InItem(depth - 1) } }, - (S::Item(0), ev) if ev.is_value() => { + (S::InItem(0), ev) if ev.is_value() => { self.item_tracer.accept(ev)?; S::WaitForItem } - (S::Item(depth), ev) => { + (S::InItem(depth), ev) => { self.item_tracer.accept(ev)?; - S::Item(depth) + S::InItem(depth) } (state, ev) => fail!( "Invalid event {ev} for list tracer ({path}) in state {state:?}", @@ -710,25 +625,26 @@ impl EventSink for ListTracer { } fn finish(&mut self) -> Result<()> { - if !matches!(self.next, ListTracerState::WaitForStart) { - fail!("Incomplete list in schema tracing"); - } - self.item_tracer.finish()?; - self.finished = true; - Ok(()) + ListTracer::finish(self) } } pub struct UnionTracer { pub variants: Vec>, - pub tracers: BTreeMap, + pub tracers: BTreeMap, pub nullable: bool, pub next: UnionTracerState, - pub finished: bool, pub path: String, pub options: TracingOptions, } +#[derive(Debug, Clone, Copy)] +pub enum UnionTracerState { + Inactive, + Active(usize, usize), + Finished, +} + impl UnionTracer { pub fn new(path: String, options: TracingOptions, nullable: bool) -> Self { Self { @@ -738,12 +654,11 @@ impl UnionTracer { tracers: BTreeMap::new(), nullable, next: UnionTracerState::Inactive, - finished: false, } } pub fn to_field(&self, name: &str) -> Result { - if !self.finished { + if !matches!(self.next, UnionTracerState::Finished) { fail!("Cannot build field {name} from unfinished tracer"); } @@ -779,7 +694,7 @@ impl UnionTracer { } self.tracers.entry(idx).or_insert_with(|| { - Tracer::new( + SamplesTracer::new( format!("{path}.{key}", path = self.path, key = variant.as_ref()), self.options.clone(), ) @@ -795,6 +710,26 @@ impl UnionTracer { } Ok(()) } + + pub fn reset(&mut self) -> Result<()> { + if !matches!(self.next, UnionTracerState::Finished) { + fail!("Cannot reset unfinished union tracer"); + } + for tracer in self.tracers.values_mut() { + tracer.reset()?; + } + self.next = UnionTracerState::Inactive; + Ok(()) + } + + pub fn finish(&mut self) -> Result<()> { + // TODO: fix me + for tracer in self.tracers.values_mut() { + tracer.finish()?; + } + self.next = UnionTracerState::Finished; + Ok(()) + } } impl EventSink for UnionTracer { @@ -846,48 +781,45 @@ impl EventSink for UnionTracer { } _ => unreachable!(), }, + S::Finished => fail!("finished union tracer cannot handle event"), }; Ok(()) } fn finish(&mut self) -> Result<()> { - for tracer in self.tracers.values_mut() { - tracer.finish()?; - } - self.finished = true; - Ok(()) + UnionTracer::finish(self) } } -#[derive(Debug, Clone, Copy)] -pub enum UnionTracerState { - Inactive, - Active(usize, usize), -} - pub struct MapTracer { pub path: String, - pub key: Box, - pub value: Box, + pub key: Box, + pub value: Box, pub nullable: bool, - pub finished: bool, next: MapTracerState, } +#[derive(Debug, Clone, Copy)] +pub enum MapTracerState { + Start, + Key(usize), + Value(usize), + Finished, +} + impl MapTracer { pub fn new(path: String, options: TracingOptions, nullable: bool) -> Self { Self { nullable, - key: Box::new(Tracer::new(format!("{path}.$key"), options.clone())), - value: Box::new(Tracer::new(format!("{path}.$value"), options)), + key: Box::new(SamplesTracer::new(format!("{path}.$key"), options.clone())), + value: Box::new(SamplesTracer::new(format!("{path}.$value"), options)), next: MapTracerState::Start, path, - finished: true, } } pub fn to_field(&self, name: &str) -> Result { - if !self.finished { + if !matches!(self.next, MapTracerState::Finished) { fail!("Cannot build field {name} from unfinished tracer"); } @@ -900,6 +832,20 @@ impl MapTracer { Ok(field) } + + pub fn reset(&mut self) -> Result<()> { + self.key.reset()?; + self.value.reset()?; + self.next = MapTracerState::Start; + Ok(()) + } + + pub fn finish(&mut self) -> Result<()> { + self.key.finish()?; + self.value.finish()?; + self.next = MapTracerState::Finished; + Ok(()) + } } impl EventSink for MapTracer { @@ -984,72 +930,13 @@ impl EventSink for MapTracer { } _ => unreachable!(), }, + S::Finished => fail!("Finished map tracer cannot handle event"), }; Ok(()) } fn finish(&mut self) -> Result<()> { - self.key.finish()?; - self.value.finish()?; - self.finished = true; - Ok(()) - } -} - -#[derive(Debug, Clone, Copy)] -pub enum MapTracerState { - Start, - Key(usize), - Value(usize), -} - -pub struct PrimitiveTracer { - pub options: TracingOptions, - pub item_type: GenericDataType, - pub strategy: Option, - pub nullable: bool, - pub finished: bool, -} - -impl PrimitiveTracer { - pub fn new(nullable: bool, options: TracingOptions) -> Self { - Self { - item_type: GenericDataType::Null, - strategy: None, - finished: false, - nullable, - options, - } - } - - pub fn to_field(&self, name: &str) -> Result { - type D = GenericDataType; - - if !self.finished { - fail!("Cannot build field {name} from unfinished tracer"); - } - - if !self.options.allow_null_fields && matches!(self.item_type, D::Null) { - fail!(concat!( - "Encountered null only field. This error can be disabled by ", - "setting `allow_null_fields` to `true` in `TracingOptions`", - )); - } - - match &self.item_type { - dt @ (D::LargeUtf8 | D::Utf8) => { - if !self.options.string_dictionary_encoding { - Ok(GenericField::new(name, dt.clone(), self.nullable)) - } else { - let field = GenericField::new(name, D::Dictionary, self.nullable) - .with_child(GenericField::new("key", D::U32, false)) - .with_child(GenericField::new("value", dt.clone(), false)); - Ok(field) - } - } - dt => Ok(GenericField::new(name, dt.clone(), self.nullable) - .with_optional_strategy(self.strategy.clone())), - } + MapTracer::finish(self) } } @@ -1130,8 +1017,7 @@ impl EventSink for PrimitiveTracer { } fn finish(&mut self) -> Result<()> { - self.finished = true; - Ok(()) + PrimitiveTracer::finish(self) } } diff --git a/serde_arrow/src/internal/tracing/tracer.rs b/serde_arrow/src/internal/tracing/tracer.rs new file mode 100644 index 00000000..c186394b --- /dev/null +++ b/serde_arrow/src/internal/tracing/tracer.rs @@ -0,0 +1,590 @@ +use crate::internal::{ + error::{fail, Result}, + schema::{GenericDataType, GenericField, Strategy}, + tracing::TracingOptions, +}; + +macro_rules! defined_tracer { + ($($variant:ident($impl:ident)),* $(,)? ) => { + #[derive(Debug, PartialEq, Clone)] + pub enum Tracer { + $($variant($impl),)* + } + + macro_rules! dispatch_tracer { + ($obj:expr, $item:ident => $block:expr) => { + match $obj { + $(Tracer::$variant($item) => $block,)* + } + }; + } + }; +} + +defined_tracer!( + Unknown(UnknownTracer), + Primitive(PrimitiveTracer), + List(ListTracer), + Map(MapTracer), + Struct(StructTracer), + Union(UnionTracer), +); + +impl Tracer { + pub fn new(path: String, options: TracingOptions) -> Self { + Self::Unknown(UnknownTracer::new(path, options)) + } +} + +impl Tracer { + pub fn get_path(&self) -> &str { + dispatch_tracer!(self, tracer => tracer.get_path()) + } + + pub fn is_unknown(&self) -> bool { + matches!(self, Tracer::Unknown(_)) + } + + pub fn is_complete(&self) -> bool { + dispatch_tracer!(self, tracer => tracer.is_complete()) + } + + pub fn get_type(&self) -> Option<&GenericDataType> { + dispatch_tracer!(self, tracer => tracer.get_type()) + } + + pub fn get_nullable(&self) -> bool { + dispatch_tracer!(self, tracer => tracer.nullable) + } + + pub fn to_field(&self, name: &str) -> Result { + dispatch_tracer!(self, tracer => tracer.to_field(name)) + } + + pub fn get_depth(&self) -> usize { + self.get_path().chars().filter(|c| *c == '.').count() + } + + pub fn get_options(&self) -> &TracingOptions { + dispatch_tracer!(self, tracer => &tracer.options) + } + + pub fn finish(&mut self) -> Result<()> { + dispatch_tracer!(self, tracer => tracer.finish()) + } + + pub fn reset(&mut self) -> Result<()> { + dispatch_tracer!(self, tracer => tracer.reset()) + } +} + +impl Tracer { + pub fn mark_nullable(&mut self) { + dispatch_tracer!(self, tracer => { tracer.nullable = true; }); + } + + pub fn ensure_struct(&mut self, fields: &[S]) -> Result<()> { + match self { + this @ Self::Unknown(_) => { + let tracer = StructTracer { + path: this.get_path().to_owned(), + options: this.get_options().clone(), + field_tracers: fields + .iter() + .map(|field| { + Tracer::new( + format!("{}.{}", this.get_path(), field), + this.get_options().clone(), + ) + }) + .collect(), + field_names: fields.iter().map(|field| field.to_string()).collect(), + nullable: this.get_nullable(), + strategy: None, + }; + *this = Self::Struct(tracer); + Ok(()) + } + Self::Struct(_tracer) => { + // TODO: check fields are equal + Ok(()) + } + _ => fail!( + "mismatched types, previous {:?}, current struct", + self.get_type() + ), + } + } + + pub fn ensure_union(&mut self, variants: &[&str]) -> Result<()> { + match self { + this @ Self::Unknown(_) => { + let tracer = UnionTracer { + path: this.get_path().to_owned(), + options: this.get_options().clone(), + variant_tracers: variants + .iter() + .map(|variant| { + Tracer::new( + format!("{}.{}", this.get_path(), variant), + this.get_options().clone(), + ) + }) + .collect(), + variant_names: variants.iter().map(|s| s.to_string()).collect(), + nullable: this.get_nullable(), + }; + *this = Self::Union(tracer); + Ok(()) + } + Self::Union(_tracer) => { + // TODO: check fields are equal + Ok(()) + } + _ => fail!( + "mismatched types, previous {:?}, current union", + self.get_type() + ), + } + } + + pub fn ensure_list(&mut self) -> Result<()> { + match self { + this @ Self::Unknown(_) => { + let tracer = ListTracer { + path: this.get_path().to_owned(), + options: this.get_options().clone(), + nullable: this.get_nullable(), + item_tracer: Box::new(Tracer::new( + format!("{}.item", this.get_path()), + this.get_options().clone(), + )), + }; + *this = Self::List(tracer); + Ok(()) + } + Self::List(_tracer) => Ok(()), + _ => fail!( + "mismatched types, previous {:?}, current list", + self.get_type() + ), + } + } + + pub fn ensure_map(&mut self) -> Result<()> { + match self { + this @ Self::Unknown(_) => { + let tracer = MapTracer { + path: this.get_path().to_owned(), + options: this.get_options().clone(), + nullable: this.get_nullable(), + key_tracer: Box::new(Tracer::new( + format!("{}.key", this.get_path()), + this.get_options().clone(), + )), + value_tracer: Box::new(Tracer::new( + format!("{}.value", this.get_path()), + this.get_options().clone(), + )), + }; + *this = Self::Map(tracer); + Ok(()) + } + Self::Map(_tracer) => Ok(()), + _ => fail!( + "mismatched types, previous {:?}, current list", + self.get_type() + ), + } + } +} + +macro_rules! impl_primitive_ensures { + ( + $( + ($func:ident, $variant:ident) + ),* + $(,)? + ) => { + impl Tracer { + $( + pub fn $func(&mut self) -> Result<()> { + match self { + this @ Self::Unknown(_) => { + let tracer = PrimitiveTracer::new( + this.get_path().to_owned(), + this.get_options().clone(), + GenericDataType::$variant, + this.get_nullable(), + ); + *this = Self::Primitive(tracer); + Ok(()) + } + Self::Primitive(tracer) if tracer.item_type == GenericDataType::$variant => { + Ok(()) + } + _ => fail!("mismatched types, previous {:?}, current {:?}", self.get_type(), GenericDataType::$variant), + } + } + )* + } + }; +} + +impl_primitive_ensures!( + (ensure_null, Null), + (ensure_bool, Bool), + (ensure_i8, I8), + (ensure_i16, I16), + (ensure_i32, I32), + (ensure_i64, I64), + (ensure_u8, U8), + (ensure_u16, U16), + (ensure_u32, U32), + (ensure_u64, U64), + (ensure_f32, F32), + (ensure_f64, F64), + (ensure_utf8, LargeUtf8), +); + +#[derive(Debug, PartialEq, Clone)] +pub struct UnknownTracer { + pub path: String, + pub options: TracingOptions, + pub nullable: bool, + pub state: UnknownTracerState, +} + +#[derive(Debug, PartialEq, Clone, Copy)] +pub enum UnknownTracerState { + Unfinished, + Finished, +} + +impl UnknownTracer { + pub fn new(path: String, options: TracingOptions) -> Self { + Self { + path, + options, + nullable: false, + state: UnknownTracerState::Unfinished, + } + } + + pub fn to_field(&self, name: &str) -> Result { + if !matches!(self.state, UnknownTracerState::Finished) { + fail!("Cannot build field {name} from unfinished tracer"); + } + if !self.options.allow_null_fields { + fail!(concat!( + "Encountered null only or unknown field. This error can be ", + "disabled by setting `allow_null_fields` to `true` in ", + "`TracingOptions`", + )); + } + + Ok(GenericField::new( + name, + GenericDataType::Null, + self.nullable, + )) + } + + pub fn reset(&mut self) -> Result<()> { + if !matches!(self.state, UnknownTracerState::Finished) { + fail!("cannot reset an unfinished tracer"); + } + self.state = UnknownTracerState::Unfinished; + Ok(()) + } + + pub fn finish(&mut self) -> Result<()> { + if !matches!(self.state, UnknownTracerState::Unfinished) { + fail!("Cannot finish an already finished tracer"); + } + self.state = UnknownTracerState::Finished; + Ok(()) + } + + pub fn get_path(&self) -> &str { + &self.path + } + + pub fn is_complete(&self) -> bool { + false + } + + pub fn get_type(&self) -> Option<&GenericDataType> { + None + } +} + +#[derive(Debug, PartialEq, Clone)] +pub struct MapTracer { + pub path: String, + pub options: TracingOptions, + pub nullable: bool, + pub key_tracer: Box, + pub value_tracer: Box, +} + +impl MapTracer { + pub fn get_path(&self) -> &str { + &self.path + } + + pub fn is_complete(&self) -> bool { + self.key_tracer.is_complete() && self.value_tracer.is_complete() + } + + pub fn to_field(&self, name: &str) -> Result { + let key = self.key_tracer.to_field("key")?; + let value = self.value_tracer.to_field("value")?; + let res = GenericField::new(name, GenericDataType::Map, self.nullable) + .with_child(key) + .with_child(value); + Ok(res) + } + + pub fn get_type(&self) -> Option<&GenericDataType> { + Some(&GenericDataType::Map) + } + + pub fn reset(&mut self) -> Result<()> { + self.key_tracer.reset()?; + self.value_tracer.reset()?; + Ok(()) + } + + pub fn finish(&mut self) -> Result<()> { + self.key_tracer.finish()?; + self.value_tracer.finish()?; + Ok(()) + } +} + +#[derive(Debug, PartialEq, Clone)] +pub struct ListTracer { + pub path: String, + pub options: TracingOptions, + pub nullable: bool, + pub item_tracer: Box, +} + +impl ListTracer { + pub fn get_path(&self) -> &str { + &self.path + } + + pub fn is_complete(&self) -> bool { + self.item_tracer.is_complete() + } + + pub fn to_field(&self, name: &str) -> Result { + let item = self.item_tracer.to_field("item")?; + let res = + GenericField::new(name, GenericDataType::LargeList, self.nullable).with_child(item); + Ok(res) + } + + pub fn get_type(&self) -> Option<&GenericDataType> { + Some(&GenericDataType::LargeList) + } + + pub fn reset(&mut self) -> Result<()> { + self.item_tracer.reset() + } + + pub fn finish(&mut self) -> Result<()> { + self.item_tracer.finish() + } +} + +#[derive(Debug, PartialEq, Clone)] +pub struct StructTracer { + pub path: String, + pub options: TracingOptions, + pub nullable: bool, + pub field_names: Vec, + pub field_tracers: Vec, + pub strategy: Option, +} + +impl StructTracer { + pub fn get_path(&self) -> &str { + &self.path + } + + pub fn is_complete(&self) -> bool { + self.field_tracers.iter().all(Tracer::is_complete) + } + + pub fn to_field(&self, name: &str) -> Result { + let mut field = GenericField::new(name, GenericDataType::Struct, self.nullable); + + for (tracer, name) in self.field_tracers.iter().zip(&self.field_names) { + field.children.push(tracer.to_field(name)?); + } + field.strategy = self.strategy.clone(); + + Ok(field) + } + + pub fn get_type(&self) -> Option<&GenericDataType> { + Some(&GenericDataType::Struct) + } + + pub fn reset(&mut self) -> Result<()> { + for tracer in &mut self.field_tracers { + tracer.reset()?; + } + Ok(()) + } + + pub fn finish(&mut self) -> Result<()> { + for tracer in &mut self.field_tracers { + tracer.finish()?; + } + Ok(()) + } +} + +#[derive(Debug, PartialEq, Clone)] + +pub struct UnionTracer { + pub path: String, + pub options: TracingOptions, + pub nullable: bool, + pub variant_names: Vec, + pub variant_tracers: Vec, +} + +impl UnionTracer { + pub fn get_path(&self) -> &str { + &self.path + } + + pub fn is_complete(&self) -> bool { + self.variant_tracers.iter().all(Tracer::is_complete) + } + + pub fn to_field(&self, name: &str) -> Result { + let mut field = GenericField::new(name, GenericDataType::Union, self.nullable); + for (tracer, name) in self.variant_tracers.iter().zip(&self.variant_names) { + field.children.push(tracer.to_field(name)?); + } + Ok(field) + } + + pub fn get_type(&self) -> Option<&GenericDataType> { + Some(&GenericDataType::Union) + } + + pub fn reset(&mut self) -> Result<()> { + for tracer in &mut self.variant_tracers { + tracer.reset()?; + } + Ok(()) + } + + pub fn finish(&mut self) -> Result<()> { + for tracer in &mut self.variant_tracers { + tracer.finish()?; + } + Ok(()) + } +} + +#[derive(Debug, PartialEq, Clone)] +pub struct PrimitiveTracer { + pub path: String, + pub options: TracingOptions, + pub nullable: bool, + pub strategy: Option, + pub item_type: GenericDataType, + pub state: PrimitiveTracerState, +} + +#[derive(Debug, PartialEq, Clone, Copy)] +pub enum PrimitiveTracerState { + Unfinished, + Finished, +} + +impl PrimitiveTracer { + pub fn new( + path: String, + options: TracingOptions, + item_type: GenericDataType, + nullable: bool, + ) -> Self { + Self { + path, + options, + item_type, + nullable, + strategy: None, + state: PrimitiveTracerState::Unfinished, + } + } + + pub fn finish(&mut self) -> Result<()> { + if matches!(self.state, PrimitiveTracerState::Finished) { + fail!("Cannot finish an already finished tracer"); + } + self.state = PrimitiveTracerState::Finished; + Ok(()) + } + + pub fn reset(&mut self) -> Result<()> { + if !matches!(self.state, PrimitiveTracerState::Finished) { + fail!("Cannot reset an unfished tracer"); + } + self.state = PrimitiveTracerState::Unfinished; + Ok(()) + } + + pub fn to_field(&self, name: &str) -> Result { + type D = GenericDataType; + + if !matches!(self.state, PrimitiveTracerState::Finished) { + fail!("Cannot build field {name} from unfinished tracer"); + } + + if !self.options.allow_null_fields && matches!(self.item_type, D::Null) { + fail!(concat!( + "Encountered null only field. This error can be disabled by ", + "setting `allow_null_fields` to `true` in `TracingOptions`", + )); + } + + match &self.item_type { + dt @ (D::LargeUtf8 | D::Utf8) => { + if !self.options.string_dictionary_encoding { + Ok(GenericField::new(name, dt.clone(), self.nullable)) + } else { + let field = GenericField::new(name, D::Dictionary, self.nullable) + .with_child(GenericField::new("key", D::U32, false)) + .with_child(GenericField::new("value", dt.clone(), false)); + Ok(field) + } + } + dt => Ok(GenericField::new(name, dt.clone(), self.nullable) + .with_optional_strategy(self.strategy.clone())), + } + } +} + +impl PrimitiveTracer { + pub fn get_path(&self) -> &str { + &self.path + } + + pub fn is_complete(&self) -> bool { + true + } + + pub fn get_type(&self) -> Option<&GenericDataType> { + Some(&self.item_type) + } +} diff --git a/serde_arrow/src/internal/tracing/types.rs b/serde_arrow/src/internal/tracing/types.rs new file mode 100644 index 00000000..d23a8204 --- /dev/null +++ b/serde_arrow/src/internal/tracing/types.rs @@ -0,0 +1,619 @@ +use serde::{ + de::{DeserializeSeed, Visitor}, + Deserialize, Deserializer, +}; + +use crate::internal::{ + fail, + schema::{GenericField, Strategy}, + tracing::{tracer::Tracer, TracingOptions}, + Error, Result, +}; + +pub fn trace_type<'de, T: Deserialize<'de>>( + options: TracingOptions, + name: &str, +) -> Result { + let mut tracer = Tracer::new(String::from("$"), options); + + // TODO: make configurable + let mut attempts = 100; + while !tracer.is_complete() { + if attempts == 0 { + fail!("could not determine ...") + } + T::deserialize(TraceAny(&mut tracer))?; + attempts -= 1; + } + + tracer.finish()?; + tracer.to_field(name) +} + +struct TraceAny<'a>(&'a mut Tracer); + +impl<'de, 'a> serde::de::Deserializer<'de> for TraceAny<'a> { + type Error = Error; + + fn deserialize_any>(self, _visitor: V) -> Result { + fail!("deserialize_any is not supported") + } + + fn deserialize_bool>(self, visitor: V) -> Result { + self.0.ensure_bool()?; + visitor.visit_bool(Default::default()) + } + + fn deserialize_i8>(self, visitor: V) -> Result { + self.0.ensure_i8()?; + visitor.visit_i8(Default::default()) + } + + fn deserialize_i16>(self, visitor: V) -> Result { + self.0.ensure_i16()?; + visitor.visit_i16(Default::default()) + } + + fn deserialize_i32>(self, visitor: V) -> Result { + self.0.ensure_i32()?; + visitor.visit_i32(Default::default()) + } + + fn deserialize_i64>(self, visitor: V) -> Result { + self.0.ensure_i64()?; + visitor.visit_i64(Default::default()) + } + + fn deserialize_u8>(self, visitor: V) -> Result { + self.0.ensure_u8()?; + visitor.visit_u8(Default::default()) + } + + fn deserialize_u16>(self, visitor: V) -> Result { + self.0.ensure_u16()?; + visitor.visit_u16(Default::default()) + } + + fn deserialize_u32>(self, visitor: V) -> Result { + self.0.ensure_u32()?; + visitor.visit_u32(Default::default()) + } + + fn deserialize_u64>(self, visitor: V) -> Result { + self.0.ensure_u64()?; + visitor.visit_u64(Default::default()) + } + + fn deserialize_f32>(self, visitor: V) -> Result { + self.0.ensure_f32()?; + visitor.visit_f32(Default::default()) + } + + fn deserialize_f64>(self, visitor: V) -> Result { + self.0.ensure_f64()?; + visitor.visit_f64(Default::default()) + } + + fn deserialize_char>(self, visitor: V) -> Result { + self.0.ensure_u32()?; + visitor.visit_char(Default::default()) + } + + fn deserialize_str>(self, visitor: V) -> Result { + if self.0.get_options().try_parse_dates { + fail!("Cannot try to parse dates without examples, prefer serialize_into_field(s)"); + } + + self.0.ensure_utf8()?; + visitor.visit_str("") + } + + fn deserialize_string>(self, visitor: V) -> Result { + if self.0.get_options().try_parse_dates { + fail!("Cannot try to parse dates without examples, prefer serialize_into_field(s)"); + } + + self.0.ensure_utf8()?; + visitor.visit_string(Default::default()) + } + + fn deserialize_bytes>(self, _visitor: V) -> Result { + todo!() + } + + fn deserialize_byte_buf>(self, _visitor: V) -> Result { + todo!() + } + + fn deserialize_option>(self, visitor: V) -> Result { + self.0.mark_nullable(); + visitor.visit_some(self) + } + + fn deserialize_unit>(self, visitor: V) -> Result { + self.0.ensure_null()?; + visitor.visit_unit() + } + + fn deserialize_unit_struct>( + self, + _name: &'static str, + visitor: V, + ) -> Result { + self.0.ensure_null()?; + visitor.visit_unit() + } + + fn deserialize_newtype_struct>( + self, + _name: &'static str, + visitor: V, + ) -> Result { + visitor.visit_newtype_struct(self) + } + + fn deserialize_seq>(self, visitor: V) -> Result { + self.0.ensure_list()?; + let Tracer::List(tracer) = self.0 else { + unreachable!() + }; + visitor.visit_seq(TraceSeq(&mut tracer.item_tracer, true)) + } + + fn deserialize_tuple>(self, len: usize, visitor: V) -> Result { + let field_names = (0..len).map(|idx| idx.to_string()).collect::>(); + self.0.ensure_struct(&field_names)?; + + let Tracer::Struct(tracer) = self.0 else { + unreachable!(); + }; + tracer.strategy = Some(Strategy::TupleAsStruct); + + visitor.visit_seq(TraceTupleStruct { + field_tracers: &mut tracer.field_tracers, + pos: 0, + }) + } + + fn deserialize_tuple_struct>( + self, + _name: &'static str, + len: usize, + visitor: V, + ) -> Result { + self.deserialize_tuple(len, visitor) + } + + fn deserialize_map>(self, visitor: V) -> Result { + if self.0.get_options().map_as_struct { + fail!("Cannot trace maps as structs without examples, prefer serialize_into_field(s)"); + } + + self.0.ensure_map()?; + let Tracer::Map(tracer) = self.0 else { + unreachable!() + }; + visitor.visit_map(TraceMap { + key_tracer: &mut tracer.key_tracer, + value_tracer: &mut tracer.value_tracer, + active: true, + }) + } + + fn deserialize_struct>( + self, + _name: &'static str, + fields: &'static [&'static str], + visitor: V, + ) -> Result { + self.0.ensure_struct(fields)?; + let Tracer::Struct(tracer) = self.0 else { + unreachable!() + }; + + visitor.visit_map(TraceStruct { + field_tracers: &mut tracer.field_tracers, + pos: 0, + fields, + }) + } + + fn deserialize_enum>( + self, + _name: &'static str, + variants: &'static [&'static str], + visitor: V, + ) -> Result { + self.0.ensure_union(variants)?; + + let Tracer::Union(tracer) = self.0 else { + fail!("invalid state") + }; + + let idx = tracer + .variant_tracers + .iter() + .position(|tracer| tracer.is_unknown()) + .unwrap_or_default(); + if idx >= tracer.variant_tracers.len() { + fail!("invalid variant index"); + } + + let res = visitor.visit_enum(TraceEnum { + tracer: &mut tracer.variant_tracers[idx], + pos: idx, + variant: &tracer.variant_names[idx], + })?; + Ok(res) + } + + fn deserialize_identifier>(self, visitor: V) -> Result { + self.deserialize_str(visitor) + } + + fn deserialize_ignored_any>(self, visitor: V) -> Result { + // TODO: is this correct? + visitor.visit_unit() + } +} + +struct TraceMap<'a> { + key_tracer: &'a mut Tracer, + value_tracer: &'a mut Tracer, + active: bool, +} + +impl<'de, 'a> serde::de::MapAccess<'de> for TraceMap<'a> { + type Error = Error; + + fn next_key_seed>(&mut self, seed: K) -> Result> { + if self.active { + let key = seed.deserialize(TraceAny(self.key_tracer))?; + Ok(Some(key)) + } else { + Ok(None) + } + } + + fn next_value_seed>(&mut self, seed: V) -> Result { + self.active = false; + seed.deserialize(TraceAny(self.value_tracer)) + } +} + +struct TraceTupleStruct<'a> { + field_tracers: &'a mut [Tracer], + pos: usize, +} + +impl<'de, 'a> serde::de::SeqAccess<'de> for TraceTupleStruct<'a> { + type Error = Error; + + fn next_element_seed>(&mut self, seed: T) -> Result> { + if self.pos >= self.field_tracers.len() { + return Ok(None); + } + + let item = seed.deserialize(TraceAny(&mut self.field_tracers[self.pos]))?; + self.pos += 1; + + Ok(Some(item)) + } +} + +struct TraceStruct<'a> { + field_tracers: &'a mut [Tracer], + pos: usize, + fields: &'static [&'static str], +} + +impl<'de, 'a> serde::de::MapAccess<'de> for TraceStruct<'a> { + type Error = Error; + + fn next_key_seed>(&mut self, seed: K) -> Result> { + if self.pos >= self.fields.len() { + return Ok(None); + } + let key = seed.deserialize(IdentifierDeserializer { + idx: self.pos, + name: self.fields[self.pos], + })?; + Ok(Some(key)) + } + + fn next_value_seed>(&mut self, seed: V) -> Result { + let value = seed.deserialize(TraceAny(&mut self.field_tracers[self.pos]))?; + self.pos += 1; + + Ok(value) + } +} + +struct TraceEnum<'a> { + tracer: &'a mut Tracer, + pos: usize, + variant: &'a str, +} + +impl<'de, 'a> serde::de::EnumAccess<'de> for TraceEnum<'a> { + type Error = Error; + type Variant = TraceAny<'a>; + + fn variant_seed>(self, seed: V) -> Result<(V::Value, Self::Variant)> { + let variant = seed.deserialize(IdentifierDeserializer { + idx: self.pos, + name: self.variant, + })?; + Ok((variant, TraceAny(self.tracer))) + } +} + +impl<'de, 'a> serde::de::VariantAccess<'de> for TraceAny<'a> { + type Error = Error; + + fn unit_variant(self) -> Result<()> { + <()>::deserialize(self) + } + + fn newtype_variant_seed>(self, seed: T) -> Result { + seed.deserialize(self) + } + + fn tuple_variant>(self, len: usize, visitor: V) -> Result { + self.deserialize_tuple(len, visitor) + } + + fn struct_variant>( + self, + fields: &'static [&'static str], + visitor: V, + ) -> Result { + self.deserialize_struct("", fields, visitor) + } +} + +struct TraceSeq<'a>(&'a mut Tracer, bool); + +impl<'de, 'a> serde::de::SeqAccess<'de> for TraceSeq<'a> { + type Error = Error; + + fn next_element_seed>(&mut self, seed: T) -> Result> { + if self.1 { + self.1 = false; + let item = seed.deserialize(TraceAny(self.0))?; + Ok(Some(item)) + } else { + Ok(None) + } + } +} + +struct IdentifierDeserializer<'a> { + idx: usize, + name: &'a str, +} + +macro_rules! unimplemented { + ($lifetime:lifetime, $name:ident $($tt:tt)*) => { + fn $name>(self $($tt)*, _: V) -> Result { + fail!("{} is not implemented", stringify!($name)) + } + }; +} + +impl<'de, 'a> serde::de::Deserializer<'de> for IdentifierDeserializer<'a> { + type Error = Error; + + fn deserialize_identifier>(self, visitor: V) -> Result { + self.deserialize_str(visitor) + } + + fn deserialize_any>(self, visitor: V) -> Result { + self.deserialize_str(visitor) + } + + fn deserialize_str>(self, visitor: V) -> Result { + visitor.visit_str(self.name) + } + + fn deserialize_string>(self, visitor: V) -> Result { + visitor.visit_string(self.name.to_owned()) + } + + fn deserialize_u64>(self, visitor: V) -> Result { + visitor.visit_u64(u64::try_from(self.idx)?) + } + + unimplemented!('de, deserialize_bool); + unimplemented!('de, deserialize_i8); + unimplemented!('de, deserialize_i16); + unimplemented!('de, deserialize_i32); + unimplemented!('de, deserialize_i64); + unimplemented!('de, deserialize_u8); + unimplemented!('de, deserialize_u16); + unimplemented!('de, deserialize_u32); + unimplemented!('de, deserialize_f32); + unimplemented!('de, deserialize_f64); + unimplemented!('de, deserialize_char); + unimplemented!('de, deserialize_bytes); + unimplemented!('de, deserialize_byte_buf); + unimplemented!('de, deserialize_option); + unimplemented!('de, deserialize_unit); + unimplemented!('de, deserialize_unit_struct, _: &'static str); + unimplemented!('de, deserialize_newtype_struct, _: &'static str); + unimplemented!('de, deserialize_seq); + unimplemented!('de, deserialize_tuple, _: usize); + unimplemented!('de, deserialize_tuple_struct, _: &'static str, _: usize); + unimplemented!('de, deserialize_map); + unimplemented!('de, deserialize_struct, _: &'static str, _: &'static [&'static str]); + unimplemented!('de, deserialize_enum, _: &'static str, _: &'static [&'static str]); + unimplemented!('de, deserialize_ignored_any); +} + +#[test] +fn trace_primitives() { + use {crate::internal::schema::GenericDataType as T, GenericField as F}; + + assert_eq!( + trace_type::(TracingOptions::default(), "root").unwrap(), + F::new("root", T::I8, false) + ); + assert_eq!( + trace_type::(TracingOptions::default(), "root").unwrap(), + F::new("root", T::I16, false) + ); + assert_eq!( + trace_type::(TracingOptions::default(), "root").unwrap(), + F::new("root", T::I32, false) + ); + assert_eq!( + trace_type::(TracingOptions::default(), "root").unwrap(), + F::new("root", T::I64, false) + ); + + assert_eq!( + trace_type::(TracingOptions::default(), "root").unwrap(), + F::new("root", T::U8, false) + ); + assert_eq!( + trace_type::(TracingOptions::default(), "root").unwrap(), + F::new("root", T::U16, false) + ); + assert_eq!( + trace_type::(TracingOptions::default(), "root").unwrap(), + F::new("root", T::U32, false) + ); + assert_eq!( + trace_type::(TracingOptions::default(), "root").unwrap(), + F::new("root", T::U64, false) + ); + + assert_eq!( + trace_type::(TracingOptions::default(), "root").unwrap(), + F::new("root", T::F32, false) + ); + assert_eq!( + trace_type::(TracingOptions::default(), "root").unwrap(), + F::new("root", T::F64, false) + ); +} + +#[test] +fn trace_option() { + use {crate::internal::schema::GenericDataType as T, GenericField as F}; + + assert_eq!( + trace_type::(TracingOptions::default(), "root").unwrap(), + F::new("root", T::I8, false) + ); + assert_eq!( + trace_type::>(TracingOptions::default(), "root").unwrap(), + F::new("root", T::I8, true) + ); +} + +#[test] +fn trace_struct() { + use {crate::internal::schema::GenericDataType as T, GenericField as F}; + + #[allow(dead_code)] + #[derive(Deserialize)] + struct Example { + a: bool, + b: Option, + } + + let actual = trace_type::(TracingOptions::default(), "root").unwrap(); + let expected = F::new("root", T::Struct, false) + .with_child(F::new("a", T::Bool, false)) + .with_child(F::new("b", T::I8, true)); + + assert_eq!(actual, expected); +} + +#[test] +fn trace_tuple_as_struct() { + use {crate::internal::schema::GenericDataType as T, GenericField as F}; + + let actual = trace_type::<(bool, Option)>(TracingOptions::default(), "root").unwrap(); + let expected = F::new("root", T::Struct, false) + .with_child(F::new("0", T::Bool, false)) + .with_child(F::new("1", T::I8, true)) + .with_strategy(Strategy::TupleAsStruct); + + assert_eq!(actual, expected); +} + +#[test] +fn trace_union() { + use {crate::internal::schema::GenericDataType as T, GenericField as F}; + + #[allow(dead_code)] + #[derive(Deserialize)] + enum Example { + A(i8), + B(f32), + } + + let actual = trace_type::(TracingOptions::default(), "root").unwrap(); + let expected = F::new("root", T::Union, false) + .with_child(F::new("A", T::I8, false)) + .with_child(F::new("B", T::F32, false)); + + assert_eq!(actual, expected); +} + +#[test] +fn trace_list() { + use {crate::internal::schema::GenericDataType as T, GenericField as F}; + + let actual = trace_type::>(TracingOptions::default(), "root").unwrap(); + let expected = + F::new("root", T::LargeList, false).with_child(F::new("item", T::LargeUtf8, false)); + + assert_eq!(actual, expected); +} + +#[test] +fn trace_map() { + use std::collections::HashMap; + use {crate::internal::schema::GenericDataType as T, GenericField as F}; + + let actual = + trace_type::>(TracingOptions::default().map_as_struct(false), "root") + .unwrap(); + let expected = F::new("root", T::Map, false) + .with_child(F::new("key", T::I8, false)) + .with_child(F::new("value", T::LargeUtf8, false)); + + assert_eq!(actual, expected); +} + +#[test] +fn issue_90() { + use {crate::internal::schema::GenericDataType as T, GenericField as F}; + + #[derive(Deserialize)] + pub struct Distribution { + pub samples: Vec, + pub statistic: String, + } + + #[derive(Deserialize)] + pub struct VectorMetric { + pub distribution: Option, + } + + let actual = trace_type::(TracingOptions::default(), "root").unwrap(); + let expected = F::new("root", T::Struct, false).with_child( + F::new("distribution", T::Struct, true) + .with_child(F::new("samples", T::LargeList, false).with_child(F::new( + "item", + T::F64, + false, + ))) + .with_child(F::new("statistic", T::LargeUtf8, false)), + ); + + assert_eq!(actual, expected); +} diff --git a/serde_arrow/src/test_impls/macros.rs b/serde_arrow/src/test_impls/macros.rs index 302e04cb..267c889d 100644 --- a/serde_arrow/src/test_impls/macros.rs +++ b/serde_arrow/src/test_impls/macros.rs @@ -269,7 +269,7 @@ macro_rules! test_events { serialization::{compile_serialization, CompilationOptions, Interpreter}, event::Event, schema::{GenericDataType, GenericField}, - tracing::{Tracer, TracingOptions}, + tracing::{SamplesTracer, TracingOptions}, sink::{accept_events, StripOuterSequenceSink}, }; @@ -282,7 +282,7 @@ macro_rules! test_events { let options = TracingOptions::default(); $(let options = $tracing_options;)? - let tracer = Tracer::new(String::from("$"), options); + let tracer = SamplesTracer::new(String::from("$"), options); let mut tracer = StripOuterSequenceSink::new(tracer); accept_events(&mut tracer, events.iter().cloned()).unwrap(); let root = tracer.into_inner().to_field("root").unwrap(); From 175faff62a5176c2ddea4a85958276c1b3b77931 Mon Sep 17 00:00:00 2001 From: Christopher Prohm Date: Fri, 27 Oct 2023 17:01:50 +0200 Subject: [PATCH 03/27] Start to cleanup the naming for the sample tracer --- serde_arrow/src/internal/tracing/samples.rs | 201 +++++++++++--------- 1 file changed, 106 insertions(+), 95 deletions(-) diff --git a/serde_arrow/src/internal/tracing/samples.rs b/serde_arrow/src/internal/tracing/samples.rs index 0eecd67b..b402b700 100644 --- a/serde_arrow/src/internal/tracing/samples.rs +++ b/serde_arrow/src/internal/tracing/samples.rs @@ -214,24 +214,29 @@ pub enum StructMode { } pub struct StructTracer { + pub path: String, + pub options: TracingOptions, + pub nullable: bool, + pub state: StructTracerState, pub mode: StructMode, pub field_tracers: Vec, - pub nullable: bool, pub field_names: Vec, pub index: HashMap, - pub next: StructTracerState, + // TODO: document and clean up these fields pub item_index: usize, pub seen_this_item: BTreeSet, pub seen_previous_items: BTreeSet, - pub path: String, - pub options: TracingOptions, } #[derive(Debug, Clone, Copy)] pub enum StructTracerState { - Start, - Key, - Value(usize, usize), + /// The tracer is waiting for the next key + WaitForKey, + /// The tracer is currently processing the next key + InKey, + /// The tracer is currently tracing a value for `(field, depth)` + InValue(usize, usize), + /// The tracer is finished Finished, } @@ -245,7 +250,7 @@ impl StructTracer { field_names: Vec::new(), index: HashMap::new(), nullable, - next: StructTracerState::Start, + state: StructTracerState::WaitForKey, item_index: 0, seen_this_item: BTreeSet::new(), seen_previous_items: BTreeSet::new(), @@ -257,7 +262,7 @@ impl StructTracer { } pub fn to_field(&self, name: &str) -> Result { - if !matches!(self.next, StructTracerState::Finished) { + if !matches!(self.state, StructTracerState::Finished) { fail!("Cannot build field {name} from unfinished tracer"); } let mut field = GenericField::new(name, GenericDataType::Struct, self.nullable); @@ -273,19 +278,19 @@ impl StructTracer { } pub fn reset(&mut self) -> Result<()> { - if !matches!(self.next, StructTracerState::Finished) { + if !matches!(self.state, StructTracerState::Finished) { fail!("Cannot reset unfinished tracer"); } for tracer in &mut self.field_tracers { tracer.reset()?; } - self.next = StructTracerState::Start; + self.state = StructTracerState::WaitForKey; Ok(()) } pub fn finish(&mut self) -> Result<()> { - if !matches!(self.next, StructTracerState::Start) { + if !matches!(self.state, StructTracerState::WaitForKey) { fail!("Incomplete struct in schema tracing"); } @@ -293,7 +298,7 @@ impl StructTracer { tracer.finish()?; } - self.next = StructTracerState::Finished; + self.state = StructTracerState::Finished; Ok(()) } @@ -306,18 +311,18 @@ impl EventSink for StructTracer { use StructTracerState::*; type E<'a> = Event<'a>; - self.next = match (self.next, event) { - (Start, E::StartStruct | E::StartMap) => Key, - (Start, E::Null | E::Some) => { + self.state = match (self.state, event) { + (WaitForKey, E::StartStruct | E::StartMap) => InKey, + (WaitForKey, E::Null | E::Some) => { self.nullable = true; - Start + WaitForKey } - (Start, ev) => fail!("Invalid event {ev} for struct tracer in state Start"), - (Key, E::Item) => Key, - (Key, E::Str(key)) => { + (WaitForKey, ev) => fail!("Invalid event {ev} for struct tracer in state Start"), + (InKey, E::Item) => InKey, + (InKey, E::Str(key)) => { if let Some(&field) = self.index.get(key) { self.mark_seen(field); - Value(field, 0) + InValue(field, 0) } else { let field = self.field_tracers.len(); self.field_tracers.push(SamplesTracer::new( @@ -327,10 +332,10 @@ impl EventSink for StructTracer { self.field_names.push(key.to_owned()); self.index.insert(key.to_owned(), field); self.mark_seen(field); - Value(field, 0) + InValue(field, 0) } } - (Key, E::EndStruct | E::EndMap) => { + (InKey, E::EndStruct | E::EndMap) => { if self.item_index == 0 { self.seen_previous_items = self.seen_this_item.clone(); } @@ -348,32 +353,32 @@ impl EventSink for StructTracer { self.seen_this_item.clear(); self.item_index += 1; - Start + WaitForKey } - (Key, ev) => fail!("Invalid event {ev} for struct tracer in state Key"), - (Value(field, depth), ev) if ev.is_start() => { + (InKey, ev) => fail!("Invalid event {ev} for struct tracer in state Key"), + (InValue(field, depth), ev) if ev.is_start() => { self.field_tracers[field].accept(ev)?; - Value(field, depth + 1) + InValue(field, depth + 1) } - (Value(field, depth), ev) if ev.is_end() => { + (InValue(field, depth), ev) if ev.is_end() => { self.field_tracers[field].accept(ev)?; match depth { 0 => fail!("Invalid closing event in struct tracer in state Value"), - 1 => Key, - depth => Value(field, depth - 1), + 1 => InKey, + depth => InValue(field, depth - 1), } } - (Value(field, depth), ev) if ev.is_marker() => { + (InValue(field, depth), ev) if ev.is_marker() => { self.field_tracers[field].accept(ev)?; // markers are always followed by the actual value - Value(field, depth) + InValue(field, depth) } - (Value(field, depth), ev) => { + (InValue(field, depth), ev) => { self.field_tracers[field].accept(ev)?; match depth { // Any event at depth == 0 that does not start a structure (is a complete value) - 0 => Key, - _ => Value(field, depth), + 0 => InKey, + _ => InValue(field, depth), } } (Finished, _) => fail!("finished StructTracer cannot handle events"), @@ -389,7 +394,7 @@ impl EventSink for StructTracer { pub struct TupleTracer { pub field_tracers: Vec, pub nullable: bool, - pub next: TupleTracerState, + pub state: TupleTracerState, pub path: String, pub options: TracingOptions, } @@ -397,8 +402,10 @@ pub struct TupleTracer { #[derive(Debug, Clone, Copy)] pub enum TupleTracerState { WaitForStart, + /// Wait for the item with `(field_index)` WaitForItem(usize), - Item(usize, usize), + /// Process the item at `(field_index, depth)` + InItem(usize, usize), Finished, } @@ -409,12 +416,12 @@ impl TupleTracer { options, field_tracers: Vec::new(), nullable, - next: TupleTracerState::WaitForStart, + state: TupleTracerState::WaitForStart, } } pub fn to_field(&self, name: &str) -> Result { - if !matches!(self.next, TupleTracerState::Finished) { + if !matches!(self.state, TupleTracerState::Finished) { fail!("Cannot build field {name} from unfinished tracer"); } @@ -438,24 +445,24 @@ impl TupleTracer { } pub fn reset(&mut self) -> Result<()> { - if !matches!(self.next, TupleTracerState::Finished) { + if !matches!(self.state, TupleTracerState::Finished) { fail!("Cannot reset unfinished tuple tracer"); } for tracer in &mut self.field_tracers { tracer.reset()?; } - self.next = TupleTracerState::WaitForStart; + self.state = TupleTracerState::WaitForStart; Ok(()) } pub fn finish(&mut self) -> Result<()> { - if !matches!(self.next, TupleTracerState::WaitForStart) { + if !matches!(self.state, TupleTracerState::WaitForStart) { fail!("Incomplete tuple in schema tracing"); } for tracer in &mut self.field_tracers { tracer.finish()?; } - self.next = TupleTracerState::Finished; + self.state = TupleTracerState::Finished; Ok(()) } } @@ -467,7 +474,7 @@ impl EventSink for TupleTracer { use TupleTracerState::*; type E<'a> = Event<'a>; - self.next = match (self.next, event) { + self.state = match (self.state, event) { (WaitForStart, Event::StartTuple) => WaitForItem(0), (WaitForStart, E::Null | E::Some) => { self.nullable = true; @@ -477,17 +484,17 @@ impl EventSink for TupleTracer { "Invalid event {ev} for TupleTracer in state Start [{path}]", path = self.path ), - (WaitForItem(field), Event::Item) => Item(field, 0), + (WaitForItem(field), Event::Item) => InItem(field, 0), (WaitForItem(_), E::EndTuple) => WaitForStart, (WaitForItem(field), ev) => fail!( "Invalid event {ev} for TupleTracer in state WaitForItem({field}) [{path}]", path = self.path ), - (Item(field, depth), ev) if ev.is_start() => { + (InItem(field, depth), ev) if ev.is_start() => { self.field_tracer(field).accept(ev)?; - Item(field, depth + 1) + InItem(field, depth + 1) } - (Item(field, depth), ev) if ev.is_end() => { + (InItem(field, depth), ev) if ev.is_end() => { self.field_tracer(field).accept(ev)?; match depth { 0 => fail!( @@ -495,20 +502,20 @@ impl EventSink for TupleTracer { path = self.path ), 1 => WaitForItem(field + 1), - depth => Item(field, depth - 1), + depth => InItem(field, depth - 1), } } - (Item(field, depth), ev) if ev.is_marker() => { + (InItem(field, depth), ev) if ev.is_marker() => { self.field_tracer(field).accept(ev)?; // markers are always followed by the actual value - Item(field, depth) + InItem(field, depth) } - (Item(field, depth), ev) => { + (InItem(field, depth), ev) => { self.field_tracer(field).accept(ev)?; match depth { // Any event at depth == 0 that does not start a structure (is a complete value) 0 => WaitForItem(field + 1), - _ => Item(field, depth), + _ => InItem(field, depth), } } (Finished, ev) => fail!("finished tuple tracer cannot handle event {ev}"), @@ -640,8 +647,10 @@ pub struct UnionTracer { #[derive(Debug, Clone, Copy)] pub enum UnionTracerState { - Inactive, - Active(usize, usize), + /// Wait for the next variant + WaitForVariant, + /// Process the current variant at `(variant_index, depth)` + InVariant(usize, usize), Finished, } @@ -653,7 +662,7 @@ impl UnionTracer { variants: Vec::new(), tracers: BTreeMap::new(), nullable, - next: UnionTracerState::Inactive, + next: UnionTracerState::WaitForVariant, } } @@ -718,7 +727,7 @@ impl UnionTracer { for tracer in self.tracers.values_mut() { tracer.reset()?; } - self.next = UnionTracerState::Inactive; + self.next = UnionTracerState::WaitForVariant; Ok(()) } @@ -740,43 +749,43 @@ impl EventSink for UnionTracer { type E<'a> = Event<'a>; self.next = match self.next { - S::Inactive => match event { + S::WaitForVariant => match event { E::Variant(variant, idx) => { self.ensure_variant(variant, idx)?; - S::Active(idx, 0) + S::InVariant(idx, 0) } E::Some => fail!("Nullable unions are not supported"), E::OwnedVariant(variant, idx) => { self.ensure_variant(variant, idx)?; - S::Active(idx, 0) + S::InVariant(idx, 0) } ev => fail!("Invalid event {ev} for UnionTracer in State Inactive"), }, - S::Active(idx, depth) => match event { + S::InVariant(idx, depth) => match event { ev if ev.is_start() => { self.tracers.get_mut(&idx).unwrap().accept(ev)?; - S::Active(idx, depth + 1) + S::InVariant(idx, depth + 1) } ev if ev.is_end() => match depth { 0 => fail!("Invalid end event {ev} at depth 0 in UnionTracer"), 1 => { self.tracers.get_mut(&idx).unwrap().accept(ev)?; - S::Inactive + S::WaitForVariant } _ => { self.tracers.get_mut(&idx).unwrap().accept(ev)?; - S::Active(idx, depth - 1) + S::InVariant(idx, depth - 1) } }, ev if ev.is_marker() => { self.tracers.get_mut(&idx).unwrap().accept(ev)?; - S::Active(idx, depth) + S::InVariant(idx, depth) } ev if ev.is_value() => { self.tracers.get_mut(&idx).unwrap().accept(ev)?; match depth { - 0 => S::Inactive, - _ => S::Active(idx, depth), + 0 => S::WaitForVariant, + _ => S::InVariant(idx, depth), } } _ => unreachable!(), @@ -796,14 +805,16 @@ pub struct MapTracer { pub key: Box, pub value: Box, pub nullable: bool, - next: MapTracerState, + pub state: MapTracerState, } #[derive(Debug, Clone, Copy)] pub enum MapTracerState { - Start, - Key(usize), - Value(usize), + WaitForKey, + /// Process the current key at `(depth)` + InKey(usize), + /// Process the current value at `(depth)` + InValue(usize), Finished, } @@ -813,13 +824,13 @@ impl MapTracer { nullable, key: Box::new(SamplesTracer::new(format!("{path}.$key"), options.clone())), value: Box::new(SamplesTracer::new(format!("{path}.$value"), options)), - next: MapTracerState::Start, + state: MapTracerState::WaitForKey, path, } } pub fn to_field(&self, name: &str) -> Result { - if !matches!(self.next, MapTracerState::Finished) { + if !matches!(self.state, MapTracerState::Finished) { fail!("Cannot build field {name} from unfinished tracer"); } @@ -836,14 +847,14 @@ impl MapTracer { pub fn reset(&mut self) -> Result<()> { self.key.reset()?; self.value.reset()?; - self.next = MapTracerState::Start; + self.state = MapTracerState::WaitForKey; Ok(()) } pub fn finish(&mut self) -> Result<()> { self.key.finish()?; self.value.finish()?; - self.next = MapTracerState::Finished; + self.state = MapTracerState::Finished; Ok(()) } } @@ -855,77 +866,77 @@ impl EventSink for MapTracer { type S = MapTracerState; type E<'a> = Event<'a>; - self.next = match self.next { - S::Start => match event { - Event::StartMap => S::Key(0), + self.state = match self.state { + S::WaitForKey => match event { + Event::StartMap => S::InKey(0), Event::Null | Event::Some => { self.nullable = true; - S::Start + S::WaitForKey } ev => fail!("Unexpected event {ev} in state Start of MapTracer"), }, - S::Key(depth) => match event { - Event::Item if depth == 0 => S::Key(depth), + S::InKey(depth) => match event { + Event::Item if depth == 0 => S::InKey(depth), ev if ev.is_end() => match depth { 0 => { if !matches!(ev, E::EndMap) { fail!("Unexpected event {ev} in State Key at depth 0 in MapTracer") } - S::Start + S::WaitForKey } 1 => { self.key.accept(ev)?; - S::Value(0) + S::InValue(0) } _ => { self.key.accept(ev)?; - S::Key(depth - 1) + S::InKey(depth - 1) } }, ev if ev.is_start() => { self.key.accept(ev)?; - S::Key(depth + 1) + S::InKey(depth + 1) } ev if ev.is_marker() => { self.key.accept(ev)?; - S::Key(depth) + S::InKey(depth) } ev if ev.is_value() => { self.key.accept(ev)?; if depth == 0 { - S::Value(0) + S::InValue(0) } else { - S::Key(depth) + S::InKey(depth) } } _ => unreachable!(), }, - S::Value(depth) => match event { + S::InValue(depth) => match event { ev if ev.is_end() => match depth { 0 => fail!("Unexpected event {ev} in State Value at depth 0 in MapTracer"), 1 => { self.value.accept(ev)?; - S::Key(0) + S::InKey(0) } _ => { self.value.accept(ev)?; - S::Value(depth - 1) + S::InValue(depth - 1) } }, ev if ev.is_start() => { self.value.accept(ev)?; - S::Value(depth + 1) + S::InValue(depth + 1) } ev if ev.is_marker() => { self.value.accept(ev)?; - S::Value(depth) + S::InValue(depth) } ev if ev.is_value() => { self.value.accept(ev)?; if depth == 0 { - S::Key(0) + S::InKey(0) } else { - S::Value(depth) + S::InValue(depth) } } _ => unreachable!(), From c17eb836d4aa66cf9847263e11d0852c6a1d257e Mon Sep 17 00:00:00 2001 From: Christopher Prohm Date: Sat, 28 Oct 2023 09:23:59 +0200 Subject: [PATCH 04/27] Use new resolver for workspace --- Cargo.toml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Cargo.toml b/Cargo.toml index aae049fd..bbed1bef 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,3 +3,5 @@ members = [ "serde_arrow", "example", ] + +resolver = "2" From 65fb940f6e1bec39631ffa4d2174ab6be03cf6b6 Mon Sep 17 00:00:00 2001 From: Christopher Prohm Date: Sat, 28 Oct 2023 09:24:13 +0200 Subject: [PATCH 05/27] Continue unifying the different tracers --- serde_arrow/src/internal/tracing/samples.rs | 249 ++++++++-------- serde_arrow/src/internal/tracing/tracer.rs | 296 +++++++++++++++++--- serde_arrow/src/internal/tracing/types.rs | 54 ++-- 3 files changed, 416 insertions(+), 183 deletions(-) diff --git a/serde_arrow/src/internal/tracing/samples.rs b/serde_arrow/src/internal/tracing/samples.rs index b402b700..b1786635 100644 --- a/serde_arrow/src/internal/tracing/samples.rs +++ b/serde_arrow/src/internal/tracing/samples.rs @@ -1,5 +1,5 @@ use std::{ - collections::{BTreeMap, BTreeSet, HashMap}, + collections::{BTreeSet, HashMap}, iter, }; @@ -49,7 +49,9 @@ impl SamplesTracer { pub fn mark_nullable(&mut self) { use SamplesTracer::*; match self { - Unknown(_) => {} + Unknown(t) => { + t.nullable = true; + } List(t) => { t.nullable = true; } @@ -219,13 +221,15 @@ pub struct StructTracer { pub nullable: bool, pub state: StructTracerState, pub mode: StructMode, - pub field_tracers: Vec, - pub field_names: Vec, + pub fields: Vec, pub index: HashMap, - // TODO: document and clean up these fields - pub item_index: usize, - pub seen_this_item: BTreeSet, - pub seen_previous_items: BTreeSet, + pub current_sample: usize, +} + +pub struct Field { + pub name: String, + pub tracer: SamplesTracer, + pub last_seen_in_sample: usize, } #[derive(Debug, Clone, Copy)] @@ -246,43 +250,36 @@ impl StructTracer { path, options, mode, - field_tracers: Vec::new(), - field_names: Vec::new(), + fields: Vec::new(), index: HashMap::new(), nullable, state: StructTracerState::WaitForKey, - item_index: 0, - seen_this_item: BTreeSet::new(), - seen_previous_items: BTreeSet::new(), + current_sample: 0, } } - pub fn mark_seen(&mut self, field: usize) { - self.seen_this_item.insert(field); - } - pub fn to_field(&self, name: &str) -> Result { if !matches!(self.state, StructTracerState::Finished) { fail!("Cannot build field {name} from unfinished tracer"); } - let mut field = GenericField::new(name, GenericDataType::Struct, self.nullable); - for (tracer, name) in iter::zip(&self.field_tracers, &self.field_names) { - field.children.push(tracer.to_field(name)?); + let mut res_field = GenericField::new(name, GenericDataType::Struct, self.nullable); + for field in &self.fields { + res_field.children.push(field.tracer.to_field(&field.name)?); } if let StructMode::Map = self.mode { - field.children.sort_by(|a, b| a.name.cmp(&b.name)); - field.strategy = Some(Strategy::MapAsStruct); + res_field.children.sort_by(|a, b| a.name.cmp(&b.name)); + res_field.strategy = Some(Strategy::MapAsStruct); } - Ok(field) + Ok(res_field) } pub fn reset(&mut self) -> Result<()> { if !matches!(self.state, StructTracerState::Finished) { fail!("Cannot reset unfinished tracer"); } - for tracer in &mut self.field_tracers { - tracer.reset()?; + for field in &mut self.fields { + field.tracer.reset()?; } self.state = StructTracerState::WaitForKey; @@ -294,8 +291,8 @@ impl StructTracer { fail!("Incomplete struct in schema tracing"); } - for tracer in &mut self.field_tracers { - tracer.finish()?; + for field in &mut self.fields { + field.tracer.finish()?; } self.state = StructTracerState::Finished; @@ -320,48 +317,53 @@ impl EventSink for StructTracer { (WaitForKey, ev) => fail!("Invalid event {ev} for struct tracer in state Start"), (InKey, E::Item) => InKey, (InKey, E::Str(key)) => { - if let Some(&field) = self.index.get(key) { - self.mark_seen(field); - InValue(field, 0) + if let Some(&field_idx) = self.index.get(key) { + let Some(field) = self.fields.get_mut(field_idx) else { + fail!("invalid state"); + }; + field.last_seen_in_sample = self.current_sample; + + InValue(field_idx, 0) } else { - let field = self.field_tracers.len(); - self.field_tracers.push(SamplesTracer::new( - format!("{path}.{key}", path = self.path), - self.options.clone(), - )); - self.field_names.push(key.to_owned()); - self.index.insert(key.to_owned(), field); - self.mark_seen(field); - InValue(field, 0) + let mut field = Field { + tracer: SamplesTracer::new( + format!("{path}.{key}", path = self.path), + self.options.clone(), + ), + name: key.to_owned(), + last_seen_in_sample: self.current_sample, + }; + + // field was missing in previous samples + if self.current_sample != 0 { + println!("{key}"); + field.tracer.mark_nullable(); + } + + let field_idx = self.fields.len(); + self.fields.push(field); + self.index.insert(key.to_owned(), field_idx); + InValue(field_idx, 0) } } (InKey, E::EndStruct | E::EndMap) => { - if self.item_index == 0 { - self.seen_previous_items = self.seen_this_item.clone(); - } - - for (field, tracer) in self.field_tracers.iter_mut().enumerate() { - if !self.seen_this_item.contains(&field) - || !self.seen_previous_items.contains(&field) - { - tracer.mark_nullable(); + for field in &mut self.fields { + // field. was not seen in this sample + if field.last_seen_in_sample != self.current_sample { + field.tracer.mark_nullable(); } } - for seen in &self.seen_this_item { - self.seen_previous_items.insert(*seen); - } - self.seen_this_item.clear(); - self.item_index += 1; + self.current_sample += 1; WaitForKey } (InKey, ev) => fail!("Invalid event {ev} for struct tracer in state Key"), (InValue(field, depth), ev) if ev.is_start() => { - self.field_tracers[field].accept(ev)?; + self.fields[field].tracer.accept(ev)?; InValue(field, depth + 1) } (InValue(field, depth), ev) if ev.is_end() => { - self.field_tracers[field].accept(ev)?; + self.fields[field].tracer.accept(ev)?; match depth { 0 => fail!("Invalid closing event in struct tracer in state Value"), 1 => InKey, @@ -369,12 +371,12 @@ impl EventSink for StructTracer { } } (InValue(field, depth), ev) if ev.is_marker() => { - self.field_tracers[field].accept(ev)?; + self.fields[field].tracer.accept(ev)?; // markers are always followed by the actual value InValue(field, depth) } (InValue(field, depth), ev) => { - self.field_tracers[field].accept(ev)?; + self.fields[field].tracer.accept(ev)?; match depth { // Any event at depth == 0 that does not start a structure (is a complete value) 0 => InKey, @@ -529,10 +531,11 @@ impl EventSink for TupleTracer { } pub struct ListTracer { + pub path: String, + pub options: TracingOptions, pub item_tracer: Box, pub nullable: bool, pub state: ListTracerState, - pub path: String, } #[derive(Debug, Clone, Copy)] @@ -547,6 +550,7 @@ impl ListTracer { pub fn new(path: String, options: TracingOptions, nullable: bool) -> Self { Self { path: path.clone(), + options: options.clone(), item_tracer: Box::new(SamplesTracer::new(path, options)), nullable, state: ListTracerState::WaitForStart, @@ -637,14 +641,18 @@ impl EventSink for ListTracer { } pub struct UnionTracer { - pub variants: Vec>, - pub tracers: BTreeMap, + pub variants: Vec>, pub nullable: bool, - pub next: UnionTracerState, + pub state: UnionTracerState, pub path: String, pub options: TracingOptions, } +pub struct Variant { + name: String, + tracer: SamplesTracer, +} + #[derive(Debug, Clone, Copy)] pub enum UnionTracerState { /// Wait for the next variant @@ -660,34 +668,26 @@ impl UnionTracer { path, options, variants: Vec::new(), - tracers: BTreeMap::new(), nullable, - next: UnionTracerState::WaitForVariant, + state: UnionTracerState::WaitForVariant, } } pub fn to_field(&self, name: &str) -> Result { - if !matches!(self.next, UnionTracerState::Finished) { + if !matches!(self.state, UnionTracerState::Finished) { fail!("Cannot build field {name} from unfinished tracer"); } let mut field = GenericField::new(name, GenericDataType::Union, self.nullable); - for (idx, variant_name) in self.variants.iter().enumerate() { - if let Some(variant_name) = variant_name { - let Some(tracer) = self.tracers.get(&idx) else { - panic!(concat!( - "invalid state: tracer for variant {idx} with name {variant_name:?} not initialized. ", - "This should not happen, please open an issue at https://github.com/chmp/serde_arrow", - ), idx=idx, variant_name=variant_name); - }; - - field.children.push(tracer.to_field(variant_name)?); + for variant in &self.variants { + if let Some(variant) = variant { + field.children.push(variant.tracer.to_field(&variant.name)?); } else { field.children.push( GenericField::new("", GenericDataType::Null, true) .with_strategy(Strategy::UnknownVariant), ); - } + }; } Ok(field) @@ -702,41 +702,50 @@ impl UnionTracer { self.variants.push(None); } - self.tracers.entry(idx).or_insert_with(|| { - SamplesTracer::new( - format!("{path}.{key}", path = self.path, key = variant.as_ref()), - self.options.clone(), - ) - }); - - if let Some(prev) = self.variants[idx].as_ref() { + if let Some(prev) = self.variants[idx].as_mut() { let variant = variant.as_ref(); - if prev != variant { - fail!("Incompatible names for variant {idx}: {prev}, {variant}"); + if prev.name != variant { + fail!( + "Incompatible names for variant {idx}: {prev}, {variant}", + prev = prev.name + ); } } else { - self.variants[idx] = Some(variant.into()); + let tracer = SamplesTracer::new( + format!("{path}.{key}", path = self.path, key = variant.as_ref()), + self.options.clone(), + ); + let name = variant.into(); + + self.variants[idx] = Some(Variant { name, tracer }); } + Ok(()) } pub fn reset(&mut self) -> Result<()> { - if !matches!(self.next, UnionTracerState::Finished) { + if !matches!(self.state, UnionTracerState::Finished) { fail!("Cannot reset unfinished union tracer"); } - for tracer in self.tracers.values_mut() { - tracer.reset()?; + for variant in &mut self.variants { + let Some(variant) = variant.as_mut() else { + continue; + }; + variant.tracer.reset()?; } - self.next = UnionTracerState::WaitForVariant; + self.state = UnionTracerState::WaitForVariant; Ok(()) } pub fn finish(&mut self) -> Result<()> { // TODO: fix me - for tracer in self.tracers.values_mut() { - tracer.finish()?; + for variant in &mut self.variants { + let Some(variant) = variant.as_mut() else { + continue; + }; + variant.tracer.finish()?; } - self.next = UnionTracerState::Finished; + self.state = UnionTracerState::Finished; Ok(()) } } @@ -748,7 +757,7 @@ impl EventSink for UnionTracer { type S = UnionTracerState; type E<'a> = Event<'a>; - self.next = match self.next { + self.state = match self.state { S::WaitForVariant => match event { E::Variant(variant, idx) => { self.ensure_variant(variant, idx)?; @@ -763,26 +772,26 @@ impl EventSink for UnionTracer { }, S::InVariant(idx, depth) => match event { ev if ev.is_start() => { - self.tracers.get_mut(&idx).unwrap().accept(ev)?; + self.variants[idx].as_mut().unwrap().tracer.accept(ev)?; S::InVariant(idx, depth + 1) } ev if ev.is_end() => match depth { 0 => fail!("Invalid end event {ev} at depth 0 in UnionTracer"), 1 => { - self.tracers.get_mut(&idx).unwrap().accept(ev)?; + self.variants[idx].as_mut().unwrap().tracer.accept(ev)?; S::WaitForVariant } _ => { - self.tracers.get_mut(&idx).unwrap().accept(ev)?; + self.variants[idx].as_mut().unwrap().tracer.accept(ev)?; S::InVariant(idx, depth - 1) } }, ev if ev.is_marker() => { - self.tracers.get_mut(&idx).unwrap().accept(ev)?; + self.variants[idx].as_mut().unwrap().tracer.accept(ev)?; S::InVariant(idx, depth) } ev if ev.is_value() => { - self.tracers.get_mut(&idx).unwrap().accept(ev)?; + self.variants[idx].as_mut().unwrap().tracer.accept(ev)?; match depth { 0 => S::WaitForVariant, _ => S::InVariant(idx, depth), @@ -802,8 +811,9 @@ impl EventSink for UnionTracer { pub struct MapTracer { pub path: String, - pub key: Box, - pub value: Box, + pub options: TracingOptions, + pub key_tracer: Box, + pub value_tracer: Box, pub nullable: bool, pub state: MapTracerState, } @@ -822,8 +832,9 @@ impl MapTracer { pub fn new(path: String, options: TracingOptions, nullable: bool) -> Self { Self { nullable, - key: Box::new(SamplesTracer::new(format!("{path}.$key"), options.clone())), - value: Box::new(SamplesTracer::new(format!("{path}.$value"), options)), + options: options.clone(), + key_tracer: Box::new(SamplesTracer::new(format!("{path}.$key"), options.clone())), + value_tracer: Box::new(SamplesTracer::new(format!("{path}.$value"), options)), state: MapTracerState::WaitForKey, path, } @@ -835,8 +846,8 @@ impl MapTracer { } let mut entries = GenericField::new("entries", GenericDataType::Struct, false); - entries.children.push(self.key.to_field("key")?); - entries.children.push(self.value.to_field("value")?); + entries.children.push(self.key_tracer.to_field("key")?); + entries.children.push(self.value_tracer.to_field("value")?); let mut field = GenericField::new(name, GenericDataType::Map, self.nullable); field.children.push(entries); @@ -845,15 +856,15 @@ impl MapTracer { } pub fn reset(&mut self) -> Result<()> { - self.key.reset()?; - self.value.reset()?; + self.key_tracer.reset()?; + self.value_tracer.reset()?; self.state = MapTracerState::WaitForKey; Ok(()) } pub fn finish(&mut self) -> Result<()> { - self.key.finish()?; - self.value.finish()?; + self.key_tracer.finish()?; + self.value_tracer.finish()?; self.state = MapTracerState::Finished; Ok(()) } @@ -885,24 +896,24 @@ impl EventSink for MapTracer { S::WaitForKey } 1 => { - self.key.accept(ev)?; + self.key_tracer.accept(ev)?; S::InValue(0) } _ => { - self.key.accept(ev)?; + self.key_tracer.accept(ev)?; S::InKey(depth - 1) } }, ev if ev.is_start() => { - self.key.accept(ev)?; + self.key_tracer.accept(ev)?; S::InKey(depth + 1) } ev if ev.is_marker() => { - self.key.accept(ev)?; + self.key_tracer.accept(ev)?; S::InKey(depth) } ev if ev.is_value() => { - self.key.accept(ev)?; + self.key_tracer.accept(ev)?; if depth == 0 { S::InValue(0) } else { @@ -915,24 +926,24 @@ impl EventSink for MapTracer { ev if ev.is_end() => match depth { 0 => fail!("Unexpected event {ev} in State Value at depth 0 in MapTracer"), 1 => { - self.value.accept(ev)?; + self.value_tracer.accept(ev)?; S::InKey(0) } _ => { - self.value.accept(ev)?; + self.value_tracer.accept(ev)?; S::InValue(depth - 1) } }, ev if ev.is_start() => { - self.value.accept(ev)?; + self.value_tracer.accept(ev)?; S::InValue(depth + 1) } ev if ev.is_marker() => { - self.value.accept(ev)?; + self.value_tracer.accept(ev)?; S::InValue(depth) } ev if ev.is_value() => { - self.value.accept(ev)?; + self.value_tracer.accept(ev)?; if depth == 0 { S::InKey(0) } else { diff --git a/serde_arrow/src/internal/tracing/tracer.rs b/serde_arrow/src/internal/tracing/tracer.rs index c186394b..a58e3402 100644 --- a/serde_arrow/src/internal/tracing/tracer.rs +++ b/serde_arrow/src/internal/tracing/tracer.rs @@ -1,3 +1,5 @@ +use std::collections::HashMap; + use crate::internal::{ error::{fail, Result}, schema::{GenericDataType, GenericField, Strategy}, @@ -27,6 +29,7 @@ defined_tracer!( List(ListTracer), Map(MapTracer), Struct(StructTracer), + Tuple(TupleTracer), Union(UnionTracer), ); @@ -78,6 +81,7 @@ impl Tracer { } } +// TODO: move into trace any? impl Tracer { pub fn mark_nullable(&mut self) { dispatch_tracer!(self, tracer => { tracer.nullable = true; }); @@ -86,21 +90,35 @@ impl Tracer { pub fn ensure_struct(&mut self, fields: &[S]) -> Result<()> { match self { this @ Self::Unknown(_) => { + let field_names = fields + .iter() + .map(|field| field.to_string()) + .collect::>(); + let index = field_names + .iter() + .enumerate() + .map(|(idx, name)| (name.to_string(), idx)) + .collect::>(); + let tracer = StructTracer { path: this.get_path().to_owned(), options: this.get_options().clone(), - field_tracers: fields + fields: fields .iter() - .map(|field| { - Tracer::new( + .map(|field| Field { + tracer: Tracer::new( format!("{}.{}", this.get_path(), field), this.get_options().clone(), - ) + ), + name: field.to_string(), + last_seen_in_sample: 0, }) .collect(), - field_names: fields.iter().map(|field| field.to_string()).collect(), + index, nullable: this.get_nullable(), - strategy: None, + mode: StructMode::Struct, + state: StructTracerState::WaitForKey, + current_sample: 0, }; *this = Self::Struct(tracer); Ok(()) @@ -116,29 +134,64 @@ impl Tracer { } } + pub fn ensure_tuple(&mut self, num_fields: usize) -> Result<()> { + match self { + this @ Self::Unknown(_) => { + let tracer = TupleTracer { + path: this.get_path().to_owned(), + options: this.get_options().clone(), + field_tracers: (0..num_fields) + .into_iter() + .map(|i| { + Tracer::new( + format!("{}.{}", this.get_path(), i), + this.get_options().clone(), + ) + }) + .collect(), + nullable: this.get_nullable(), + state: TupleTracerState::WaitForStart, + }; + *this = Self::Tuple(tracer); + Ok(()) + } + Self::Tuple(_tracer) => { + // TODO: check fields are equal + Ok(()) + } + _ => fail!( + "mismatched types, previous {:?}, current struct", + self.get_type() + ), + } + } + pub fn ensure_union(&mut self, variants: &[&str]) -> Result<()> { match self { this @ Self::Unknown(_) => { let tracer = UnionTracer { path: this.get_path().to_owned(), options: this.get_options().clone(), - variant_tracers: variants + state: UnionTracerState::WaitForVariant, + variants: variants .iter() .map(|variant| { - Tracer::new( - format!("{}.{}", this.get_path(), variant), - this.get_options().clone(), - ) + Some(Variant { + name: variant.to_string(), + tracer: Tracer::new( + format!("{}.{}", this.get_path(), variant), + this.get_options().clone(), + ), + }) }) .collect(), - variant_names: variants.iter().map(|s| s.to_string()).collect(), nullable: this.get_nullable(), }; *this = Self::Union(tracer); Ok(()) } Self::Union(_tracer) => { - // TODO: check fields are equal + // TODO: check fields are equal or fill missing fields Ok(()) } _ => fail!( @@ -159,6 +212,7 @@ impl Tracer { format!("{}.item", this.get_path()), this.get_options().clone(), )), + state: ListTracerState::WaitForStart, }; *this = Self::List(tracer); Ok(()) @@ -186,6 +240,7 @@ impl Tracer { format!("{}.value", this.get_path()), this.get_options().clone(), )), + state: MapTracerState::WaitForKey, }; *this = Self::Map(tracer); Ok(()) @@ -326,6 +381,17 @@ pub struct MapTracer { pub nullable: bool, pub key_tracer: Box, pub value_tracer: Box, + pub state: MapTracerState, +} + +#[derive(Debug, Clone, Copy, PartialEq)] +pub enum MapTracerState { + WaitForKey, + /// Process the current key at `(depth)` + InKey(usize), + /// Process the current value at `(depth)` + InValue(usize), + Finished, } impl MapTracer { @@ -369,6 +435,15 @@ pub struct ListTracer { pub options: TracingOptions, pub nullable: bool, pub item_tracer: Box, + pub state: ListTracerState, +} + +#[derive(Debug, Clone, Copy, PartialEq)] +pub enum ListTracerState { + WaitForStart, + WaitForItem, + InItem(usize), + Finished, } impl ListTracer { @@ -380,54 +455,77 @@ impl ListTracer { self.item_tracer.is_complete() } - pub fn to_field(&self, name: &str) -> Result { - let item = self.item_tracer.to_field("item")?; - let res = - GenericField::new(name, GenericDataType::LargeList, self.nullable).with_child(item); - Ok(res) - } - pub fn get_type(&self) -> Option<&GenericDataType> { Some(&GenericDataType::LargeList) } + pub fn to_field(&self, name: &str) -> Result { + if !matches!(self.state, ListTracerState::Finished) { + fail!("Cannot build field {name} from unfinished tracer"); + } + + let mut field = GenericField::new(name, GenericDataType::LargeList, self.nullable); + field.children.push(self.item_tracer.to_field("item")?); + + Ok(field) + } + pub fn reset(&mut self) -> Result<()> { - self.item_tracer.reset() + if !matches!(self.state, ListTracerState::Finished) { + fail!("Cannot reset unfinished list tracer"); + } + self.item_tracer.reset()?; + self.state = ListTracerState::Finished; + Ok(()) } pub fn finish(&mut self) -> Result<()> { - self.item_tracer.finish() + if !matches!(self.state, ListTracerState::WaitForStart) { + fail!("Incomplete list in schema tracing"); + } + self.item_tracer.finish()?; + self.state = ListTracerState::Finished; + Ok(()) } } #[derive(Debug, PartialEq, Clone)] -pub struct StructTracer { +pub struct TupleTracer { pub path: String, pub options: TracingOptions, pub nullable: bool, - pub field_names: Vec, pub field_tracers: Vec, - pub strategy: Option, + pub state: TupleTracerState, } -impl StructTracer { +#[derive(Debug, Clone, Copy, PartialEq)] +pub enum TupleTracerState { + WaitForStart, + /// Wait for the item with `(field_index)` + WaitForItem(usize), + /// Process the item at `(field_index, depth)` + InItem(usize, usize), + Finished, +} + +impl TupleTracer { pub fn get_path(&self) -> &str { &self.path } pub fn is_complete(&self) -> bool { - self.field_tracers.iter().all(Tracer::is_complete) + self.field_tracers.iter().all(|tracer| tracer.is_complete()) } pub fn to_field(&self, name: &str) -> Result { - let mut field = GenericField::new(name, GenericDataType::Struct, self.nullable); + let mut res_field = GenericField::new(name, GenericDataType::Struct, self.nullable); - for (tracer, name) in self.field_tracers.iter().zip(&self.field_names) { - field.children.push(tracer.to_field(name)?); + for (idx, tracer) in self.field_tracers.iter().enumerate() { + res_field.children.push(tracer.to_field(&idx.to_string())?); } - field.strategy = self.strategy.clone(); + res_field.strategy = Some(Strategy::TupleAsStruct); - Ok(field) + Ok(res_field) } pub fn get_type(&self) -> Option<&GenericDataType> { @@ -449,14 +547,103 @@ impl StructTracer { } } +#[derive(Debug, PartialEq, Clone)] +pub struct StructTracer { + pub path: String, + pub options: TracingOptions, + pub nullable: bool, + pub fields: Vec, + pub index: HashMap, + pub mode: StructMode, + pub state: StructTracerState, + pub current_sample: usize, +} + +#[derive(Debug, PartialEq, Clone)] +pub struct Field { + pub name: String, + pub tracer: Tracer, + pub last_seen_in_sample: usize, +} + +#[derive(Debug, PartialEq, Eq, Clone, Copy)] +pub enum StructMode { + Struct, + Map, +} + +#[derive(Debug, Clone, Copy, PartialEq)] +pub enum StructTracerState { + /// The tracer is waiting for the next key + WaitForKey, + /// The tracer is currently processing the next key + InKey, + /// The tracer is currently tracing a value for `(field, depth)` + InValue(usize, usize), + /// The tracer is finished + Finished, +} + +impl StructTracer { + pub fn get_path(&self) -> &str { + &self.path + } + + pub fn is_complete(&self) -> bool { + self.fields.iter().all(|field| field.tracer.is_complete()) + } + + pub fn to_field(&self, name: &str) -> Result { + let mut res_field = GenericField::new(name, GenericDataType::Struct, self.nullable); + for field in &self.fields { + res_field.children.push(field.tracer.to_field(&field.name)?); + } + + Ok(res_field) + } + + pub fn get_type(&self) -> Option<&GenericDataType> { + Some(&GenericDataType::Struct) + } + + pub fn reset(&mut self) -> Result<()> { + for field in &mut self.fields { + field.tracer.reset()?; + } + Ok(()) + } + + pub fn finish(&mut self) -> Result<()> { + for field in &mut self.fields { + field.tracer.finish()?; + } + Ok(()) + } +} + #[derive(Debug, PartialEq, Clone)] pub struct UnionTracer { pub path: String, pub options: TracingOptions, pub nullable: bool, - pub variant_names: Vec, - pub variant_tracers: Vec, + pub variants: Vec>, + pub state: UnionTracerState, +} + +#[derive(Debug, PartialEq, Clone)] +pub struct Variant { + pub name: String, + pub tracer: Tracer, +} + +#[derive(Debug, Clone, Copy, PartialEq)] +pub enum UnionTracerState { + /// Wait for the next variant + WaitForVariant, + /// Process the current variant at `(variant_index, depth)` + InVariant(usize, usize), + Finished, } impl UnionTracer { @@ -465,14 +652,29 @@ impl UnionTracer { } pub fn is_complete(&self) -> bool { - self.variant_tracers.iter().all(Tracer::is_complete) + self.variants + .iter() + .flat_map(|opt| opt.as_ref()) + .all(|variant| variant.tracer.is_complete()) } pub fn to_field(&self, name: &str) -> Result { + if !matches!(self.state, UnionTracerState::Finished) { + fail!("Cannot build field {name} from unfinished tracer"); + } + let mut field = GenericField::new(name, GenericDataType::Union, self.nullable); - for (tracer, name) in self.variant_tracers.iter().zip(&self.variant_names) { - field.children.push(tracer.to_field(name)?); + for variant in &self.variants { + if let Some(variant) = variant { + field.children.push(variant.tracer.to_field(&variant.name)?); + } else { + field.children.push( + GenericField::new("", GenericDataType::Null, true) + .with_strategy(Strategy::UnknownVariant), + ); + }; } + Ok(field) } @@ -481,16 +683,28 @@ impl UnionTracer { } pub fn reset(&mut self) -> Result<()> { - for tracer in &mut self.variant_tracers { - tracer.reset()?; + if !matches!(self.state, UnionTracerState::Finished) { + fail!("Cannot reset unfinished union tracer"); } + for variant in &mut self.variants { + let Some(variant) = variant.as_mut() else { + continue; + }; + variant.tracer.reset()?; + } + self.state = UnionTracerState::WaitForVariant; Ok(()) } pub fn finish(&mut self) -> Result<()> { - for tracer in &mut self.variant_tracers { - tracer.finish()?; + // TODO: fix me + for variant in &mut self.variants { + let Some(variant) = variant.as_mut() else { + continue; + }; + variant.tracer.finish()?; } + self.state = UnionTracerState::Finished; Ok(()) } } diff --git a/serde_arrow/src/internal/tracing/types.rs b/serde_arrow/src/internal/tracing/types.rs index d23a8204..ee98263f 100644 --- a/serde_arrow/src/internal/tracing/types.rs +++ b/serde_arrow/src/internal/tracing/types.rs @@ -5,8 +5,11 @@ use serde::{ use crate::internal::{ fail, - schema::{GenericField, Strategy}, - tracing::{tracer::Tracer, TracingOptions}, + schema::GenericField, + tracing::{ + tracer::{Field, Tracer}, + TracingOptions, + }, Error, Result, }; @@ -161,16 +164,14 @@ impl<'de, 'a> serde::de::Deserializer<'de> for TraceAny<'a> { } fn deserialize_tuple>(self, len: usize, visitor: V) -> Result { - let field_names = (0..len).map(|idx| idx.to_string()).collect::>(); - self.0.ensure_struct(&field_names)?; + self.0.ensure_tuple(len)?; - let Tracer::Struct(tracer) = self.0 else { + let Tracer::Tuple(tracer) = self.0 else { unreachable!(); }; - tracer.strategy = Some(Strategy::TupleAsStruct); visitor.visit_seq(TraceTupleStruct { - field_tracers: &mut tracer.field_tracers, + tracers: &mut tracer.field_tracers, pos: 0, }) } @@ -212,9 +213,9 @@ impl<'de, 'a> serde::de::Deserializer<'de> for TraceAny<'a> { }; visitor.visit_map(TraceStruct { - field_tracers: &mut tracer.field_tracers, + fields: &mut tracer.fields, pos: 0, - fields, + names: fields, }) } @@ -231,18 +232,22 @@ impl<'de, 'a> serde::de::Deserializer<'de> for TraceAny<'a> { }; let idx = tracer - .variant_tracers + .variants .iter() - .position(|tracer| tracer.is_unknown()) + .position(|opt| opt.as_ref().unwrap().tracer.is_unknown()) .unwrap_or_default(); - if idx >= tracer.variant_tracers.len() { + if idx >= tracer.variants.len() { fail!("invalid variant index"); } + let Some(variant) = tracer.variants[idx].as_mut() else { + fail!("invalid state"); + }; + let res = visitor.visit_enum(TraceEnum { - tracer: &mut tracer.variant_tracers[idx], + tracer: &mut variant.tracer, pos: idx, - variant: &tracer.variant_names[idx], + variant: &variant.name, })?; Ok(res) } @@ -282,7 +287,7 @@ impl<'de, 'a> serde::de::MapAccess<'de> for TraceMap<'a> { } struct TraceTupleStruct<'a> { - field_tracers: &'a mut [Tracer], + tracers: &'a mut [Tracer], pos: usize, } @@ -290,11 +295,11 @@ impl<'de, 'a> serde::de::SeqAccess<'de> for TraceTupleStruct<'a> { type Error = Error; fn next_element_seed>(&mut self, seed: T) -> Result> { - if self.pos >= self.field_tracers.len() { + if self.pos >= self.tracers.len() { return Ok(None); } - let item = seed.deserialize(TraceAny(&mut self.field_tracers[self.pos]))?; + let item = seed.deserialize(TraceAny(&mut self.tracers[self.pos]))?; self.pos += 1; Ok(Some(item)) @@ -302,27 +307,27 @@ impl<'de, 'a> serde::de::SeqAccess<'de> for TraceTupleStruct<'a> { } struct TraceStruct<'a> { - field_tracers: &'a mut [Tracer], + fields: &'a mut [Field], pos: usize, - fields: &'static [&'static str], + names: &'static [&'static str], } impl<'de, 'a> serde::de::MapAccess<'de> for TraceStruct<'a> { type Error = Error; fn next_key_seed>(&mut self, seed: K) -> Result> { - if self.pos >= self.fields.len() { + if self.pos >= self.names.len() { return Ok(None); } let key = seed.deserialize(IdentifierDeserializer { idx: self.pos, - name: self.fields[self.pos], + name: self.names[self.pos], })?; Ok(Some(key)) } fn next_value_seed>(&mut self, seed: V) -> Result { - let value = seed.deserialize(TraceAny(&mut self.field_tracers[self.pos]))?; + let value = seed.deserialize(TraceAny(&mut self.fields[self.pos].tracer))?; self.pos += 1; Ok(value) @@ -533,7 +538,10 @@ fn trace_struct() { #[test] fn trace_tuple_as_struct() { - use {crate::internal::schema::GenericDataType as T, GenericField as F}; + use { + crate::internal::schema::GenericDataType as T, crate::internal::schema::Strategy, + GenericField as F, + }; let actual = trace_type::<(bool, Option)>(TracingOptions::default(), "root").unwrap(); let expected = F::new("root", T::Struct, false) From 0025da601dc381997abea53efa142f91d92a0031 Mon Sep 17 00:00:00 2001 From: Christopher Prohm Date: Sat, 28 Oct 2023 09:44:35 +0200 Subject: [PATCH 06/27] Unify tracers --- serde_arrow/src/internal/mod.rs | 6 +- serde_arrow/src/internal/tracing/mod.rs | 7 +- serde_arrow/src/internal/tracing/samples.rs | 503 +------------------- serde_arrow/src/internal/tracing/tracer.rs | 168 ++++++- serde_arrow/src/internal/tracing/types.rs | 16 +- serde_arrow/src/test_impls/macros.rs | 4 +- 6 files changed, 178 insertions(+), 526 deletions(-) diff --git a/serde_arrow/src/internal/mod.rs b/serde_arrow/src/internal/mod.rs index 5233c778..b758cb1b 100644 --- a/serde_arrow/src/internal/mod.rs +++ b/serde_arrow/src/internal/mod.rs @@ -19,7 +19,7 @@ use self::{ schema::{GenericDataType, GenericField}, sink::{serialize_into_sink, EventSerializer, EventSink, StripOuterSequenceSink}, source::deserialize_from_source, - tracing::{trace_type, SamplesTracer, TracingOptions}, + tracing::{trace_type, Tracer, TracingOptions}, }; pub static CONFIGURATION: RwLock = RwLock::new(Configuration { @@ -57,7 +57,7 @@ pub fn serialize_into_fields(items: &T, options: TracingOptions) -> Result( where T: Serialize + ?Sized, { - let tracer = SamplesTracer::new(String::from("$"), options); + let tracer = Tracer::new(String::from("$"), options); let tracer = StripOuterSequenceSink::new(tracer); let mut tracer = tracer; serialize_into_sink(&mut tracer, items)?; diff --git a/serde_arrow/src/internal/tracing/mod.rs b/serde_arrow/src/internal/tracing/mod.rs index e3da14aa..b30a7c1c 100644 --- a/serde_arrow/src/internal/tracing/mod.rs +++ b/serde_arrow/src/internal/tracing/mod.rs @@ -4,12 +4,9 @@ pub mod types; use serde::{Deserialize, Serialize}; -use crate::internal::{ - schema::{GenericField, Schema}, - Result, -}; +use crate::internal::{schema::Schema, Result}; -pub use samples::SamplesTracer; +pub use tracer::Tracer; pub use types::trace_type; /// Configure how the schema is traced diff --git a/serde_arrow/src/internal/tracing/samples.rs b/serde_arrow/src/internal/tracing/samples.rs index b1786635..0b4881ce 100644 --- a/serde_arrow/src/internal/tracing/samples.rs +++ b/serde_arrow/src/internal/tracing/samples.rs @@ -1,104 +1,20 @@ -use std::{ - collections::{BTreeSet, HashMap}, - iter, -}; +use std::collections::HashMap; use crate::internal::{ error::{fail, Result}, event::Event, + schema::{GenericDataType, Strategy}, + sink::macros, sink::EventSink, - tracing::tracer::{PrimitiveTracer, UnknownTracer}, -}; - -use super::{ - super::{ - schema::{GenericDataType, GenericField, Strategy}, - sink::macros, + tracing::tracer::{ + ListTracer, ListTracerState, MapTracer, MapTracerState, PrimitiveTracer, StructField, + StructMode, StructTracer, StructTracerState, Tracer, TupleTracer, TupleTracerState, + UnionTracer, UnionTracerState, }, - TracingOptions, + tracing::TracingOptions, }; -pub enum SamplesTracer { - Unknown(UnknownTracer), - Primitive(PrimitiveTracer), - List(ListTracer), - Map(MapTracer), - Struct(StructTracer), - Union(UnionTracer), - Tuple(TupleTracer), -} - -impl SamplesTracer { - pub fn new(path: String, options: TracingOptions) -> Self { - Self::Unknown(UnknownTracer::new(path, options)) - } - - pub fn to_field(&self, name: &str) -> Result { - use SamplesTracer::*; - match self { - Unknown(t) => t.to_field(name), - List(t) => t.to_field(name), - Map(t) => t.to_field(name), - Primitive(t) => t.to_field(name), - Tuple(t) => t.to_field(name), - Union(t) => t.to_field(name), - Struct(t) => t.to_field(name), - } - } - - pub fn mark_nullable(&mut self) { - use SamplesTracer::*; - match self { - Unknown(t) => { - t.nullable = true; - } - List(t) => { - t.nullable = true; - } - Map(t) => { - t.nullable = true; - } - Primitive(t) => { - t.nullable = true; - } - Tuple(t) => { - t.nullable = true; - } - Union(t) => { - t.nullable = true; - } - Struct(t) => { - t.nullable = true; - } - } - } - - fn reset(&mut self) -> Result<()> { - match self { - Self::Unknown(tracer) => tracer.reset(), - Self::List(tracer) => tracer.reset(), - Self::Struct(tracer) => tracer.reset(), - Self::Primitive(tracer) => tracer.reset(), - Self::Tuple(tracer) => tracer.reset(), - Self::Union(tracer) => tracer.reset(), - Self::Map(tracer) => tracer.reset(), - } - } - - pub fn finish(&mut self) -> Result<()> { - match self { - Self::Unknown(tracer) => tracer.finish(), - Self::List(tracer) => tracer.finish(), - Self::Struct(tracer) => tracer.finish(), - Self::Primitive(tracer) => tracer.finish(), - Self::Tuple(tracer) => tracer.finish(), - Self::Union(tracer) => tracer.finish(), - Self::Map(tracer) => tracer.finish(), - } - } -} - -impl EventSink for SamplesTracer { +impl EventSink for Tracer { macros::forward_specialized_to_generic!(); fn accept(&mut self, event: Event<'_>) -> Result<()> { @@ -126,7 +42,7 @@ impl EventSink for SamplesTracer { tracer.nullable, ); tracer.accept(event)?; - *self = SamplesTracer::Primitive(tracer) + *self = Tracer::Primitive(tracer) } Event::StartSequence => { let mut tracer = ListTracer::new( @@ -135,7 +51,7 @@ impl EventSink for SamplesTracer { tracer.nullable, ); tracer.accept(event)?; - *self = SamplesTracer::List(tracer); + *self = Tracer::List(tracer); } Event::StartStruct => { let mut tracer = StructTracer::new( @@ -145,7 +61,7 @@ impl EventSink for SamplesTracer { tracer.nullable, ); tracer.accept(event)?; - *self = SamplesTracer::Struct(tracer); + *self = Tracer::Struct(tracer); } Event::StartTuple => { let mut tracer = TupleTracer::new( @@ -154,7 +70,7 @@ impl EventSink for SamplesTracer { tracer.nullable, ); tracer.accept(event)?; - *self = SamplesTracer::Tuple(tracer); + *self = Tracer::Tuple(tracer); } Event::StartMap => { if tracer.options.map_as_struct { @@ -165,7 +81,7 @@ impl EventSink for SamplesTracer { tracer.nullable, ); tracer.accept(event)?; - *self = SamplesTracer::Struct(tracer); + *self = Tracer::Struct(tracer); } else { let mut tracer = MapTracer::new( tracer.path.clone(), @@ -173,7 +89,7 @@ impl EventSink for SamplesTracer { tracer.nullable, ); tracer.accept(event)?; - *self = SamplesTracer::Map(tracer); + *self = Tracer::Map(tracer); } } Event::Variant(_, _) => { @@ -183,7 +99,7 @@ impl EventSink for SamplesTracer { tracer.nullable, ); tracer.accept(event)?; - *self = SamplesTracer::Union(tracer) + *self = Tracer::Union(tracer) } ev if ev.is_end() => fail!( "Invalid end nesting events for unknown tracer ({path})", @@ -205,45 +121,10 @@ impl EventSink for SamplesTracer { } fn finish(&mut self) -> Result<()> { - SamplesTracer::finish(self) + Tracer::finish(self) } } -#[derive(Debug, PartialEq, Eq, Clone, Copy)] -pub enum StructMode { - Struct, - Map, -} - -pub struct StructTracer { - pub path: String, - pub options: TracingOptions, - pub nullable: bool, - pub state: StructTracerState, - pub mode: StructMode, - pub fields: Vec, - pub index: HashMap, - pub current_sample: usize, -} - -pub struct Field { - pub name: String, - pub tracer: SamplesTracer, - pub last_seen_in_sample: usize, -} - -#[derive(Debug, Clone, Copy)] -pub enum StructTracerState { - /// The tracer is waiting for the next key - WaitForKey, - /// The tracer is currently processing the next key - InKey, - /// The tracer is currently tracing a value for `(field, depth)` - InValue(usize, usize), - /// The tracer is finished - Finished, -} - impl StructTracer { pub fn new(path: String, options: TracingOptions, mode: StructMode, nullable: bool) -> Self { Self { @@ -257,48 +138,6 @@ impl StructTracer { current_sample: 0, } } - - pub fn to_field(&self, name: &str) -> Result { - if !matches!(self.state, StructTracerState::Finished) { - fail!("Cannot build field {name} from unfinished tracer"); - } - let mut res_field = GenericField::new(name, GenericDataType::Struct, self.nullable); - for field in &self.fields { - res_field.children.push(field.tracer.to_field(&field.name)?); - } - - if let StructMode::Map = self.mode { - res_field.children.sort_by(|a, b| a.name.cmp(&b.name)); - res_field.strategy = Some(Strategy::MapAsStruct); - } - Ok(res_field) - } - - pub fn reset(&mut self) -> Result<()> { - if !matches!(self.state, StructTracerState::Finished) { - fail!("Cannot reset unfinished tracer"); - } - for field in &mut self.fields { - field.tracer.reset()?; - } - - self.state = StructTracerState::WaitForKey; - Ok(()) - } - - pub fn finish(&mut self) -> Result<()> { - if !matches!(self.state, StructTracerState::WaitForKey) { - fail!("Incomplete struct in schema tracing"); - } - - for field in &mut self.fields { - field.tracer.finish()?; - } - - self.state = StructTracerState::Finished; - - Ok(()) - } } impl EventSink for StructTracer { @@ -325,8 +164,8 @@ impl EventSink for StructTracer { InValue(field_idx, 0) } else { - let mut field = Field { - tracer: SamplesTracer::new( + let mut field = StructField { + tracer: Tracer::new( format!("{path}.{key}", path = self.path), self.options.clone(), ), @@ -393,82 +232,6 @@ impl EventSink for StructTracer { } } -pub struct TupleTracer { - pub field_tracers: Vec, - pub nullable: bool, - pub state: TupleTracerState, - pub path: String, - pub options: TracingOptions, -} - -#[derive(Debug, Clone, Copy)] -pub enum TupleTracerState { - WaitForStart, - /// Wait for the item with `(field_index)` - WaitForItem(usize), - /// Process the item at `(field_index, depth)` - InItem(usize, usize), - Finished, -} - -impl TupleTracer { - pub fn new(path: String, options: TracingOptions, nullable: bool) -> Self { - Self { - path, - options, - field_tracers: Vec::new(), - nullable, - state: TupleTracerState::WaitForStart, - } - } - - pub fn to_field(&self, name: &str) -> Result { - if !matches!(self.state, TupleTracerState::Finished) { - fail!("Cannot build field {name} from unfinished tracer"); - } - - let mut field = GenericField::new(name, GenericDataType::Struct, self.nullable); - for (idx, tracer) in self.field_tracers.iter().enumerate() { - field.children.push(tracer.to_field(&idx.to_string())?); - } - field.strategy = Some(Strategy::TupleAsStruct); - - Ok(field) - } - - fn field_tracer(&mut self, idx: usize) -> &mut SamplesTracer { - while self.field_tracers.len() <= idx { - self.field_tracers.push(SamplesTracer::new( - format!("{path}.{idx}", path = self.path), - self.options.clone(), - )); - } - &mut self.field_tracers[idx] - } - - pub fn reset(&mut self) -> Result<()> { - if !matches!(self.state, TupleTracerState::Finished) { - fail!("Cannot reset unfinished tuple tracer"); - } - for tracer in &mut self.field_tracers { - tracer.reset()?; - } - self.state = TupleTracerState::WaitForStart; - Ok(()) - } - - pub fn finish(&mut self) -> Result<()> { - if !matches!(self.state, TupleTracerState::WaitForStart) { - fail!("Incomplete tuple in schema tracing"); - } - for tracer in &mut self.field_tracers { - tracer.finish()?; - } - self.state = TupleTracerState::Finished; - Ok(()) - } -} - impl EventSink for TupleTracer { macros::forward_specialized_to_generic!(); @@ -530,63 +293,6 @@ impl EventSink for TupleTracer { } } -pub struct ListTracer { - pub path: String, - pub options: TracingOptions, - pub item_tracer: Box, - pub nullable: bool, - pub state: ListTracerState, -} - -#[derive(Debug, Clone, Copy)] -pub enum ListTracerState { - WaitForStart, - WaitForItem, - InItem(usize), - Finished, -} - -impl ListTracer { - pub fn new(path: String, options: TracingOptions, nullable: bool) -> Self { - Self { - path: path.clone(), - options: options.clone(), - item_tracer: Box::new(SamplesTracer::new(path, options)), - nullable, - state: ListTracerState::WaitForStart, - } - } - - pub fn to_field(&self, name: &str) -> Result { - if !matches!(self.state, ListTracerState::Finished) { - fail!("Cannot build field {name} from unfinished tracer"); - } - - let mut field = GenericField::new(name, GenericDataType::LargeList, self.nullable); - field.children.push(self.item_tracer.to_field("element")?); - - Ok(field) - } - - pub fn reset(&mut self) -> Result<()> { - if !matches!(self.state, ListTracerState::Finished) { - fail!("Cannot reset unfinished list tracer"); - } - self.item_tracer.reset()?; - self.state = ListTracerState::Finished; - Ok(()) - } - - pub fn finish(&mut self) -> Result<()> { - if !matches!(self.state, ListTracerState::WaitForStart) { - fail!("Incomplete list in schema tracing"); - } - self.item_tracer.finish()?; - self.state = ListTracerState::Finished; - Ok(()) - } -} - impl EventSink for ListTracer { macros::forward_specialized_to_generic!(); @@ -640,116 +346,6 @@ impl EventSink for ListTracer { } } -pub struct UnionTracer { - pub variants: Vec>, - pub nullable: bool, - pub state: UnionTracerState, - pub path: String, - pub options: TracingOptions, -} - -pub struct Variant { - name: String, - tracer: SamplesTracer, -} - -#[derive(Debug, Clone, Copy)] -pub enum UnionTracerState { - /// Wait for the next variant - WaitForVariant, - /// Process the current variant at `(variant_index, depth)` - InVariant(usize, usize), - Finished, -} - -impl UnionTracer { - pub fn new(path: String, options: TracingOptions, nullable: bool) -> Self { - Self { - path, - options, - variants: Vec::new(), - nullable, - state: UnionTracerState::WaitForVariant, - } - } - - pub fn to_field(&self, name: &str) -> Result { - if !matches!(self.state, UnionTracerState::Finished) { - fail!("Cannot build field {name} from unfinished tracer"); - } - - let mut field = GenericField::new(name, GenericDataType::Union, self.nullable); - for variant in &self.variants { - if let Some(variant) = variant { - field.children.push(variant.tracer.to_field(&variant.name)?); - } else { - field.children.push( - GenericField::new("", GenericDataType::Null, true) - .with_strategy(Strategy::UnknownVariant), - ); - }; - } - - Ok(field) - } - - fn ensure_variant + AsRef>( - &mut self, - variant: S, - idx: usize, - ) -> Result<()> { - while self.variants.len() <= idx { - self.variants.push(None); - } - - if let Some(prev) = self.variants[idx].as_mut() { - let variant = variant.as_ref(); - if prev.name != variant { - fail!( - "Incompatible names for variant {idx}: {prev}, {variant}", - prev = prev.name - ); - } - } else { - let tracer = SamplesTracer::new( - format!("{path}.{key}", path = self.path, key = variant.as_ref()), - self.options.clone(), - ); - let name = variant.into(); - - self.variants[idx] = Some(Variant { name, tracer }); - } - - Ok(()) - } - - pub fn reset(&mut self) -> Result<()> { - if !matches!(self.state, UnionTracerState::Finished) { - fail!("Cannot reset unfinished union tracer"); - } - for variant in &mut self.variants { - let Some(variant) = variant.as_mut() else { - continue; - }; - variant.tracer.reset()?; - } - self.state = UnionTracerState::WaitForVariant; - Ok(()) - } - - pub fn finish(&mut self) -> Result<()> { - // TODO: fix me - for variant in &mut self.variants { - let Some(variant) = variant.as_mut() else { - continue; - }; - variant.tracer.finish()?; - } - self.state = UnionTracerState::Finished; - Ok(()) - } -} - impl EventSink for UnionTracer { macros::forward_specialized_to_generic!(); @@ -809,67 +405,6 @@ impl EventSink for UnionTracer { } } -pub struct MapTracer { - pub path: String, - pub options: TracingOptions, - pub key_tracer: Box, - pub value_tracer: Box, - pub nullable: bool, - pub state: MapTracerState, -} - -#[derive(Debug, Clone, Copy)] -pub enum MapTracerState { - WaitForKey, - /// Process the current key at `(depth)` - InKey(usize), - /// Process the current value at `(depth)` - InValue(usize), - Finished, -} - -impl MapTracer { - pub fn new(path: String, options: TracingOptions, nullable: bool) -> Self { - Self { - nullable, - options: options.clone(), - key_tracer: Box::new(SamplesTracer::new(format!("{path}.$key"), options.clone())), - value_tracer: Box::new(SamplesTracer::new(format!("{path}.$value"), options)), - state: MapTracerState::WaitForKey, - path, - } - } - - pub fn to_field(&self, name: &str) -> Result { - if !matches!(self.state, MapTracerState::Finished) { - fail!("Cannot build field {name} from unfinished tracer"); - } - - let mut entries = GenericField::new("entries", GenericDataType::Struct, false); - entries.children.push(self.key_tracer.to_field("key")?); - entries.children.push(self.value_tracer.to_field("value")?); - - let mut field = GenericField::new(name, GenericDataType::Map, self.nullable); - field.children.push(entries); - - Ok(field) - } - - pub fn reset(&mut self) -> Result<()> { - self.key_tracer.reset()?; - self.value_tracer.reset()?; - self.state = MapTracerState::WaitForKey; - Ok(()) - } - - pub fn finish(&mut self) -> Result<()> { - self.key_tracer.finish()?; - self.value_tracer.finish()?; - self.state = MapTracerState::Finished; - Ok(()) - } -} - impl EventSink for MapTracer { macros::forward_specialized_to_generic!(); diff --git a/serde_arrow/src/internal/tracing/tracer.rs b/serde_arrow/src/internal/tracing/tracer.rs index a58e3402..df3e03bf 100644 --- a/serde_arrow/src/internal/tracing/tracer.rs +++ b/serde_arrow/src/internal/tracing/tracer.rs @@ -105,7 +105,7 @@ impl Tracer { options: this.get_options().clone(), fields: fields .iter() - .map(|field| Field { + .map(|field| StructField { tracer: Tracer::new( format!("{}.{}", this.get_path(), field), this.get_options().clone(), @@ -141,7 +141,6 @@ impl Tracer { path: this.get_path().to_owned(), options: this.get_options().clone(), field_tracers: (0..num_fields) - .into_iter() .map(|i| { Tracer::new( format!("{}.{}", this.get_path(), i), @@ -176,7 +175,7 @@ impl Tracer { variants: variants .iter() .map(|variant| { - Some(Variant { + Some(UnionVariant { name: variant.to_string(), tracer: Tracer::new( format!("{}.{}", this.get_path(), variant), @@ -395,6 +394,17 @@ pub enum MapTracerState { } impl MapTracer { + pub fn new(path: String, options: TracingOptions, nullable: bool) -> Self { + Self { + nullable, + options: options.clone(), + key_tracer: Box::new(Tracer::new(format!("{path}.$key"), options.clone())), + value_tracer: Box::new(Tracer::new(format!("{path}.$value"), options)), + state: MapTracerState::WaitForKey, + path, + } + } + pub fn get_path(&self) -> &str { &self.path } @@ -403,28 +413,36 @@ impl MapTracer { self.key_tracer.is_complete() && self.value_tracer.is_complete() } - pub fn to_field(&self, name: &str) -> Result { - let key = self.key_tracer.to_field("key")?; - let value = self.value_tracer.to_field("value")?; - let res = GenericField::new(name, GenericDataType::Map, self.nullable) - .with_child(key) - .with_child(value); - Ok(res) - } - pub fn get_type(&self) -> Option<&GenericDataType> { Some(&GenericDataType::Map) } + pub fn to_field(&self, name: &str) -> Result { + if !matches!(self.state, MapTracerState::Finished) { + fail!("Cannot build field {name} from unfinished tracer"); + } + + let mut entries = GenericField::new("entries", GenericDataType::Struct, false); + entries.children.push(self.key_tracer.to_field("key")?); + entries.children.push(self.value_tracer.to_field("value")?); + + let mut field = GenericField::new(name, GenericDataType::Map, self.nullable); + field.children.push(entries); + + Ok(field) + } + pub fn reset(&mut self) -> Result<()> { self.key_tracer.reset()?; self.value_tracer.reset()?; + self.state = MapTracerState::WaitForKey; Ok(()) } pub fn finish(&mut self) -> Result<()> { self.key_tracer.finish()?; self.value_tracer.finish()?; + self.state = MapTracerState::Finished; Ok(()) } } @@ -447,6 +465,16 @@ pub enum ListTracerState { } impl ListTracer { + pub fn new(path: String, options: TracingOptions, nullable: bool) -> Self { + Self { + path: path.clone(), + options: options.clone(), + item_tracer: Box::new(Tracer::new(path, options)), + nullable, + state: ListTracerState::WaitForStart, + } + } + pub fn get_path(&self) -> &str { &self.path } @@ -465,7 +493,7 @@ impl ListTracer { } let mut field = GenericField::new(name, GenericDataType::LargeList, self.nullable); - field.children.push(self.item_tracer.to_field("item")?); + field.children.push(self.item_tracer.to_field("element")?); Ok(field) } @@ -509,6 +537,16 @@ pub enum TupleTracerState { } impl TupleTracer { + pub fn new(path: String, options: TracingOptions, nullable: bool) -> Self { + Self { + path, + options, + field_tracers: Vec::new(), + nullable, + state: TupleTracerState::WaitForStart, + } + } + pub fn get_path(&self) -> &str { &self.path } @@ -518,14 +556,17 @@ impl TupleTracer { } pub fn to_field(&self, name: &str) -> Result { - let mut res_field = GenericField::new(name, GenericDataType::Struct, self.nullable); + if !matches!(self.state, TupleTracerState::Finished) { + fail!("Cannot build field {name} from unfinished tracer"); + } + let mut field = GenericField::new(name, GenericDataType::Struct, self.nullable); for (idx, tracer) in self.field_tracers.iter().enumerate() { - res_field.children.push(tracer.to_field(&idx.to_string())?); + field.children.push(tracer.to_field(&idx.to_string())?); } - res_field.strategy = Some(Strategy::TupleAsStruct); + field.strategy = Some(Strategy::TupleAsStruct); - Ok(res_field) + Ok(field) } pub fn get_type(&self) -> Option<&GenericDataType> { @@ -533,18 +574,36 @@ impl TupleTracer { } pub fn reset(&mut self) -> Result<()> { + if !matches!(self.state, TupleTracerState::Finished) { + fail!("Cannot reset unfinished tuple tracer"); + } for tracer in &mut self.field_tracers { tracer.reset()?; } + self.state = TupleTracerState::WaitForStart; Ok(()) } pub fn finish(&mut self) -> Result<()> { + if !matches!(self.state, TupleTracerState::WaitForStart) { + fail!("Incomplete tuple in schema tracing"); + } for tracer in &mut self.field_tracers { tracer.finish()?; } + self.state = TupleTracerState::Finished; Ok(()) } + + pub fn field_tracer(&mut self, idx: usize) -> &mut Tracer { + while self.field_tracers.len() <= idx { + self.field_tracers.push(Tracer::new( + format!("{path}.{idx}", path = self.path), + self.options.clone(), + )); + } + &mut self.field_tracers[idx] + } } #[derive(Debug, PartialEq, Clone)] @@ -552,7 +611,7 @@ pub struct StructTracer { pub path: String, pub options: TracingOptions, pub nullable: bool, - pub fields: Vec, + pub fields: Vec, pub index: HashMap, pub mode: StructMode, pub state: StructTracerState, @@ -560,7 +619,7 @@ pub struct StructTracer { } #[derive(Debug, PartialEq, Clone)] -pub struct Field { +pub struct StructField { pub name: String, pub tracer: Tracer, pub last_seen_in_sample: usize, @@ -594,11 +653,18 @@ impl StructTracer { } pub fn to_field(&self, name: &str) -> Result { + if !matches!(self.state, StructTracerState::Finished) { + fail!("Cannot build field {name} from unfinished tracer"); + } let mut res_field = GenericField::new(name, GenericDataType::Struct, self.nullable); for field in &self.fields { res_field.children.push(field.tracer.to_field(&field.name)?); } + if let StructMode::Map = self.mode { + res_field.children.sort_by(|a, b| a.name.cmp(&b.name)); + res_field.strategy = Some(Strategy::MapAsStruct); + } Ok(res_field) } @@ -607,16 +673,28 @@ impl StructTracer { } pub fn reset(&mut self) -> Result<()> { + if !matches!(self.state, StructTracerState::Finished) { + fail!("Cannot reset unfinished tracer"); + } for field in &mut self.fields { field.tracer.reset()?; } + + self.state = StructTracerState::WaitForKey; Ok(()) } pub fn finish(&mut self) -> Result<()> { + if !matches!(self.state, StructTracerState::WaitForKey) { + fail!("Incomplete struct in schema tracing"); + } + for field in &mut self.fields { field.tracer.finish()?; } + + self.state = StructTracerState::Finished; + Ok(()) } } @@ -627,12 +705,12 @@ pub struct UnionTracer { pub path: String, pub options: TracingOptions, pub nullable: bool, - pub variants: Vec>, + pub variants: Vec>, pub state: UnionTracerState, } #[derive(Debug, PartialEq, Clone)] -pub struct Variant { +pub struct UnionVariant { pub name: String, pub tracer: Tracer, } @@ -647,6 +725,46 @@ pub enum UnionTracerState { } impl UnionTracer { + pub fn new(path: String, options: TracingOptions, nullable: bool) -> Self { + Self { + path, + options, + variants: Vec::new(), + nullable, + state: UnionTracerState::WaitForVariant, + } + } + + pub fn ensure_variant + AsRef>( + &mut self, + variant: S, + idx: usize, + ) -> Result<()> { + while self.variants.len() <= idx { + self.variants.push(None); + } + + if let Some(prev) = self.variants[idx].as_mut() { + let variant = variant.as_ref(); + if prev.name != variant { + fail!( + "Incompatible names for variant {idx}: {prev}, {variant}", + prev = prev.name + ); + } + } else { + let tracer = Tracer::new( + format!("{path}.{key}", path = self.path, key = variant.as_ref()), + self.options.clone(), + ); + let name = variant.into(); + + self.variants[idx] = Some(UnionVariant { name, tracer }); + } + + Ok(()) + } + pub fn get_path(&self) -> &str { &self.path } @@ -658,6 +776,10 @@ impl UnionTracer { .all(|variant| variant.tracer.is_complete()) } + pub fn get_type(&self) -> Option<&GenericDataType> { + Some(&GenericDataType::Union) + } + pub fn to_field(&self, name: &str) -> Result { if !matches!(self.state, UnionTracerState::Finished) { fail!("Cannot build field {name} from unfinished tracer"); @@ -678,10 +800,6 @@ impl UnionTracer { Ok(field) } - pub fn get_type(&self) -> Option<&GenericDataType> { - Some(&GenericDataType::Union) - } - pub fn reset(&mut self) -> Result<()> { if !matches!(self.state, UnionTracerState::Finished) { fail!("Cannot reset unfinished union tracer"); diff --git a/serde_arrow/src/internal/tracing/types.rs b/serde_arrow/src/internal/tracing/types.rs index ee98263f..29f5a625 100644 --- a/serde_arrow/src/internal/tracing/types.rs +++ b/serde_arrow/src/internal/tracing/types.rs @@ -7,7 +7,7 @@ use crate::internal::{ fail, schema::GenericField, tracing::{ - tracer::{Field, Tracer}, + tracer::{StructField, Tracer}, TracingOptions, }, Error, Result, @@ -307,7 +307,7 @@ impl<'de, 'a> serde::de::SeqAccess<'de> for TraceTupleStruct<'a> { } struct TraceStruct<'a> { - fields: &'a mut [Field], + fields: &'a mut [StructField], pos: usize, names: &'static [&'static str], } @@ -577,7 +577,7 @@ fn trace_list() { let actual = trace_type::>(TracingOptions::default(), "root").unwrap(); let expected = - F::new("root", T::LargeList, false).with_child(F::new("item", T::LargeUtf8, false)); + F::new("root", T::LargeList, false).with_child(F::new("element", T::LargeUtf8, false)); assert_eq!(actual, expected); } @@ -590,9 +590,11 @@ fn trace_map() { let actual = trace_type::>(TracingOptions::default().map_as_struct(false), "root") .unwrap(); - let expected = F::new("root", T::Map, false) - .with_child(F::new("key", T::I8, false)) - .with_child(F::new("value", T::LargeUtf8, false)); + let expected = F::new("root", T::Map, false).with_child( + F::new("entries", T::Struct, false) + .with_child(F::new("key", T::I8, false)) + .with_child(F::new("value", T::LargeUtf8, false)), + ); assert_eq!(actual, expected); } @@ -616,7 +618,7 @@ fn issue_90() { let expected = F::new("root", T::Struct, false).with_child( F::new("distribution", T::Struct, true) .with_child(F::new("samples", T::LargeList, false).with_child(F::new( - "item", + "element", T::F64, false, ))) diff --git a/serde_arrow/src/test_impls/macros.rs b/serde_arrow/src/test_impls/macros.rs index 267c889d..302e04cb 100644 --- a/serde_arrow/src/test_impls/macros.rs +++ b/serde_arrow/src/test_impls/macros.rs @@ -269,7 +269,7 @@ macro_rules! test_events { serialization::{compile_serialization, CompilationOptions, Interpreter}, event::Event, schema::{GenericDataType, GenericField}, - tracing::{SamplesTracer, TracingOptions}, + tracing::{Tracer, TracingOptions}, sink::{accept_events, StripOuterSequenceSink}, }; @@ -282,7 +282,7 @@ macro_rules! test_events { let options = TracingOptions::default(); $(let options = $tracing_options;)? - let tracer = SamplesTracer::new(String::from("$"), options); + let tracer = Tracer::new(String::from("$"), options); let mut tracer = StripOuterSequenceSink::new(tracer); accept_events(&mut tracer, events.iter().cloned()).unwrap(); let root = tracer.into_inner().to_field("root").unwrap(); From 382c9babee9055028102f1d0044a6d3fab13fd7e Mon Sep 17 00:00:00 2001 From: Christopher Prohm Date: Sat, 28 Oct 2023 12:38:16 +0200 Subject: [PATCH 07/27] Add missing derive feature for serde --- serde_arrow/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/serde_arrow/Cargo.toml b/serde_arrow/Cargo.toml index ce74e703..1045f80e 100644 --- a/serde_arrow/Cargo.toml +++ b/serde_arrow/Cargo.toml @@ -38,7 +38,7 @@ bytemuck = "1" # TODO: make optional, only required for str -> date conversions chrono = "0.4" half = { version = "2", features = ["bytemuck"] } -serde = "1.0" +serde = { version = "1.0", features = ["derive"] } arrow-array-46 = { package = "arrow-array", version = "46", optional = true } arrow-buffer-46 = { package = "arrow-buffer", version = "46", optional = true } From e1eeceb4eda0f6b8289d4369a7534e210fb0903d Mon Sep 17 00:00:00 2001 From: Christopher Prohm Date: Sat, 28 Oct 2023 12:38:57 +0200 Subject: [PATCH 08/27] Use TracedSchema internally throughout --- serde_arrow/src/arrow/mod.rs | 15 +- serde_arrow/src/arrow/schema.rs | 9 + serde_arrow/src/arrow2/mod.rs | 15 +- serde_arrow/src/arrow2/schema.rs | 9 + serde_arrow/src/internal/mod.rs | 42 +-- serde_arrow/src/internal/sink.rs | 4 - serde_arrow/src/internal/tracing/mod.rs | 54 +++- serde_arrow/src/internal/tracing/samples.rs | 23 +- serde_arrow/src/internal/tracing/tracer.rs | 98 +++--- serde_arrow/src/internal/tracing/types.rs | 337 ++++++++++---------- serde_arrow/src/lib.rs | 7 +- serde_arrow/src/test_impls/macros.rs | 8 +- 12 files changed, 324 insertions(+), 297 deletions(-) diff --git a/serde_arrow/src/arrow/mod.rs b/serde_arrow/src/arrow/mod.rs index 6a0e0196..1eecaa73 100644 --- a/serde_arrow/src/arrow/mod.rs +++ b/serde_arrow/src/arrow/mod.rs @@ -23,7 +23,7 @@ use crate::{ serialization::{compile_serialization, CompilationOptions, Interpreter}, sink::serialize_into_sink, source::deserialize_from_source, - tracing::TracingOptions, + tracing::{TracedSchema, TracingOptions}, }, }; @@ -71,10 +71,9 @@ pub fn serialize_into_fields(items: &T, options: TracingOptions) -> Result(items: &T, name: &str, options: TracingOptions) - where T: Serialize + ?Sized, { - let field = internal::serialize_into_field(items, name, options)?; - (&field).try_into() + let mut schema = TracedSchema::new(options); + schema.trace_samples(items)?; + let field = schema.to_field(name)?; + Field::try_from(&field) } /// Build arrays from the given items diff --git a/serde_arrow/src/arrow/schema.rs b/serde_arrow/src/arrow/schema.rs index 931e256f..8234418a 100644 --- a/serde_arrow/src/arrow/schema.rs +++ b/serde_arrow/src/arrow/schema.rs @@ -4,6 +4,7 @@ use crate::{ internal::{ error::{error, fail, Error, Result}, schema::{GenericDataType, GenericField, GenericTimeUnit, Schema, Strategy, STRATEGY_KEY}, + tracing::TracedSchema, }, }; @@ -25,6 +26,14 @@ impl Schema { } } +/// Support for arrow types (requires one of the `arrow-*` features) +impl TracedSchema { + /// Build a vec of fields from a TracedSchema object + pub fn to_arrow_fields(&self) -> Result> { + self.to_fields()?.iter().map(Field::try_from).collect() + } +} + impl TryFrom<&DataType> for GenericDataType { type Error = Error; diff --git a/serde_arrow/src/arrow2/mod.rs b/serde_arrow/src/arrow2/mod.rs index 391b9d7d..6591d3a3 100644 --- a/serde_arrow/src/arrow2/mod.rs +++ b/serde_arrow/src/arrow2/mod.rs @@ -22,7 +22,7 @@ use crate::{ serialization::{compile_serialization, CompilationOptions, Interpreter}, sink::serialize_into_sink, source::deserialize_from_source, - tracing::TracingOptions, + tracing::{TracedSchema, TracingOptions}, }, }; @@ -70,10 +70,9 @@ pub fn serialize_into_fields(items: &T, options: TracingOptions) -> Result(items: &T, name: &str, options: TracingOptions) - where T: Serialize + ?Sized, { - let field = internal::serialize_into_field(items, name, options)?; - (&field).try_into() + let mut schema = TracedSchema::new(options); + schema.trace_samples(items)?; + let field = schema.to_field(name)?; + Field::try_from(&field) } /// Serialize a sequence of objects representing a single array into an array diff --git a/serde_arrow/src/arrow2/schema.rs b/serde_arrow/src/arrow2/schema.rs index 9a2ff0b4..10e36845 100644 --- a/serde_arrow/src/arrow2/schema.rs +++ b/serde_arrow/src/arrow2/schema.rs @@ -3,6 +3,7 @@ use crate::{ internal::{ error::{error, fail, Error, Result}, schema::{GenericDataType, GenericField, GenericTimeUnit, Schema, Strategy, STRATEGY_KEY}, + tracing::TracedSchema, }, }; @@ -24,6 +25,14 @@ impl Schema { } } +/// Support for arrow2 types (requires one of the `arrow2-*` features) +impl TracedSchema { + /// Build a vec of arrow2 fields from a TracedSchema object + pub fn to_arrow2_fields(&self) -> Result> { + self.to_fields()?.iter().map(Field::try_from).collect() + } +} + impl TryFrom<&Field> for GenericField { type Error = Error; diff --git a/serde_arrow/src/internal/mod.rs b/serde_arrow/src/internal/mod.rs index b758cb1b..c888d045 100644 --- a/serde_arrow/src/internal/mod.rs +++ b/serde_arrow/src/internal/mod.rs @@ -15,11 +15,10 @@ use serde::{Deserialize, Serialize}; use self::{ common::{BufferExtract, Buffers}, - error::{fail, Error, Result}, - schema::{GenericDataType, GenericField}, - sink::{serialize_into_sink, EventSerializer, EventSink, StripOuterSequenceSink}, + error::{Error, Result}, + schema::GenericField, + sink::{serialize_into_sink, EventSerializer, EventSink}, source::deserialize_from_source, - tracing::{trace_type, Tracer, TracingOptions}, }; pub static CONFIGURATION: RwLock = RwLock::new(Configuration { @@ -53,41 +52,6 @@ pub fn configure(f: F) { f(&mut guard) } -pub fn serialize_into_fields(items: &T, options: TracingOptions) -> Result> -where - T: Serialize + ?Sized, -{ - let tracer = Tracer::new(String::from("$"), options); - let mut tracer = StripOuterSequenceSink::new(tracer); - serialize_into_sink(&mut tracer, items)?; - let root = tracer.into_inner().to_field("root")?; - - match root.data_type { - GenericDataType::Struct => {} - GenericDataType::Null => fail!("No records found to determine schema"), - dt => fail!("Unexpected root data type {dt:?}"), - }; - - Ok(root.children) -} - -pub fn serialize_into_field( - items: &T, - name: &str, - options: TracingOptions, -) -> Result -where - T: Serialize + ?Sized, -{ - let tracer = Tracer::new(String::from("$"), options); - let tracer = StripOuterSequenceSink::new(tracer); - let mut tracer = tracer; - serialize_into_sink(&mut tracer, items)?; - - let field = tracer.into_inner().to_field(name)?; - Ok(field) -} - pub struct GenericBuilder(pub serialization::Interpreter); impl GenericBuilder { diff --git a/serde_arrow/src/internal/sink.rs b/serde_arrow/src/internal/sink.rs index 6e647895..0c4c54c1 100644 --- a/serde_arrow/src/internal/sink.rs +++ b/serde_arrow/src/internal/sink.rs @@ -125,10 +125,6 @@ impl StripOuterSequenceSink { state: StripOuterSequenceState::WaitForStart, } } - - pub fn into_inner(self) -> E { - self.wrapped - } } impl EventSink for StripOuterSequenceSink { diff --git a/serde_arrow/src/internal/tracing/mod.rs b/serde_arrow/src/internal/tracing/mod.rs index b30a7c1c..90418a96 100644 --- a/serde_arrow/src/internal/tracing/mod.rs +++ b/serde_arrow/src/internal/tracing/mod.rs @@ -4,10 +4,12 @@ pub mod types; use serde::{Deserialize, Serialize}; -use crate::internal::{schema::Schema, Result}; +use crate::internal::{ + error::{fail, Result}, + schema::{GenericDataType, GenericField, Schema}, +}; pub use tracer::Tracer; -pub use types::trace_type; /// Configure how the schema is traced /// @@ -43,7 +45,7 @@ pub struct TracingOptions { pub allow_null_fields: bool, /// If `true` serialize maps as structs (the default). See - /// [`Strategy::MapAsStruct`] for details. + /// [`Strategy::MapAsStruct`][crate::schema::Strategy] for details. pub map_as_struct: bool, /// If `true` serialize strings dictionary encoded. The default is `false`. @@ -73,8 +75,8 @@ pub struct TracingOptions { /// /// For string fields where all values are either missing or conform to one /// of the format the data type is set as `Date64` with strategy - /// [`NaiveStrAsDate64`][Strategy::NaiveStrAsDate64] or - /// [`UtcStrAsDate64`][Strategy::UtcStrAsDate64]. + /// [`NaiveStrAsDate64`][crate::schema::Strategy::NaiveStrAsDate64] or + /// [`UtcStrAsDate64`][crate::schema::Strategy::UtcStrAsDate64]. pub try_parse_dates: bool, } @@ -126,28 +128,52 @@ impl TracingOptions { } } -pub struct TracedSchema {} +/// Collect schema information from samples and types +pub struct TracedSchema { + tracer: Tracer, +} impl TracedSchema { - pub fn new() -> Self { - Self {} + /// Construct a new instance with the given options + pub fn new(options: TracingOptions) -> Self { + Self { + tracer: Tracer::new(String::from("$"), options), + } } - pub fn get_schema(&self) -> Result { - todo!() + pub(crate) fn to_field(&self, name: &str) -> Result { + self.tracer.to_field(name) } - // TODO: add get_arrow2_fields - // TODO: add get_arrow_fields + pub(crate) fn to_fields(&self) -> Result> { + let root = self.tracer.to_field("root")?; + + match root.data_type { + GenericDataType::Struct => Ok(root.children), + GenericDataType::Null => fail!("No records found to determine schema"), + dt => fail!("Unexpected root data type {dt:?}"), + } + } + + /// Convert the traced schema into a schema object + pub fn to_schema(&self) -> Result { + Ok(Schema { + fields: self.to_fields()?, + }) + } } impl TracedSchema { + /// Trace the given samples and collect schema information pub fn trace_samples(&mut self, samples: &T) -> Result<()> { - todo!() + self.tracer.reset()?; + self.tracer.trace_samples(samples) } + /// Trace the given type and collect schema information pub fn trace_type<'de, T: Deserialize<'de>>(&mut self) -> Result<()> { - todo!() + self.tracer.reset()?; + self.tracer.trace_type::() } } diff --git a/serde_arrow/src/internal/tracing/samples.rs b/serde_arrow/src/internal/tracing/samples.rs index 0b4881ce..0a83e20d 100644 --- a/serde_arrow/src/internal/tracing/samples.rs +++ b/serde_arrow/src/internal/tracing/samples.rs @@ -1,11 +1,13 @@ use std::collections::HashMap; +use serde::Serialize; + use crate::internal::{ error::{fail, Result}, event::Event, schema::{GenericDataType, Strategy}, sink::macros, - sink::EventSink, + sink::{serialize_into_sink, EventSink, StripOuterSequenceSink}, tracing::tracer::{ ListTracer, ListTracerState, MapTracer, MapTracerState, PrimitiveTracer, StructField, StructMode, StructTracer, StructTracerState, Tracer, TupleTracer, TupleTracerState, @@ -14,6 +16,25 @@ use crate::internal::{ tracing::TracingOptions, }; +impl Tracer { + pub fn trace_samples(&mut self, samples: &T) -> Result<()> { + let mut tracer = StripOuterSequenceSink::new(&mut *self); + serialize_into_sink(&mut tracer, samples) + } +} + +impl<'a> EventSink for &'a mut Tracer { + macros::forward_specialized_to_generic!(); + + fn accept(&mut self, event: Event<'_>) -> Result<()> { + (*self).accept(event) + } + + fn finish(&mut self) -> Result<()> { + (*self).finish() + } +} + impl EventSink for Tracer { macros::forward_specialized_to_generic!(); diff --git a/serde_arrow/src/internal/tracing/tracer.rs b/serde_arrow/src/internal/tracing/tracer.rs index df3e03bf..1c45e76b 100644 --- a/serde_arrow/src/internal/tracing/tracer.rs +++ b/serde_arrow/src/internal/tracing/tracer.rs @@ -64,10 +64,6 @@ impl Tracer { dispatch_tracer!(self, tracer => tracer.to_field(name)) } - pub fn get_depth(&self) -> usize { - self.get_path().chars().filter(|c| *c == '.').count() - } - pub fn get_options(&self) -> &TracingOptions { dispatch_tracer!(self, tracer => &tracer.options) } @@ -345,9 +341,6 @@ impl UnknownTracer { } pub fn reset(&mut self) -> Result<()> { - if !matches!(self.state, UnknownTracerState::Finished) { - fail!("cannot reset an unfinished tracer"); - } self.state = UnknownTracerState::Unfinished; Ok(()) } @@ -433,13 +426,25 @@ impl MapTracer { } pub fn reset(&mut self) -> Result<()> { - self.key_tracer.reset()?; - self.value_tracer.reset()?; - self.state = MapTracerState::WaitForKey; - Ok(()) + match self.state { + MapTracerState::WaitForKey | MapTracerState::Finished => { + self.key_tracer.reset()?; + self.value_tracer.reset()?; + self.state = MapTracerState::WaitForKey; + Ok(()) + } + state => fail!("Cannot reset map tracer in state {state:?}"), + } } pub fn finish(&mut self) -> Result<()> { + if !matches!(self.state, MapTracerState::WaitForKey) { + fail!( + "Cannot finish map tracer in state {state:?}", + state = self.state + ); + } + self.key_tracer.finish()?; self.value_tracer.finish()?; self.state = MapTracerState::Finished; @@ -499,12 +504,14 @@ impl ListTracer { } pub fn reset(&mut self) -> Result<()> { - if !matches!(self.state, ListTracerState::Finished) { - fail!("Cannot reset unfinished list tracer"); + match self.state { + ListTracerState::WaitForStart | ListTracerState::Finished => { + self.item_tracer.reset()?; + self.state = ListTracerState::Finished; + Ok(()) + } + state => fail!("cannot reset list tracer in {state:?}"), } - self.item_tracer.reset()?; - self.state = ListTracerState::Finished; - Ok(()) } pub fn finish(&mut self) -> Result<()> { @@ -574,14 +581,16 @@ impl TupleTracer { } pub fn reset(&mut self) -> Result<()> { - if !matches!(self.state, TupleTracerState::Finished) { - fail!("Cannot reset unfinished tuple tracer"); - } - for tracer in &mut self.field_tracers { - tracer.reset()?; + match self.state { + TupleTracerState::WaitForStart | TupleTracerState::Finished => { + for tracer in &mut self.field_tracers { + tracer.reset()?; + } + self.state = TupleTracerState::WaitForStart; + Ok(()) + } + state => fail!("Cannot reset tuple tracer in state {state:?}"), } - self.state = TupleTracerState::WaitForStart; - Ok(()) } pub fn finish(&mut self) -> Result<()> { @@ -673,15 +682,17 @@ impl StructTracer { } pub fn reset(&mut self) -> Result<()> { - if !matches!(self.state, StructTracerState::Finished) { - fail!("Cannot reset unfinished tracer"); - } - for field in &mut self.fields { - field.tracer.reset()?; - } + match self.state { + StructTracerState::WaitForKey | StructTracerState::Finished => { + for field in &mut self.fields { + field.tracer.reset()?; + } - self.state = StructTracerState::WaitForKey; - Ok(()) + self.state = StructTracerState::WaitForKey; + Ok(()) + } + state => fail!("Cannot unfinished tracer in state {state:?}"), + } } pub fn finish(&mut self) -> Result<()> { @@ -801,17 +812,19 @@ impl UnionTracer { } pub fn reset(&mut self) -> Result<()> { - if !matches!(self.state, UnionTracerState::Finished) { - fail!("Cannot reset unfinished union tracer"); - } - for variant in &mut self.variants { - let Some(variant) = variant.as_mut() else { - continue; - }; - variant.tracer.reset()?; + match self.state { + UnionTracerState::WaitForVariant | UnionTracerState::Finished => { + for variant in &mut self.variants { + let Some(variant) = variant.as_mut() else { + continue; + }; + variant.tracer.reset()?; + } + self.state = UnionTracerState::WaitForVariant; + Ok(()) + } + state => fail!("Cannot reset union tracer in state {state:?}"), } - self.state = UnionTracerState::WaitForVariant; - Ok(()) } pub fn finish(&mut self) -> Result<()> { @@ -869,9 +882,6 @@ impl PrimitiveTracer { } pub fn reset(&mut self) -> Result<()> { - if !matches!(self.state, PrimitiveTracerState::Finished) { - fail!("Cannot reset an unfished tracer"); - } self.state = PrimitiveTracerState::Unfinished; Ok(()) } diff --git a/serde_arrow/src/internal/tracing/types.rs b/serde_arrow/src/internal/tracing/types.rs index 29f5a625..36fc7f0c 100644 --- a/serde_arrow/src/internal/tracing/types.rs +++ b/serde_arrow/src/internal/tracing/types.rs @@ -4,33 +4,25 @@ use serde::{ }; use crate::internal::{ - fail, - schema::GenericField, - tracing::{ - tracer::{StructField, Tracer}, - TracingOptions, - }, - Error, Result, + error::{fail, Error, Result}, + tracing::tracer::{StructField, Tracer}, }; -pub fn trace_type<'de, T: Deserialize<'de>>( - options: TracingOptions, - name: &str, -) -> Result { - let mut tracer = Tracer::new(String::from("$"), options); - - // TODO: make configurable - let mut attempts = 100; - while !tracer.is_complete() { - if attempts == 0 { - fail!("could not determine ...") +impl Tracer { + pub fn trace_type<'de, T: Deserialize<'de>>(&mut self) -> Result<()> { + // TODO: make configurable + let mut attempts = 100; + while !self.is_complete() { + if attempts == 0 { + fail!("could not determine ...") + } + T::deserialize(TraceAny(&mut *self))?; + attempts -= 1; } - T::deserialize(TraceAny(&mut tracer))?; - attempts -= 1; - } - tracer.finish()?; - tracer.to_field(name) + self.finish()?; + Ok(()) + } } struct TraceAny<'a>(&'a mut Tracer); @@ -455,175 +447,172 @@ impl<'de, 'a> serde::de::Deserializer<'de> for IdentifierDeserializer<'a> { unimplemented!('de, deserialize_ignored_any); } -#[test] -fn trace_primitives() { - use {crate::internal::schema::GenericDataType as T, GenericField as F}; - - assert_eq!( - trace_type::(TracingOptions::default(), "root").unwrap(), - F::new("root", T::I8, false) - ); - assert_eq!( - trace_type::(TracingOptions::default(), "root").unwrap(), - F::new("root", T::I16, false) - ); - assert_eq!( - trace_type::(TracingOptions::default(), "root").unwrap(), - F::new("root", T::I32, false) - ); - assert_eq!( - trace_type::(TracingOptions::default(), "root").unwrap(), - F::new("root", T::I64, false) - ); - - assert_eq!( - trace_type::(TracingOptions::default(), "root").unwrap(), - F::new("root", T::U8, false) - ); - assert_eq!( - trace_type::(TracingOptions::default(), "root").unwrap(), - F::new("root", T::U16, false) - ); - assert_eq!( - trace_type::(TracingOptions::default(), "root").unwrap(), - F::new("root", T::U32, false) - ); - assert_eq!( - trace_type::(TracingOptions::default(), "root").unwrap(), - F::new("root", T::U64, false) - ); - - assert_eq!( - trace_type::(TracingOptions::default(), "root").unwrap(), - F::new("root", T::F32, false) - ); - assert_eq!( - trace_type::(TracingOptions::default(), "root").unwrap(), - F::new("root", T::F64, false) - ); -} +#[cfg(test)] +mod test { + use std::collections::HashMap; -#[test] -fn trace_option() { - use {crate::internal::schema::GenericDataType as T, GenericField as F}; - - assert_eq!( - trace_type::(TracingOptions::default(), "root").unwrap(), - F::new("root", T::I8, false) - ); - assert_eq!( - trace_type::>(TracingOptions::default(), "root").unwrap(), - F::new("root", T::I8, true) - ); -} + use serde::Deserialize; -#[test] -fn trace_struct() { - use {crate::internal::schema::GenericDataType as T, GenericField as F}; + use crate::internal::{ + schema::{GenericDataType as T, GenericField as F, Strategy}, + tracing::{TracedSchema, TracingOptions}, + }; - #[allow(dead_code)] - #[derive(Deserialize)] - struct Example { - a: bool, - b: Option, - } + fn trace_type<'de, T: Deserialize<'de>>(options: TracingOptions) -> F { + let mut schema = TracedSchema::new(options); + schema.trace_type::().unwrap(); + schema.to_field("root").unwrap() + } + + #[test] + fn trace_primitives() { + assert_eq!( + trace_type::(TracingOptions::default()), + F::new("root", T::I8, false) + ); + assert_eq!( + trace_type::(TracingOptions::default()), + F::new("root", T::I16, false) + ); + assert_eq!( + trace_type::(TracingOptions::default()), + F::new("root", T::I32, false) + ); + assert_eq!( + trace_type::(TracingOptions::default()), + F::new("root", T::I64, false) + ); + + assert_eq!( + trace_type::(TracingOptions::default()), + F::new("root", T::U8, false) + ); + assert_eq!( + trace_type::(TracingOptions::default()), + F::new("root", T::U16, false) + ); + assert_eq!( + trace_type::(TracingOptions::default()), + F::new("root", T::U32, false) + ); + assert_eq!( + trace_type::(TracingOptions::default()), + F::new("root", T::U64, false) + ); + + assert_eq!( + trace_type::(TracingOptions::default()), + F::new("root", T::F32, false) + ); + assert_eq!( + trace_type::(TracingOptions::default()), + F::new("root", T::F64, false) + ); + } + + #[test] + fn trace_option() { + assert_eq!( + trace_type::(TracingOptions::default()), + F::new("root", T::I8, false) + ); + assert_eq!( + trace_type::>(TracingOptions::default()), + F::new("root", T::I8, true) + ); + } + + #[test] + fn trace_struct() { + #[allow(dead_code)] + #[derive(Deserialize)] + struct Example { + a: bool, + b: Option, + } - let actual = trace_type::(TracingOptions::default(), "root").unwrap(); - let expected = F::new("root", T::Struct, false) - .with_child(F::new("a", T::Bool, false)) - .with_child(F::new("b", T::I8, true)); + let actual = trace_type::(TracingOptions::default()); + let expected = F::new("root", T::Struct, false) + .with_child(F::new("a", T::Bool, false)) + .with_child(F::new("b", T::I8, true)); - assert_eq!(actual, expected); -} + assert_eq!(actual, expected); + } -#[test] -fn trace_tuple_as_struct() { - use { - crate::internal::schema::GenericDataType as T, crate::internal::schema::Strategy, - GenericField as F, - }; + #[test] + fn trace_tuple_as_struct() { + let actual = trace_type::<(bool, Option)>(TracingOptions::default()); + let expected = F::new("root", T::Struct, false) + .with_child(F::new("0", T::Bool, false)) + .with_child(F::new("1", T::I8, true)) + .with_strategy(Strategy::TupleAsStruct); - let actual = trace_type::<(bool, Option)>(TracingOptions::default(), "root").unwrap(); - let expected = F::new("root", T::Struct, false) - .with_child(F::new("0", T::Bool, false)) - .with_child(F::new("1", T::I8, true)) - .with_strategy(Strategy::TupleAsStruct); + assert_eq!(actual, expected); + } - assert_eq!(actual, expected); -} + #[test] + fn trace_union() { + #[allow(dead_code)] + #[derive(Deserialize)] + enum Example { + A(i8), + B(f32), + } -#[test] -fn trace_union() { - use {crate::internal::schema::GenericDataType as T, GenericField as F}; + let actual = trace_type::(TracingOptions::default()); + let expected = F::new("root", T::Union, false) + .with_child(F::new("A", T::I8, false)) + .with_child(F::new("B", T::F32, false)); - #[allow(dead_code)] - #[derive(Deserialize)] - enum Example { - A(i8), - B(f32), + assert_eq!(actual, expected); } - let actual = trace_type::(TracingOptions::default(), "root").unwrap(); - let expected = F::new("root", T::Union, false) - .with_child(F::new("A", T::I8, false)) - .with_child(F::new("B", T::F32, false)); - - assert_eq!(actual, expected); -} + #[test] + fn trace_list() { + let actual = trace_type::>(TracingOptions::default()); + let expected = + F::new("root", T::LargeList, false).with_child(F::new("element", T::LargeUtf8, false)); -#[test] -fn trace_list() { - use {crate::internal::schema::GenericDataType as T, GenericField as F}; + assert_eq!(actual, expected); + } - let actual = trace_type::>(TracingOptions::default(), "root").unwrap(); - let expected = - F::new("root", T::LargeList, false).with_child(F::new("element", T::LargeUtf8, false)); + #[test] + fn trace_map() { + let actual = + trace_type::>(TracingOptions::default().map_as_struct(false)); + let expected = F::new("root", T::Map, false).with_child( + F::new("entries", T::Struct, false) + .with_child(F::new("key", T::I8, false)) + .with_child(F::new("value", T::LargeUtf8, false)), + ); - assert_eq!(actual, expected); -} + assert_eq!(actual, expected); + } -#[test] -fn trace_map() { - use std::collections::HashMap; - use {crate::internal::schema::GenericDataType as T, GenericField as F}; - - let actual = - trace_type::>(TracingOptions::default().map_as_struct(false), "root") - .unwrap(); - let expected = F::new("root", T::Map, false).with_child( - F::new("entries", T::Struct, false) - .with_child(F::new("key", T::I8, false)) - .with_child(F::new("value", T::LargeUtf8, false)), - ); - - assert_eq!(actual, expected); -} + #[test] + fn issue_90() { + #[derive(Deserialize)] + pub struct Distribution { + pub samples: Vec, + pub statistic: String, + } -#[test] -fn issue_90() { - use {crate::internal::schema::GenericDataType as T, GenericField as F}; + #[derive(Deserialize)] + pub struct VectorMetric { + pub distribution: Option, + } - #[derive(Deserialize)] - pub struct Distribution { - pub samples: Vec, - pub statistic: String, - } + let actual = trace_type::(TracingOptions::default()); + let expected = F::new("root", T::Struct, false).with_child( + F::new("distribution", T::Struct, true) + .with_child(F::new("samples", T::LargeList, false).with_child(F::new( + "element", + T::F64, + false, + ))) + .with_child(F::new("statistic", T::LargeUtf8, false)), + ); - #[derive(Deserialize)] - pub struct VectorMetric { - pub distribution: Option, + assert_eq!(actual, expected); } - - let actual = trace_type::(TracingOptions::default(), "root").unwrap(); - let expected = F::new("root", T::Struct, false).with_child( - F::new("distribution", T::Struct, true) - .with_child(F::new("samples", T::LargeList, false).with_child(F::new( - "element", - T::F64, - false, - ))) - .with_child(F::new("statistic", T::LargeUtf8, false)), - ); - - assert_eq!(actual, expected); } diff --git a/serde_arrow/src/lib.rs b/serde_arrow/src/lib.rs index e35ff534..fd32c145 100644 --- a/serde_arrow/src/lib.rs +++ b/serde_arrow/src/lib.rs @@ -9,8 +9,7 @@ //! as easy as using Serde's derive macros. //! //! In the Rust ecosystem there are two competing implementations of the arrow -//! in-memory format, [`arrow`][arrow] and [`arrow2`][arrow2]. `serde_arrow` -//! supports both. +//! in-memory format, [`arrow`][] and [`arrow2`][]. `serde_arrow` supports both. //! //! `serde_arrow` relies on a schema to translate between Rust and Arrow. The //! schema is expressed as Arrow fields and describes the schema of the arrays. @@ -250,15 +249,17 @@ pub use crate::internal::error::{Error, Result}; /// # #[cfg(not(feature="arrow2"))] /// # fn main() {} /// ``` +#[deny(missing_docs)] pub mod schema { pub use crate::internal::{ schema::{Schema, Strategy, STRATEGY_KEY}, - tracing::TracingOptions, + tracing::{TracedSchema, TracingOptions}, }; } /// Experimental functionality that is not bound by semver compatibility /// +#[deny(missing_docs)] pub mod experimental { pub use crate::internal::{configure, Configuration}; } diff --git a/serde_arrow/src/test_impls/macros.rs b/serde_arrow/src/test_impls/macros.rs index 302e04cb..949e947b 100644 --- a/serde_arrow/src/test_impls/macros.rs +++ b/serde_arrow/src/test_impls/macros.rs @@ -282,10 +282,10 @@ macro_rules! test_events { let options = TracingOptions::default(); $(let options = $tracing_options;)? - let tracer = Tracer::new(String::from("$"), options); - let mut tracer = StripOuterSequenceSink::new(tracer); - accept_events(&mut tracer, events.iter().cloned()).unwrap(); - let root = tracer.into_inner().to_field("root").unwrap(); + let mut tracer = Tracer::new(String::from("$"), options); + let mut sink = StripOuterSequenceSink::new(&mut tracer); + accept_events(&mut sink, events.iter().cloned()).unwrap(); + let root = tracer.to_field("root").unwrap(); assert_eq!(root.children, fields); } From b528a579cd7cb2d12806b257d766c9e3f1bff89d Mon Sep 17 00:00:00 2001 From: Christopher Prohm Date: Sat, 28 Oct 2023 12:44:19 +0200 Subject: [PATCH 09/27] Remove all top-level definitions from internal --- serde_arrow/src/arrow/mod.rs | 12 +- serde_arrow/src/arrow2/mod.rs | 12 +- serde_arrow/src/internal/config.rs | 32 ++++++ .../src/internal/deserialization/mod.rs | 2 +- serde_arrow/src/internal/generic.rs | 70 ++++++++++++ serde_arrow/src/internal/mod.rs | 103 +----------------- .../src/internal/serialization/compiler.rs | 2 +- serde_arrow/src/lib.rs | 2 +- 8 files changed, 119 insertions(+), 116 deletions(-) create mode 100644 serde_arrow/src/internal/config.rs create mode 100644 serde_arrow/src/internal/generic.rs diff --git a/serde_arrow/src/arrow/mod.rs b/serde_arrow/src/arrow/mod.rs index 1eecaa73..55319600 100644 --- a/serde_arrow/src/arrow/mod.rs +++ b/serde_arrow/src/arrow/mod.rs @@ -17,7 +17,7 @@ use crate::{ datatypes::Field, }, internal::{ - self, + generic, error::Result, schema::GenericField, serialization::{compile_serialization, CompilationOptions, Interpreter}, @@ -270,7 +270,7 @@ where T: Deserialize<'de>, A: AsRef + 'de + ?Sized, { - internal::deserialize_from_array(field, array.as_ref()) + generic::deserialize_from_array(field, array.as_ref()) } /// Build a single array item by item @@ -294,7 +294,7 @@ where /// let array = builder.build_array().unwrap(); /// assert_eq!(array.len(), 6); /// ``` -pub struct ArrayBuilder(internal::GenericBuilder); +pub struct ArrayBuilder(generic::GenericBuilder); impl ArrayBuilder { /// Construct a new build for the given field @@ -302,7 +302,7 @@ impl ArrayBuilder { /// This method may fail for an unsupported data type of the given field. /// pub fn new(field: &Field) -> Result { - Ok(Self(internal::GenericBuilder::new_for_array( + Ok(Self(generic::GenericBuilder::new_for_array( GenericField::try_from(field)?, )?)) } @@ -365,7 +365,7 @@ impl ArrayBuilder { /// assert_eq!(arrays.len(), 2); /// assert_eq!(arrays[0].len(), 6); /// ``` -pub struct ArraysBuilder(internal::GenericBuilder); +pub struct ArraysBuilder(generic::GenericBuilder); impl std::fmt::Debug for ArraysBuilder { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { @@ -384,7 +384,7 @@ impl ArraysBuilder { .iter() .map(GenericField::try_from) .collect::>>()?; - Ok(Self(internal::GenericBuilder::new_for_arrays(&fields)?)) + Ok(Self(generic::GenericBuilder::new_for_arrays(&fields)?)) } /// Add a single record to the arrays diff --git a/serde_arrow/src/arrow2/mod.rs b/serde_arrow/src/arrow2/mod.rs index 6591d3a3..96e5c7c4 100644 --- a/serde_arrow/src/arrow2/mod.rs +++ b/serde_arrow/src/arrow2/mod.rs @@ -16,7 +16,7 @@ use serde::{Deserialize, Serialize}; use crate::{ _impl::arrow2::{array::Array, datatypes::Field}, internal::{ - self, + generic, error::Result, schema::GenericField, serialization::{compile_serialization, CompilationOptions, Interpreter}, @@ -271,7 +271,7 @@ where T: Deserialize<'de>, A: AsRef + 'de + ?Sized, { - internal::deserialize_from_array(field, array.as_ref()) + generic::deserialize_from_array(field, array.as_ref()) } /// Build a single array item by item @@ -295,7 +295,7 @@ where /// let array = builder.build_array().unwrap(); /// assert_eq!(array.len(), 6); /// ``` -pub struct ArrayBuilder(internal::GenericBuilder); +pub struct ArrayBuilder(generic::GenericBuilder); impl ArrayBuilder { /// Construct a new build for the given field @@ -303,7 +303,7 @@ impl ArrayBuilder { /// This method may fail for an unsupported data type of the given field. /// pub fn new(field: &Field) -> Result { - Ok(Self(internal::GenericBuilder::new_for_array( + Ok(Self(generic::GenericBuilder::new_for_array( GenericField::try_from(field)?, )?)) } @@ -366,7 +366,7 @@ impl ArrayBuilder { /// assert_eq!(arrays.len(), 2); /// assert_eq!(arrays[0].len(), 6); /// ``` -pub struct ArraysBuilder(internal::GenericBuilder); +pub struct ArraysBuilder(generic::GenericBuilder); impl std::fmt::Debug for ArraysBuilder { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { @@ -385,7 +385,7 @@ impl ArraysBuilder { .iter() .map(GenericField::try_from) .collect::>>()?; - Ok(Self(internal::GenericBuilder::new_for_arrays(&fields)?)) + Ok(Self(generic::GenericBuilder::new_for_arrays(&fields)?)) } /// Add a single record to the arrays diff --git a/serde_arrow/src/internal/config.rs b/serde_arrow/src/internal/config.rs new file mode 100644 index 00000000..33b9f304 --- /dev/null +++ b/serde_arrow/src/internal/config.rs @@ -0,0 +1,32 @@ +use std::sync::RwLock; + +pub static CONFIGURATION: RwLock = RwLock::new(Configuration { + debug_print_program: false, + _prevent_construction: (), +}); + +/// The crate settings can be configured by calling [configure] +#[derive(Default, Clone)] +pub struct Configuration { + pub(crate) debug_print_program: bool, + /// A non public member to allow extending the member list as non-breaking + /// changes + _prevent_construction: (), +} + +/// Change global configuration options +/// +/// Note the configuration will be shared by all threads in the current program. +/// Thread-local configurations are not supported at the moment. +/// +/// Usage: +/// +/// ``` +/// serde_arrow::experimental::configure(|c| { +/// // set attributes on c +/// }); +/// ``` +pub fn configure(f: F) { + let mut guard = CONFIGURATION.write().unwrap(); + f(&mut guard) +} diff --git a/serde_arrow/src/internal/deserialization/mod.rs b/serde_arrow/src/internal/deserialization/mod.rs index 8989f95a..2f9e6762 100644 --- a/serde_arrow/src/internal/deserialization/mod.rs +++ b/serde_arrow/src/internal/deserialization/mod.rs @@ -11,7 +11,7 @@ use crate::{ use super::{ common::{define_bytecode, ArrayMapping, Buffers, DictionaryIndex, DictionaryValue}, - CONFIGURATION, + config::CONFIGURATION, }; use half::f16; diff --git a/serde_arrow/src/internal/generic.rs b/serde_arrow/src/internal/generic.rs new file mode 100644 index 00000000..d48a2d66 --- /dev/null +++ b/serde_arrow/src/internal/generic.rs @@ -0,0 +1,70 @@ +use serde::{Deserialize, Serialize}; + +use crate::internal::{ + common::{BufferExtract, Buffers}, + error::{Error, Result}, + schema::GenericField, + sink::{serialize_into_sink, EventSerializer, EventSink}, + source::deserialize_from_source, + serialization, + deserialization, +}; + + +pub struct GenericBuilder(pub serialization::Interpreter); + +impl GenericBuilder { + pub fn new_for_array(field: GenericField) -> Result { + let program = serialization::compile_serialization( + std::slice::from_ref(&field), + serialization::CompilationOptions::default().wrap_with_struct(false), + )?; + let interpreter = serialization::Interpreter::new(program); + + Ok(Self(interpreter)) + } + + pub fn new_for_arrays(fields: &[GenericField]) -> Result { + let program = serialization::compile_serialization( + fields, + serialization::CompilationOptions::default(), + )?; + let interpreter = serialization::Interpreter::new(program); + + Ok(Self(interpreter)) + } + + pub fn push(&mut self, item: &T) -> Result<()> { + self.0.accept_start_sequence()?; + self.0.accept_item()?; + item.serialize(EventSerializer(&mut self.0))?; + self.0.accept_end_sequence()?; + self.0.finish() + } + + pub fn extend(&mut self, items: &T) -> Result<()> { + serialize_into_sink(&mut self.0, items) + } +} + +pub fn deserialize_from_array<'de, T, F, A>(field: &'de F, array: &'de A) -> Result +where + T: Deserialize<'de>, + F: 'static, + GenericField: TryFrom<&'de F, Error = Error>, + A: BufferExtract + ?Sized, +{ + let field = GenericField::try_from(field)?; + let num_items = array.len(); + + let mut buffers = Buffers::new(); + let mapping = array.extract_buffers(&field, &mut buffers)?; + + let interpreter = deserialization::compile_deserialization( + num_items, + std::slice::from_ref(&mapping), + buffers, + deserialization::CompilationOptions::default().wrap_with_struct(false), + )?; + deserialize_from_source(interpreter) +} diff --git a/serde_arrow/src/internal/mod.rs b/serde_arrow/src/internal/mod.rs index c888d045..66aca0e8 100644 --- a/serde_arrow/src/internal/mod.rs +++ b/serde_arrow/src/internal/mod.rs @@ -1,111 +1,12 @@ pub mod common; +pub mod config; pub mod conversions; pub mod deserialization; pub mod error; pub mod event; +pub mod generic; pub mod schema; pub mod serialization; pub mod sink; pub mod source; pub mod tracing; - -use std::sync::RwLock; - -use serde::{Deserialize, Serialize}; - -use self::{ - common::{BufferExtract, Buffers}, - error::{Error, Result}, - schema::GenericField, - sink::{serialize_into_sink, EventSerializer, EventSink}, - source::deserialize_from_source, -}; - -pub static CONFIGURATION: RwLock = RwLock::new(Configuration { - debug_print_program: false, - _prevent_construction: (), -}); - -/// The crate settings can be configured by calling [configure] -#[derive(Default, Clone)] -pub struct Configuration { - pub(crate) debug_print_program: bool, - /// A non public member to allow extending the member list as non-breaking - /// changes - _prevent_construction: (), -} - -/// Change global configuration options -/// -/// Note the configuration will be shared by all threads in the current program. -/// Thread-local configurations are not supported at the moment. -/// -/// Usage: -/// -/// ``` -/// serde_arrow::experimental::configure(|c| { -/// // set attributes on c -/// }); -/// ``` -pub fn configure(f: F) { - let mut guard = CONFIGURATION.write().unwrap(); - f(&mut guard) -} - -pub struct GenericBuilder(pub serialization::Interpreter); - -impl GenericBuilder { - pub fn new_for_array(field: GenericField) -> Result { - let program = serialization::compile_serialization( - std::slice::from_ref(&field), - serialization::CompilationOptions::default().wrap_with_struct(false), - )?; - let interpreter = serialization::Interpreter::new(program); - - Ok(Self(interpreter)) - } - - pub fn new_for_arrays(fields: &[GenericField]) -> Result { - let program = serialization::compile_serialization( - fields, - serialization::CompilationOptions::default(), - )?; - let interpreter = serialization::Interpreter::new(program); - - Ok(Self(interpreter)) - } - - pub fn push(&mut self, item: &T) -> Result<()> { - self.0.accept_start_sequence()?; - self.0.accept_item()?; - item.serialize(EventSerializer(&mut self.0))?; - self.0.accept_end_sequence()?; - self.0.finish() - } - - pub fn extend(&mut self, items: &T) -> Result<()> { - serialize_into_sink(&mut self.0, items) - } -} - -pub fn deserialize_from_array<'de, T, F, A>(field: &'de F, array: &'de A) -> Result -where - T: Deserialize<'de>, - F: 'static, - GenericField: TryFrom<&'de F, Error = Error>, - A: BufferExtract + ?Sized, -{ - let field = GenericField::try_from(field)?; - let num_items = array.len(); - - let mut buffers = Buffers::new(); - let mapping = array.extract_buffers(&field, &mut buffers)?; - - let interpreter = deserialization::compile_deserialization( - num_items, - std::slice::from_ref(&mapping), - buffers, - deserialization::CompilationOptions::default().wrap_with_struct(false), - )?; - deserialize_from_source(interpreter) -} diff --git a/serde_arrow/src/internal/serialization/compiler.rs b/serde_arrow/src/internal/serialization/compiler.rs index 5d4a0430..47022bf4 100644 --- a/serde_arrow/src/internal/serialization/compiler.rs +++ b/serde_arrow/src/internal/serialization/compiler.rs @@ -3,7 +3,7 @@ use crate::internal::{ error::Result, error::{error, fail}, schema::{GenericDataType, GenericField, GenericTimeUnit, Strategy}, - CONFIGURATION, + config::CONFIGURATION, }; use super::{ diff --git a/serde_arrow/src/lib.rs b/serde_arrow/src/lib.rs index fd32c145..8944967b 100644 --- a/serde_arrow/src/lib.rs +++ b/serde_arrow/src/lib.rs @@ -261,5 +261,5 @@ pub mod schema { /// #[deny(missing_docs)] pub mod experimental { - pub use crate::internal::{configure, Configuration}; + pub use crate::internal::config::{configure, Configuration}; } From 1841cab8d0ae9ba22895e5000eb82b655c7997b5 Mon Sep 17 00:00:00 2001 From: Christopher Prohm Date: Sat, 28 Oct 2023 14:26:52 +0200 Subject: [PATCH 10/27] Rename TracedSchema to SchemaTracer --- serde_arrow/src/arrow/mod.rs | 8 ++++---- serde_arrow/src/arrow/schema.rs | 4 ++-- serde_arrow/src/arrow2/mod.rs | 8 ++++---- serde_arrow/src/arrow2/schema.rs | 4 ++-- serde_arrow/src/internal/generic.rs | 5 ++--- serde_arrow/src/internal/schema.rs | 17 +++++++++++++++++ .../src/internal/serialization/compiler.rs | 2 +- serde_arrow/src/internal/tracing/mod.rs | 6 +++--- serde_arrow/src/internal/tracing/types.rs | 4 ++-- serde_arrow/src/lib.rs | 2 +- 10 files changed, 38 insertions(+), 22 deletions(-) diff --git a/serde_arrow/src/arrow/mod.rs b/serde_arrow/src/arrow/mod.rs index 55319600..a732700b 100644 --- a/serde_arrow/src/arrow/mod.rs +++ b/serde_arrow/src/arrow/mod.rs @@ -17,13 +17,13 @@ use crate::{ datatypes::Field, }, internal::{ - generic, error::Result, + generic, schema::GenericField, serialization::{compile_serialization, CompilationOptions, Interpreter}, sink::serialize_into_sink, source::deserialize_from_source, - tracing::{TracedSchema, TracingOptions}, + tracing::{SchemaTracer, TracingOptions}, }, }; @@ -71,7 +71,7 @@ pub fn serialize_into_fields(items: &T, options: TracingOptions) -> Result(items: &T, name: &str, options: TracingOptions) - where T: Serialize + ?Sized, { - let mut schema = TracedSchema::new(options); + let mut schema = SchemaTracer::new(options); schema.trace_samples(items)?; let field = schema.to_field(name)?; Field::try_from(&field) diff --git a/serde_arrow/src/arrow/schema.rs b/serde_arrow/src/arrow/schema.rs index 8234418a..65e46ae7 100644 --- a/serde_arrow/src/arrow/schema.rs +++ b/serde_arrow/src/arrow/schema.rs @@ -4,7 +4,7 @@ use crate::{ internal::{ error::{error, fail, Error, Result}, schema::{GenericDataType, GenericField, GenericTimeUnit, Schema, Strategy, STRATEGY_KEY}, - tracing::TracedSchema, + tracing::SchemaTracer, }, }; @@ -27,7 +27,7 @@ impl Schema { } /// Support for arrow types (requires one of the `arrow-*` features) -impl TracedSchema { +impl SchemaTracer { /// Build a vec of fields from a TracedSchema object pub fn to_arrow_fields(&self) -> Result> { self.to_fields()?.iter().map(Field::try_from).collect() diff --git a/serde_arrow/src/arrow2/mod.rs b/serde_arrow/src/arrow2/mod.rs index 96e5c7c4..9f76517c 100644 --- a/serde_arrow/src/arrow2/mod.rs +++ b/serde_arrow/src/arrow2/mod.rs @@ -16,13 +16,13 @@ use serde::{Deserialize, Serialize}; use crate::{ _impl::arrow2::{array::Array, datatypes::Field}, internal::{ - generic, error::Result, + generic, schema::GenericField, serialization::{compile_serialization, CompilationOptions, Interpreter}, sink::serialize_into_sink, source::deserialize_from_source, - tracing::{TracedSchema, TracingOptions}, + tracing::{SchemaTracer, TracingOptions}, }, }; @@ -70,7 +70,7 @@ pub fn serialize_into_fields(items: &T, options: TracingOptions) -> Result(items: &T, name: &str, options: TracingOptions) - where T: Serialize + ?Sized, { - let mut schema = TracedSchema::new(options); + let mut schema = SchemaTracer::new(options); schema.trace_samples(items)?; let field = schema.to_field(name)?; Field::try_from(&field) diff --git a/serde_arrow/src/arrow2/schema.rs b/serde_arrow/src/arrow2/schema.rs index 10e36845..68cd61fb 100644 --- a/serde_arrow/src/arrow2/schema.rs +++ b/serde_arrow/src/arrow2/schema.rs @@ -3,7 +3,7 @@ use crate::{ internal::{ error::{error, fail, Error, Result}, schema::{GenericDataType, GenericField, GenericTimeUnit, Schema, Strategy, STRATEGY_KEY}, - tracing::TracedSchema, + tracing::SchemaTracer, }, }; @@ -26,7 +26,7 @@ impl Schema { } /// Support for arrow2 types (requires one of the `arrow2-*` features) -impl TracedSchema { +impl SchemaTracer { /// Build a vec of arrow2 fields from a TracedSchema object pub fn to_arrow2_fields(&self) -> Result> { self.to_fields()?.iter().map(Field::try_from).collect() diff --git a/serde_arrow/src/internal/generic.rs b/serde_arrow/src/internal/generic.rs index d48a2d66..1295b1c4 100644 --- a/serde_arrow/src/internal/generic.rs +++ b/serde_arrow/src/internal/generic.rs @@ -2,15 +2,14 @@ use serde::{Deserialize, Serialize}; use crate::internal::{ common::{BufferExtract, Buffers}, + deserialization, error::{Error, Result}, schema::GenericField, + serialization, sink::{serialize_into_sink, EventSerializer, EventSink}, source::deserialize_from_source, - serialization, - deserialization, }; - pub struct GenericBuilder(pub serialization::Interpreter); impl GenericBuilder { diff --git a/serde_arrow/src/internal/schema.rs b/serde_arrow/src/internal/schema.rs index 7d92d475..4f9644b1 100644 --- a/serde_arrow/src/internal/schema.rs +++ b/serde_arrow/src/internal/schema.rs @@ -15,6 +15,23 @@ pub const STRATEGY_KEY: &str = "SERDE_ARROW:strategy"; /// A collection of fields that can be easily serialized and deserialized /// +/// ```rust +/// # use serde_arrow::schema::Schema; +/// let schema_json = r#" +/// [ +/// { +/// "name": "date", +/// "data_type": "Date64", +/// "strategy": "NaiveStrAsDate64" +/// }, +/// {"name":"foo","data_type":"U8"}, +/// {"name":"bar","data_type":"Utf8"} +/// ] +/// "#; +/// +/// let schema: Schema = serde_json::from_str(&schema_json).unwrap(); +/// ``` +/// /// The serialization format is designed to be as easy as possible to to write /// by hand. The schema can be given in two ways: /// diff --git a/serde_arrow/src/internal/serialization/compiler.rs b/serde_arrow/src/internal/serialization/compiler.rs index 47022bf4..6ad85644 100644 --- a/serde_arrow/src/internal/serialization/compiler.rs +++ b/serde_arrow/src/internal/serialization/compiler.rs @@ -1,9 +1,9 @@ use crate::internal::{ common::{ArrayMapping, DictionaryIndex, DictionaryValue}, + config::CONFIGURATION, error::Result, error::{error, fail}, schema::{GenericDataType, GenericField, GenericTimeUnit, Strategy}, - config::CONFIGURATION, }; use super::{ diff --git a/serde_arrow/src/internal/tracing/mod.rs b/serde_arrow/src/internal/tracing/mod.rs index 90418a96..14054442 100644 --- a/serde_arrow/src/internal/tracing/mod.rs +++ b/serde_arrow/src/internal/tracing/mod.rs @@ -129,11 +129,11 @@ impl TracingOptions { } /// Collect schema information from samples and types -pub struct TracedSchema { +pub struct SchemaTracer { tracer: Tracer, } -impl TracedSchema { +impl SchemaTracer { /// Construct a new instance with the given options pub fn new(options: TracingOptions) -> Self { Self { @@ -163,7 +163,7 @@ impl TracedSchema { } } -impl TracedSchema { +impl SchemaTracer { /// Trace the given samples and collect schema information pub fn trace_samples(&mut self, samples: &T) -> Result<()> { self.tracer.reset()?; diff --git a/serde_arrow/src/internal/tracing/types.rs b/serde_arrow/src/internal/tracing/types.rs index 36fc7f0c..4f2388b0 100644 --- a/serde_arrow/src/internal/tracing/types.rs +++ b/serde_arrow/src/internal/tracing/types.rs @@ -455,11 +455,11 @@ mod test { use crate::internal::{ schema::{GenericDataType as T, GenericField as F, Strategy}, - tracing::{TracedSchema, TracingOptions}, + tracing::{SchemaTracer, TracingOptions}, }; fn trace_type<'de, T: Deserialize<'de>>(options: TracingOptions) -> F { - let mut schema = TracedSchema::new(options); + let mut schema = SchemaTracer::new(options); schema.trace_type::().unwrap(); schema.to_field("root").unwrap() } diff --git a/serde_arrow/src/lib.rs b/serde_arrow/src/lib.rs index 8944967b..141e7405 100644 --- a/serde_arrow/src/lib.rs +++ b/serde_arrow/src/lib.rs @@ -253,7 +253,7 @@ pub use crate::internal::error::{Error, Result}; pub mod schema { pub use crate::internal::{ schema::{Schema, Strategy, STRATEGY_KEY}, - tracing::{TracedSchema, TracingOptions}, + tracing::{SchemaTracer, TracingOptions}, }; } From 9deeda109201d94dbc858df111302825c2a7f6a1 Mon Sep 17 00:00:00 2001 From: Christopher Prohm Date: Sat, 28 Oct 2023 16:01:28 +0200 Subject: [PATCH 11/27] Tune public interface, test mixed type / sample tracing --- Changes.md | 5 + serde_arrow/src/arrow/schema.rs | 8 +- serde_arrow/src/arrow2/schema.rs | 6 + serde_arrow/src/internal/tracing/mod.rs | 34 ++- serde_arrow/src/internal/tracing/samples.rs | 95 ++++--- serde_arrow/src/internal/tracing/tracer.rs | 86 ++++++- serde_arrow/src/internal/tracing/types.rs | 178 ------------- serde_arrow/src/test_impls/chrono.rs | 13 + ...issue_74.rs => issue_74_unknown_fields.rs} | 0 ...> issue_79_declared_but_missing_fields.rs} | 0 .../src/test_impls/issue_90_type_tracing.rs | 239 ++++++++++++++++++ serde_arrow/src/test_impls/mod.rs | 5 +- 12 files changed, 430 insertions(+), 239 deletions(-) rename serde_arrow/src/test_impls/{issue_74.rs => issue_74_unknown_fields.rs} (100%) rename serde_arrow/src/test_impls/{issue_79.rs => issue_79_declared_but_missing_fields.rs} (100%) create mode 100644 serde_arrow/src/test_impls/issue_90_type_tracing.rs diff --git a/Changes.md b/Changes.md index 3ef998cf..51d6d2cb 100644 --- a/Changes.md +++ b/Changes.md @@ -1,5 +1,10 @@ # Change log +## 0.9.0 + +- Remove `try_parse_dates` in favor of `guess_dates` field in `TracingOptions` +- Add type based tracing to allow schema tracing without samples + ## 0.8.0 Make bytecode based serialization and deserialization the default diff --git a/serde_arrow/src/arrow/schema.rs b/serde_arrow/src/arrow/schema.rs index 65e46ae7..5dd105ef 100644 --- a/serde_arrow/src/arrow/schema.rs +++ b/serde_arrow/src/arrow/schema.rs @@ -20,8 +20,14 @@ impl Schema { }) } - /// Build a vec of fields from a Schema object + /// Build a vec of fields from a Schema object + #[deprecated = "The method `get_arrow_fields` is deprecated. Use `to_arrow_fields` instead"] pub fn get_arrow_fields(&self) -> Result> { + self.to_arrow_fields() + } + + /// Build a vec of fields from a Schema object + pub fn to_arrow_fields(&self) -> Result> { self.fields.iter().map(Field::try_from).collect() } } diff --git a/serde_arrow/src/arrow2/schema.rs b/serde_arrow/src/arrow2/schema.rs index 68cd61fb..9faa513b 100644 --- a/serde_arrow/src/arrow2/schema.rs +++ b/serde_arrow/src/arrow2/schema.rs @@ -20,7 +20,13 @@ impl Schema { } /// Build a vec of fields from a Schema object + #[deprecated = "The method `get_arrow2_fields` is deprecated. Use `to_arrow2_fields` instead"] pub fn get_arrow2_fields(&self) -> Result> { + self.to_arrow2_fields() + } + + /// Build a vec of fields from a Schema object + pub fn to_arrow2_fields(&self) -> Result> { self.fields.iter().map(Field::try_from).collect() } } diff --git a/serde_arrow/src/internal/tracing/mod.rs b/serde_arrow/src/internal/tracing/mod.rs index 14054442..5a4ad87c 100644 --- a/serde_arrow/src/internal/tracing/mod.rs +++ b/serde_arrow/src/internal/tracing/mod.rs @@ -26,18 +26,16 @@ pub use tracer::Tracer; /// /// ```rust /// # use serde_arrow::schema::TracingOptions; -/// # let defaults = -/// TracingOptions { -/// allow_null_fields: false, -/// map_as_struct: true, -/// string_dictionary_encoding: false, -/// coerce_numbers: false, -/// try_parse_dates: false, -/// } -/// # ; -/// # assert_eq!(defaults, TracingOptions::default()); +/// let default = TracingOptions::default(); +/// +/// assert_eq!(default.allow_null_fields, false); +/// assert_eq!(default.map_as_struct, true); +/// assert_eq!(default.string_dictionary_encoding, false); +/// assert_eq!(default.coerce_numbers, false); +/// assert_eq!(default.guess_dates, false); /// ``` #[derive(Debug, Clone, PartialEq)] +#[non_exhaustive] pub struct TracingOptions { /// If `true`, accept null-only fields (e.g., fields with type `()` or fields /// with only `None` entries). If `false`, schema tracing will fail in this @@ -77,7 +75,7 @@ pub struct TracingOptions { /// of the format the data type is set as `Date64` with strategy /// [`NaiveStrAsDate64`][crate::schema::Strategy::NaiveStrAsDate64] or /// [`UtcStrAsDate64`][crate::schema::Strategy::UtcStrAsDate64]. - pub try_parse_dates: bool, + pub guess_dates: bool, } impl Default for TracingOptions { @@ -87,7 +85,7 @@ impl Default for TracingOptions { map_as_struct: true, string_dictionary_encoding: false, coerce_numbers: false, - try_parse_dates: false, + guess_dates: false, } } } @@ -97,33 +95,33 @@ impl TracingOptions { Default::default() } - /// Configure `allow_null_fields` + /// Set [`allow_null_fields`](#structfield.allow_null_fields) pub fn allow_null_fields(mut self, value: bool) -> Self { self.allow_null_fields = value; self } - /// Configure `map_as_struct` + /// Set [`map_as_struct`](#structfield.map_as_struct) pub fn map_as_struct(mut self, value: bool) -> Self { self.map_as_struct = value; self } - /// Configure `string_dictionary_encoding` + /// Set [`string_dictionary_encoding`](#structfield.string_dictionary_encoding) pub fn string_dictionary_encoding(mut self, value: bool) -> Self { self.string_dictionary_encoding = value; self } - /// Configure `coerce_numbers` + /// Set [`coerce_numbers`](#structfield.coerce_numbers) pub fn coerce_numbers(mut self, value: bool) -> Self { self.coerce_numbers = value; self } - /// Configure `coerce_numbers` + /// Set [`try_parse_dates`](#structfield.try_parse_dates) pub fn guess_dates(mut self, value: bool) -> Self { - self.try_parse_dates = value; + self.guess_dates = value; self } } diff --git a/serde_arrow/src/internal/tracing/samples.rs b/serde_arrow/src/internal/tracing/samples.rs index 0a83e20d..8c1ceae6 100644 --- a/serde_arrow/src/internal/tracing/samples.rs +++ b/serde_arrow/src/internal/tracing/samples.rs @@ -156,7 +156,7 @@ impl StructTracer { index: HashMap::new(), nullable, state: StructTracerState::WaitForKey, - current_sample: 0, + seen_samples: 0, } } } @@ -181,7 +181,7 @@ impl EventSink for StructTracer { let Some(field) = self.fields.get_mut(field_idx) else { fail!("invalid state"); }; - field.last_seen_in_sample = self.current_sample; + field.last_seen_in_sample = self.seen_samples; InValue(field_idx, 0) } else { @@ -191,11 +191,11 @@ impl EventSink for StructTracer { self.options.clone(), ), name: key.to_owned(), - last_seen_in_sample: self.current_sample, + last_seen_in_sample: self.seen_samples, }; // field was missing in previous samples - if self.current_sample != 0 { + if self.seen_samples != 0 { println!("{key}"); field.tracer.mark_nullable(); } @@ -209,11 +209,11 @@ impl EventSink for StructTracer { (InKey, E::EndStruct | E::EndMap) => { for field in &mut self.fields { // field. was not seen in this sample - if field.last_seen_in_sample != self.current_sample { + if field.last_seen_in_sample != self.seen_samples { field.tracer.mark_nullable(); } } - self.current_sample += 1; + self.seen_samples += 1; WaitForKey } @@ -543,36 +543,55 @@ impl EventSink for PrimitiveTracer { ev => fail!("Cannot handle event {ev} in primitive tracer"), }; - (self.item_type, self.strategy) = match (&self.item_type, ev_type) { - (ty, Null) => { + // coercion rules as a table of (this_ty, this_strategy), (ev_ty, ev_strategy) + (self.item_type, self.strategy) = match ( + (&self.item_type, self.strategy.as_ref()), + (ev_type, ev_strategy), + ) { + ((ty, strategy), (Null, None)) => { self.nullable = true; - (ty.clone(), self.strategy.clone()) + (ty.clone(), strategy.cloned()) } - (Bool | Null, Bool) => (Bool, None), - (I8 | Null, I8) => (I8, None), - (I16 | Null, I16) => (I16, None), - (I32 | Null, I32) => (I32, None), - (I64 | Null, I64) => (I64, None), - (U8 | Null, U8) => (U8, None), - (U16 | Null, U16) => (U16, None), - (U32 | Null, U32) => (U32, None), - (U64 | Null, U64) => (U64, None), - (F32 | Null, F32) => (F32, None), - (F64 | Null, F64) => (F64, None), - (Null, Date64) => (Date64, ev_strategy), - (Date64, Date64) => match (&self.strategy, ev_strategy) { - (Some(S::NaiveStrAsDate64), Some(S::NaiveStrAsDate64)) => { - (Date64, Some(S::NaiveStrAsDate64)) - } - (Some(S::UtcStrAsDate64), Some(S::UtcStrAsDate64)) => { - (Date64, Some(S::UtcStrAsDate64)) - } - _ => (LargeUtf8, None), - }, - (LargeUtf8 | Null, LargeUtf8) | (Date64, LargeUtf8) | (LargeUtf8, Date64) => { + ((Null, None), (ev_type, ev_strategy)) => (ev_type, ev_strategy), + ((Bool, None), (Bool, None)) => (Bool, None), + ((I8, None), (I8, None)) => (I8, None), + ((I16, None), (I16, None)) => (I16, None), + ((I32, None), (I32, None)) => (I32, None), + ((I64, None), (I64, None)) => (I64, None), + ((U8, None), (U8, None)) => (U8, None), + ((U16, None), (U16, None)) => (U16, None), + ((U32, None), (U32, None)) => (U32, None), + ((U64, None), (U64, None)) => (U64, None), + ((F32, None), (F32, None)) => (F32, None), + ((F64, None), (F64, None)) => (F64, None), + ((Date64, Some(S::NaiveStrAsDate64)), (Date64, Some(S::NaiveStrAsDate64))) => { + (Date64, Some(S::NaiveStrAsDate64)) + } + ((Date64, Some(S::UtcStrAsDate64)), (Date64, Some(S::UtcStrAsDate64))) => { + (Date64, Some(S::UtcStrAsDate64)) + } + ((Date64, Some(S::NaiveStrAsDate64)), (Date64, Some(S::UtcStrAsDate64))) => { + (LargeUtf8, None) + } + // incompatible strategies, coerce to string + ((Date64, Some(S::UtcStrAsDate64)), (Date64, Some(S::NaiveStrAsDate64))) => { (LargeUtf8, None) } - (ty, ev) if self.options.coerce_numbers => match (ty, ev) { + ( + (LargeUtf8, None) | (Date64, Some(S::NaiveStrAsDate64) | Some(S::UtcStrAsDate64)), + (LargeUtf8, None), + ) => (LargeUtf8, None), + ( + (LargeUtf8, None), + (Date64, strategy @ (Some(S::NaiveStrAsDate64) | Some(S::UtcStrAsDate64))), + ) => { + if self.seen_samples == 0 { + (Date64, strategy) + } else { + (LargeUtf8, None) + } + } + ((ty, None), (ev, None)) if self.options.coerce_numbers => match (ty, ev) { // unsigned x unsigned -> u64 (U8 | U16 | U32 | U64, U8 | U16 | U32 | U64) => (U64, None), // signed x signed -> i64 @@ -589,8 +608,12 @@ impl EventSink for PrimitiveTracer { (F32 | F64, I8 | I16 | I32 | I64 | U8 | U16 | U32 | U64) => (F64, None), (ty, ev) => fail!("Cannot accept event {ev} for tracer of primitive type {ty}"), }, - (ty, ev) => fail!("Cannot accept event {ev} for tracer of primitive type {ty}"), + ((this_ty, this_strategy), (ev_ty, ev_strategy)) => { + fail!("Cannot accept event {ev_ty} with strategy {ev_strategy:?} for tracer of primitive type {this_ty} with strategy {this_strategy:?}") + } }; + + self.seen_samples += 1; Ok(()) } @@ -601,11 +624,9 @@ impl EventSink for PrimitiveTracer { impl PrimitiveTracer { fn get_string_type_and_strategy(&self, s: &str) -> (GenericDataType, Option) { - if !self.options.try_parse_dates { - (GenericDataType::LargeUtf8, None) - } else if matches_naive_datetime(s) { + if self.options.guess_dates && matches_naive_datetime(s) { (GenericDataType::Date64, Some(Strategy::NaiveStrAsDate64)) - } else if matches_utc_datetime(s) { + } else if self.options.guess_dates && matches_utc_datetime(s) { (GenericDataType::Date64, Some(Strategy::UtcStrAsDate64)) } else { (GenericDataType::LargeUtf8, None) diff --git a/serde_arrow/src/internal/tracing/tracer.rs b/serde_arrow/src/internal/tracing/tracer.rs index 1c45e76b..5ce98cbf 100644 --- a/serde_arrow/src/internal/tracing/tracer.rs +++ b/serde_arrow/src/internal/tracing/tracer.rs @@ -56,6 +56,10 @@ impl Tracer { dispatch_tracer!(self, tracer => tracer.get_type()) } + pub fn get_strategy(&self) -> Option<&Strategy> { + dispatch_tracer!(self, tracer => tracer.get_strategy()) + } + pub fn get_nullable(&self) -> bool { dispatch_tracer!(self, tracer => tracer.nullable) } @@ -114,7 +118,7 @@ impl Tracer { nullable: this.get_nullable(), mode: StructMode::Struct, state: StructTracerState::WaitForKey, - current_sample: 0, + seen_samples: 0, }; *this = Self::Struct(tracer); Ok(()) @@ -249,6 +253,48 @@ impl Tracer { } } +impl Tracer { + pub fn ensure_utf8(&mut self) -> Result<()> { + if self.is_unknown() { + let tracer = PrimitiveTracer::new( + self.get_path().to_owned(), + self.get_options().clone(), + GenericDataType::LargeUtf8, + self.get_nullable(), + ); + *self = Self::Primitive(tracer); + } + self.ensure_utf8_type_compatible() + } + + pub fn ensure_utf8_type_compatible(&self) -> Result<()> { + let Some(item_type) = self.get_type() else { + fail!("unknown tracer is not compatible with LargeUtf8"); + }; + + let strategy = self.get_strategy(); + + let compatible = matches!( + (item_type, strategy), + (GenericDataType::LargeUtf8, None) + | (GenericDataType::Utf8, None) + | (GenericDataType::Date64, Some(Strategy::UtcStrAsDate64)) + | (GenericDataType::Date64, Some(Strategy::NaiveStrAsDate64)) + ); + + if !compatible { + fail!( + "mismatched types, previous {:?} with strategy {:?}, current {:?}", + item_type, + strategy, + GenericDataType::LargeUtf8 + ); + } + + Ok(()) + } +} + macro_rules! impl_primitive_ensures { ( $( @@ -294,7 +340,6 @@ impl_primitive_ensures!( (ensure_u64, U64), (ensure_f32, F32), (ensure_f64, F64), - (ensure_utf8, LargeUtf8), ); #[derive(Debug, PartialEq, Clone)] @@ -353,6 +398,10 @@ impl UnknownTracer { Ok(()) } + pub fn get_strategy(&self) -> Option<&Strategy> { + None + } + pub fn get_path(&self) -> &str { &self.path } @@ -398,6 +447,10 @@ impl MapTracer { } } + pub fn get_strategy(&self) -> Option<&Strategy> { + None + } + pub fn get_path(&self) -> &str { &self.path } @@ -480,6 +533,10 @@ impl ListTracer { } } + pub fn get_strategy(&self) -> Option<&Strategy> { + None + } + pub fn get_path(&self) -> &str { &self.path } @@ -580,6 +637,10 @@ impl TupleTracer { Some(&GenericDataType::Struct) } + pub fn get_strategy(&self) -> Option<&Strategy> { + Some(&Strategy::TupleAsStruct) + } + pub fn reset(&mut self) -> Result<()> { match self.state { TupleTracerState::WaitForStart | TupleTracerState::Finished => { @@ -624,7 +685,8 @@ pub struct StructTracer { pub index: HashMap, pub mode: StructMode, pub state: StructTracerState, - pub current_sample: usize, + /// Count how many samples were seen by this tracer + pub seen_samples: usize, } #[derive(Debug, PartialEq, Clone)] @@ -657,6 +719,13 @@ impl StructTracer { &self.path } + pub fn get_strategy(&self) -> Option<&Strategy> { + match self.mode { + StructMode::Struct => None, + StructMode::Map => Some(&Strategy::MapAsStruct), + } + } + pub fn is_complete(&self) -> bool { self.fields.iter().all(|field| field.tracer.is_complete()) } @@ -791,6 +860,10 @@ impl UnionTracer { Some(&GenericDataType::Union) } + pub fn get_strategy(&self) -> Option<&Strategy> { + None + } + pub fn to_field(&self, name: &str) -> Result { if !matches!(self.state, UnionTracerState::Finished) { fail!("Cannot build field {name} from unfinished tracer"); @@ -848,6 +921,8 @@ pub struct PrimitiveTracer { pub strategy: Option, pub item_type: GenericDataType, pub state: PrimitiveTracerState, + /// Count how many samples were seen by this tracer + pub seen_samples: usize, } #[derive(Debug, PartialEq, Clone, Copy)] @@ -870,6 +945,7 @@ impl PrimitiveTracer { nullable, strategy: None, state: PrimitiveTracerState::Unfinished, + seen_samples: 0, } } @@ -929,4 +1005,8 @@ impl PrimitiveTracer { pub fn get_type(&self) -> Option<&GenericDataType> { Some(&self.item_type) } + + pub fn get_strategy(&self) -> Option<&Strategy> { + self.strategy.as_ref() + } } diff --git a/serde_arrow/src/internal/tracing/types.rs b/serde_arrow/src/internal/tracing/types.rs index 4f2388b0..c972db5a 100644 --- a/serde_arrow/src/internal/tracing/types.rs +++ b/serde_arrow/src/internal/tracing/types.rs @@ -95,19 +95,11 @@ impl<'de, 'a> serde::de::Deserializer<'de> for TraceAny<'a> { } fn deserialize_str>(self, visitor: V) -> Result { - if self.0.get_options().try_parse_dates { - fail!("Cannot try to parse dates without examples, prefer serialize_into_field(s)"); - } - self.0.ensure_utf8()?; visitor.visit_str("") } fn deserialize_string>(self, visitor: V) -> Result { - if self.0.get_options().try_parse_dates { - fail!("Cannot try to parse dates without examples, prefer serialize_into_field(s)"); - } - self.0.ensure_utf8()?; visitor.visit_string(Default::default()) } @@ -446,173 +438,3 @@ impl<'de, 'a> serde::de::Deserializer<'de> for IdentifierDeserializer<'a> { unimplemented!('de, deserialize_enum, _: &'static str, _: &'static [&'static str]); unimplemented!('de, deserialize_ignored_any); } - -#[cfg(test)] -mod test { - use std::collections::HashMap; - - use serde::Deserialize; - - use crate::internal::{ - schema::{GenericDataType as T, GenericField as F, Strategy}, - tracing::{SchemaTracer, TracingOptions}, - }; - - fn trace_type<'de, T: Deserialize<'de>>(options: TracingOptions) -> F { - let mut schema = SchemaTracer::new(options); - schema.trace_type::().unwrap(); - schema.to_field("root").unwrap() - } - - #[test] - fn trace_primitives() { - assert_eq!( - trace_type::(TracingOptions::default()), - F::new("root", T::I8, false) - ); - assert_eq!( - trace_type::(TracingOptions::default()), - F::new("root", T::I16, false) - ); - assert_eq!( - trace_type::(TracingOptions::default()), - F::new("root", T::I32, false) - ); - assert_eq!( - trace_type::(TracingOptions::default()), - F::new("root", T::I64, false) - ); - - assert_eq!( - trace_type::(TracingOptions::default()), - F::new("root", T::U8, false) - ); - assert_eq!( - trace_type::(TracingOptions::default()), - F::new("root", T::U16, false) - ); - assert_eq!( - trace_type::(TracingOptions::default()), - F::new("root", T::U32, false) - ); - assert_eq!( - trace_type::(TracingOptions::default()), - F::new("root", T::U64, false) - ); - - assert_eq!( - trace_type::(TracingOptions::default()), - F::new("root", T::F32, false) - ); - assert_eq!( - trace_type::(TracingOptions::default()), - F::new("root", T::F64, false) - ); - } - - #[test] - fn trace_option() { - assert_eq!( - trace_type::(TracingOptions::default()), - F::new("root", T::I8, false) - ); - assert_eq!( - trace_type::>(TracingOptions::default()), - F::new("root", T::I8, true) - ); - } - - #[test] - fn trace_struct() { - #[allow(dead_code)] - #[derive(Deserialize)] - struct Example { - a: bool, - b: Option, - } - - let actual = trace_type::(TracingOptions::default()); - let expected = F::new("root", T::Struct, false) - .with_child(F::new("a", T::Bool, false)) - .with_child(F::new("b", T::I8, true)); - - assert_eq!(actual, expected); - } - - #[test] - fn trace_tuple_as_struct() { - let actual = trace_type::<(bool, Option)>(TracingOptions::default()); - let expected = F::new("root", T::Struct, false) - .with_child(F::new("0", T::Bool, false)) - .with_child(F::new("1", T::I8, true)) - .with_strategy(Strategy::TupleAsStruct); - - assert_eq!(actual, expected); - } - - #[test] - fn trace_union() { - #[allow(dead_code)] - #[derive(Deserialize)] - enum Example { - A(i8), - B(f32), - } - - let actual = trace_type::(TracingOptions::default()); - let expected = F::new("root", T::Union, false) - .with_child(F::new("A", T::I8, false)) - .with_child(F::new("B", T::F32, false)); - - assert_eq!(actual, expected); - } - - #[test] - fn trace_list() { - let actual = trace_type::>(TracingOptions::default()); - let expected = - F::new("root", T::LargeList, false).with_child(F::new("element", T::LargeUtf8, false)); - - assert_eq!(actual, expected); - } - - #[test] - fn trace_map() { - let actual = - trace_type::>(TracingOptions::default().map_as_struct(false)); - let expected = F::new("root", T::Map, false).with_child( - F::new("entries", T::Struct, false) - .with_child(F::new("key", T::I8, false)) - .with_child(F::new("value", T::LargeUtf8, false)), - ); - - assert_eq!(actual, expected); - } - - #[test] - fn issue_90() { - #[derive(Deserialize)] - pub struct Distribution { - pub samples: Vec, - pub statistic: String, - } - - #[derive(Deserialize)] - pub struct VectorMetric { - pub distribution: Option, - } - - let actual = trace_type::(TracingOptions::default()); - let expected = F::new("root", T::Struct, false).with_child( - F::new("distribution", T::Struct, true) - .with_child(F::new("samples", T::LargeList, false).with_child(F::new( - "element", - T::F64, - false, - ))) - .with_child(F::new("statistic", T::LargeUtf8, false)), - ); - - assert_eq!(actual, expected); - } -} diff --git a/serde_arrow/src/test_impls/chrono.rs b/serde_arrow/src/test_impls/chrono.rs index 381c9052..dbfac8e5 100644 --- a/serde_arrow/src/test_impls/chrono.rs +++ b/serde_arrow/src/test_impls/chrono.rs @@ -240,3 +240,16 @@ test_example!( ], nulls = [false, false, false], ); + +test_example!( + test_name = incompatible_date_formats, + test_bytecode_deserialization = true, + tracing_options = TracingOptions::default().guess_dates(true), + field = GenericField::new("root", GenericDataType::LargeUtf8, false), + ty = String, + values = [ + String::from("2015-09-18T23:56:04Z"), + String::from("2023-08-14T17:00:04"), + ], + nulls = [false, false], +); diff --git a/serde_arrow/src/test_impls/issue_74.rs b/serde_arrow/src/test_impls/issue_74_unknown_fields.rs similarity index 100% rename from serde_arrow/src/test_impls/issue_74.rs rename to serde_arrow/src/test_impls/issue_74_unknown_fields.rs diff --git a/serde_arrow/src/test_impls/issue_79.rs b/serde_arrow/src/test_impls/issue_79_declared_but_missing_fields.rs similarity index 100% rename from serde_arrow/src/test_impls/issue_79.rs rename to serde_arrow/src/test_impls/issue_79_declared_but_missing_fields.rs diff --git a/serde_arrow/src/test_impls/issue_90_type_tracing.rs b/serde_arrow/src/test_impls/issue_90_type_tracing.rs new file mode 100644 index 00000000..c1d7ff3b --- /dev/null +++ b/serde_arrow/src/test_impls/issue_90_type_tracing.rs @@ -0,0 +1,239 @@ +use std::collections::HashMap; + +use serde::{Deserialize, Serialize}; + +use crate::internal::{ + schema::{GenericDataType as T, GenericField as F, Strategy}, + tracing::{SchemaTracer, TracingOptions}, +}; + +fn trace_type<'de, T: Deserialize<'de>>(options: TracingOptions) -> F { + let mut schema = SchemaTracer::new(options); + schema.trace_type::().unwrap(); + schema.to_field("root").unwrap() +} + +#[test] +fn issue_90() { + #[derive(Deserialize)] + pub struct Distribution { + pub samples: Vec, + pub statistic: String, + } + + #[derive(Deserialize)] + pub struct VectorMetric { + pub distribution: Option, + } + + let actual = trace_type::(TracingOptions::default()); + let expected = F::new("root", T::Struct, false).with_child( + F::new("distribution", T::Struct, true) + .with_child(F::new("samples", T::LargeList, false).with_child(F::new( + "element", + T::F64, + false, + ))) + .with_child(F::new("statistic", T::LargeUtf8, false)), + ); + + assert_eq!(actual, expected); +} + +#[test] +fn trace_primitives() { + assert_eq!( + trace_type::(TracingOptions::default()), + F::new("root", T::I8, false) + ); + assert_eq!( + trace_type::(TracingOptions::default()), + F::new("root", T::I16, false) + ); + assert_eq!( + trace_type::(TracingOptions::default()), + F::new("root", T::I32, false) + ); + assert_eq!( + trace_type::(TracingOptions::default()), + F::new("root", T::I64, false) + ); + + assert_eq!( + trace_type::(TracingOptions::default()), + F::new("root", T::U8, false) + ); + assert_eq!( + trace_type::(TracingOptions::default()), + F::new("root", T::U16, false) + ); + assert_eq!( + trace_type::(TracingOptions::default()), + F::new("root", T::U32, false) + ); + assert_eq!( + trace_type::(TracingOptions::default()), + F::new("root", T::U64, false) + ); + + assert_eq!( + trace_type::(TracingOptions::default()), + F::new("root", T::F32, false) + ); + assert_eq!( + trace_type::(TracingOptions::default()), + F::new("root", T::F64, false) + ); +} + +#[test] +fn trace_option() { + assert_eq!( + trace_type::(TracingOptions::default()), + F::new("root", T::I8, false) + ); + assert_eq!( + trace_type::>(TracingOptions::default()), + F::new("root", T::I8, true) + ); +} + +#[test] +fn trace_struct() { + #[allow(dead_code)] + #[derive(Deserialize)] + struct Example { + a: bool, + b: Option, + } + + let actual = trace_type::(TracingOptions::default()); + let expected = F::new("root", T::Struct, false) + .with_child(F::new("a", T::Bool, false)) + .with_child(F::new("b", T::I8, true)); + + assert_eq!(actual, expected); +} + +#[test] +fn trace_tuple_as_struct() { + let actual = trace_type::<(bool, Option)>(TracingOptions::default()); + let expected = F::new("root", T::Struct, false) + .with_child(F::new("0", T::Bool, false)) + .with_child(F::new("1", T::I8, true)) + .with_strategy(Strategy::TupleAsStruct); + + assert_eq!(actual, expected); +} + +#[test] +fn trace_union() { + #[allow(dead_code)] + #[derive(Deserialize)] + enum Example { + A(i8), + B(f32), + } + + let actual = trace_type::(TracingOptions::default()); + let expected = F::new("root", T::Union, false) + .with_child(F::new("A", T::I8, false)) + .with_child(F::new("B", T::F32, false)); + + assert_eq!(actual, expected); +} + +#[test] +fn trace_list() { + let actual = trace_type::>(TracingOptions::default()); + let expected = + F::new("root", T::LargeList, false).with_child(F::new("element", T::LargeUtf8, false)); + + assert_eq!(actual, expected); +} + +#[test] +fn trace_map() { + let actual = trace_type::>(TracingOptions::default().map_as_struct(false)); + let expected = F::new("root", T::Map, false).with_child( + F::new("entries", T::Struct, false) + .with_child(F::new("key", T::I8, false)) + .with_child(F::new("value", T::LargeUtf8, false)), + ); + + assert_eq!(actual, expected); +} + +mod mixed_tracing_dates { + use super::*; + + #[derive(Serialize, Deserialize)] + struct Example { + opt: Option, + date: String, + } + + fn expected() -> Vec { + vec![ + F::new("opt", T::U32, true), + F::new("date", T::Date64, false).with_strategy(Strategy::UtcStrAsDate64), + ] + } + + fn samples() -> Vec { + vec![Example { + opt: None, + date: String::from("2015-09-18T23:56:04Z"), + }] + } + + #[test] + fn type_then_samples() { + let mut tracer = SchemaTracer::new(TracingOptions::default().guess_dates(true)); + + tracer.trace_type::().unwrap(); + tracer.trace_samples(&samples()).unwrap(); + + let actual = tracer.to_fields().unwrap(); + assert_eq!(actual, expected()); + } + + #[test] + fn samples_then_type() { + let mut tracer = SchemaTracer::new(TracingOptions::default().guess_dates(true)); + + tracer.trace_samples(&samples()).unwrap(); + tracer.trace_type::().unwrap(); + + let actual = tracer.to_fields().unwrap(); + assert_eq!(actual, expected()); + } + + #[test] + fn invalid_values_first() { + let mut tracer = SchemaTracer::new(TracingOptions::default().guess_dates(true)); + + tracer.trace_samples(&["foo bar"]).unwrap(); + tracer.trace_type::().unwrap(); + tracer.trace_samples(&["2015-09-18T23:56:04Z"]).unwrap(); + + let actual = tracer.to_field("root").unwrap(); + let expected = F::new("root", T::LargeUtf8, false); + + assert_eq!(actual, expected); + } + + #[test] + fn invalid_values_last() { + let mut tracer = SchemaTracer::new(TracingOptions::default().guess_dates(true)); + + tracer.trace_samples(&["2015-09-18T23:56:04Z"]).unwrap(); + tracer.trace_type::().unwrap(); + tracer.trace_samples(&["foo bar"]).unwrap(); + + let actual = tracer.to_field("root").unwrap(); + let expected = F::new("root", T::LargeUtf8, false); + + assert_eq!(actual, expected); + } +} diff --git a/serde_arrow/src/test_impls/mod.rs b/serde_arrow/src/test_impls/mod.rs index 27945544..c48f9670 100644 --- a/serde_arrow/src/test_impls/mod.rs +++ b/serde_arrow/src/test_impls/mod.rs @@ -12,5 +12,6 @@ mod r#union; mod utils; mod wrappers; -mod issue_74; -mod issue_79; +mod issue_74_unknown_fields; +mod issue_79_declared_but_missing_fields; +mod issue_90_type_tracing; From d074ba8215e451a9812a2b77d50fe6237c99d4a7 Mon Sep 17 00:00:00 2001 From: Christopher Prohm Date: Sun, 29 Oct 2023 13:45:58 +0100 Subject: [PATCH 12/27] Experiment with the API --- serde_arrow/src/arrow/mod.rs | 15 ++-- serde_arrow/src/arrow/schema.rs | 13 ++- serde_arrow/src/arrow2/mod.rs | 14 +-- serde_arrow/src/arrow2/schema.rs | 13 ++- serde_arrow/src/internal/generic.rs | 9 ++ serde_arrow/src/internal/schema.rs | 36 ++++++-- serde_arrow/src/internal/tracing/mod.rs | 69 +++----------- serde_arrow/src/internal/tracing/samples.rs | 1 + serde_arrow/src/internal/tracing/tracer.rs | 21 ++++- serde_arrow/src/internal/tracing/types.rs | 2 + serde_arrow/src/lib.rs | 41 ++++++++- serde_arrow/src/test_end_to_end/issue_90.rs | 50 +++++++++++ serde_arrow/src/test_end_to_end/mod.rs | 3 + .../src/test_impls/issue_90_type_tracing.rs | 90 ++++++++++++++++--- 14 files changed, 274 insertions(+), 103 deletions(-) create mode 100644 serde_arrow/src/test_end_to_end/issue_90.rs create mode 100644 serde_arrow/src/test_end_to_end/mod.rs diff --git a/serde_arrow/src/arrow/mod.rs b/serde_arrow/src/arrow/mod.rs index a732700b..1a0dc38b 100644 --- a/serde_arrow/src/arrow/mod.rs +++ b/serde_arrow/src/arrow/mod.rs @@ -23,7 +23,7 @@ use crate::{ serialization::{compile_serialization, CompilationOptions, Interpreter}, sink::serialize_into_sink, source::deserialize_from_source, - tracing::{SchemaTracer, TracingOptions}, + tracing::{Tracer, TracingOptions}, }, }; @@ -71,8 +71,10 @@ pub fn serialize_into_fields(items: &T, options: TracingOptions) -> Result(items: &T, name: &str, options: TracingOptions) - where T: Serialize + ?Sized, { - let mut schema = SchemaTracer::new(options); - schema.trace_samples(items)?; - let field = schema.to_field(name)?; + let mut tracer = Tracer::new(String::from("$"), options); + tracer.trace_samples(items)?; + + let field = tracer.to_field(name)?; Field::try_from(&field) } diff --git a/serde_arrow/src/arrow/schema.rs b/serde_arrow/src/arrow/schema.rs index 5dd105ef..63b7401b 100644 --- a/serde_arrow/src/arrow/schema.rs +++ b/serde_arrow/src/arrow/schema.rs @@ -3,8 +3,8 @@ use crate::{ _impl::arrow::datatypes::{DataType, Field, TimeUnit, UnionMode}, internal::{ error::{error, fail, Error, Result}, + generic, schema::{GenericDataType, GenericField, GenericTimeUnit, Schema, Strategy, STRATEGY_KEY}, - tracing::SchemaTracer, }, }; @@ -30,13 +30,12 @@ impl Schema { pub fn to_arrow_fields(&self) -> Result> { self.fields.iter().map(Field::try_from).collect() } -} -/// Support for arrow types (requires one of the `arrow-*` features) -impl SchemaTracer { - /// Build a vec of fields from a TracedSchema object - pub fn to_arrow_fields(&self) -> Result> { - self.to_fields()?.iter().map(Field::try_from).collect() + /// If this schema object has a single valid field, return it + pub fn to_arrow_field(&self) -> Result { + let fields = self.to_arrow_fields()?; + generic::to_single_item(fields) + .ok_or_else(|| error!("schema does not have exactly one field")) } } diff --git a/serde_arrow/src/arrow2/mod.rs b/serde_arrow/src/arrow2/mod.rs index 9f76517c..174ce44f 100644 --- a/serde_arrow/src/arrow2/mod.rs +++ b/serde_arrow/src/arrow2/mod.rs @@ -22,7 +22,7 @@ use crate::{ serialization::{compile_serialization, CompilationOptions, Interpreter}, sink::serialize_into_sink, source::deserialize_from_source, - tracing::{SchemaTracer, TracingOptions}, + tracing::{Tracer, TracingOptions}, }, }; @@ -70,8 +70,10 @@ pub fn serialize_into_fields(items: &T, options: TracingOptions) -> Result(items: &T, name: &str, options: TracingOptions) - where T: Serialize + ?Sized, { - let mut schema = SchemaTracer::new(options); - schema.trace_samples(items)?; - let field = schema.to_field(name)?; + let mut tracer = Tracer::new(String::from("$"), options); + tracer.trace_samples(items)?; + let field = tracer.to_field(name)?; Field::try_from(&field) } diff --git a/serde_arrow/src/arrow2/schema.rs b/serde_arrow/src/arrow2/schema.rs index 9faa513b..2f009df7 100644 --- a/serde_arrow/src/arrow2/schema.rs +++ b/serde_arrow/src/arrow2/schema.rs @@ -2,8 +2,8 @@ use crate::{ _impl::arrow2::datatypes::{DataType, Field, IntegerType, TimeUnit, UnionMode}, internal::{ error::{error, fail, Error, Result}, + generic, schema::{GenericDataType, GenericField, GenericTimeUnit, Schema, Strategy, STRATEGY_KEY}, - tracing::SchemaTracer, }, }; @@ -29,13 +29,12 @@ impl Schema { pub fn to_arrow2_fields(&self) -> Result> { self.fields.iter().map(Field::try_from).collect() } -} -/// Support for arrow2 types (requires one of the `arrow2-*` features) -impl SchemaTracer { - /// Build a vec of arrow2 fields from a TracedSchema object - pub fn to_arrow2_fields(&self) -> Result> { - self.to_fields()?.iter().map(Field::try_from).collect() + /// If this schema object has a single valid field, return it + pub fn to_arrow2_field(&self) -> Result { + let fields = self.to_arrow2_fields()?; + generic::to_single_item(fields) + .ok_or_else(|| error!("schema does not have exactly one field")) } } diff --git a/serde_arrow/src/internal/generic.rs b/serde_arrow/src/internal/generic.rs index 1295b1c4..1688fc97 100644 --- a/serde_arrow/src/internal/generic.rs +++ b/serde_arrow/src/internal/generic.rs @@ -10,6 +10,15 @@ use crate::internal::{ source::deserialize_from_source, }; +/// Return the item if the vector has exactly one +pub fn to_single_item(items: Vec) -> Option { + if items.len() == 1 { + items.into_iter().next() + } else { + None + } +} + pub struct GenericBuilder(pub serialization::Interpreter); impl GenericBuilder { diff --git a/serde_arrow/src/internal/schema.rs b/serde_arrow/src/internal/schema.rs index 4f9644b1..a39d6086 100644 --- a/serde_arrow/src/internal/schema.rs +++ b/serde_arrow/src/internal/schema.rs @@ -3,7 +3,10 @@ use std::{ str::FromStr, }; -use crate::internal::error::{fail, Error, Result}; +use crate::internal::{ + error::{fail, Error, Result}, + tracing::{Tracer, TracingOptions}, +}; use serde::{Deserialize, Serialize}; @@ -92,10 +95,26 @@ impl Schema { Self::default() } - #[allow(unused)] - fn with_field(mut self, field: GenericField) -> Self { - self.fields.push(field); - self + /// Determine the schema from the given type + /// + /// For more control consider using the underlying + /// [SchemaTracer][crate::schema::SchemaTracer] directly. + /// + pub fn from_type<'de, T: Deserialize<'de>>(options: TracingOptions) -> Result { + let mut tracer = Tracer::new(String::from("$"), options); + tracer.trace_type::()?; + tracer.to_schema() + } + + /// Determine the schema from the given samples + /// + /// For more control consider using the underlying + /// [SchemaTracer][crate::schema::SchemaTracer] directly. + /// + pub fn from_samples(options: TracingOptions, samples: &T) -> Result { + let mut tracer = Tracer::new(String::from("$"), options); + tracer.trace_samples(samples)?; + tracer.to_schema() } } @@ -748,6 +767,13 @@ mod test_schema_serialization { use super::{GenericField, Schema}; + impl Schema { + fn with_field(mut self, field: GenericField) -> Self { + self.fields.push(field); + self + } + } + #[test] fn example() { let schema = Schema::new() diff --git a/serde_arrow/src/internal/tracing/mod.rs b/serde_arrow/src/internal/tracing/mod.rs index 5a4ad87c..3ed76dd7 100644 --- a/serde_arrow/src/internal/tracing/mod.rs +++ b/serde_arrow/src/internal/tracing/mod.rs @@ -2,13 +2,6 @@ pub mod samples; pub mod tracer; pub mod types; -use serde::{Deserialize, Serialize}; - -use crate::internal::{ - error::{fail, Result}, - schema::{GenericDataType, GenericField, Schema}, -}; - pub use tracer::Tracer; /// Configure how the schema is traced @@ -76,6 +69,13 @@ pub struct TracingOptions { /// [`NaiveStrAsDate64`][crate::schema::Strategy::NaiveStrAsDate64] or /// [`UtcStrAsDate64`][crate::schema::Strategy::UtcStrAsDate64]. pub guess_dates: bool, + + /// If not `None`, trace the schema as a field with the given name instead + /// of multiple fields + /// + /// This may be helpful when the individual items are not structs, but other + /// objects, e.g., numbers or strings. + pub as_field: Option, } impl Default for TracingOptions { @@ -86,6 +86,7 @@ impl Default for TracingOptions { string_dictionary_encoding: false, coerce_numbers: false, guess_dates: false, + as_field: None, } } } @@ -124,56 +125,10 @@ impl TracingOptions { self.guess_dates = value; self } -} - -/// Collect schema information from samples and types -pub struct SchemaTracer { - tracer: Tracer, -} - -impl SchemaTracer { - /// Construct a new instance with the given options - pub fn new(options: TracingOptions) -> Self { - Self { - tracer: Tracer::new(String::from("$"), options), - } - } - - pub(crate) fn to_field(&self, name: &str) -> Result { - self.tracer.to_field(name) - } - - pub(crate) fn to_fields(&self) -> Result> { - let root = self.tracer.to_field("root")?; - - match root.data_type { - GenericDataType::Struct => Ok(root.children), - GenericDataType::Null => fail!("No records found to determine schema"), - dt => fail!("Unexpected root data type {dt:?}"), - } - } - - /// Convert the traced schema into a schema object - pub fn to_schema(&self) -> Result { - Ok(Schema { - fields: self.to_fields()?, - }) - } -} - -impl SchemaTracer { - /// Trace the given samples and collect schema information - pub fn trace_samples(&mut self, samples: &T) -> Result<()> { - self.tracer.reset()?; - self.tracer.trace_samples(samples) - } - /// Trace the given type and collect schema information - pub fn trace_type<'de, T: Deserialize<'de>>(&mut self) -> Result<()> { - self.tracer.reset()?; - self.tracer.trace_type::() + /// Set [`as_field`](#structfield.as_field) + pub fn as_field>(mut self, value: S) -> Self { + self.as_field = Some(value.into()); + self } } - -#[test] -fn test_trace_type() {} diff --git a/serde_arrow/src/internal/tracing/samples.rs b/serde_arrow/src/internal/tracing/samples.rs index 8c1ceae6..1e7cbb85 100644 --- a/serde_arrow/src/internal/tracing/samples.rs +++ b/serde_arrow/src/internal/tracing/samples.rs @@ -18,6 +18,7 @@ use crate::internal::{ impl Tracer { pub fn trace_samples(&mut self, samples: &T) -> Result<()> { + self.reset()?; let mut tracer = StripOuterSequenceSink::new(&mut *self); serialize_into_sink(&mut tracer, samples) } diff --git a/serde_arrow/src/internal/tracing/tracer.rs b/serde_arrow/src/internal/tracing/tracer.rs index 5ce98cbf..c6b2b2b8 100644 --- a/serde_arrow/src/internal/tracing/tracer.rs +++ b/serde_arrow/src/internal/tracing/tracer.rs @@ -2,7 +2,7 @@ use std::collections::HashMap; use crate::internal::{ error::{fail, Result}, - schema::{GenericDataType, GenericField, Strategy}, + schema::{GenericDataType, GenericField, Schema, Strategy}, tracing::TracingOptions, }; @@ -37,6 +37,24 @@ impl Tracer { pub fn new(path: String, options: TracingOptions) -> Self { Self::Unknown(UnknownTracer::new(path, options)) } + + /// Convert the traced schema into a schema object + pub fn to_schema(&self) -> Result { + let fields = if let Some(field_name) = self.get_options().as_field.as_ref() { + let field = self.to_field(field_name)?; + vec![field] + } else { + let root = self.to_field("root")?; + + match root.data_type { + GenericDataType::Struct => root.children, + GenericDataType::Null => fail!("No records found to determine schema"), + dt => fail!("Unexpected root data type {dt:?}"), + } + }; + + Ok(Schema { fields }) + } } impl Tracer { @@ -977,6 +995,7 @@ impl PrimitiveTracer { } match &self.item_type { + D::Null => Ok(GenericField::new(name, D::Null, true)), dt @ (D::LargeUtf8 | D::Utf8) => { if !self.options.string_dictionary_encoding { Ok(GenericField::new(name, dt.clone(), self.nullable)) diff --git a/serde_arrow/src/internal/tracing/types.rs b/serde_arrow/src/internal/tracing/types.rs index c972db5a..f4c1c509 100644 --- a/serde_arrow/src/internal/tracing/types.rs +++ b/serde_arrow/src/internal/tracing/types.rs @@ -10,6 +10,8 @@ use crate::internal::{ impl Tracer { pub fn trace_type<'de, T: Deserialize<'de>>(&mut self) -> Result<()> { + self.reset()?; + // TODO: make configurable let mut attempts = 100; while !self.is_complete() { diff --git a/serde_arrow/src/lib.rs b/serde_arrow/src/lib.rs index 141e7405..54b3dd12 100644 --- a/serde_arrow/src/lib.rs +++ b/serde_arrow/src/lib.rs @@ -212,6 +212,9 @@ pub mod arrow; #[cfg(all(test, has_arrow, has_arrow2))] mod test_impls; +#[cfg(all(test, has_arrow, has_arrow2))] +mod test_end_to_end; + #[cfg(test)] mod test; @@ -253,8 +256,44 @@ pub use crate::internal::error::{Error, Result}; pub mod schema { pub use crate::internal::{ schema::{Schema, Strategy, STRATEGY_KEY}, - tracing::{SchemaTracer, TracingOptions}, + tracing::TracingOptions, }; + + /// Trace the schema from type information and samples + pub struct SchemaTracer(crate::internal::tracing::Tracer); + + impl SchemaTracer { + /// Build a new schema tracer with the given options + pub fn new(options: TracingOptions) -> Self { + Self(crate::internal::tracing::Tracer::new( + String::from("$"), + options, + )) + } + + /// Trace type information of the given type + /// + /// Note: the given type should be the type of the element, not the + /// sequence of elements. + pub fn trace_type<'de, T: serde::Deserialize<'de>>(&mut self) -> crate::Result<()> { + self.0.trace_type::() + } + + /// Trace type information from the given samples + /// + /// Note: the given samples should be a sequence of elements. + pub fn trace_samples( + &mut self, + samples: &T, + ) -> crate::Result<()> { + self.0.trace_samples(samples) + } + + /// Convert the tracer to a schema + pub fn to_schema(&self) -> crate::Result { + self.0.to_schema() + } + } } /// Experimental functionality that is not bound by semver compatibility diff --git a/serde_arrow/src/test_end_to_end/issue_90.rs b/serde_arrow/src/test_end_to_end/issue_90.rs new file mode 100644 index 00000000..100603d9 --- /dev/null +++ b/serde_arrow/src/test_end_to_end/issue_90.rs @@ -0,0 +1,50 @@ +use serde::{Deserialize, Serialize}; + +use crate::{ + arrow::serialize_into_arrays, + schema::{Schema, TracingOptions}, +}; + +#[test] +fn example() -> Result<(), PanicOnError> { + #[derive(Deserialize, Serialize, Debug, PartialEq, Clone)] + pub struct Distribution { + pub samples: Vec, + pub statistic: String, + } + + #[derive(Deserialize, Serialize, Debug, PartialEq, Clone)] + pub struct VectorMetric { + pub distribution: Option, + } + + let schema = Schema::from_type::(TracingOptions::default())?; + let fields = schema.to_arrow_fields()?; + + let metrics = [ + VectorMetric { + distribution: Some(Distribution { + samples: vec![1.0, 2.0, 3.0], + statistic: String::from("metric1"), + }), + }, + VectorMetric { + distribution: Some(Distribution { + samples: vec![4.0, 5.0, 6.0], + statistic: String::from("metric2"), + }), + }, + ]; + + let _arrays = serialize_into_arrays(&fields, &metrics)?; + Ok(()) +} + +#[derive(Debug)] +struct PanicOnError; + +impl From for PanicOnError { + fn from(value: E) -> Self { + panic!("{value}"); + } +} diff --git a/serde_arrow/src/test_end_to_end/mod.rs b/serde_arrow/src/test_end_to_end/mod.rs new file mode 100644 index 00000000..8d9b4ddf --- /dev/null +++ b/serde_arrow/src/test_end_to_end/mod.rs @@ -0,0 +1,3 @@ +//! Test end to end examples to ensure the API works as designed +//! +mod issue_90; diff --git a/serde_arrow/src/test_impls/issue_90_type_tracing.rs b/serde_arrow/src/test_impls/issue_90_type_tracing.rs index c1d7ff3b..22749d7e 100644 --- a/serde_arrow/src/test_impls/issue_90_type_tracing.rs +++ b/serde_arrow/src/test_impls/issue_90_type_tracing.rs @@ -4,13 +4,15 @@ use serde::{Deserialize, Serialize}; use crate::internal::{ schema::{GenericDataType as T, GenericField as F, Strategy}, - tracing::{SchemaTracer, TracingOptions}, + tracing::{Tracer, TracingOptions}, }; fn trace_type<'de, T: Deserialize<'de>>(options: TracingOptions) -> F { - let mut schema = SchemaTracer::new(options); - schema.trace_type::().unwrap(); - schema.to_field("root").unwrap() + let mut tracer = Tracer::new(String::from("$"), options.as_field("root")); + tracer.trace_type::().unwrap(); + + let schema = tracer.to_schema().unwrap(); + schema.fields.into_iter().next().unwrap() } #[test] @@ -42,6 +44,10 @@ fn issue_90() { #[test] fn trace_primitives() { + assert_eq!( + trace_type::<()>(TracingOptions::default().allow_null_fields(true)), + F::new("root", T::Null, true), + ); assert_eq!( trace_type::(TracingOptions::default()), F::new("root", T::I8, false) @@ -189,35 +195,50 @@ mod mixed_tracing_dates { #[test] fn type_then_samples() { - let mut tracer = SchemaTracer::new(TracingOptions::default().guess_dates(true)); + let mut tracer = Tracer::new( + String::from("$"), + TracingOptions::default().guess_dates(true), + ); tracer.trace_type::().unwrap(); tracer.trace_samples(&samples()).unwrap(); - let actual = tracer.to_fields().unwrap(); + let actual = tracer.to_schema().unwrap().fields; assert_eq!(actual, expected()); } #[test] fn samples_then_type() { - let mut tracer = SchemaTracer::new(TracingOptions::default().guess_dates(true)); + let mut tracer = Tracer::new( + String::from("$"), + TracingOptions::default().guess_dates(true), + ); tracer.trace_samples(&samples()).unwrap(); tracer.trace_type::().unwrap(); - let actual = tracer.to_fields().unwrap(); + let actual = tracer.to_schema().unwrap().fields; assert_eq!(actual, expected()); } #[test] fn invalid_values_first() { - let mut tracer = SchemaTracer::new(TracingOptions::default().guess_dates(true)); + let mut tracer = Tracer::new( + String::from("$"), + TracingOptions::default().guess_dates(true).as_field("root"), + ); tracer.trace_samples(&["foo bar"]).unwrap(); tracer.trace_type::().unwrap(); tracer.trace_samples(&["2015-09-18T23:56:04Z"]).unwrap(); - - let actual = tracer.to_field("root").unwrap(); + + let actual = tracer + .to_schema() + .unwrap() + .fields + .into_iter() + .next() + .unwrap(); let expected = F::new("root", T::LargeUtf8, false); assert_eq!(actual, expected); @@ -225,15 +246,58 @@ mod mixed_tracing_dates { #[test] fn invalid_values_last() { - let mut tracer = SchemaTracer::new(TracingOptions::default().guess_dates(true)); + let mut tracer = Tracer::new( + String::from("$"), + TracingOptions::default().guess_dates(true).as_field("root"), + ); tracer.trace_samples(&["2015-09-18T23:56:04Z"]).unwrap(); tracer.trace_type::().unwrap(); tracer.trace_samples(&["foo bar"]).unwrap(); - let actual = tracer.to_field("root").unwrap(); + let actual = tracer + .to_schema() + .unwrap() + .fields + .into_iter() + .next() + .unwrap(); let expected = F::new("root", T::LargeUtf8, false); assert_eq!(actual, expected); } } + +mod mixed_tracing_unions { + use crate::internal::{generic, tracing}; + + use super::*; + + #[test] + fn example() { + #[derive(Serialize, Deserialize)] + enum E { + A, + B, + C(u32), + } + + let mut tracer = tracing::Tracer::new( + String::from("$"), + TracingOptions::default() + .allow_null_fields(true) + .as_field("root"), + ); + tracer.trace_type::().unwrap(); + tracer.trace_samples(&[E::A, E::C(32)]).unwrap(); + let schema = tracer.to_schema().unwrap(); + + let actual = generic::to_single_item(schema.fields).unwrap(); + let expected = F::new("root", T::Union, false) + .with_child(F::new("A", T::Null, true)) + .with_child(F::new("B", T::Null, true)) + .with_child(F::new("C", T::U32, false)); + + assert_eq!(actual, expected); + } +} From 6768bc5716b7833b9c820366ecdcd26da67c9440 Mon Sep 17 00:00:00 2001 From: Christopher Prohm Date: Tue, 31 Oct 2023 10:10:51 +0100 Subject: [PATCH 13/27] Implement single-item API, rework tests to remove deprecated API --- Changes.md | 39 ++ example/src/main.rs | 20 +- serde_arrow/Quickstart.md | 2 + serde_arrow/src/arrow/api.rs | 340 ++++++++++++ serde_arrow/src/arrow/mod.rs | 407 +------------- serde_arrow/src/arrow/schema.rs | 19 +- serde_arrow/src/arrow2/api.rs | 522 ++++++++++++++++++ serde_arrow/src/arrow2/mod.rs | 405 +------------- serde_arrow/src/arrow2/schema.rs | 19 +- serde_arrow/src/internal/config.rs | 3 +- serde_arrow/src/internal/error.rs | 11 + serde_arrow/src/internal/generic.rs | 86 ++- serde_arrow/src/internal/schema.rs | 50 +- serde_arrow/src/internal/tracing/mod.rs | 14 - serde_arrow/src/internal/tracing/tracer.rs | 25 +- serde_arrow/src/lib.rs | 95 +--- serde_arrow/src/schema.rs | 40 ++ serde_arrow/src/test_end_to_end/issue_90.rs | 65 ++- serde_arrow/src/test_end_to_end/test_items.rs | 28 + serde_arrow/src/test_impls/chrono.rs | 42 +- serde_arrow/src/test_impls/dictionary.rs | 124 ++--- serde_arrow/src/test_impls/examples.rs | 16 +- .../src/test_impls/issue_74_unknown_fields.rs | 4 +- .../issue_79_declared_but_missing_fields.rs | 4 +- .../src/test_impls/issue_90_type_tracing.rs | 76 +-- serde_arrow/src/test_impls/json_values.rs | 18 +- serde_arrow/src/test_impls/list.rs | 20 +- serde_arrow/src/test_impls/macros.rs | 218 ++------ serde_arrow/src/test_impls/map.rs | 30 +- serde_arrow/src/test_impls/mod.rs | 1 - serde_arrow/src/test_impls/primitives.rs | 86 +-- serde_arrow/src/test_impls/struct.rs | 26 +- serde_arrow/src/test_impls/tuple.rs | 14 +- serde_arrow/src/test_impls/union.rs | 64 +-- serde_arrow/src/test_impls/utils.rs | 29 - serde_arrow/src/test_impls/wrappers.rs | 61 +- serde_arrow/src/utils.rs | 2 + 37 files changed, 1547 insertions(+), 1478 deletions(-) create mode 100644 serde_arrow/src/arrow/api.rs create mode 100644 serde_arrow/src/arrow2/api.rs create mode 100644 serde_arrow/src/schema.rs create mode 100644 serde_arrow/src/test_end_to_end/test_items.rs delete mode 100644 serde_arrow/src/test_impls/utils.rs create mode 100644 serde_arrow/src/utils.rs diff --git a/Changes.md b/Changes.md index 51d6d2cb..c369415e 100644 --- a/Changes.md +++ b/Changes.md @@ -2,9 +2,48 @@ ## 0.9.0 +Breaking changes: + +- Make tracing options non-exhaustive - Remove `try_parse_dates` in favor of `guess_dates` field in `TracingOptions` +- Remove experimental configuration api + +New feature: Improved schema tracing: + - Add type based tracing to allow schema tracing without samples +Deprecations: + +- Rename `serde_arrow::schema::Schema` to + `serde_arrow::schema::SerdeArrowSchema` to prevent name clashes with the + schema types of `arrow` and `arrow2`. +- Deprecate `serialize_into_arrays`, `deserialize_from_arrays` methods in favor of + `to_arrow` / `to_arrow2` and `from_arrow` / `from_arrow2` +- Deprecate `serialize_into_fields` methods in favor of + `SerdeArrowSchema::from_samples` +- Deprecated single item methods in favor of using the `Items` and `Item` + wrappers + +Migration guide: + +```rust +// old: +serde_arrow::arrow::serialize_into_arrays(&fields, &items)? +// new: +serde_arrow::to_arrow(&fields, &items)? + +// old +serde_arrow::arrow::deserialize_from_arrays(&fields, &arrays)? +// new +serde::from_arrow(&fields, &arrays)? + +// old +serde_arrow::arrow::serialize_into_fields(&items)? +// new +use serde_arrow::schema::SerdeArrowSchema; +SerdeArrowSchema::from_samples(&items)?.to_arrow_fields()? +``` + ## 0.8.0 Make bytecode based serialization and deserialization the default diff --git a/example/src/main.rs b/example/src/main.rs index e1eb78c4..33fc6510 100644 --- a/example/src/main.rs +++ b/example/src/main.rs @@ -1,4 +1,4 @@ -use std::{collections::HashMap, fs::File, path::Path}; +use std::{collections::HashMap, convert::TryInto, fs::File, path::Path}; use chrono::NaiveDateTime; use serde::Serialize; @@ -6,10 +6,9 @@ use serde::Serialize; use arrow2::{ array::Array, chunk::Chunk, - datatypes::{DataType, Field, Schema}, + datatypes::{Field, Schema}, io::ipc::write, }; -use serde_arrow::schema::Strategy; macro_rules! hashmap { () => { @@ -79,17 +78,12 @@ fn main() -> Result<(), PanicOnError> { }, ]; - use serde_arrow::arrow2::{serialize_into_arrays, serialize_into_fields}; + use serde_arrow::schema::{SerdeArrowSchema, TracingOptions}; - let mut fields = serialize_into_fields(&examples, Default::default())?; - - for field in &mut fields { - if field.name == "date64" { - *field = Field::new("date64", DataType::Date64, false) - .with_metadata(Strategy::NaiveStrAsDate64.into()); - } - } - let arrays = serialize_into_arrays(&fields, &examples)?; + let fields: Vec = + SerdeArrowSchema::from_samples(&examples, TracingOptions::default().guess_dates(true))? + .try_into()?; + let arrays = serde_arrow::to_arrow2(&fields, &examples)?; let schema = Schema::from(fields); let chunk = Chunk::new(arrays); diff --git a/serde_arrow/Quickstart.md b/serde_arrow/Quickstart.md index 5242e0d5..39fc5ad9 100644 --- a/serde_arrow/Quickstart.md +++ b/serde_arrow/Quickstart.md @@ -122,6 +122,8 @@ will be mapped to the following arrow union: ## Specifying the schema in JSON +TODO: cross-reference + ```rust let schema_json = r#" [ diff --git a/serde_arrow/src/arrow/api.rs b/serde_arrow/src/arrow/api.rs new file mode 100644 index 00000000..5690e478 --- /dev/null +++ b/serde_arrow/src/arrow/api.rs @@ -0,0 +1,340 @@ +#![deny(missing_docs)] +use serde::{Deserialize, Serialize}; + +use crate::{ + _impl::arrow::{ + array::{Array, ArrayRef}, + datatypes::Field, + }, + internal::{ + error::Result, + generic, + schema::GenericField, + serialization::{compile_serialization, CompilationOptions, Interpreter}, + sink::serialize_into_sink, + source::deserialize_from_source, + tracing::{Tracer, TracingOptions}, + }, +}; + +/// Deserialize items from arrow arrays (*requires one of the `arrow-*` +/// features*) +/// +/// The type should be a list of records (e.g., a vector of structs). +/// +/// ```rust +/// use serde::{Deserialize, Serialize}; +/// use serde_arrow::{ +/// arrow::{ +/// deserialize_from_arrays, +/// serialize_into_arrays, +/// serialize_into_fields, +/// }, +/// schema::TracingOptions, +/// }; +/// +/// ##[derive(Deserialize, Serialize)] +/// struct Record { +/// a: Option, +/// b: u64, +/// } +/// +/// // provide an example record to get the field information +/// let fields = serialize_into_fields( +/// &[Record { a: Some(1.0), b: 2}], +/// TracingOptions::default(), +/// ).unwrap(); +/// # let items = &[Record { a: Some(1.0), b: 2}]; +/// # let arrays = serialize_into_arrays(&fields, &items).unwrap(); +/// # +/// +/// // deserialize the records from arrays +/// let items: Vec = deserialize_from_arrays(&fields, &arrays).unwrap(); +/// ``` +/// +pub fn from_arrow<'de, T, A>(fields: &'de [Field], arrays: &'de [A]) -> Result +where + T: Deserialize<'de>, + A: AsRef, +{ + use crate::internal::{ + common::{BufferExtract, Buffers}, + deserialization, + }; + + let fields = fields + .iter() + .map(GenericField::try_from) + .collect::>>()?; + + let num_items = arrays + .iter() + .map(|a| a.as_ref().len()) + .min() + .unwrap_or_default(); + + let mut buffers = Buffers::new(); + let mut mappings = Vec::new(); + for (field, array) in fields.iter().zip(arrays.iter()) { + mappings.push(array.as_ref().extract_buffers(field, &mut buffers)?); + } + + let interpreter = deserialization::compile_deserialization( + num_items, + &mappings, + buffers, + deserialization::CompilationOptions::default(), + )?; + deserialize_from_source(interpreter) +} + +/// Determine the schema (as a list of fields) for the given items +#[deprecated = "serialize_into_fields is deprecated. Use serde_arrow::schema::SerdeArrowSchema::from_samples instead"] +pub fn serialize_into_fields(items: &T, options: TracingOptions) -> Result> +where + T: Serialize + ?Sized, +{ + let mut tracer = Tracer::new(String::from("$"), options); + tracer.trace_samples(items)?; + + let schema = tracer.to_schema()?; + schema.to_arrow_fields() +} + +/// Determine the schema of an object that represents a single array +#[deprecated = "serialize_into_field is deprecated. Use serde_arrow::to_arrow with serde_arrow::utils::Items instead"] +pub fn serialize_into_field(items: &T, name: &str, options: TracingOptions) -> Result +where + T: Serialize + ?Sized, +{ + let mut tracer = Tracer::new(String::from("$"), options); + tracer.trace_samples(items)?; + + let field = tracer.to_field(name)?; + Field::try_from(&field) +} + +/// Build arrays from the given items +#[deprecated = "serialize_into_arrays is deprecated. Use serde_arrow::to_arrow instead"] +pub fn serialize_into_arrays( + fields: &[Field], + items: &T, +) -> Result> { + to_arrow(fields, items) +} + +/// Build arrow arrays from the given items (*requires one of the `arrow-*` +/// features*)) +/// +/// `items` should be given in the form a list of records (e.g., a vector of +/// structs). +/// +/// Example: +/// +/// ```rust +/// # fn main() -> serde_arrow::Result<()> { +/// use serde::{Serialize, Deserialize}; +/// use serde_arrow::schema::{SerdeArrowSchema, TracingOptions}; +/// +/// ##[derive(Serialize, Deserialize)] +/// struct Record { +/// a: Option, +/// b: u64, +/// } +/// +/// let items = vec![ +/// Record { a: Some(1.0), b: 2}, +/// // ... +/// ]; +/// +/// let fields = SerdeArrowSchema::from_type::(TracingOptions::default())?.to_arrow_fields()?; +/// let arrays = serde_arrow::to_arrow(&fields, &items)?; +/// +/// assert_eq!(arrays.len(), 2); +/// # Ok(()) +/// # } +/// ``` +/// +pub fn to_arrow(fields: &[Field], items: &T) -> Result> { + let fields = fields + .iter() + .map(GenericField::try_from) + .collect::>>()?; + + let program = compile_serialization(&fields, CompilationOptions::default())?; + let mut interpreter = Interpreter::new(program); + serialize_into_sink(&mut interpreter, items)?; + interpreter.build_arrow_arrays() +} + +/// Deserialize a type from the given arrays +#[deprecated = "deserialize_from_arrays is deprecated. Use serde_arrow::from_arrow instead"] +pub fn deserialize_from_arrays<'de, T, A>(fields: &'de [Field], arrays: &'de [A]) -> Result +where + T: Deserialize<'de>, + A: AsRef, +{ + from_arrow(fields, arrays) +} + +/// Serialize an object that represents a single array into an array +#[deprecated = "serialize_into_array is deprecated. Use serde_arrow::arrow::ArrayBuilder instead"] +pub fn serialize_into_array(field: &Field, items: &T) -> Result +where + T: Serialize + ?Sized, +{ + let field: GenericField = field.try_into()?; + + let program = compile_serialization( + std::slice::from_ref(&field), + CompilationOptions::default().wrap_with_struct(false), + )?; + let mut interpreter = Interpreter::new(program); + serialize_into_sink(&mut interpreter, items)?; + interpreter.build_arrow_array() +} + +/// Deserialize a sequence of objects from a single array +#[deprecated = "deserialize_from_array is deprecated"] +pub fn deserialize_from_array<'de, T, A>(field: &'de Field, array: &'de A) -> Result +where + T: Deserialize<'de>, + A: AsRef + 'de + ?Sized, +{ + generic::deserialize_from_array(field, array.as_ref()) +} + +/// Build a single array item by item +/// +/// Example: +/// +/// ```rust +/// # use serde_arrow::_impl::arrow as arrow; +/// use arrow::datatypes::{Field, DataType}; +/// use serde_arrow::arrow::ArrayBuilder; +/// +/// let field = Field::new("value", DataType::Int64, false); +/// let mut builder = ArrayBuilder::new(&field).unwrap(); +/// +/// builder.push(&-1_i64).unwrap(); +/// builder.push(&2_i64).unwrap(); +/// builder.push(&-3_i64).unwrap(); +/// +/// builder.extend(&[4_i64, -5, 6]).unwrap(); +/// +/// let array = builder.build_array().unwrap(); +/// assert_eq!(array.len(), 6); +/// ``` +pub struct ArrayBuilder(generic::GenericBuilder); + +impl ArrayBuilder { + /// Construct a new build for the given field + /// + /// This method may fail for an unsupported data type of the given field. + /// + pub fn new(field: &Field) -> Result { + Ok(Self(generic::GenericBuilder::new_for_array( + GenericField::try_from(field)?, + )?)) + } + + /// Add a single item to the arrays + /// + pub fn push(&mut self, item: &T) -> Result<()> { + self.0.push(item) + } + + /// Add multiple items to the arrays + /// + pub fn extend(&mut self, items: &T) -> Result<()> { + self.0.extend(items) + } + + /// Build the array from the rows pushed to far. + /// + /// This operation will reset the underlying buffers and start a new batch. + /// + pub fn build_array(&mut self) -> Result { + self.0 .0.build_arrow_array() + } +} + +/// Build arrays record by record +/// +/// Example: +/// +/// ```rust +/// # use serde_arrow::_impl::arrow as arrow; +/// use arrow::datatypes::{DataType, Field}; +/// use serde::Serialize; +/// use serde_arrow::arrow::{ArraysBuilder}; +/// +/// ##[derive(Serialize)] +/// struct Record { +/// a: Option, +/// b: u64, +/// } + +/// let fields = vec![ +/// Field::new("a", DataType::Float32, true), +/// Field::new("b", DataType::UInt64, false), +/// ]; +/// let mut builder = ArraysBuilder::new(&fields).unwrap(); +/// +/// builder.push(&Record { a: Some(1.0), b: 2}).unwrap(); +/// builder.push(&Record { a: Some(3.0), b: 4}).unwrap(); +/// builder.push(&Record { a: Some(5.0), b: 5}).unwrap(); +/// +/// builder.extend(&[ +/// Record { a: Some(6.0), b: 7}, +/// Record { a: Some(8.0), b: 9}, +/// Record { a: Some(10.0), b: 11}, +/// ]).unwrap(); +/// +/// let arrays = builder.build_arrays().unwrap(); +/// +/// assert_eq!(arrays.len(), 2); +/// assert_eq!(arrays[0].len(), 6); +/// ``` +pub struct ArraysBuilder(generic::GenericBuilder); + +impl std::fmt::Debug for ArraysBuilder { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "ArraysBuilder<...>") + } +} + +impl ArraysBuilder { + /// Build a new ArraysBuilder for the given fields + /// + /// This method may fail when unsupported data types are encountered in the + /// given fields. + /// + pub fn new(fields: &[Field]) -> Result { + let fields = fields + .iter() + .map(GenericField::try_from) + .collect::>>()?; + Ok(Self(generic::GenericBuilder::new_for_arrays(&fields)?)) + } + + /// Add a single record to the arrays + /// + pub fn push(&mut self, item: &T) -> Result<()> { + self.0.push(item) + } + + /// Add multiple records to the arrays + /// + pub fn extend(&mut self, items: &T) -> Result<()> { + self.0.extend(items) + } + + /// Build the arrays from the rows pushed to far. + /// + /// This operation will reset the underlying buffers and start a new batch. + /// + pub fn build_arrays(&mut self) -> Result> { + self.0 .0.build_arrow_arrays() + } +} diff --git a/serde_arrow/src/arrow/mod.rs b/serde_arrow/src/arrow/mod.rs index 1a0dc38b..a6990bd2 100644 --- a/serde_arrow/src/arrow/mod.rs +++ b/serde_arrow/src/arrow/mod.rs @@ -1,412 +1,17 @@ -//! Support for the `arrow` crate (requires one the `arrow-*` features) +//! Support for the `arrow` crate (*requires one the `arrow-*` features*) //! //! Functions to convert Rust objects into arrow Arrays. Deserialization from //! `arrow` arrays to Rust objects is not yet supported. //! #![deny(missing_docs)] +pub(crate) mod api; mod deserialization; mod schema; pub(crate) mod serialization; mod type_support; -use serde::{Deserialize, Serialize}; - -use crate::{ - _impl::arrow::{ - array::{Array, ArrayRef}, - datatypes::Field, - }, - internal::{ - error::Result, - generic, - schema::GenericField, - serialization::{compile_serialization, CompilationOptions, Interpreter}, - sink::serialize_into_sink, - source::deserialize_from_source, - tracing::{Tracer, TracingOptions}, - }, +#[allow(deprecated)] +pub use api::{ + deserialize_from_array, deserialize_from_arrays, serialize_into_array, serialize_into_arrays, + serialize_into_field, serialize_into_fields, ArrayBuilder, ArraysBuilder, }; - -/// Determine the schema (as a list of fields) for the given items -/// -/// `items` should be given in the form a list of records (e.g., a vector of -/// structs). -/// -/// To correctly record the type information make sure to: -/// -/// - include values for `Option` -/// - include all variants of an enum -/// - include at least single element of a list or a map -/// -/// Example: -/// -/// ```rust -/// # use serde_arrow::_impl::arrow as arrow; -/// # -/// use arrow::datatypes::{DataType, Field}; -/// use serde::Serialize; -/// use serde_arrow::arrow::serialize_into_fields; -/// -/// ##[derive(Serialize)] -/// struct Record { -/// a: Option, -/// b: u64, -/// } -/// -/// let items = vec![ -/// Record { a: Some(1.0), b: 2}, -/// // ... -/// ]; -/// -/// let fields = serialize_into_fields(&items, Default::default()).unwrap(); -/// let expected = vec![ -/// Field::new("a", DataType::Float32, true), -/// Field::new("b", DataType::UInt64, false), -/// ]; -/// -/// assert_eq!(fields, expected); -/// ``` -/// -pub fn serialize_into_fields(items: &T, options: TracingOptions) -> Result> -where - T: Serialize + ?Sized, -{ - let mut tracer = Tracer::new(String::from("$"), options); - tracer.trace_samples(items)?; - - let schema = tracer.to_schema()?; - schema.to_arrow_fields() -} - -/// Determine the schema of an object that represents a single array -/// -/// Example: -/// -/// ```rust -/// # use serde_arrow::_impl::arrow as arrow; -/// use arrow::datatypes::{DataType, Field}; -/// use serde_arrow::arrow::serialize_into_field; -/// -/// let items: Vec = vec![1.0, 2.0, 3.0, 4.0]; -/// -/// let field = serialize_into_field(&items, "floats", Default::default()).unwrap(); -/// assert_eq!(field, Field::new("floats", DataType::Float32, false)); -/// ``` -/// -pub fn serialize_into_field(items: &T, name: &str, options: TracingOptions) -> Result -where - T: Serialize + ?Sized, -{ - let mut tracer = Tracer::new(String::from("$"), options); - tracer.trace_samples(items)?; - - let field = tracer.to_field(name)?; - Field::try_from(&field) -} - -/// Build arrays from the given items -/// -/// `items` should be given in the form a list of records (e.g., a vector of -/// structs). -/// -/// Example: -/// -/// ```rust -/// use serde::Serialize; -/// use serde_arrow::arrow::{serialize_into_fields, serialize_into_arrays}; -/// -/// ##[derive(Serialize)] -/// struct Record { -/// a: Option, -/// b: u64, -/// } -/// -/// let items = vec![ -/// Record { a: Some(1.0), b: 2}, -/// // ... -/// ]; -/// -/// let fields = serialize_into_fields(&items, Default::default()).unwrap(); -/// let arrays = serialize_into_arrays(&fields, &items).unwrap(); -/// -/// assert_eq!(arrays.len(), 2); -/// ``` -/// -pub fn serialize_into_arrays(fields: &[Field], items: &T) -> Result> -where - T: Serialize + ?Sized, -{ - let fields = fields - .iter() - .map(GenericField::try_from) - .collect::>>()?; - - let program = compile_serialization(&fields, CompilationOptions::default())?; - let mut interpreter = Interpreter::new(program); - serialize_into_sink(&mut interpreter, items)?; - interpreter.build_arrow_arrays() -} - -/// Deserialize a type from the given arrays -/// -/// The type should be a list of records (e.g., a vector of structs). -/// -/// ```rust -/// use serde::{Deserialize, Serialize}; -/// use serde_arrow::{ -/// arrow::{ -/// deserialize_from_arrays, -/// serialize_into_arrays, -/// serialize_into_fields, -/// }, -/// schema::TracingOptions, -/// }; -/// -/// ##[derive(Deserialize, Serialize)] -/// struct Record { -/// a: Option, -/// b: u64, -/// } -/// -/// // provide an example record to get the field information -/// let fields = serialize_into_fields( -/// &[Record { a: Some(1.0), b: 2}], -/// TracingOptions::default(), -/// ).unwrap(); -/// # let items = &[Record { a: Some(1.0), b: 2}]; -/// # let arrays = serialize_into_arrays(&fields, &items).unwrap(); -/// # -/// -/// // deserialize the records from arrays -/// let items: Vec = deserialize_from_arrays(&fields, &arrays).unwrap(); -/// ``` -/// -pub fn deserialize_from_arrays<'de, T, A>(fields: &'de [Field], arrays: &'de [A]) -> Result -where - T: Deserialize<'de>, - A: AsRef, -{ - use crate::internal::{ - common::{BufferExtract, Buffers}, - deserialization, - }; - - let fields = fields - .iter() - .map(GenericField::try_from) - .collect::>>()?; - - let num_items = arrays - .iter() - .map(|a| a.as_ref().len()) - .min() - .unwrap_or_default(); - - let mut buffers = Buffers::new(); - let mut mappings = Vec::new(); - for (field, array) in fields.iter().zip(arrays.iter()) { - mappings.push(array.as_ref().extract_buffers(field, &mut buffers)?); - } - - let interpreter = deserialization::compile_deserialization( - num_items, - &mappings, - buffers, - deserialization::CompilationOptions::default(), - )?; - deserialize_from_source(interpreter) -} - -/// Serialize an object that represents a single array into an array -/// -/// Example: -/// -/// ```rust -/// # use serde_arrow::_impl::arrow as arrow; -/// # -/// use arrow::datatypes::{DataType, Field}; -/// use serde_arrow::arrow::serialize_into_array; -/// -/// let items: Vec = vec![1.0, 2.0, 3.0, 4.0]; -/// -/// let field = Field::new("floats", DataType::Float32, false); -/// let array = serialize_into_array(&field, &items).unwrap(); -/// -/// assert_eq!(array.len(), 4); -/// ``` -pub fn serialize_into_array(field: &Field, items: &T) -> Result -where - T: Serialize + ?Sized, -{ - let field: GenericField = field.try_into()?; - - let program = compile_serialization( - std::slice::from_ref(&field), - CompilationOptions::default().wrap_with_struct(false), - )?; - let mut interpreter = Interpreter::new(program); - serialize_into_sink(&mut interpreter, items)?; - interpreter.build_arrow_array() -} - -/// Deserialize a sequence of objects from a single array -/// -/// Example: -/// -/// ```rust -/// # use serde_arrow::_impl::arrow as arrow; -/// # -/// use arrow::{array::Array, datatypes::{DataType, Field}}; -/// use serde_arrow::arrow::{ -/// serialize_into_array, -/// deserialize_from_array, -/// }; -/// -/// let field = Field::new("floats", DataType::Float32, false); -/// -/// let array = serialize_into_array(&field, &vec![1.0_f32, 2.0, 3.0]).unwrap(); -/// let items: Vec = deserialize_from_array(&field, &array).unwrap(); -/// ``` -/// -pub fn deserialize_from_array<'de, T, A>(field: &'de Field, array: &'de A) -> Result -where - T: Deserialize<'de>, - A: AsRef + 'de + ?Sized, -{ - generic::deserialize_from_array(field, array.as_ref()) -} - -/// Build a single array item by item -/// -/// Example: -/// -/// ```rust -/// # use serde_arrow::_impl::arrow as arrow; -/// use arrow::datatypes::{Field, DataType}; -/// use serde_arrow::arrow::ArrayBuilder; -/// -/// let field = Field::new("value", DataType::Int64, false); -/// let mut builder = ArrayBuilder::new(&field).unwrap(); -/// -/// builder.push(&-1_i64).unwrap(); -/// builder.push(&2_i64).unwrap(); -/// builder.push(&-3_i64).unwrap(); -/// -/// builder.extend(&[4_i64, -5, 6]).unwrap(); -/// -/// let array = builder.build_array().unwrap(); -/// assert_eq!(array.len(), 6); -/// ``` -pub struct ArrayBuilder(generic::GenericBuilder); - -impl ArrayBuilder { - /// Construct a new build for the given field - /// - /// This method may fail for an unsupported data type of the given field. - /// - pub fn new(field: &Field) -> Result { - Ok(Self(generic::GenericBuilder::new_for_array( - GenericField::try_from(field)?, - )?)) - } - - /// Add a single item to the arrays - /// - pub fn push(&mut self, item: &T) -> Result<()> { - self.0.push(item) - } - - /// Add multiple items to the arrays - /// - pub fn extend(&mut self, items: &T) -> Result<()> { - self.0.extend(items) - } - - /// Build the array from the rows pushed to far. - /// - /// This operation will reset the underlying buffers and start a new batch. - /// - pub fn build_array(&mut self) -> Result { - self.0 .0.build_arrow_array() - } -} - -/// Build arrays record by record -/// -/// Example: -/// -/// ```rust -/// # use serde_arrow::_impl::arrow as arrow; -/// use arrow::datatypes::{DataType, Field}; -/// use serde::Serialize; -/// use serde_arrow::arrow::{ArraysBuilder}; -/// -/// ##[derive(Serialize)] -/// struct Record { -/// a: Option, -/// b: u64, -/// } - -/// let fields = vec![ -/// Field::new("a", DataType::Float32, true), -/// Field::new("b", DataType::UInt64, false), -/// ]; -/// let mut builder = ArraysBuilder::new(&fields).unwrap(); -/// -/// builder.push(&Record { a: Some(1.0), b: 2}).unwrap(); -/// builder.push(&Record { a: Some(3.0), b: 4}).unwrap(); -/// builder.push(&Record { a: Some(5.0), b: 5}).unwrap(); -/// -/// builder.extend(&[ -/// Record { a: Some(6.0), b: 7}, -/// Record { a: Some(8.0), b: 9}, -/// Record { a: Some(10.0), b: 11}, -/// ]).unwrap(); -/// -/// let arrays = builder.build_arrays().unwrap(); -/// -/// assert_eq!(arrays.len(), 2); -/// assert_eq!(arrays[0].len(), 6); -/// ``` -pub struct ArraysBuilder(generic::GenericBuilder); - -impl std::fmt::Debug for ArraysBuilder { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "ArraysBuilder<...>") - } -} - -impl ArraysBuilder { - /// Build a new ArraysBuilder for the given fields - /// - /// This method may fail when unsupported data types are encountered in the - /// given fields. - /// - pub fn new(fields: &[Field]) -> Result { - let fields = fields - .iter() - .map(GenericField::try_from) - .collect::>>()?; - Ok(Self(generic::GenericBuilder::new_for_arrays(&fields)?)) - } - - /// Add a single record to the arrays - /// - pub fn push(&mut self, item: &T) -> Result<()> { - self.0.push(item) - } - - /// Add multiple records to the arrays - /// - pub fn extend(&mut self, items: &T) -> Result<()> { - self.0.extend(items) - } - - /// Build the arrays from the rows pushed to far. - /// - /// This operation will reset the underlying buffers and start a new batch. - /// - pub fn build_arrays(&mut self) -> Result> { - self.0 .0.build_arrow_arrays() - } -} diff --git a/serde_arrow/src/arrow/schema.rs b/serde_arrow/src/arrow/schema.rs index 63b7401b..46f179ac 100644 --- a/serde_arrow/src/arrow/schema.rs +++ b/serde_arrow/src/arrow/schema.rs @@ -3,13 +3,15 @@ use crate::{ _impl::arrow::datatypes::{DataType, Field, TimeUnit, UnionMode}, internal::{ error::{error, fail, Error, Result}, - generic, - schema::{GenericDataType, GenericField, GenericTimeUnit, Schema, Strategy, STRATEGY_KEY}, + schema::{ + GenericDataType, GenericField, GenericTimeUnit, SerdeArrowSchema, Strategy, + STRATEGY_KEY, + }, }, }; /// Support for arrow types (requires one of the `arrow-*` features) -impl Schema { +impl SerdeArrowSchema { /// Build a new Schema object from fields pub fn from_arrow_fields(fields: &[Field]) -> Result { Ok(Self { @@ -30,12 +32,13 @@ impl Schema { pub fn to_arrow_fields(&self) -> Result> { self.fields.iter().map(Field::try_from).collect() } +} + +impl TryFrom for Vec { + type Error = Error; - /// If this schema object has a single valid field, return it - pub fn to_arrow_field(&self) -> Result { - let fields = self.to_arrow_fields()?; - generic::to_single_item(fields) - .ok_or_else(|| error!("schema does not have exactly one field")) + fn try_from(value: SerdeArrowSchema) -> Result { + value.to_arrow_fields() } } diff --git a/serde_arrow/src/arrow2/api.rs b/serde_arrow/src/arrow2/api.rs new file mode 100644 index 00000000..ad469e43 --- /dev/null +++ b/serde_arrow/src/arrow2/api.rs @@ -0,0 +1,522 @@ +//! Support for the `arrow2` crate (requires one the `arrow2-*` features) +//! +//! Functions to convert Rust objects into Arrow arrays and back. +//! +use serde::{Deserialize, Serialize}; + +use crate::{ + _impl::arrow2::{array::Array, datatypes::Field}, + internal::{ + error::Result, + generic, + schema::GenericField, + serialization::{compile_serialization, CompilationOptions, Interpreter}, + sink::serialize_into_sink, + source::deserialize_from_source, + tracing::{Tracer, TracingOptions}, + }, +}; + +/// Determine the schema (as a list of fields) for the given items +/// +/// `items` should be given in the form a list of records (e.g., a vector of +/// structs). +/// +/// To correctly record the type information make sure to: +/// +/// - include values for `Option` +/// - include all variants of an enum +/// - include at least single element of a list or a map +/// +/// Example: +/// +/// ```rust +/// # use serde_arrow::_impl::arrow2 as arrow2; +/// # +/// use arrow2::datatypes::{DataType, Field}; +/// use serde::Serialize; +/// use serde_arrow::arrow2::serialize_into_fields; +/// +/// ##[derive(Serialize)] +/// struct Record { +/// a: Option, +/// b: u64, +/// } +/// +/// let items = vec![ +/// Record { a: Some(1.0), b: 2}, +/// // ... +/// ]; +/// +/// let fields = serialize_into_fields(&items, Default::default()).unwrap(); +/// let expected = vec![ +/// Field::new("a", DataType::Float32, true), +/// Field::new("b", DataType::UInt64, false), +/// ]; +/// +/// assert_eq!(fields, expected); +/// ``` +/// +pub fn serialize_into_fields(items: &T, options: TracingOptions) -> Result> +where + T: Serialize + ?Sized, +{ + let mut tracer = Tracer::new(String::from("$"), options); + tracer.trace_samples(items)?; + + let schema = tracer.to_schema()?; + schema.to_arrow2_fields() +} + +/// Build arrays from the given items +/// +/// `items` should be given in the form a list of records (e.g., a vector of +/// structs). +/// +/// To build arrays record by record use [ArraysBuilder]. +/// +/// ```rust +/// use serde::Serialize; +/// use serde_arrow::arrow2::{serialize_into_fields, serialize_into_arrays}; +/// +/// ##[derive(Serialize)] +/// struct Record { +/// a: Option, +/// b: u64, +/// } +/// +/// let items = vec![ +/// Record { a: Some(1.0), b: 2}, +/// // ... +/// ]; +/// +/// let fields = serialize_into_fields(&items, Default::default()).unwrap(); +/// let arrays = serialize_into_arrays(&fields, &items).unwrap(); +/// +/// assert_eq!(arrays.len(), 2); +/// ``` +/// +pub fn serialize_into_arrays(fields: &[Field], items: &T) -> Result>> +where + T: Serialize + ?Sized, +{ + let fields = fields + .iter() + .map(GenericField::try_from) + .collect::>>()?; + + let program = compile_serialization(&fields, CompilationOptions::default())?; + let mut interpreter = Interpreter::new(program); + serialize_into_sink(&mut interpreter, items)?; + + interpreter.build_arrow2_arrays() +} + +/// Build arrow2 arrays from the given items (*requires one of the `arrow2-*` +/// features*) +/// +/// `items` should be given in the form a list of records (e.g., a vector of +/// structs). +/// +/// To build arrays record by record use [ArraysBuilder]. +/// +/// ```rust +/// # fn main() -> serde_arrow::Result<()> { +/// use serde::{Serialize, Deserialize}; +/// use serde_arrow::schema::{SerdeArrowSchema, TracingOptions}; +/// +/// ##[derive(Serialize, Deserialize)] +/// struct Record { +/// a: Option, +/// b: u64, +/// } +/// +/// let items = vec![ +/// Record { a: Some(1.0), b: 2}, +/// // ... +/// ]; +/// +/// let fields = SerdeArrowSchema::from_type::(TracingOptions::default()).unwrap().to_arrow2_fields().unwrap(); +/// let arrays = serde_arrow::to_arrow2(&fields, &items).unwrap(); +/// +/// assert_eq!(arrays.len(), 2); +/// # Ok(()) +/// # } +/// ``` +/// +pub fn to_arrow2(fields: &[Field], items: &T) -> Result>> +where + T: Serialize + ?Sized, +{ + let fields = fields + .iter() + .map(GenericField::try_from) + .collect::>>()?; + + let program = compile_serialization(&fields, CompilationOptions::default())?; + let mut interpreter = Interpreter::new(program); + serialize_into_sink(&mut interpreter, items)?; + + interpreter.build_arrow2_arrays() +} + +/// Deserialize a type from the given arrays +/// +/// The type should be a list of records (e.g., a vector of structs). +/// +/// ```rust +/// use serde::{Deserialize, Serialize}; +/// use serde_arrow::{ +/// arrow2::{ +/// deserialize_from_arrays, +/// serialize_into_arrays, +/// serialize_into_fields, +/// }, +/// schema::TracingOptions, +/// }; +/// +/// ##[derive(Deserialize, Serialize)] +/// struct Record { +/// a: Option, +/// b: u64, +/// } +/// +/// // provide an example record to get the field information +/// let fields = serialize_into_fields( +/// &[Record { a: Some(1.0), b: 2}], +/// TracingOptions::default(), +/// ).unwrap(); +/// # let items = &[Record { a: Some(1.0), b: 2}]; +/// # let arrays = serialize_into_arrays(&fields, &items).unwrap(); +/// # +/// +/// // deserialize the records from arrays +/// let items: Vec = deserialize_from_arrays(&fields, &arrays).unwrap(); +/// ``` +/// +pub fn deserialize_from_arrays<'de, T, A>(fields: &'de [Field], arrays: &'de [A]) -> Result +where + T: Deserialize<'de>, + A: AsRef, +{ + use crate::internal::{ + common::{BufferExtract, Buffers}, + deserialization, + }; + + let fields = fields + .iter() + .map(GenericField::try_from) + .collect::>>()?; + + let num_items = arrays + .iter() + .map(|a| a.as_ref().len()) + .min() + .unwrap_or_default(); + + let mut buffers = Buffers::new(); + let mut mappings = Vec::new(); + for (field, array) in fields.iter().zip(arrays.iter()) { + mappings.push(array.as_ref().extract_buffers(field, &mut buffers)?); + } + + let interpreter = deserialization::compile_deserialization( + num_items, + &mappings, + buffers, + deserialization::CompilationOptions::default(), + )?; + deserialize_from_source(interpreter) +} + +/// Deserialize items from the given arrow2 arrays (*requires* one of the +/// `arrow2-*` features) +/// +/// The type should be a list of records (e.g., a vector of structs). +/// +/// ```rust +/// use serde::{Deserialize, Serialize}; +/// use serde_arrow::{ +/// arrow2::{ +/// deserialize_from_arrays, +/// serialize_into_arrays, +/// serialize_into_fields, +/// }, +/// schema::TracingOptions, +/// }; +/// +/// ##[derive(Deserialize, Serialize)] +/// struct Record { +/// a: Option, +/// b: u64, +/// } +/// +/// // provide an example record to get the field information +/// let fields = serialize_into_fields( +/// &[Record { a: Some(1.0), b: 2}], +/// TracingOptions::default(), +/// ).unwrap(); +/// # let items = &[Record { a: Some(1.0), b: 2}]; +/// # let arrays = serialize_into_arrays(&fields, &items).unwrap(); +/// # +/// +/// // deserialize the records from arrays +/// let items: Vec = deserialize_from_arrays(&fields, &arrays).unwrap(); +/// ``` +/// +pub fn from_arrow2<'de, T, A>(fields: &'de [Field], arrays: &'de [A]) -> Result +where + T: Deserialize<'de>, + A: AsRef, +{ + use crate::internal::{ + common::{BufferExtract, Buffers}, + deserialization, + }; + + let fields = fields + .iter() + .map(GenericField::try_from) + .collect::>>()?; + + let num_items = arrays + .iter() + .map(|a| a.as_ref().len()) + .min() + .unwrap_or_default(); + + let mut buffers = Buffers::new(); + let mut mappings = Vec::new(); + for (field, array) in fields.iter().zip(arrays.iter()) { + mappings.push(array.as_ref().extract_buffers(field, &mut buffers)?); + } + + let interpreter = deserialization::compile_deserialization( + num_items, + &mappings, + buffers, + deserialization::CompilationOptions::default(), + )?; + deserialize_from_source(interpreter) +} + +/// Determine the schema of an object that represents a single array +/// +/// Example: +/// +/// ```rust +/// # use serde_arrow::_impl::arrow2 as arrow2; +/// use arrow2::datatypes::{DataType, Field}; +/// use serde_arrow::arrow2::serialize_into_field; +/// +/// let items: Vec = vec![1.0, 2.0, 3.0, 4.0]; +/// +/// let field = serialize_into_field(&items, "floats", Default::default()).unwrap(); +/// assert_eq!(field, Field::new("floats", DataType::Float32, false)); +/// ``` +/// +pub fn serialize_into_field(items: &T, name: &str, options: TracingOptions) -> Result +where + T: Serialize + ?Sized, +{ + let mut tracer = Tracer::new(String::from("$"), options); + tracer.trace_samples(items)?; + let field = tracer.to_field(name)?; + Field::try_from(&field) +} + +/// Serialize a sequence of objects representing a single array into an array +/// +/// Example: +/// +/// ```rust +/// # use serde_arrow::_impl::arrow2 as arrow2; +/// # +/// use arrow2::datatypes::{DataType, Field}; +/// use serde_arrow::arrow2::serialize_into_array; +/// +/// let items: Vec = vec![1.0, 2.0, 3.0, 4.0]; +/// +/// let field = Field::new("floats", DataType::Float32, false); +/// let array = serialize_into_array(&field, &items).unwrap(); +/// +/// assert_eq!(array.len(), 4); +/// ``` +/// +pub fn serialize_into_array(field: &Field, items: &T) -> Result> +where + T: Serialize + ?Sized, +{ + let field: GenericField = field.try_into()?; + + let program = compile_serialization( + std::slice::from_ref(&field), + CompilationOptions::default().wrap_with_struct(false), + )?; + let mut interpreter = Interpreter::new(program); + serialize_into_sink(&mut interpreter, items)?; + interpreter.build_arrow2_array() +} + +/// Deserialize a sequence of objects from a single array +/// +/// Example: +/// +/// ```rust +/// # use serde_arrow::_impl::arrow2 as arrow2; +/// # +/// use arrow2::{array::Array, datatypes::{DataType, Field}}; +/// use serde_arrow::arrow2::{ +/// serialize_into_array, +/// deserialize_from_array, +/// }; +/// +/// let field = Field::new("floats", DataType::Float32, false); +/// +/// let array = serialize_into_array(&field, &vec![1.0_f32, 2.0, 3.0]).unwrap(); +/// let items: Vec = deserialize_from_array(&field, &array).unwrap(); +/// ``` +/// +pub fn deserialize_from_array<'de, T, A>(field: &'de Field, array: &'de A) -> Result +where + T: Deserialize<'de>, + A: AsRef + 'de + ?Sized, +{ + generic::deserialize_from_array(field, array.as_ref()) +} + +/// Build a single array item by item +/// +/// Example: +/// +/// ```rust +/// # use serde_arrow::_impl::arrow2 as arrow2; +/// use arrow2::datatypes::{Field, DataType}; +/// use serde_arrow::arrow2::ArrayBuilder; +/// +/// let field = Field::new("value", DataType::Int64, false); +/// let mut builder = ArrayBuilder::new(&field).unwrap(); +/// +/// builder.push(&-1_i64).unwrap(); +/// builder.push(&2_i64).unwrap(); +/// builder.push(&-3_i64).unwrap(); +/// +/// builder.extend(&[4_i64, -5, 6]).unwrap(); +/// +/// let array = builder.build_array().unwrap(); +/// assert_eq!(array.len(), 6); +/// ``` +pub struct ArrayBuilder(generic::GenericBuilder); + +impl ArrayBuilder { + /// Construct a new build for the given field + /// + /// This method may fail for an unsupported data type of the given field. + /// + pub fn new(field: &Field) -> Result { + Ok(Self(generic::GenericBuilder::new_for_array( + GenericField::try_from(field)?, + )?)) + } + + /// Add a single item to the arrays + /// + pub fn push(&mut self, item: &T) -> Result<()> { + self.0.push(item) + } + + /// Add multiple items to the arrays + /// + pub fn extend(&mut self, items: &T) -> Result<()> { + self.0.extend(items) + } + + /// Build the array from the rows pushed to far. + /// + /// This operation will reset the underlying buffers and start a new batch. + /// + pub fn build_array(&mut self) -> Result> { + self.0 .0.build_arrow2_array() + } +} + +/// Build arrays record by record +/// +/// Example: +/// +/// ```rust +/// # use serde_arrow::_impl::arrow2 as arrow2; +/// use arrow2::datatypes::{DataType, Field}; +/// use serde::Serialize; +/// use serde_arrow::arrow2::{ArraysBuilder}; +/// +/// ##[derive(Serialize)] +/// struct Record { +/// a: Option, +/// b: u64, +/// } + +/// let fields = vec![ +/// Field::new("a", DataType::Float32, true), +/// Field::new("b", DataType::UInt64, false), +/// ]; +/// let mut builder = ArraysBuilder::new(&fields).unwrap(); +/// +/// builder.push(&Record { a: Some(1.0), b: 2}).unwrap(); +/// builder.push(&Record { a: Some(3.0), b: 4}).unwrap(); +/// builder.push(&Record { a: Some(5.0), b: 5}).unwrap(); +/// +/// builder.extend(&[ +/// Record { a: Some(6.0), b: 7}, +/// Record { a: Some(8.0), b: 9}, +/// Record { a: Some(10.0), b: 11}, +/// ]).unwrap(); +/// +/// let arrays = builder.build_arrays().unwrap(); +/// +/// assert_eq!(arrays.len(), 2); +/// assert_eq!(arrays[0].len(), 6); +/// ``` +pub struct ArraysBuilder(generic::GenericBuilder); + +impl std::fmt::Debug for ArraysBuilder { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "ArraysBuilder<...>") + } +} + +impl ArraysBuilder { + /// Build a new ArraysBuilder for the given fields + /// + /// This method may fail when unsupported data types are encountered in the + /// given fields. + /// + pub fn new(fields: &[Field]) -> Result { + let fields = fields + .iter() + .map(GenericField::try_from) + .collect::>>()?; + Ok(Self(generic::GenericBuilder::new_for_arrays(&fields)?)) + } + + /// Add a single record to the arrays + /// + pub fn push(&mut self, item: &T) -> Result<()> { + self.0.push(item) + } + + /// Add multiple records to the arrays + /// + pub fn extend(&mut self, items: &T) -> Result<()> { + self.0.extend(items) + } + + /// Build the arrays from the rows pushed to far. + /// + /// This operation will reset the underlying buffers and start a new batch. + /// + pub fn build_arrays(&mut self) -> Result>> { + self.0 .0.build_arrow2_arrays() + } +} diff --git a/serde_arrow/src/arrow2/mod.rs b/serde_arrow/src/arrow2/mod.rs index 174ce44f..84411f00 100644 --- a/serde_arrow/src/arrow2/mod.rs +++ b/serde_arrow/src/arrow2/mod.rs @@ -1,8 +1,9 @@ -//! Support for the `arrow2` crate (requires one the `arrow2-*` features) +//! Support for the `arrow2` crate (*requires one the `arrow2-*` features*) //! //! Functions to convert Rust objects into Arrow arrays and back. //! #![deny(missing_docs)] +pub(crate) mod api; pub(crate) mod deserialization; pub(crate) mod schema; pub(crate) mod serialization; @@ -11,402 +12,8 @@ mod type_support; #[cfg(test)] mod test; -use serde::{Deserialize, Serialize}; - -use crate::{ - _impl::arrow2::{array::Array, datatypes::Field}, - internal::{ - error::Result, - generic, - schema::GenericField, - serialization::{compile_serialization, CompilationOptions, Interpreter}, - sink::serialize_into_sink, - source::deserialize_from_source, - tracing::{Tracer, TracingOptions}, - }, +#[allow(deprecated)] +pub use api::{ + deserialize_from_array, deserialize_from_arrays, serialize_into_array, serialize_into_arrays, + serialize_into_field, serialize_into_fields, ArrayBuilder, ArraysBuilder, }; - -/// Determine the schema (as a list of fields) for the given items -/// -/// `items` should be given in the form a list of records (e.g., a vector of -/// structs). -/// -/// To correctly record the type information make sure to: -/// -/// - include values for `Option` -/// - include all variants of an enum -/// - include at least single element of a list or a map -/// -/// Example: -/// -/// ```rust -/// # use serde_arrow::_impl::arrow2 as arrow2; -/// # -/// use arrow2::datatypes::{DataType, Field}; -/// use serde::Serialize; -/// use serde_arrow::arrow2::serialize_into_fields; -/// -/// ##[derive(Serialize)] -/// struct Record { -/// a: Option, -/// b: u64, -/// } -/// -/// let items = vec![ -/// Record { a: Some(1.0), b: 2}, -/// // ... -/// ]; -/// -/// let fields = serialize_into_fields(&items, Default::default()).unwrap(); -/// let expected = vec![ -/// Field::new("a", DataType::Float32, true), -/// Field::new("b", DataType::UInt64, false), -/// ]; -/// -/// assert_eq!(fields, expected); -/// ``` -/// -pub fn serialize_into_fields(items: &T, options: TracingOptions) -> Result> -where - T: Serialize + ?Sized, -{ - let mut tracer = Tracer::new(String::from("$"), options); - tracer.trace_samples(items)?; - - let schema = tracer.to_schema()?; - schema.to_arrow2_fields() -} - -/// Build arrays from the given items -/// -/// `items` should be given in the form a list of records (e.g., a vector of -/// structs). -/// -/// To build arrays record by record use [ArraysBuilder]. -/// -/// ```rust -/// use serde::Serialize; -/// use serde_arrow::arrow2::{serialize_into_fields, serialize_into_arrays}; -/// -/// ##[derive(Serialize)] -/// struct Record { -/// a: Option, -/// b: u64, -/// } -/// -/// let items = vec![ -/// Record { a: Some(1.0), b: 2}, -/// // ... -/// ]; -/// -/// let fields = serialize_into_fields(&items, Default::default()).unwrap(); -/// let arrays = serialize_into_arrays(&fields, &items).unwrap(); -/// -/// assert_eq!(arrays.len(), 2); -/// ``` -/// -pub fn serialize_into_arrays(fields: &[Field], items: &T) -> Result>> -where - T: Serialize + ?Sized, -{ - let fields = fields - .iter() - .map(GenericField::try_from) - .collect::>>()?; - - let program = compile_serialization(&fields, CompilationOptions::default())?; - let mut interpreter = Interpreter::new(program); - serialize_into_sink(&mut interpreter, items)?; - - interpreter.build_arrow2_arrays() -} - -/// Deserialize a type from the given arrays -/// -/// The type should be a list of records (e.g., a vector of structs). -/// -/// ```rust -/// use serde::{Deserialize, Serialize}; -/// use serde_arrow::{ -/// arrow2::{ -/// deserialize_from_arrays, -/// serialize_into_arrays, -/// serialize_into_fields, -/// }, -/// schema::TracingOptions, -/// }; -/// -/// ##[derive(Deserialize, Serialize)] -/// struct Record { -/// a: Option, -/// b: u64, -/// } -/// -/// // provide an example record to get the field information -/// let fields = serialize_into_fields( -/// &[Record { a: Some(1.0), b: 2}], -/// TracingOptions::default(), -/// ).unwrap(); -/// # let items = &[Record { a: Some(1.0), b: 2}]; -/// # let arrays = serialize_into_arrays(&fields, &items).unwrap(); -/// # -/// -/// // deserialize the records from arrays -/// let items: Vec = deserialize_from_arrays(&fields, &arrays).unwrap(); -/// ``` -/// -pub fn deserialize_from_arrays<'de, T, A>(fields: &'de [Field], arrays: &'de [A]) -> Result -where - T: Deserialize<'de>, - A: AsRef, -{ - use crate::internal::{ - common::{BufferExtract, Buffers}, - deserialization, - }; - - let fields = fields - .iter() - .map(GenericField::try_from) - .collect::>>()?; - - let num_items = arrays - .iter() - .map(|a| a.as_ref().len()) - .min() - .unwrap_or_default(); - - let mut buffers = Buffers::new(); - let mut mappings = Vec::new(); - for (field, array) in fields.iter().zip(arrays.iter()) { - mappings.push(array.as_ref().extract_buffers(field, &mut buffers)?); - } - - let interpreter = deserialization::compile_deserialization( - num_items, - &mappings, - buffers, - deserialization::CompilationOptions::default(), - )?; - deserialize_from_source(interpreter) -} - -/// Determine the schema of an object that represents a single array -/// -/// Example: -/// -/// ```rust -/// # use serde_arrow::_impl::arrow2 as arrow2; -/// use arrow2::datatypes::{DataType, Field}; -/// use serde_arrow::arrow2::serialize_into_field; -/// -/// let items: Vec = vec![1.0, 2.0, 3.0, 4.0]; -/// -/// let field = serialize_into_field(&items, "floats", Default::default()).unwrap(); -/// assert_eq!(field, Field::new("floats", DataType::Float32, false)); -/// ``` -/// -pub fn serialize_into_field(items: &T, name: &str, options: TracingOptions) -> Result -where - T: Serialize + ?Sized, -{ - let mut tracer = Tracer::new(String::from("$"), options); - tracer.trace_samples(items)?; - let field = tracer.to_field(name)?; - Field::try_from(&field) -} - -/// Serialize a sequence of objects representing a single array into an array -/// -/// Example: -/// -/// ```rust -/// # use serde_arrow::_impl::arrow2 as arrow2; -/// # -/// use arrow2::datatypes::{DataType, Field}; -/// use serde_arrow::arrow2::serialize_into_array; -/// -/// let items: Vec = vec![1.0, 2.0, 3.0, 4.0]; -/// -/// let field = Field::new("floats", DataType::Float32, false); -/// let array = serialize_into_array(&field, &items).unwrap(); -/// -/// assert_eq!(array.len(), 4); -/// ``` -/// -pub fn serialize_into_array(field: &Field, items: &T) -> Result> -where - T: Serialize + ?Sized, -{ - let field: GenericField = field.try_into()?; - - let program = compile_serialization( - std::slice::from_ref(&field), - CompilationOptions::default().wrap_with_struct(false), - )?; - let mut interpreter = Interpreter::new(program); - serialize_into_sink(&mut interpreter, items)?; - interpreter.build_arrow2_array() -} - -/// Deserialize a sequence of objects from a single array -/// -/// Example: -/// -/// ```rust -/// # use serde_arrow::_impl::arrow2 as arrow2; -/// # -/// use arrow2::{array::Array, datatypes::{DataType, Field}}; -/// use serde_arrow::arrow2::{ -/// serialize_into_array, -/// deserialize_from_array, -/// }; -/// -/// let field = Field::new("floats", DataType::Float32, false); -/// -/// let array = serialize_into_array(&field, &vec![1.0_f32, 2.0, 3.0]).unwrap(); -/// let items: Vec = deserialize_from_array(&field, &array).unwrap(); -/// ``` -/// -pub fn deserialize_from_array<'de, T, A>(field: &'de Field, array: &'de A) -> Result -where - T: Deserialize<'de>, - A: AsRef + 'de + ?Sized, -{ - generic::deserialize_from_array(field, array.as_ref()) -} - -/// Build a single array item by item -/// -/// Example: -/// -/// ```rust -/// # use serde_arrow::_impl::arrow2 as arrow2; -/// use arrow2::datatypes::{Field, DataType}; -/// use serde_arrow::arrow2::ArrayBuilder; -/// -/// let field = Field::new("value", DataType::Int64, false); -/// let mut builder = ArrayBuilder::new(&field).unwrap(); -/// -/// builder.push(&-1_i64).unwrap(); -/// builder.push(&2_i64).unwrap(); -/// builder.push(&-3_i64).unwrap(); -/// -/// builder.extend(&[4_i64, -5, 6]).unwrap(); -/// -/// let array = builder.build_array().unwrap(); -/// assert_eq!(array.len(), 6); -/// ``` -pub struct ArrayBuilder(generic::GenericBuilder); - -impl ArrayBuilder { - /// Construct a new build for the given field - /// - /// This method may fail for an unsupported data type of the given field. - /// - pub fn new(field: &Field) -> Result { - Ok(Self(generic::GenericBuilder::new_for_array( - GenericField::try_from(field)?, - )?)) - } - - /// Add a single item to the arrays - /// - pub fn push(&mut self, item: &T) -> Result<()> { - self.0.push(item) - } - - /// Add multiple items to the arrays - /// - pub fn extend(&mut self, items: &T) -> Result<()> { - self.0.extend(items) - } - - /// Build the array from the rows pushed to far. - /// - /// This operation will reset the underlying buffers and start a new batch. - /// - pub fn build_array(&mut self) -> Result> { - self.0 .0.build_arrow2_array() - } -} - -/// Build arrays record by record -/// -/// Example: -/// -/// ```rust -/// # use serde_arrow::_impl::arrow2 as arrow2; -/// use arrow2::datatypes::{DataType, Field}; -/// use serde::Serialize; -/// use serde_arrow::arrow2::{ArraysBuilder}; -/// -/// ##[derive(Serialize)] -/// struct Record { -/// a: Option, -/// b: u64, -/// } - -/// let fields = vec![ -/// Field::new("a", DataType::Float32, true), -/// Field::new("b", DataType::UInt64, false), -/// ]; -/// let mut builder = ArraysBuilder::new(&fields).unwrap(); -/// -/// builder.push(&Record { a: Some(1.0), b: 2}).unwrap(); -/// builder.push(&Record { a: Some(3.0), b: 4}).unwrap(); -/// builder.push(&Record { a: Some(5.0), b: 5}).unwrap(); -/// -/// builder.extend(&[ -/// Record { a: Some(6.0), b: 7}, -/// Record { a: Some(8.0), b: 9}, -/// Record { a: Some(10.0), b: 11}, -/// ]).unwrap(); -/// -/// let arrays = builder.build_arrays().unwrap(); -/// -/// assert_eq!(arrays.len(), 2); -/// assert_eq!(arrays[0].len(), 6); -/// ``` -pub struct ArraysBuilder(generic::GenericBuilder); - -impl std::fmt::Debug for ArraysBuilder { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "ArraysBuilder<...>") - } -} - -impl ArraysBuilder { - /// Build a new ArraysBuilder for the given fields - /// - /// This method may fail when unsupported data types are encountered in the - /// given fields. - /// - pub fn new(fields: &[Field]) -> Result { - let fields = fields - .iter() - .map(GenericField::try_from) - .collect::>>()?; - Ok(Self(generic::GenericBuilder::new_for_arrays(&fields)?)) - } - - /// Add a single record to the arrays - /// - pub fn push(&mut self, item: &T) -> Result<()> { - self.0.push(item) - } - - /// Add multiple records to the arrays - /// - pub fn extend(&mut self, items: &T) -> Result<()> { - self.0.extend(items) - } - - /// Build the arrays from the rows pushed to far. - /// - /// This operation will reset the underlying buffers and start a new batch. - /// - pub fn build_arrays(&mut self) -> Result>> { - self.0 .0.build_arrow2_arrays() - } -} diff --git a/serde_arrow/src/arrow2/schema.rs b/serde_arrow/src/arrow2/schema.rs index 2f009df7..7a6f70d6 100644 --- a/serde_arrow/src/arrow2/schema.rs +++ b/serde_arrow/src/arrow2/schema.rs @@ -2,13 +2,15 @@ use crate::{ _impl::arrow2::datatypes::{DataType, Field, IntegerType, TimeUnit, UnionMode}, internal::{ error::{error, fail, Error, Result}, - generic, - schema::{GenericDataType, GenericField, GenericTimeUnit, Schema, Strategy, STRATEGY_KEY}, + schema::{ + GenericDataType, GenericField, GenericTimeUnit, SerdeArrowSchema, Strategy, + STRATEGY_KEY, + }, }, }; /// Support for arrow2 types (requires one of the `arrow2-*` features) -impl Schema { +impl SerdeArrowSchema { /// Build a new Schema object from fields pub fn from_arrow2_fields(fields: &[Field]) -> Result { Ok(Self { @@ -29,12 +31,13 @@ impl Schema { pub fn to_arrow2_fields(&self) -> Result> { self.fields.iter().map(Field::try_from).collect() } +} + +impl TryFrom for Vec { + type Error = Error; - /// If this schema object has a single valid field, return it - pub fn to_arrow2_field(&self) -> Result { - let fields = self.to_arrow2_fields()?; - generic::to_single_item(fields) - .ok_or_else(|| error!("schema does not have exactly one field")) + fn try_from(value: SerdeArrowSchema) -> Result { + value.to_arrow2_fields() } } diff --git a/serde_arrow/src/internal/config.rs b/serde_arrow/src/internal/config.rs index 33b9f304..a3881d7c 100644 --- a/serde_arrow/src/internal/config.rs +++ b/serde_arrow/src/internal/config.rs @@ -21,11 +21,12 @@ pub struct Configuration { /// /// Usage: /// -/// ``` +/// ```ignore /// serde_arrow::experimental::configure(|c| { /// // set attributes on c /// }); /// ``` +#[allow(unused)] pub fn configure(f: F) { let mut guard = CONFIGURATION.write().unwrap(); f(&mut guard) diff --git a/serde_arrow/src/internal/error.rs b/serde_arrow/src/internal/error.rs index 1563eef6..69b082c2 100644 --- a/serde_arrow/src/internal/error.rs +++ b/serde_arrow/src/internal/error.rs @@ -159,3 +159,14 @@ impl From for Error { Self::custom(format!("bytemuck::PodCastError: {err}")) } } + +/// An error type for testing, that panics once an error is converted +#[allow(unused)] +#[derive(Debug)] +pub struct PanicOnError; + +impl From for PanicOnError { + fn from(value: E) -> Self { + panic!("{value}"); + } +} diff --git a/serde_arrow/src/internal/generic.rs b/serde_arrow/src/internal/generic.rs index 1688fc97..e1e5e721 100644 --- a/serde_arrow/src/internal/generic.rs +++ b/serde_arrow/src/internal/generic.rs @@ -1,4 +1,4 @@ -use serde::{Deserialize, Serialize}; +use serde::{Deserialize, Serialize, ser::SerializeSeq}; use crate::internal::{ common::{BufferExtract, Buffers}, @@ -10,15 +10,6 @@ use crate::internal::{ source::deserialize_from_source, }; -/// Return the item if the vector has exactly one -pub fn to_single_item(items: Vec) -> Option { - if items.len() == 1 { - items.into_iter().next() - } else { - None - } -} - pub struct GenericBuilder(pub serialization::Interpreter); impl GenericBuilder { @@ -76,3 +67,78 @@ where )?; deserialize_from_source(interpreter) } + +/// A wrapper around a sequence of individual items +pub struct Items( + /// The wrapped object + pub T, +); + +/// A wrapper around a single item +pub struct Item( + /// The wrapped object + pub T, +); + +impl Serialize for Item { + fn serialize(&self, serializer: S) -> std::result::Result { + #[derive(Debug, Serialize)] + struct Item<'a, T> { + item: &'a T, + } + Item { item: &self.0 }.serialize(serializer) + } +} + +impl<'de, T: Deserialize<'de>> Deserialize<'de> for Item { + fn deserialize>(deserializer: D) -> std::result::Result { + #[derive(Debug, Deserialize)] + struct Item { + item: T, + } + let item = Item::::deserialize(deserializer)?; + Ok(Item(item.item)) + } +} + +// TODO: implement for all types? +impl<'de, T: Deserialize<'de>> Deserialize<'de> for Items> { + fn deserialize>(deserializer: D) -> std::result::Result { + let items = Vec::>::deserialize(deserializer)?.into_iter().map(|item| item.0).collect(); + Ok(Items(items)) + } +} + +impl Serialize for Items> { + fn serialize(&self, serializer: S) -> std::result::Result { + Items(self.0.as_slice()).serialize(serializer) + } +} + +impl<'a, T: Serialize> Serialize for Items<&'a Vec> { + fn serialize(&self, serializer: S) -> std::result::Result { + Items(self.0.as_slice()).serialize(serializer) + } +} + +impl Serialize for Items<[T; N]> { + fn serialize(&self, serializer: S) -> std::result::Result { + Items(self.0.as_slice()).serialize(serializer) + } +} + +impl<'a, const N: usize, T: Serialize> Serialize for Items<&'a [T; N]> { + fn serialize(&self, serializer: S) -> std::result::Result { + Items(self.0.as_slice()).serialize(serializer) + } +} + +impl<'a, T: Serialize> Serialize for Items<&'a [T]> { + fn serialize(&self, serializer: S) -> std::result::Result { + let mut seq = serializer.serialize_seq(Some(self.0.len()))?; + for item in self.0 { + seq.serialize_element(&Item(item))?; + } + seq.end() + } +} diff --git a/serde_arrow/src/internal/schema.rs b/serde_arrow/src/internal/schema.rs index a39d6086..9a79e952 100644 --- a/serde_arrow/src/internal/schema.rs +++ b/serde_arrow/src/internal/schema.rs @@ -16,10 +16,12 @@ use serde::{Deserialize, Serialize}; /// pub const STRATEGY_KEY: &str = "SERDE_ARROW:strategy"; -/// A collection of fields that can be easily serialized and deserialized +/// A collection of fields as understood by `serde_arrow` +/// +/// `SerdeArrowSchema` is designed to be easily serialized and deserialized /// /// ```rust -/// # use serde_arrow::schema::Schema; +/// # use serde_arrow::schema::SerdeArrowSchema; /// let schema_json = r#" /// [ /// { @@ -32,11 +34,10 @@ pub const STRATEGY_KEY: &str = "SERDE_ARROW:strategy"; /// ] /// "#; /// -/// let schema: Schema = serde_json::from_str(&schema_json).unwrap(); +/// let schema: SerdeArrowSchema = serde_json::from_str(&schema_json).unwrap(); /// ``` /// -/// The serialization format is designed to be as easy as possible to to write -/// by hand. The schema can be given in two ways: +/// The schema can be given in two ways: /// /// - an array of fields /// - or an object with a `"fields"` key that contains an array of fields @@ -69,7 +70,7 @@ pub const STRATEGY_KEY: &str = "SERDE_ARROW:strategy"; /// #[derive(Default, Debug, PartialEq, Clone, Serialize, Deserialize)] #[serde(from = "SchemaSerializationOptions")] -pub struct Schema { +pub struct SerdeArrowSchema { pub(crate) fields: Vec, } @@ -80,7 +81,7 @@ enum SchemaSerializationOptions { FullSchema { fields: Vec }, } -impl From for Schema { +impl From for SerdeArrowSchema { fn from(value: SchemaSerializationOptions) -> Self { use SchemaSerializationOptions::*; match value { @@ -89,17 +90,13 @@ impl From for Schema { } } -impl Schema { +impl SerdeArrowSchema { /// Return a new schema (empty) instance pub fn new() -> Self { Self::default() } /// Determine the schema from the given type - /// - /// For more control consider using the underlying - /// [SchemaTracer][crate::schema::SchemaTracer] directly. - /// pub fn from_type<'de, T: Deserialize<'de>>(options: TracingOptions) -> Result { let mut tracer = Tracer::new(String::from("$"), options); tracer.trace_type::()?; @@ -108,10 +105,13 @@ impl Schema { /// Determine the schema from the given samples /// - /// For more control consider using the underlying - /// [SchemaTracer][crate::schema::SchemaTracer] directly. + /// To correctly record the type information make sure to: + /// + /// - include values for `Option` + /// - include all variants of an enum + /// - include at least single element of a list or a map /// - pub fn from_samples(options: TracingOptions, samples: &T) -> Result { + pub fn from_samples(samples: &T, options: TracingOptions) -> Result { let mut tracer = Tracer::new(String::from("$"), options); tracer.trace_samples(samples)?; tracer.to_schema() @@ -765,9 +765,9 @@ fn field_is_compatible(left: &GenericField, right: &GenericField) -> bool { mod test_schema_serialization { use crate::internal::schema::GenericDataType; - use super::{GenericField, Schema}; + use super::{GenericField, SerdeArrowSchema}; - impl Schema { + impl SerdeArrowSchema { fn with_field(mut self, field: GenericField) -> Self { self.fields.push(field); self @@ -776,7 +776,7 @@ mod test_schema_serialization { #[test] fn example() { - let schema = Schema::new() + let schema = SerdeArrowSchema::new() .with_field(GenericField::new("foo", GenericDataType::U8, false)) .with_field(GenericField::new("bar", GenericDataType::Utf8, false)); @@ -786,25 +786,25 @@ mod test_schema_serialization { r#"{"fields":[{"name":"foo","data_type":"U8"},{"name":"bar","data_type":"Utf8"}]}"# ); - let round_tripped: Schema = serde_json::from_str(&actual).unwrap(); + let round_tripped: SerdeArrowSchema = serde_json::from_str(&actual).unwrap(); assert_eq!(round_tripped, schema); } #[test] fn example_without_wrapper() { - let expected = Schema::new() + let expected = SerdeArrowSchema::new() .with_field(GenericField::new("foo", GenericDataType::U8, false)) .with_field(GenericField::new("bar", GenericDataType::Utf8, false)); let input = r#"[{"name":"foo","data_type":"U8"},{"name":"bar","data_type":"Utf8"}]"#; - let actual: Schema = serde_json::from_str(&input).unwrap(); + let actual: SerdeArrowSchema = serde_json::from_str(&input).unwrap(); assert_eq!(actual, expected); } #[test] fn list() { let schema = - Schema::new().with_field( + SerdeArrowSchema::new().with_field( GenericField::new("value", GenericDataType::List, false) .with_child(GenericField::new("element", GenericDataType::I32, false)), ); @@ -815,7 +815,7 @@ mod test_schema_serialization { r#"{"fields":[{"name":"value","data_type":"List","children":[{"name":"element","data_type":"I32"}]}]}"# ); - let round_tripped: Schema = serde_json::from_str(&actual).unwrap(); + let round_tripped: SerdeArrowSchema = serde_json::from_str(&actual).unwrap(); assert_eq!(round_tripped, schema); } @@ -828,8 +828,8 @@ mod test_schema_serialization { ] "#; - let actual: Schema = serde_json::from_str(&schema).unwrap(); - let expected = Schema::new() + let actual: SerdeArrowSchema = serde_json::from_str(&schema).unwrap(); + let expected = SerdeArrowSchema::new() .with_field(GenericField::new("foo", GenericDataType::U8, false)) .with_field(GenericField::new("bar", GenericDataType::Utf8, false)); diff --git a/serde_arrow/src/internal/tracing/mod.rs b/serde_arrow/src/internal/tracing/mod.rs index 3ed76dd7..26f3d048 100644 --- a/serde_arrow/src/internal/tracing/mod.rs +++ b/serde_arrow/src/internal/tracing/mod.rs @@ -69,13 +69,6 @@ pub struct TracingOptions { /// [`NaiveStrAsDate64`][crate::schema::Strategy::NaiveStrAsDate64] or /// [`UtcStrAsDate64`][crate::schema::Strategy::UtcStrAsDate64]. pub guess_dates: bool, - - /// If not `None`, trace the schema as a field with the given name instead - /// of multiple fields - /// - /// This may be helpful when the individual items are not structs, but other - /// objects, e.g., numbers or strings. - pub as_field: Option, } impl Default for TracingOptions { @@ -86,7 +79,6 @@ impl Default for TracingOptions { string_dictionary_encoding: false, coerce_numbers: false, guess_dates: false, - as_field: None, } } } @@ -125,10 +117,4 @@ impl TracingOptions { self.guess_dates = value; self } - - /// Set [`as_field`](#structfield.as_field) - pub fn as_field>(mut self, value: S) -> Self { - self.as_field = Some(value.into()); - self - } } diff --git a/serde_arrow/src/internal/tracing/tracer.rs b/serde_arrow/src/internal/tracing/tracer.rs index c6b2b2b8..2a9cb744 100644 --- a/serde_arrow/src/internal/tracing/tracer.rs +++ b/serde_arrow/src/internal/tracing/tracer.rs @@ -2,7 +2,7 @@ use std::collections::HashMap; use crate::internal::{ error::{fail, Result}, - schema::{GenericDataType, GenericField, Schema, Strategy}, + schema::{GenericDataType, GenericField, SerdeArrowSchema, Strategy}, tracing::TracingOptions, }; @@ -39,21 +39,20 @@ impl Tracer { } /// Convert the traced schema into a schema object - pub fn to_schema(&self) -> Result { - let fields = if let Some(field_name) = self.get_options().as_field.as_ref() { - let field = self.to_field(field_name)?; - vec![field] - } else { - let root = self.to_field("root")?; + pub fn to_schema(&self) -> Result { + let root = self.to_field("root")?; - match root.data_type { - GenericDataType::Struct => root.children, - GenericDataType::Null => fail!("No records found to determine schema"), - dt => fail!("Unexpected root data type {dt:?}"), - } + if root.nullable { + fail!("The root type cannot be nullable"); + } + + let fields = match root.data_type { + GenericDataType::Struct => root.children, + GenericDataType::Null => fail!("No records found to determine schema"), + dt => fail!("Unexpected root data type {dt:?}"), }; - Ok(Schema { fields }) + Ok(SerdeArrowSchema { fields }) } } diff --git a/serde_arrow/src/lib.rs b/serde_arrow/src/lib.rs index 54b3dd12..e8847fd7 100644 --- a/serde_arrow/src/lib.rs +++ b/serde_arrow/src/lib.rs @@ -124,6 +124,12 @@ //! mod internal; +#[cfg(has_arrow)] +pub use arrow::api::{from_arrow, to_arrow}; + +#[cfg(has_arrow2)] +pub use arrow2::api::{from_arrow2, to_arrow2}; + /// Internal. Do not use /// /// This module is an internal implementation detail and not subject to any @@ -131,6 +137,7 @@ mod internal; /// to allow usage in doc tests or benchmarks. /// #[rustfmt::skip] +#[doc(hidden)] pub mod _impl { #[allow(unused)] macro_rules! build_arrow2_crate { @@ -149,6 +156,13 @@ pub mod _impl { /// A "fake" arrow crate re-exporting the relevant definitions of the /// used arrow-* subcrates pub mod arrow { + /// The raw arrow packages + pub mod _raw { + pub use $arrow_array as array; + pub use $arrow_buffer as buffer; + pub use $arrow_data as data; + pub use $arrow_schema as schema; + } pub mod array { pub use $arrow_array::array::{ make_array, Array, ArrayRef, ArrowPrimitiveType, BooleanArray, @@ -220,85 +234,8 @@ mod test; pub use crate::internal::error::{Error, Result}; -/// Configure how Arrow and Rust types are translated into one another -/// -/// When tracing the schema using the `serialize_into_fields` methods, the -/// following defaults are used: -/// -/// - Strings: `LargeUtf8`, i.e., i64 offsets -/// - Lists: `LargeList`, i.e., i64 offsets -/// - Strings with dictionary encoding: U32 keys and LargeUtf8 values -/// - Rationale: `polars` cannot handle 64 bit keys in its default -/// configuration -/// -/// Null-only fields (e.g., fields of type `()` or fields with only `None` -/// entries) result in errors per default. -/// [`TracingOptions::allow_null_fields`][crate::internal::tracing::TracingOptions::allow_null_fields] -/// allows to disable this behavior. -/// -/// All customization of the types happens via the metadata of the fields -/// structs describing arrays. For example, to let `serde_arrow` handle date -/// time objects that are serialized to strings (chrono's default), use -/// -/// ```rust -/// # #[cfg(feature="arrow2")] -/// # fn main() { -/// # use arrow2::datatypes::{DataType, Field}; -/// # use serde_arrow::schema::{STRATEGY_KEY, Strategy}; -/// # let mut field = Field::new("my_field", DataType::Null, false); -/// field.data_type = DataType::Date64; -/// field.metadata = Strategy::UtcStrAsDate64.into(); -/// # } -/// # #[cfg(not(feature="arrow2"))] -/// # fn main() {} -/// ``` #[deny(missing_docs)] -pub mod schema { - pub use crate::internal::{ - schema::{Schema, Strategy, STRATEGY_KEY}, - tracing::TracingOptions, - }; +pub mod schema; - /// Trace the schema from type information and samples - pub struct SchemaTracer(crate::internal::tracing::Tracer); - - impl SchemaTracer { - /// Build a new schema tracer with the given options - pub fn new(options: TracingOptions) -> Self { - Self(crate::internal::tracing::Tracer::new( - String::from("$"), - options, - )) - } - - /// Trace type information of the given type - /// - /// Note: the given type should be the type of the element, not the - /// sequence of elements. - pub fn trace_type<'de, T: serde::Deserialize<'de>>(&mut self) -> crate::Result<()> { - self.0.trace_type::() - } - - /// Trace type information from the given samples - /// - /// Note: the given samples should be a sequence of elements. - pub fn trace_samples( - &mut self, - samples: &T, - ) -> crate::Result<()> { - self.0.trace_samples(samples) - } - - /// Convert the tracer to a schema - pub fn to_schema(&self) -> crate::Result { - self.0.to_schema() - } - } -} - -/// Experimental functionality that is not bound by semver compatibility -/// #[deny(missing_docs)] -pub mod experimental { - pub use crate::internal::config::{configure, Configuration}; -} +pub mod utils; \ No newline at end of file diff --git a/serde_arrow/src/schema.rs b/serde_arrow/src/schema.rs new file mode 100644 index 00000000..bc41452f --- /dev/null +++ b/serde_arrow/src/schema.rs @@ -0,0 +1,40 @@ +//! Configure how Arrow and Rust types are translated into one another +//! +//! When tracing the schema using the `serialize_into_fields` methods, the +//! following defaults are used: +//! +//! - Strings: `LargeUtf8`, i.e., i64 offsets +//! - Lists: `LargeList`, i.e., i64 offsets +//! - Strings with dictionary encoding: U32 keys and LargeUtf8 values +//! - Rationale: `polars` cannot handle 64 bit keys in its default +//! configuration +//! +//! Null-only fields (e.g., fields of type `()` or fields with only `None` +//! entries) result in errors per default. +//! [`TracingOptions::allow_null_fields`][crate::internal::tracing::TracingOptions::allow_null_fields] +//! allows to disable this behavior. +//! +//! All customization of the types happens via the metadata of the fields +//! structs describing arrays. For example, to let `serde_arrow` handle date +//! time objects that are serialized to strings (chrono's default), use +//! +//! ```rust +//! # #[cfg(feature="arrow2")] +//! # fn main() { +//! # use arrow2::datatypes::{DataType, Field}; +//! # use serde_arrow::schema::{STRATEGY_KEY, Strategy}; +//! # let mut field = Field::new("my_field", DataType::Null, false); +//! field.data_type = DataType::Date64; +//! field.metadata = Strategy::UtcStrAsDate64.into(); +//! # } +//! # #[cfg(not(feature="arrow2"))] +//! # fn main() {} +//! ``` +pub use crate::internal::{ + schema::{SerdeArrowSchema, Strategy, STRATEGY_KEY}, + tracing::TracingOptions, +}; + +/// Type alias for SerdeArrowSchema for backwards compatibility +#[deprecated = "serde_arrow::schema::Schema is deprecated. Use serde_arrow::schema::SerdeArrowSchema instead"] +pub type Schema = SerdeArrowSchema; diff --git a/serde_arrow/src/test_end_to_end/issue_90.rs b/serde_arrow/src/test_end_to_end/issue_90.rs index 100603d9..dfea8dd0 100644 --- a/serde_arrow/src/test_end_to_end/issue_90.rs +++ b/serde_arrow/src/test_end_to_end/issue_90.rs @@ -1,27 +1,29 @@ +//! Test the example from https://github.com/chmp/serde_arrow/issues/90 +use std::sync::Arc; + use serde::{Deserialize, Serialize}; -use crate::{ - arrow::serialize_into_arrays, - schema::{Schema, TracingOptions}, -}; +use crate::{self as serde_arrow, internal::error::PanicOnError, schema::TracingOptions}; -#[test] -fn example() -> Result<(), PanicOnError> { - #[derive(Deserialize, Serialize, Debug, PartialEq, Clone)] - pub struct Distribution { - pub samples: Vec, - pub statistic: String, - } +use crate::_impl::arrow::{ + _raw::{array::RecordBatch, schema::Schema}, + datatypes::Field, +}; - #[derive(Deserialize, Serialize, Debug, PartialEq, Clone)] - pub struct VectorMetric { - pub distribution: Option, - } +#[derive(Deserialize, Serialize, Debug, PartialEq, Clone)] +pub struct Distribution { + pub samples: Vec, + pub statistic: String, +} - let schema = Schema::from_type::(TracingOptions::default())?; - let fields = schema.to_arrow_fields()?; +#[derive(Deserialize, Serialize, Debug, PartialEq, Clone)] +pub struct VectorMetric { + pub distribution: Option, +} - let metrics = [ +#[test] +fn example() -> Result<(), PanicOnError> { + let metrics = vec![ VectorMetric { distribution: Some(Distribution { samples: vec![1.0, 2.0, 3.0], @@ -34,17 +36,30 @@ fn example() -> Result<(), PanicOnError> { statistic: String::from("metric2"), }), }, + VectorMetric { distribution: None }, ]; - let _arrays = serialize_into_arrays(&fields, &metrics)?; + use serde_arrow::schema::SerdeArrowSchema; + + let fields: Vec = + SerdeArrowSchema::from_type::(TracingOptions::default())?.try_into()?; + let arrays = serde_arrow::to_arrow(&fields, &metrics)?; + + let batch = RecordBatch::try_new(Arc::new(Schema::new(fields.clone())), arrays.clone())?; + println!("{:#?}", batch); + + let round_tripped: Vec = serde_arrow::from_arrow(&fields, &arrays)?; + assert_eq!(metrics, round_tripped); + Ok(()) } -#[derive(Debug)] -struct PanicOnError; +#[test] +fn example_top_level_none() -> Result<(), PanicOnError> { + use serde_arrow::schema::SerdeArrowSchema; -impl From for PanicOnError { - fn from(value: E) -> Self { - panic!("{value}"); - } + // top-level options are not supported if fields are are extracted + let res = SerdeArrowSchema::from_type::>(TracingOptions::default()); + assert!(res.is_err()); + Ok(()) } diff --git a/serde_arrow/src/test_end_to_end/test_items.rs b/serde_arrow/src/test_end_to_end/test_items.rs new file mode 100644 index 00000000..3a27b6c1 --- /dev/null +++ b/serde_arrow/src/test_end_to_end/test_items.rs @@ -0,0 +1,28 @@ +use std::sync::Arc; + +use serde::{Deserialize, Serialize}; + +use crate::{self as serde_arrow, internal::{error::PanicOnError, generic::Items}, schema::TracingOptions}; + +#[test] +fn example() -> Result<(), PanicOnError> { + use serde_arrow::schema::SerdeArrowSchema; + + let items: Vec = vec![1, 2, 3, 4, 5]; + + let fields_from_type: Vec = SerdeArrowSchema::from_type::>>(TracingOptions::default())?.try_into()?; + let fields_from_samples: Vec = SerdeArrowSchema::from_samples(&Items(&items), TracingOptions::default())?.try_into()?; + + assert_eq!(fields_from_type, fields_from_samples); + let fields = fields_from_type; + + let arrays = serde_arrow::to_arrow(&fields, &Items(&items))?; + + let batch = RecordBatch::try_new(Arc::new(Schema::new(fields.clone())), arrays.clone())?; + println!("{:#?}", batch); + + let Items(round_tripped): Items> = serde_arrow::from_arrow(&fields, &arrays)?; + assert_eq!(metrics, round_tripped); + + Ok(()) +} diff --git a/serde_arrow/src/test_impls/chrono.rs b/serde_arrow/src/test_impls/chrono.rs index dbfac8e5..6aba0bc5 100644 --- a/serde_arrow/src/test_impls/chrono.rs +++ b/serde_arrow/src/test_impls/chrono.rs @@ -3,7 +3,7 @@ use super::macros::test_example; test_example!( test_name = utc_as_str, test_bytecode_deserialization = true, - field = GenericField::new("root", GenericDataType::LargeUtf8, false), + field = GenericField::new("item", GenericDataType::LargeUtf8, false), ty = DateTime, values = [ Utc.with_ymd_and_hms(2020, 12, 24, 8, 30, 0).unwrap(), @@ -18,7 +18,7 @@ test_example!( test_example!( test_name = naive_as_str, test_bytecode_deserialization = true, - field = GenericField::new("root", GenericDataType::LargeUtf8, false), + field = GenericField::new("item", GenericDataType::LargeUtf8, false), ty = NaiveDateTime, values = [ NaiveDateTime::from_timestamp_millis(1662921288000).unwrap(), @@ -33,8 +33,8 @@ test_example!( test_example!( test_name = utc_as_date64, test_bytecode_deserialization = true, - field = GenericField::new("root", GenericDataType::LargeUtf8, false), - overwrite_field = GenericField::new("root", GenericDataType::Date64, false) + field = GenericField::new("item", GenericDataType::LargeUtf8, false), + overwrite_field = GenericField::new("item", GenericDataType::Date64, false) .with_strategy(Strategy::UtcStrAsDate64), ty = DateTime, values = [ @@ -50,8 +50,8 @@ test_example!( test_example!( test_name = naive_as_date64, test_bytecode_deserialization = true, - field = GenericField::new("root", GenericDataType::LargeUtf8, false), - overwrite_field = GenericField::new("root", GenericDataType::Date64, false) + field = GenericField::new("item", GenericDataType::LargeUtf8, false), + overwrite_field = GenericField::new("item", GenericDataType::Date64, false) .with_strategy(Strategy::NaiveStrAsDate64), ty = NaiveDateTime, values = [ @@ -67,8 +67,8 @@ test_example!( test_example!( test_name = utc_as_date64_as_millis, test_bytecode_deserialization = true, - field = GenericField::new("root", GenericDataType::I64, false), - overwrite_field = GenericField::new("root", GenericDataType::Date64, false), + field = GenericField::new("item", GenericDataType::I64, false), + overwrite_field = GenericField::new("item", GenericDataType::Date64, false), ty = T, values = [ T(Utc.with_ymd_and_hms(2020, 12, 24, 8, 30, 0).unwrap()), @@ -86,8 +86,8 @@ test_example!( test_example!( test_name = utc_as_timestamp, test_bytecode_deserialization = true, - field = GenericField::new("root", GenericDataType::LargeUtf8, false), - overwrite_field = GenericField::new("root", GenericDataType::Timestamp(GenericTimeUnit::Millisecond, Some("UTC".into())), false) + field = GenericField::new("item", GenericDataType::LargeUtf8, false), + overwrite_field = GenericField::new("item", GenericDataType::Timestamp(GenericTimeUnit::Millisecond, Some("UTC".into())), false) .with_strategy(Strategy::UtcStrAsDate64), ty = DateTime, values = [ @@ -103,9 +103,9 @@ test_example!( test_example!( test_name = naive_as_timestamp, test_bytecode_deserialization = true, - field = GenericField::new("root", GenericDataType::LargeUtf8, false), + field = GenericField::new("item", GenericDataType::LargeUtf8, false), overwrite_field = GenericField::new( - "root", + "item", GenericDataType::Timestamp(GenericTimeUnit::Millisecond, None), false ) @@ -125,7 +125,7 @@ test_example!( test_name = utc_as_date64_tracing, test_bytecode_deserialization = true, tracing_options = TracingOptions::default().guess_dates(true), - field = GenericField::new("root", GenericDataType::Date64, false) + field = GenericField::new("item", GenericDataType::Date64, false) .with_strategy(Strategy::UtcStrAsDate64), ty = DateTime, values = [ @@ -142,7 +142,7 @@ test_example!( test_name = naive_as_date64_tracing, test_bytecode_deserialization = true, tracing_options = TracingOptions::default().guess_dates(true), - field = GenericField::new("root", GenericDataType::Date64, false) + field = GenericField::new("item", GenericDataType::Date64, false) .with_strategy(Strategy::NaiveStrAsDate64), ty = NaiveDateTime, values = [ @@ -159,7 +159,7 @@ test_example!( test_name = utc_as_date64_tracing_string_only, test_bytecode_deserialization = true, tracing_options = TracingOptions::default().guess_dates(true), - field = GenericField::new("root", GenericDataType::Date64, false) + field = GenericField::new("item", GenericDataType::Date64, false) .with_strategy(Strategy::UtcStrAsDate64), ty = String, values = [ @@ -173,7 +173,7 @@ test_example!( test_name = utc_as_date64_tracing_string_nullable, test_bytecode_deserialization = true, tracing_options = TracingOptions::default().guess_dates(true), - field = GenericField::new("root", GenericDataType::Date64, true) + field = GenericField::new("item", GenericDataType::Date64, true) .with_strategy(Strategy::UtcStrAsDate64), ty = Option, values = [ @@ -188,7 +188,7 @@ test_example!( test_name = utc_as_date64_tracing_string_only_with_invalid, test_bytecode_deserialization = true, tracing_options = TracingOptions::default().guess_dates(true), - field = GenericField::new("root", GenericDataType::LargeUtf8, false), + field = GenericField::new("item", GenericDataType::LargeUtf8, false), ty = String, values = [ String::from("2015-09-18T23:56:04Z"), @@ -202,7 +202,7 @@ test_example!( test_name = naive_as_date64_tracing_string_only, test_bytecode_deserialization = true, tracing_options = TracingOptions::default().guess_dates(true), - field = GenericField::new("root", GenericDataType::Date64, false) + field = GenericField::new("item", GenericDataType::Date64, false) .with_strategy(Strategy::NaiveStrAsDate64), ty = String, values = [ @@ -216,7 +216,7 @@ test_example!( test_name = naive_as_date64_tracing_string_nullable, test_bytecode_deserialization = true, tracing_options = TracingOptions::default().guess_dates(true), - field = GenericField::new("root", GenericDataType::Date64, true) + field = GenericField::new("item", GenericDataType::Date64, true) .with_strategy(Strategy::NaiveStrAsDate64), ty = Option, values = [ @@ -231,7 +231,7 @@ test_example!( test_name = naive_as_date64_tracing_string_only_with_invalid, test_bytecode_deserialization = true, tracing_options = TracingOptions::default().guess_dates(true), - field = GenericField::new("root", GenericDataType::LargeUtf8, false), + field = GenericField::new("item", GenericDataType::LargeUtf8, false), ty = String, values = [ String::from("2015-09-18T23:56:04"), @@ -245,7 +245,7 @@ test_example!( test_name = incompatible_date_formats, test_bytecode_deserialization = true, tracing_options = TracingOptions::default().guess_dates(true), - field = GenericField::new("root", GenericDataType::LargeUtf8, false), + field = GenericField::new("item", GenericDataType::LargeUtf8, false), ty = String, values = [ String::from("2015-09-18T23:56:04Z"), diff --git a/serde_arrow/src/test_impls/dictionary.rs b/serde_arrow/src/test_impls/dictionary.rs index 966f15d5..8b1ba569 100644 --- a/serde_arrow/src/test_impls/dictionary.rs +++ b/serde_arrow/src/test_impls/dictionary.rs @@ -5,7 +5,7 @@ test_example!( test_bytecode_deserialization = true, test_deserialization = [], tracing_options = TracingOptions::default().string_dictionary_encoding(true), - field = GenericField::new("root", GenericDataType::Dictionary, false) + field = GenericField::new("item", GenericDataType::Dictionary, false) .with_child(GenericField::new("key", GenericDataType::U32, false)) .with_child(GenericField::new( "value", @@ -22,7 +22,7 @@ test_example!( test_bytecode_deserialization = true, test_deserialization = [], tracing_options = TracingOptions::default().string_dictionary_encoding(true), - field = GenericField::new("root", GenericDataType::Dictionary, true) + field = GenericField::new("item", GenericDataType::Dictionary, true) .with_child(GenericField::new("key", GenericDataType::U32, false)) .with_child(GenericField::new("value", GenericDataType::LargeUtf8, false)), ty = Option, @@ -35,14 +35,14 @@ test_example!( test_bytecode_deserialization = true, test_deserialization = [], tracing_options = TracingOptions::default().string_dictionary_encoding(true), - field = GenericField::new("root", GenericDataType::Dictionary, false) + field = GenericField::new("item", GenericDataType::Dictionary, false) .with_child(GenericField::new("key", GenericDataType::U32, false)) .with_child(GenericField::new( "value", GenericDataType::LargeUtf8, false )), - overwrite_field = GenericField::new("root", GenericDataType::Dictionary, false) + overwrite_field = GenericField::new("item", GenericDataType::Dictionary, false) .with_child(GenericField::new("key", GenericDataType::U8, false)) .with_child(GenericField::new( "value", @@ -59,10 +59,10 @@ test_example!( test_bytecode_deserialization = true, test_deserialization = [], tracing_options = TracingOptions::default().string_dictionary_encoding(true), - field = GenericField::new("root", GenericDataType::Dictionary, true) + field = GenericField::new("item", GenericDataType::Dictionary, true) .with_child(GenericField::new("key", GenericDataType::U32, false)) .with_child(GenericField::new("value", GenericDataType::LargeUtf8, false)), - overwrite_field = GenericField::new("root", GenericDataType::Dictionary, true) + overwrite_field = GenericField::new("item", GenericDataType::Dictionary, true) .with_child(GenericField::new("key", GenericDataType::U8, false)) .with_child(GenericField::new("value", GenericDataType::LargeUtf8, false)), ty = Option, @@ -75,14 +75,14 @@ test_example!( test_bytecode_deserialization = true, test_deserialization = [], tracing_options = TracingOptions::default().string_dictionary_encoding(true), - field = GenericField::new("root", GenericDataType::Dictionary, false) + field = GenericField::new("item", GenericDataType::Dictionary, false) .with_child(GenericField::new("key", GenericDataType::U32, false)) .with_child(GenericField::new( "value", GenericDataType::LargeUtf8, false )), - overwrite_field = GenericField::new("root", GenericDataType::Dictionary, false) + overwrite_field = GenericField::new("item", GenericDataType::Dictionary, false) .with_child(GenericField::new("key", GenericDataType::U16, false)) .with_child(GenericField::new( "value", @@ -99,10 +99,10 @@ test_example!( test_bytecode_deserialization = true, test_deserialization = [], tracing_options = TracingOptions::default().string_dictionary_encoding(true), - field = GenericField::new("root", GenericDataType::Dictionary, true) + field = GenericField::new("item", GenericDataType::Dictionary, true) .with_child(GenericField::new("key", GenericDataType::U32, false)) .with_child(GenericField::new("value", GenericDataType::LargeUtf8, false)), - overwrite_field = GenericField::new("root", GenericDataType::Dictionary, true) + overwrite_field = GenericField::new("item", GenericDataType::Dictionary, true) .with_child(GenericField::new("key", GenericDataType::U16, false)) .with_child(GenericField::new("value", GenericDataType::LargeUtf8, false)), ty = Option, @@ -115,14 +115,14 @@ test_example!( test_bytecode_deserialization = true, test_deserialization = [], tracing_options = TracingOptions::default().string_dictionary_encoding(true), - field = GenericField::new("root", GenericDataType::Dictionary, false) + field = GenericField::new("item", GenericDataType::Dictionary, false) .with_child(GenericField::new("key", GenericDataType::U32, false)) .with_child(GenericField::new( "value", GenericDataType::LargeUtf8, false )), - overwrite_field = GenericField::new("root", GenericDataType::Dictionary, false) + overwrite_field = GenericField::new("item", GenericDataType::Dictionary, false) .with_child(GenericField::new("key", GenericDataType::U64, false)) .with_child(GenericField::new( "value", @@ -139,10 +139,10 @@ test_example!( test_bytecode_deserialization = true, test_deserialization = [], tracing_options = TracingOptions::default().string_dictionary_encoding(true), - field = GenericField::new("root", GenericDataType::Dictionary, true) + field = GenericField::new("item", GenericDataType::Dictionary, true) .with_child(GenericField::new("key", GenericDataType::U32, false)) .with_child(GenericField::new("value", GenericDataType::LargeUtf8, false)), - overwrite_field = GenericField::new("root", GenericDataType::Dictionary, true) + overwrite_field = GenericField::new("item", GenericDataType::Dictionary, true) .with_child(GenericField::new("key", GenericDataType::U64, false)) .with_child(GenericField::new("value", GenericDataType::LargeUtf8, false)), ty = Option, @@ -155,14 +155,14 @@ test_example!( test_bytecode_deserialization = true, test_deserialization = [], tracing_options = TracingOptions::default().string_dictionary_encoding(true), - field = GenericField::new("root", GenericDataType::Dictionary, false) + field = GenericField::new("item", GenericDataType::Dictionary, false) .with_child(GenericField::new("key", GenericDataType::U32, false)) .with_child(GenericField::new( "value", GenericDataType::LargeUtf8, false )), - overwrite_field = GenericField::new("root", GenericDataType::Dictionary, false) + overwrite_field = GenericField::new("item", GenericDataType::Dictionary, false) .with_child(GenericField::new("key", GenericDataType::I32, false)) .with_child(GenericField::new( "value", @@ -179,10 +179,10 @@ test_example!( test_bytecode_deserialization = true, test_deserialization = [], tracing_options = TracingOptions::default().string_dictionary_encoding(true), - field = GenericField::new("root", GenericDataType::Dictionary, true) + field = GenericField::new("item", GenericDataType::Dictionary, true) .with_child(GenericField::new("key", GenericDataType::U32, false)) .with_child(GenericField::new("value", GenericDataType::LargeUtf8, false)), - overwrite_field = GenericField::new("root", GenericDataType::Dictionary, true) + overwrite_field = GenericField::new("item", GenericDataType::Dictionary, true) .with_child(GenericField::new("key", GenericDataType::I32, false)) .with_child(GenericField::new("value", GenericDataType::LargeUtf8, false)), ty = Option, @@ -195,14 +195,14 @@ test_example!( test_bytecode_deserialization = true, test_deserialization = [], tracing_options = TracingOptions::default().string_dictionary_encoding(true), - field = GenericField::new("root", GenericDataType::Dictionary, false) + field = GenericField::new("item", GenericDataType::Dictionary, false) .with_child(GenericField::new("key", GenericDataType::U32, false)) .with_child(GenericField::new( "value", GenericDataType::LargeUtf8, false )), - overwrite_field = GenericField::new("root", GenericDataType::Dictionary, false) + overwrite_field = GenericField::new("item", GenericDataType::Dictionary, false) .with_child(GenericField::new("key", GenericDataType::I8, false)) .with_child(GenericField::new( "value", @@ -219,10 +219,10 @@ test_example!( test_bytecode_deserialization = true, test_deserialization = [], tracing_options = TracingOptions::default().string_dictionary_encoding(true), - field = GenericField::new("root", GenericDataType::Dictionary, true) + field = GenericField::new("item", GenericDataType::Dictionary, true) .with_child(GenericField::new("key", GenericDataType::U32, false)) .with_child(GenericField::new("value", GenericDataType::LargeUtf8, false)), - overwrite_field = GenericField::new("root", GenericDataType::Dictionary, true) + overwrite_field = GenericField::new("item", GenericDataType::Dictionary, true) .with_child(GenericField::new("key", GenericDataType::I8, false)) .with_child(GenericField::new("value", GenericDataType::LargeUtf8, false)), ty = Option, @@ -235,14 +235,14 @@ test_example!( test_bytecode_deserialization = true, test_deserialization = [], tracing_options = TracingOptions::default().string_dictionary_encoding(true), - field = GenericField::new("root", GenericDataType::Dictionary, false) + field = GenericField::new("item", GenericDataType::Dictionary, false) .with_child(GenericField::new("key", GenericDataType::U32, false)) .with_child(GenericField::new( "value", GenericDataType::LargeUtf8, false )), - overwrite_field = GenericField::new("root", GenericDataType::Dictionary, false) + overwrite_field = GenericField::new("item", GenericDataType::Dictionary, false) .with_child(GenericField::new("key", GenericDataType::I16, false)) .with_child(GenericField::new( "value", @@ -259,10 +259,10 @@ test_example!( test_bytecode_deserialization = true, test_deserialization = [], tracing_options = TracingOptions::default().string_dictionary_encoding(true), - field = GenericField::new("root", GenericDataType::Dictionary, true) + field = GenericField::new("item", GenericDataType::Dictionary, true) .with_child(GenericField::new("key", GenericDataType::U32, false)) .with_child(GenericField::new("value", GenericDataType::LargeUtf8, false)), - overwrite_field = GenericField::new("root", GenericDataType::Dictionary, true) + overwrite_field = GenericField::new("item", GenericDataType::Dictionary, true) .with_child(GenericField::new("key", GenericDataType::I16, false)) .with_child(GenericField::new("value", GenericDataType::LargeUtf8, false)), ty = Option, @@ -275,14 +275,14 @@ test_example!( test_bytecode_deserialization = true, test_deserialization = [], tracing_options = TracingOptions::default().string_dictionary_encoding(true), - field = GenericField::new("root", GenericDataType::Dictionary, false) + field = GenericField::new("item", GenericDataType::Dictionary, false) .with_child(GenericField::new("key", GenericDataType::U32, false)) .with_child(GenericField::new( "value", GenericDataType::LargeUtf8, false )), - overwrite_field = GenericField::new("root", GenericDataType::Dictionary, false) + overwrite_field = GenericField::new("item", GenericDataType::Dictionary, false) .with_child(GenericField::new("key", GenericDataType::I64, false)) .with_child(GenericField::new( "value", @@ -299,10 +299,10 @@ test_example!( test_bytecode_deserialization = true, test_deserialization = [], tracing_options = TracingOptions::default().string_dictionary_encoding(true), - field = GenericField::new("root", GenericDataType::Dictionary, true) + field = GenericField::new("item", GenericDataType::Dictionary, true) .with_child(GenericField::new("key", GenericDataType::U32, false)) .with_child(GenericField::new("value", GenericDataType::LargeUtf8, false)), - overwrite_field = GenericField::new("root", GenericDataType::Dictionary, true) + overwrite_field = GenericField::new("item", GenericDataType::Dictionary, true) .with_child(GenericField::new("key", GenericDataType::I64, false)) .with_child(GenericField::new("value", GenericDataType::LargeUtf8, false)), ty = Option, @@ -315,14 +315,14 @@ test_example!( test_bytecode_deserialization = true, test_deserialization = [], tracing_options = TracingOptions::default().string_dictionary_encoding(true), - field = GenericField::new("root", GenericDataType::Dictionary, false) + field = GenericField::new("item", GenericDataType::Dictionary, false) .with_child(GenericField::new("key", GenericDataType::U32, false)) .with_child(GenericField::new( "value", GenericDataType::LargeUtf8, false )), - overwrite_field = GenericField::new("root", GenericDataType::Dictionary, false) + overwrite_field = GenericField::new("item", GenericDataType::Dictionary, false) .with_child(GenericField::new("key", GenericDataType::U32, false)) .with_child(GenericField::new("value", GenericDataType::Utf8, false)), ty = String, @@ -335,10 +335,10 @@ test_example!( test_bytecode_deserialization = true, test_deserialization = [], tracing_options = TracingOptions::default().string_dictionary_encoding(true), - field = GenericField::new("root", GenericDataType::Dictionary, true) + field = GenericField::new("item", GenericDataType::Dictionary, true) .with_child(GenericField::new("key", GenericDataType::U32, false)) .with_child(GenericField::new("value", GenericDataType::LargeUtf8, false)), - overwrite_field = GenericField::new("root", GenericDataType::Dictionary, true) + overwrite_field = GenericField::new("item", GenericDataType::Dictionary, true) .with_child(GenericField::new("key", GenericDataType::U32, false)) .with_child(GenericField::new("value", GenericDataType::Utf8, false)), ty = Option, @@ -351,14 +351,14 @@ test_example!( test_bytecode_deserialization = true, test_deserialization = [], tracing_options = TracingOptions::default().string_dictionary_encoding(true), - field = GenericField::new("root", GenericDataType::Dictionary, false) + field = GenericField::new("item", GenericDataType::Dictionary, false) .with_child(GenericField::new("key", GenericDataType::U32, false)) .with_child(GenericField::new( "value", GenericDataType::LargeUtf8, false )), - overwrite_field = GenericField::new("root", GenericDataType::Dictionary, false) + overwrite_field = GenericField::new("item", GenericDataType::Dictionary, false) .with_child(GenericField::new("key", GenericDataType::U8, false)) .with_child(GenericField::new("value", GenericDataType::Utf8, false)), ty = String, @@ -371,10 +371,10 @@ test_example!( test_bytecode_deserialization = true, test_deserialization = [], tracing_options = TracingOptions::default().string_dictionary_encoding(true), - field = GenericField::new("root", GenericDataType::Dictionary, true) + field = GenericField::new("item", GenericDataType::Dictionary, true) .with_child(GenericField::new("key", GenericDataType::U32, false)) .with_child(GenericField::new("value", GenericDataType::LargeUtf8, false)), - overwrite_field = GenericField::new("root", GenericDataType::Dictionary, true) + overwrite_field = GenericField::new("item", GenericDataType::Dictionary, true) .with_child(GenericField::new("key", GenericDataType::U8, false)) .with_child(GenericField::new("value", GenericDataType::Utf8, false)), ty = Option, @@ -387,14 +387,14 @@ test_example!( test_bytecode_deserialization = true, test_deserialization = [], tracing_options = TracingOptions::default().string_dictionary_encoding(true), - field = GenericField::new("root", GenericDataType::Dictionary, false) + field = GenericField::new("item", GenericDataType::Dictionary, false) .with_child(GenericField::new("key", GenericDataType::U32, false)) .with_child(GenericField::new( "value", GenericDataType::LargeUtf8, false )), - overwrite_field = GenericField::new("root", GenericDataType::Dictionary, false) + overwrite_field = GenericField::new("item", GenericDataType::Dictionary, false) .with_child(GenericField::new("key", GenericDataType::U16, false)) .with_child(GenericField::new("value", GenericDataType::Utf8, false)), ty = String, @@ -407,10 +407,10 @@ test_example!( test_bytecode_deserialization = true, test_deserialization = [], tracing_options = TracingOptions::default().string_dictionary_encoding(true), - field = GenericField::new("root", GenericDataType::Dictionary, true) + field = GenericField::new("item", GenericDataType::Dictionary, true) .with_child(GenericField::new("key", GenericDataType::U32, false)) .with_child(GenericField::new("value", GenericDataType::LargeUtf8, false)), - overwrite_field = GenericField::new("root", GenericDataType::Dictionary, true) + overwrite_field = GenericField::new("item", GenericDataType::Dictionary, true) .with_child(GenericField::new("key", GenericDataType::U16, false)) .with_child(GenericField::new("value", GenericDataType::Utf8, false)), ty = Option, @@ -423,14 +423,14 @@ test_example!( test_bytecode_deserialization = true, test_deserialization = [], tracing_options = TracingOptions::default().string_dictionary_encoding(true), - field = GenericField::new("root", GenericDataType::Dictionary, false) + field = GenericField::new("item", GenericDataType::Dictionary, false) .with_child(GenericField::new("key", GenericDataType::U32, false)) .with_child(GenericField::new( "value", GenericDataType::LargeUtf8, false )), - overwrite_field = GenericField::new("root", GenericDataType::Dictionary, false) + overwrite_field = GenericField::new("item", GenericDataType::Dictionary, false) .with_child(GenericField::new("key", GenericDataType::U64, false)) .with_child(GenericField::new("value", GenericDataType::Utf8, false)), ty = String, @@ -443,10 +443,10 @@ test_example!( test_bytecode_deserialization = true, test_deserialization = [], tracing_options = TracingOptions::default().string_dictionary_encoding(true), - field = GenericField::new("root", GenericDataType::Dictionary, true) + field = GenericField::new("item", GenericDataType::Dictionary, true) .with_child(GenericField::new("key", GenericDataType::U32, false)) .with_child(GenericField::new("value", GenericDataType::LargeUtf8, false)), - overwrite_field = GenericField::new("root", GenericDataType::Dictionary, true) + overwrite_field = GenericField::new("item", GenericDataType::Dictionary, true) .with_child(GenericField::new("key", GenericDataType::U64, false)) .with_child(GenericField::new("value", GenericDataType::Utf8, false)), ty = Option, @@ -459,14 +459,14 @@ test_example!( test_bytecode_deserialization = true, test_deserialization = [], tracing_options = TracingOptions::default().string_dictionary_encoding(true), - field = GenericField::new("root", GenericDataType::Dictionary, false) + field = GenericField::new("item", GenericDataType::Dictionary, false) .with_child(GenericField::new("key", GenericDataType::U32, false)) .with_child(GenericField::new( "value", GenericDataType::LargeUtf8, false )), - overwrite_field = GenericField::new("root", GenericDataType::Dictionary, false) + overwrite_field = GenericField::new("item", GenericDataType::Dictionary, false) .with_child(GenericField::new("key", GenericDataType::I32, false)) .with_child(GenericField::new("value", GenericDataType::Utf8, false)), ty = String, @@ -479,10 +479,10 @@ test_example!( test_bytecode_deserialization = true, test_deserialization = [], tracing_options = TracingOptions::default().string_dictionary_encoding(true), - field = GenericField::new("root", GenericDataType::Dictionary, true) + field = GenericField::new("item", GenericDataType::Dictionary, true) .with_child(GenericField::new("key", GenericDataType::U32, false)) .with_child(GenericField::new("value", GenericDataType::LargeUtf8, false)), - overwrite_field = GenericField::new("root", GenericDataType::Dictionary, true) + overwrite_field = GenericField::new("item", GenericDataType::Dictionary, true) .with_child(GenericField::new("key", GenericDataType::I32, false)) .with_child(GenericField::new("value", GenericDataType::Utf8, false)), ty = Option, @@ -495,14 +495,14 @@ test_example!( test_bytecode_deserialization = true, test_deserialization = [], tracing_options = TracingOptions::default().string_dictionary_encoding(true), - field = GenericField::new("root", GenericDataType::Dictionary, false) + field = GenericField::new("item", GenericDataType::Dictionary, false) .with_child(GenericField::new("key", GenericDataType::U32, false)) .with_child(GenericField::new( "value", GenericDataType::LargeUtf8, false )), - overwrite_field = GenericField::new("root", GenericDataType::Dictionary, false) + overwrite_field = GenericField::new("item", GenericDataType::Dictionary, false) .with_child(GenericField::new("key", GenericDataType::I8, false)) .with_child(GenericField::new("value", GenericDataType::Utf8, false)), ty = String, @@ -515,10 +515,10 @@ test_example!( test_bytecode_deserialization = true, test_deserialization = [], tracing_options = TracingOptions::default().string_dictionary_encoding(true), - field = GenericField::new("root", GenericDataType::Dictionary, true) + field = GenericField::new("item", GenericDataType::Dictionary, true) .with_child(GenericField::new("key", GenericDataType::U32, false)) .with_child(GenericField::new("value", GenericDataType::LargeUtf8, false)), - overwrite_field = GenericField::new("root", GenericDataType::Dictionary, true) + overwrite_field = GenericField::new("item", GenericDataType::Dictionary, true) .with_child(GenericField::new("key", GenericDataType::I8, false)) .with_child(GenericField::new("value", GenericDataType::Utf8, false)), ty = Option, @@ -531,14 +531,14 @@ test_example!( test_bytecode_deserialization = true, test_deserialization = [], tracing_options = TracingOptions::default().string_dictionary_encoding(true), - field = GenericField::new("root", GenericDataType::Dictionary, false) + field = GenericField::new("item", GenericDataType::Dictionary, false) .with_child(GenericField::new("key", GenericDataType::U32, false)) .with_child(GenericField::new( "value", GenericDataType::LargeUtf8, false )), - overwrite_field = GenericField::new("root", GenericDataType::Dictionary, false) + overwrite_field = GenericField::new("item", GenericDataType::Dictionary, false) .with_child(GenericField::new("key", GenericDataType::I16, false)) .with_child(GenericField::new("value", GenericDataType::Utf8, false)), ty = String, @@ -551,10 +551,10 @@ test_example!( test_bytecode_deserialization = true, test_deserialization = [], tracing_options = TracingOptions::default().string_dictionary_encoding(true), - field = GenericField::new("root", GenericDataType::Dictionary, true) + field = GenericField::new("item", GenericDataType::Dictionary, true) .with_child(GenericField::new("key", GenericDataType::U32, false)) .with_child(GenericField::new("value", GenericDataType::LargeUtf8, false)), - overwrite_field = GenericField::new("root", GenericDataType::Dictionary, true) + overwrite_field = GenericField::new("item", GenericDataType::Dictionary, true) .with_child(GenericField::new("key", GenericDataType::I16, false)) .with_child(GenericField::new("value", GenericDataType::Utf8, false)), ty = Option, @@ -567,14 +567,14 @@ test_example!( test_bytecode_deserialization = true, test_deserialization = [], tracing_options = TracingOptions::default().string_dictionary_encoding(true), - field = GenericField::new("root", GenericDataType::Dictionary, false) + field = GenericField::new("item", GenericDataType::Dictionary, false) .with_child(GenericField::new("key", GenericDataType::U32, false)) .with_child(GenericField::new( "value", GenericDataType::LargeUtf8, false )), - overwrite_field = GenericField::new("root", GenericDataType::Dictionary, false) + overwrite_field = GenericField::new("item", GenericDataType::Dictionary, false) .with_child(GenericField::new("key", GenericDataType::I64, false)) .with_child(GenericField::new("value", GenericDataType::Utf8, false)), ty = String, @@ -587,10 +587,10 @@ test_example!( test_bytecode_deserialization = true, test_deserialization = [], tracing_options = TracingOptions::default().string_dictionary_encoding(true), - field = GenericField::new("root", GenericDataType::Dictionary, true) + field = GenericField::new("item", GenericDataType::Dictionary, true) .with_child(GenericField::new("key", GenericDataType::U32, false)) .with_child(GenericField::new("value", GenericDataType::LargeUtf8, false)), - overwrite_field = GenericField::new("root", GenericDataType::Dictionary, true) + overwrite_field = GenericField::new("item", GenericDataType::Dictionary, true) .with_child(GenericField::new("key", GenericDataType::I64, false)) .with_child(GenericField::new("value", GenericDataType::Utf8, false)), ty = Option, diff --git a/serde_arrow/src/test_impls/examples.rs b/serde_arrow/src/test_impls/examples.rs index fa49cfc0..2a9b9996 100644 --- a/serde_arrow/src/test_impls/examples.rs +++ b/serde_arrow/src/test_impls/examples.rs @@ -3,7 +3,7 @@ use super::macros::*; test_example!( test_name = benchmark_primitives, test_bytecode_deserialization = true, - field = GenericField::new("root", GenericDataType::Struct, false) + field = GenericField::new("item", GenericDataType::Struct, false) .with_child(GenericField::new("a", GenericDataType::U8, false)) .with_child(GenericField::new("b", GenericDataType::U16, false)) .with_child(GenericField::new("c", GenericDataType::U32, false)) @@ -39,7 +39,7 @@ test_example!( test_example!( test_name = benchmark_complex_1, test_bytecode_deserialization = true, - field = GenericField::new("root", GenericDataType::Struct, false) + field = GenericField::new("item", GenericDataType::Struct, false) .with_child(GenericField::new( "string", GenericDataType::LargeUtf8, @@ -96,7 +96,7 @@ test_example!( test_example!( test_name = benchmark_complex_2, test_bytecode_deserialization = true, - field = GenericField::new("root", GenericDataType::Struct, false) + field = GenericField::new("item", GenericDataType::Struct, false) .with_child(GenericField::new( "string", GenericDataType::LargeUtf8, @@ -163,7 +163,7 @@ test_example!( test_example!( test_name = nested_options, test_bytecode_deserialization = true, - field = GenericField::new("root", GenericDataType::Struct, false) + field = GenericField::new("item", GenericDataType::Struct, false) .with_child(GenericField::new("a", GenericDataType::U8, false)) .with_child(GenericField::new("b", GenericDataType::U16, true)) .with_child(GenericField::new("c", GenericDataType::U32, true)), @@ -219,7 +219,7 @@ test_example!( test_name = fieldless_unions_in_a_struct, test_bytecode_deserialization = true, tracing_options = TracingOptions::default().allow_null_fields(true), - field = GenericField::new("root", GenericDataType::Struct, false) + field = GenericField::new("item", GenericDataType::Struct, false) .with_child(GenericField::new("foo", GenericDataType::U32, false)) .with_child( GenericField::new("bar", GenericDataType::Union, false) @@ -274,7 +274,7 @@ test_example!( test_name = issue_57, test_bytecode_deserialization = true, tracing_options = TracingOptions::default().allow_null_fields(true), - field = GenericField::new("root", GenericDataType::Struct, false) + field = GenericField::new("item", GenericDataType::Struct, false) .with_child(GenericField::new( "filename", GenericDataType::LargeUtf8, @@ -383,7 +383,7 @@ test_roundtrip_arrays!( test_example!( test_name = new_type_wrappers, test_bytecode_deserialization = true, - field = GenericField::new("root", GenericDataType::U64, false), + field = GenericField::new("item", GenericDataType::U64, false), ty = U64, values = [U64(0), U64(1), U64(2)], nulls = [false, false, false], @@ -397,7 +397,7 @@ test_example!( test_name = unit, test_bytecode_deserialization = true, tracing_options = TracingOptions::default().allow_null_fields(true), - field = GenericField::new("root", GenericDataType::Null, true), + field = GenericField::new("item", GenericDataType::Null, true), ty = (), values = [(), (), (), ()], ); diff --git a/serde_arrow/src/test_impls/issue_74_unknown_fields.rs b/serde_arrow/src/test_impls/issue_74_unknown_fields.rs index 0ca93977..cc16418c 100644 --- a/serde_arrow/src/test_impls/issue_74_unknown_fields.rs +++ b/serde_arrow/src/test_impls/issue_74_unknown_fields.rs @@ -11,7 +11,7 @@ macro_rules! test_missing_field { Field::try_from(&GenericField::new("a", GenericDataType::U8, false)).unwrap(), ]; - let res = serialize_into_arrays(&fields, &items).unwrap(); + let res = to_arrow(&fields, &items).unwrap(); assert_eq!(res.len(), 1); assert_eq!(res[0].len(), items.len()); } @@ -212,7 +212,7 @@ test_generic!( .unwrap(), ]; - let res = serialize_into_arrays(&fields, &items).unwrap(); + let res = to_arrow(&fields, &items).unwrap(); assert_eq!(res.len(), 2); assert_eq!(res[0].len(), items.len()); assert_eq!(res[1].len(), items.len()); diff --git a/serde_arrow/src/test_impls/issue_79_declared_but_missing_fields.rs b/serde_arrow/src/test_impls/issue_79_declared_but_missing_fields.rs index 5ee2b5d7..061488cb 100644 --- a/serde_arrow/src/test_impls/issue_79_declared_but_missing_fields.rs +++ b/serde_arrow/src/test_impls/issue_79_declared_but_missing_fields.rs @@ -16,7 +16,7 @@ test_generic!( Field::try_from(&GenericField::new("b", GenericDataType::U8, true)).unwrap(), ]; - let arrays = serialize_into_arrays(&fields, &items).unwrap(); + let arrays = to_arrow(&fields, &items).unwrap(); assert_eq!(arrays.len(), 2); assert_eq!(arrays[0].len(), 2); @@ -40,7 +40,7 @@ test_generic!( Field::try_from(&GenericField::new("b", GenericDataType::U8, false)).unwrap(), ]; - let Err(err) = serialize_into_arrays(&fields, &items) else { + let Err(err) = to_arrow(&fields, &items) else { panic!("Expected error"); }; assert!( diff --git a/serde_arrow/src/test_impls/issue_90_type_tracing.rs b/serde_arrow/src/test_impls/issue_90_type_tracing.rs index 22749d7e..0d53f294 100644 --- a/serde_arrow/src/test_impls/issue_90_type_tracing.rs +++ b/serde_arrow/src/test_impls/issue_90_type_tracing.rs @@ -3,13 +3,14 @@ use std::collections::HashMap; use serde::{Deserialize, Serialize}; use crate::internal::{ + generic::{Item, Items}, schema::{GenericDataType as T, GenericField as F, Strategy}, tracing::{Tracer, TracingOptions}, }; fn trace_type<'de, T: Deserialize<'de>>(options: TracingOptions) -> F { - let mut tracer = Tracer::new(String::from("$"), options.as_field("root")); - tracer.trace_type::().unwrap(); + let mut tracer = Tracer::new(String::from("$"), options); + tracer.trace_type::>().unwrap(); let schema = tracer.to_schema().unwrap(); schema.fields.into_iter().next().unwrap() @@ -29,7 +30,7 @@ fn issue_90() { } let actual = trace_type::(TracingOptions::default()); - let expected = F::new("root", T::Struct, false).with_child( + let expected = F::new("item", T::Struct, false).with_child( F::new("distribution", T::Struct, true) .with_child(F::new("samples", T::LargeList, false).with_child(F::new( "element", @@ -46,49 +47,49 @@ fn issue_90() { fn trace_primitives() { assert_eq!( trace_type::<()>(TracingOptions::default().allow_null_fields(true)), - F::new("root", T::Null, true), + F::new("item", T::Null, true), ); assert_eq!( trace_type::(TracingOptions::default()), - F::new("root", T::I8, false) + F::new("item", T::I8, false) ); assert_eq!( trace_type::(TracingOptions::default()), - F::new("root", T::I16, false) + F::new("item", T::I16, false) ); assert_eq!( trace_type::(TracingOptions::default()), - F::new("root", T::I32, false) + F::new("item", T::I32, false) ); assert_eq!( trace_type::(TracingOptions::default()), - F::new("root", T::I64, false) + F::new("item", T::I64, false) ); assert_eq!( trace_type::(TracingOptions::default()), - F::new("root", T::U8, false) + F::new("item", T::U8, false) ); assert_eq!( trace_type::(TracingOptions::default()), - F::new("root", T::U16, false) + F::new("item", T::U16, false) ); assert_eq!( trace_type::(TracingOptions::default()), - F::new("root", T::U32, false) + F::new("item", T::U32, false) ); assert_eq!( trace_type::(TracingOptions::default()), - F::new("root", T::U64, false) + F::new("item", T::U64, false) ); assert_eq!( trace_type::(TracingOptions::default()), - F::new("root", T::F32, false) + F::new("item", T::F32, false) ); assert_eq!( trace_type::(TracingOptions::default()), - F::new("root", T::F64, false) + F::new("item", T::F64, false) ); } @@ -96,11 +97,11 @@ fn trace_primitives() { fn trace_option() { assert_eq!( trace_type::(TracingOptions::default()), - F::new("root", T::I8, false) + F::new("item", T::I8, false) ); assert_eq!( trace_type::>(TracingOptions::default()), - F::new("root", T::I8, true) + F::new("item", T::I8, true) ); } @@ -114,7 +115,7 @@ fn trace_struct() { } let actual = trace_type::(TracingOptions::default()); - let expected = F::new("root", T::Struct, false) + let expected = F::new("item", T::Struct, false) .with_child(F::new("a", T::Bool, false)) .with_child(F::new("b", T::I8, true)); @@ -124,7 +125,7 @@ fn trace_struct() { #[test] fn trace_tuple_as_struct() { let actual = trace_type::<(bool, Option)>(TracingOptions::default()); - let expected = F::new("root", T::Struct, false) + let expected = F::new("item", T::Struct, false) .with_child(F::new("0", T::Bool, false)) .with_child(F::new("1", T::I8, true)) .with_strategy(Strategy::TupleAsStruct); @@ -142,7 +143,7 @@ fn trace_union() { } let actual = trace_type::(TracingOptions::default()); - let expected = F::new("root", T::Union, false) + let expected = F::new("item", T::Union, false) .with_child(F::new("A", T::I8, false)) .with_child(F::new("B", T::F32, false)); @@ -153,7 +154,7 @@ fn trace_union() { fn trace_list() { let actual = trace_type::>(TracingOptions::default()); let expected = - F::new("root", T::LargeList, false).with_child(F::new("element", T::LargeUtf8, false)); + F::new("item", T::LargeList, false).with_child(F::new("element", T::LargeUtf8, false)); assert_eq!(actual, expected); } @@ -161,7 +162,7 @@ fn trace_list() { #[test] fn trace_map() { let actual = trace_type::>(TracingOptions::default().map_as_struct(false)); - let expected = F::new("root", T::Map, false).with_child( + let expected = F::new("item", T::Map, false).with_child( F::new("entries", T::Struct, false) .with_child(F::new("key", T::I8, false)) .with_child(F::new("value", T::LargeUtf8, false)), @@ -225,12 +226,12 @@ mod mixed_tracing_dates { fn invalid_values_first() { let mut tracer = Tracer::new( String::from("$"), - TracingOptions::default().guess_dates(true).as_field("root"), + TracingOptions::default().guess_dates(true), ); - tracer.trace_samples(&["foo bar"]).unwrap(); - tracer.trace_type::().unwrap(); - tracer.trace_samples(&["2015-09-18T23:56:04Z"]).unwrap(); + tracer.trace_samples(&Items(["foo bar"])).unwrap(); + tracer.trace_type::>().unwrap(); + tracer.trace_samples(&Items(["2015-09-18T23:56:04Z"])).unwrap(); let actual = tracer .to_schema() @@ -239,7 +240,7 @@ mod mixed_tracing_dates { .into_iter() .next() .unwrap(); - let expected = F::new("root", T::LargeUtf8, false); + let expected = F::new("item", T::LargeUtf8, false); assert_eq!(actual, expected); } @@ -248,12 +249,12 @@ mod mixed_tracing_dates { fn invalid_values_last() { let mut tracer = Tracer::new( String::from("$"), - TracingOptions::default().guess_dates(true).as_field("root"), + TracingOptions::default().guess_dates(true), ); - tracer.trace_samples(&["2015-09-18T23:56:04Z"]).unwrap(); - tracer.trace_type::().unwrap(); - tracer.trace_samples(&["foo bar"]).unwrap(); + tracer.trace_samples(&Items(["2015-09-18T23:56:04Z"])).unwrap(); + tracer.trace_type::>().unwrap(); + tracer.trace_samples(&Items(["foo bar"])).unwrap(); let actual = tracer .to_schema() @@ -262,14 +263,14 @@ mod mixed_tracing_dates { .into_iter() .next() .unwrap(); - let expected = F::new("root", T::LargeUtf8, false); + let expected = F::new("item", T::LargeUtf8, false); assert_eq!(actual, expected); } } mod mixed_tracing_unions { - use crate::internal::{generic, tracing}; + use crate::internal::{generic::{Items, Item}, tracing}; use super::*; @@ -285,15 +286,14 @@ mod mixed_tracing_unions { let mut tracer = tracing::Tracer::new( String::from("$"), TracingOptions::default() - .allow_null_fields(true) - .as_field("root"), + .allow_null_fields(true), ); - tracer.trace_type::().unwrap(); - tracer.trace_samples(&[E::A, E::C(32)]).unwrap(); + tracer.trace_type::>().unwrap(); + tracer.trace_samples(&Items(&[E::A, E::C(32)])).unwrap(); let schema = tracer.to_schema().unwrap(); - let actual = generic::to_single_item(schema.fields).unwrap(); - let expected = F::new("root", T::Union, false) + let actual = schema.fields.into_iter().next().unwrap(); + let expected = F::new("item", T::Union, false) .with_child(F::new("A", T::Null, true)) .with_child(F::new("B", T::Null, true)) .with_child(F::new("C", T::U32, false)); diff --git a/serde_arrow/src/test_impls/json_values.rs b/serde_arrow/src/test_impls/json_values.rs index 8f6e254d..cd3d2a64 100644 --- a/serde_arrow/src/test_impls/json_values.rs +++ b/serde_arrow/src/test_impls/json_values.rs @@ -7,8 +7,8 @@ test_generic!( let tracing_options = TracingOptions::default(); let items = vec![json!({ "a": 1, "b": 2 }), json!({ "a": 3, "b": 4 })]; - let fields = serialize_into_fields(&items, tracing_options).unwrap(); - let arrays = serialize_into_arrays(&fields, &items).unwrap(); + let fields: Vec = SerdeArrowSchema::from_samples(&items, tracing_options).unwrap().try_into().unwrap(); + let arrays = to_arrow(&fields, &items).unwrap(); drop(arrays); } @@ -21,8 +21,8 @@ test_generic!( let tracing_options = TracingOptions::default().coerce_numbers(true); let items = vec![json!({ "a": 1, "b": -2 }), json!({ "a": 3.0, "b": 4 })]; - let fields = serialize_into_fields(&items, tracing_options).unwrap(); - let arrays = serialize_into_arrays(&fields, &items).unwrap(); + let fields: Vec = SerdeArrowSchema::from_samples(&items, tracing_options).unwrap().try_into().unwrap(); + let arrays = to_arrow(&fields, &items).unwrap(); drop(arrays); } @@ -39,7 +39,7 @@ test_generic!( Field::try_from(&GenericField::new("b", GenericDataType::I64, false)).unwrap(), ]; - let arrays = serialize_into_arrays(&fields, &items).unwrap(); + let arrays = to_arrow(&fields, &items).unwrap(); drop(arrays); } @@ -56,7 +56,7 @@ test_generic!( Field::try_from(&GenericField::new("b", GenericDataType::I64, false)).unwrap(), ]; - let arrays = serialize_into_arrays(&fields, &items).unwrap(); + let arrays = to_arrow(&fields, &items).unwrap(); drop(arrays); } @@ -73,7 +73,7 @@ test_generic!( Field::try_from(&GenericField::new("b", GenericDataType::I64, false)).unwrap(), ]; - let arrays = serialize_into_arrays(&fields, &items).unwrap(); + let arrays = to_arrow(&fields, &items).unwrap(); drop(arrays); } @@ -95,7 +95,7 @@ test_generic!( Field::try_from(&GenericField::new("a", GenericDataType::Utf8, false)).unwrap(), ]; - let arrays = serialize_into_arrays(&fields, &items).unwrap(); + let arrays = to_arrow(&fields, &items).unwrap(); drop(arrays); } @@ -112,7 +112,7 @@ test_generic!( Field::try_from(&GenericField::new("b", GenericDataType::I64, false)).unwrap(), ]; - let Err(err) = serialize_into_arrays(&fields, &items) else { + let Err(err) = to_arrow(&fields, &items) else { panic!("expected an error, but no error was raised"); }; diff --git a/serde_arrow/src/test_impls/list.rs b/serde_arrow/src/test_impls/list.rs index ea7230f8..4dc6d4d6 100644 --- a/serde_arrow/src/test_impls/list.rs +++ b/serde_arrow/src/test_impls/list.rs @@ -3,7 +3,7 @@ use super::macros::test_example; test_example!( test_name = large_list_u32, test_bytecode_deserialization = true, - field = GenericField::new("root", GenericDataType::LargeList, false) + field = GenericField::new("item", GenericDataType::LargeList, false) .with_child(GenericField::new("element", GenericDataType::U32, false)), ty = Vec, values = [vec![0, 1, 2], vec![3, 4], vec![]], @@ -13,7 +13,7 @@ test_example!( test_example!( test_name = large_list_nullable_u64, test_bytecode_deserialization = true, - field = GenericField::new("root", GenericDataType::LargeList, false) + field = GenericField::new("item", GenericDataType::LargeList, false) .with_child(GenericField::new("element", GenericDataType::U64, true)), ty = Vec>, values = [vec![Some(0), None, Some(2)], vec![Some(3)], vec![None], vec![]], @@ -23,7 +23,7 @@ test_example!( test_example!( test_name = nullable_large_list_u32, test_bytecode_deserialization = true, - field = GenericField::new("root", GenericDataType::LargeList, true) + field = GenericField::new("item", GenericDataType::LargeList, true) .with_child(GenericField::new("element", GenericDataType::U32, false)), ty = Option>, values = [Some(vec![0, 1, 2]), None, Some(vec![3, 4]), Some(vec![])], @@ -33,9 +33,9 @@ test_example!( test_example!( test_name = list_u32, test_bytecode_deserialization = true, - field = GenericField::new("root", GenericDataType::LargeList, false) + field = GenericField::new("item", GenericDataType::LargeList, false) .with_child(GenericField::new("element", GenericDataType::U32, false)), - overwrite_field = GenericField::new("root", GenericDataType::List, false) + overwrite_field = GenericField::new("item", GenericDataType::List, false) .with_child(GenericField::new("element", GenericDataType::U32, false)), ty = Vec, values = [vec![0, 1, 2], vec![3, 4], vec![]], @@ -45,7 +45,7 @@ test_example!( test_example!( test_name = nested_large_list_u32, test_bytecode_deserialization = true, - field = GenericField::new("root", GenericDataType::LargeList, false) + field = GenericField::new("item", GenericDataType::LargeList, false) .with_child(GenericField::new("element", GenericDataType::LargeList, false) .with_child(GenericField::new("element", GenericDataType::U32, false))), ty = Vec>, @@ -56,7 +56,7 @@ test_example!( test_example!( test_name = nullable_vec_bool, test_bytecode_deserialization = true, - field = GenericField::new("root", GenericDataType::LargeList, true) + field = GenericField::new("item", GenericDataType::LargeList, true) .with_child(GenericField::new("element", GenericDataType::Bool, false)), ty = Option>, values = [Some(vec![true, false]), None, Some(vec![])], @@ -65,7 +65,7 @@ test_example!( test_example!( test_name = nullable_vec_bool_nested, test_bytecode_deserialization = true, - field = GenericField::new("root", GenericDataType::LargeList, true) + field = GenericField::new("item", GenericDataType::LargeList, true) .with_child(GenericField::new("element", GenericDataType::LargeList, false) .with_child(GenericField::new("element", GenericDataType::Bool, false))), ty = Option>>, @@ -75,7 +75,7 @@ test_example!( test_example!( test_name = vec_nullable_bool, test_bytecode_deserialization = true, - field = GenericField::new("root", GenericDataType::LargeList, false) + field = GenericField::new("item", GenericDataType::LargeList, false) .with_child(GenericField::new("element", GenericDataType::Bool, true)), ty = Vec>, values = [vec![Some(true), Some(false)], vec![], vec![None, Some(false)]], @@ -84,7 +84,7 @@ test_example!( test_example!( test_name = byte_arrays, test_bytecode_deserialization = true, - field = GenericField::new("root", GenericDataType::LargeList, false) + field = GenericField::new("item", GenericDataType::LargeList, false) .with_child(GenericField::new("element", GenericDataType::U8, false)), ty = Vec, values = [ diff --git a/serde_arrow/src/test_impls/macros.rs b/serde_arrow/src/test_impls/macros.rs index 949e947b..ead1b42e 100644 --- a/serde_arrow/src/test_impls/macros.rs +++ b/serde_arrow/src/test_impls/macros.rs @@ -49,18 +49,15 @@ macro_rules! test_example_impl { use super::*; use crate::{ - internal::{ - schema::{ - GenericDataType, - GenericField, - GenericTimeUnit, - Strategy, - }, - tracing::TracingOptions, + schema::{SerdeArrowSchema, TracingOptions, Strategy}, + utils::Items, + internal::schema::{ + GenericDataType, + GenericField, + GenericTimeUnit, }, test_impls::{ macros::{btree_map, hash_map}, - utils::ScopedConfiguration, }, }; @@ -78,8 +75,8 @@ macro_rules! test_example_impl { println!("{options:?}"); - let actual = serialize_into_field(&items, "root", options).unwrap(); - let expected: Field = (&field).try_into().unwrap(); + let actual: Vec = SerdeArrowSchema::from_samples(&Items(items), options).unwrap().try_into().unwrap(); + let expected: Vec = vec![(&field).try_into().unwrap()]; assert_eq!( actual, expected, @@ -94,7 +91,7 @@ macro_rules! test_example_impl { expected = expected, ); - let traced: GenericField = (&actual).try_into().unwrap(); + let traced: GenericField = (&actual[0]).try_into().unwrap(); println!("traced: {:?}\n", traced); println!("defined: {:?}\n", field); @@ -104,10 +101,6 @@ macro_rules! test_example_impl { $(#[ignore = $ignore])? #[test] fn serialization() { - let _guard = ScopedConfiguration::configure(|c| { - c.debug_print_program = true; - }); - $($($definitions)*)? let items: &[$ty] = &$values; @@ -115,7 +108,8 @@ macro_rules! test_example_impl { $(let field = $overwrite_field;)? let field: Field = (&field).try_into().unwrap(); - let array = serialize_into_array(&field, &items).unwrap(); + let arrays = to_arrow(std::slice::from_ref(&field), &Items(items)).unwrap(); + let array = arrays.into_iter().next().unwrap(); assert_eq!(array.data_type(), field.data_type(), "Unexpected data type"); assert_eq!(array.len(), items.len(), "Unexpected number of items"); @@ -149,7 +143,10 @@ macro_rules! test_example_impl { let expected_items = items; $(let expected_items: &[$ty] = &$expected_values;)? - let items_round_trip: Vec<$ty> = deserialize_from_array(&field, &array).unwrap(); + let Items(items_round_trip): Items> = from_arrow( + std::slice::from_ref(&field), + std::slice::from_ref(&array), + ).unwrap(); assert_eq!(expected_items, items_round_trip); } } @@ -164,7 +161,8 @@ macro_rules! test_example_impl { $(let field = $overwrite_field;)? let field: Field = (&field).try_into().unwrap(); - let array_reference = serialize_into_array(&field, &items).unwrap(); + let arrays_reference = to_arrow(std::slice::from_ref(&field), &Items(items)).unwrap(); + let array_reference = arrays_reference.into_iter().next().unwrap(); let mut builder = ArrayBuilder::new(&field).unwrap(); @@ -189,7 +187,10 @@ macro_rules! test_example_impl { let expected_items = items; $(let expected_items: &[$ty] = &$expected_values;)? - let items_round_trip: Vec<$ty> = deserialize_from_array(&field, &array).unwrap(); + let Items(items_round_trip): Items> = from_arrow( + std::slice::from_ref(&field), + std::slice::from_ref(&array), + ).unwrap(); assert_eq!(expected_items, items_round_trip); } } @@ -226,7 +227,9 @@ macro_rules! test_example { mod $test_name { mod arrow { use crate::{ - arrow::{deserialize_from_array, serialize_into_field, serialize_into_array, ArrayBuilder}, + to_arrow, + from_arrow, + arrow::ArrayBuilder, _impl::arrow::datatypes::Field, }; const IMPL: &'static str = "arrow"; @@ -239,7 +242,9 @@ macro_rules! test_example { } mod arrow2 { use crate::{ - arrow2::{deserialize_from_array, serialize_into_field, serialize_into_array, ArrayBuilder}, + to_arrow2 as to_arrow, + from_arrow2 as from_arrow, + arrow2::ArrayBuilder, _impl::arrow2::datatypes::Field, }; const IMPL: &'static str = "arrow2"; @@ -285,7 +290,7 @@ macro_rules! test_events { let mut tracer = Tracer::new(String::from("$"), options); let mut sink = StripOuterSequenceSink::new(&mut tracer); accept_events(&mut sink, events.iter().cloned()).unwrap(); - let root = tracer.to_field("root").unwrap(); + let root = tracer.to_field("item").unwrap(); assert_eq!(root.children, fields); } @@ -314,77 +319,6 @@ macro_rules! test_events { pub(crate) use test_events; -macro_rules! test_error_impl { - ( - test_name = $test_name:ident, - expected_error = $expected_error:expr, - block = $block:expr, - ) => { - use super::*; - - use $crate::internal::error::Result; - - #[test] - fn test() { - fn block() -> Result<()> { - $block - }; - - let actual = block(); - let expected = $expected_error; - - let Err(actual) = actual else { - panic!("expected an error, but no error was raised"); - }; - - let actual = actual.to_string(); - - if !actual.contains(expected) { - panic!("Error did not contain {expected:?}. Full error: {actual}"); - } - } - }; -} - -pub(crate) use test_error_impl; - -macro_rules! test_error { - ( - test_name = $test_name:ident, - $($tt:tt)* - ) => { - #[allow(unused)] - mod $test_name { - mod arrow { - use crate::{ - arrow::{deserialize_from_array, serialize_into_field, serialize_into_array, ArrayBuilder}, - _impl::arrow::datatypes::Field, - }; - const IMPL: &'static str = "arrow"; - - $crate::test_impls::macros::test_error_impl!( - test_name = $test_name, - $($tt)* - ); - } - mod arrow2 { - use crate::{ - arrow2::{deserialize_from_array, serialize_into_field, serialize_into_array, ArrayBuilder}, - _impl::arrow2::datatypes::Field, - }; - const IMPL: &'static str = "arrow2"; - - $crate::test_impls::macros::test_error_impl!( - test_name = $test_name, - $($tt)* - ); - } - } - }; -} - -pub(crate) use test_error; - macro_rules! test_roundtrip_arrays { ( $name:ident { @@ -400,6 +334,7 @@ macro_rules! test_roundtrip_arrays { mod arrow2 { use serde::{Serialize, Deserialize}; use crate::{ + to_arrow2, from_arrow2, arrow2, internal::schema::{GenericField, GenericDataType}, Result, @@ -418,8 +353,8 @@ macro_rules! test_roundtrip_arrays { let fields = fields.iter().map(|f| Field::try_from(f)).collect::>>().unwrap(); - let arrays = arrow2::serialize_into_arrays(&fields, inputs).unwrap(); - let reconstructed: Vec = arrow2::deserialize_from_arrays(&fields, &arrays).unwrap(); + let arrays = to_arrow2(&fields, inputs).unwrap(); + let reconstructed: Vec = from_arrow2(&fields, &arrays).unwrap(); assert_eq!(reconstructed, expected); } @@ -443,7 +378,7 @@ macro_rules! test_roundtrip_arrays { } let arrays = builder.build_arrays().unwrap(); - let reconstructed: Vec = arrow2::deserialize_from_arrays(&fields, &arrays).unwrap(); + let reconstructed: Vec = from_arrow2(&fields, &arrays).unwrap(); assert_eq!(reconstructed, expected); } @@ -464,7 +399,7 @@ macro_rules! test_roundtrip_arrays { builder.extend(inputs).unwrap(); let arrays = builder.build_arrays().unwrap(); - let reconstructed: Vec = arrow2::deserialize_from_arrays(&fields, &arrays).unwrap(); + let reconstructed: Vec = from_arrow2(&fields, &arrays).unwrap(); assert_eq!(reconstructed, expected); } @@ -475,70 +410,6 @@ macro_rules! test_roundtrip_arrays { pub(crate) use test_roundtrip_arrays; -macro_rules! test_serialize_into_array { - ( - $(#[ignore = $ignore:literal])? - test_name = $test_name:ident, - $($tt:tt)* - ) => { - #[allow(unused)] - mod $test_name { - mod arrow { - use crate::arrow::{serialize_into_field, serialize_into_array}; - $crate::test_impls::macros::test_serialize_into_array_impl!( - $(#[ignore = $ignore])? - test_name = $test_name, - $($tt)* - ); - } - mod arrow2 { - use crate::arrow2::{serialize_into_field, serialize_into_array}; - $crate::test_impls::macros::test_serialize_into_array_impl!( - $(#[ignore = $ignore])? - test_name = $test_name, - $($tt)* - ); - } - } - }; -} - -pub(crate) use test_serialize_into_array; - -macro_rules! test_serialize_into_array_impl { - ( - $(#[ignore = $ignore:literal])? - test_name = $test_name:ident, - values = $values:expr, - $(define = { $($definitions:item)* } ,)? - ) => { - use super::*; - - use crate::{ - internal::tracing::TracingOptions, - test_impls::utils::ScopedConfiguration, - }; - - $(#[ignore = $ignore])? - #[test] - fn serialization() { - let _guard = ScopedConfiguration::configure(|c| { - c.debug_print_program = true; - }); - - $($($definitions)*)? - - let items = &$values; - let field = serialize_into_field(&items, "root", TracingOptions::default()).unwrap(); - let array = serialize_into_array(&field, &items).unwrap(); - - drop(array); - } - - }; -} -pub(crate) use test_serialize_into_array_impl; - macro_rules! test_generic { ( $(#[ignore = $ignore:literal])? @@ -549,16 +420,14 @@ macro_rules! test_generic { #[allow(unused)] mod $name { use crate::{ - internal::{ - schema::{GenericField, GenericDataType}, - tracing::TracingOptions, - }, - test_impls::utils::ScopedConfiguration, + schema::{SerdeArrowSchema, TracingOptions}, + utils::{Items, Item} }; + use crate::internal::schema::{GenericField, GenericDataType}; mod arrow { use super::*; - use crate::arrow::{serialize_into_fields, serialize_into_arrays}; + use crate::{to_arrow, from_arrow}; use crate::_impl::arrow::datatypes::Field; $(#[ignore = $ignore])? @@ -569,7 +438,7 @@ macro_rules! test_generic { } mod arrow2 { use super::*; - use crate::arrow2::{serialize_into_fields, serialize_into_arrays}; + use crate::{to_arrow2 as to_arrow, from_arrow2 as from_arrow}; use crate::_impl::arrow2::datatypes::Field; $(#[ignore = $ignore])? @@ -583,3 +452,14 @@ macro_rules! test_generic { } pub(crate) use test_generic; + +pub fn expect_error(actual: &Result, expected: &str) { + let Err(actual) = actual else { + panic!("expected an error, but no error was raised"); + }; + + let actual = actual.to_string(); + if !actual.contains(expected) { + panic!("Error did not contain {expected:?}. Full error: {actual}"); + } +} \ No newline at end of file diff --git a/serde_arrow/src/test_impls/map.rs b/serde_arrow/src/test_impls/map.rs index 8378915d..f2e6e00a 100644 --- a/serde_arrow/src/test_impls/map.rs +++ b/serde_arrow/src/test_impls/map.rs @@ -5,7 +5,7 @@ use super::macros::{test_events, test_example}; test_example!( test_name = map_as_struct, test_bytecode_deserialization = true, - field = GenericField::new("root", GenericDataType::Struct, false) + field = GenericField::new("item", GenericDataType::Struct, false) .with_strategy(Strategy::MapAsStruct) .with_child(GenericField::new("a", GenericDataType::U32, false)) .with_child(GenericField::new("b", GenericDataType::U32, false)), @@ -20,7 +20,7 @@ test_example!( test_example!( test_name = hash_map_as_struct, test_bytecode_deserialization = true, - field = GenericField::new("root", GenericDataType::Struct, false) + field = GenericField::new("item", GenericDataType::Struct, false) .with_strategy(Strategy::MapAsStruct) .with_child(GenericField::new("a", GenericDataType::U32, false)) .with_child(GenericField::new("b", GenericDataType::U32, false)), @@ -35,7 +35,7 @@ test_example!( test_example!( test_name = map_as_struct_nullable, test_bytecode_deserialization = true, - field = GenericField::new("root", GenericDataType::Struct, true) + field = GenericField::new("item", GenericDataType::Struct, true) .with_strategy(Strategy::MapAsStruct) .with_child(GenericField::new("a", GenericDataType::U32, false)) .with_child(GenericField::new("b", GenericDataType::U32, false)), @@ -52,7 +52,7 @@ test_example!( test_name = map_as_struct_missing_fields, test_bytecode_deserialization = true, test_deserialization = [], - field = GenericField::new("root", GenericDataType::Struct, false) + field = GenericField::new("item", GenericDataType::Struct, false) .with_strategy(Strategy::MapAsStruct) .with_child(GenericField::new("a", GenericDataType::U32, false)) .with_child(GenericField::new("b", GenericDataType::U32, true)), @@ -68,7 +68,7 @@ test_example!( test_name = map_as_struct_missing_fields_2, test_bytecode_deserialization = true, test_deserialization = [], - field = GenericField::new("root", GenericDataType::Struct, false) + field = GenericField::new("item", GenericDataType::Struct, false) .with_strategy(Strategy::MapAsStruct) .with_child(GenericField::new("a", GenericDataType::U32, true)) .with_child(GenericField::new("b", GenericDataType::U32, true)), @@ -86,7 +86,7 @@ test_example!( test_name = map_as_struct_missing_fields_3, test_bytecode_deserialization = true, test_deserialization = [], - field = GenericField::new("root", GenericDataType::Struct, false) + field = GenericField::new("item", GenericDataType::Struct, false) .with_strategy(Strategy::MapAsStruct) .with_child(GenericField::new("a", GenericDataType::U32, true)) .with_child(GenericField::new("b", GenericDataType::U32, true)), @@ -103,7 +103,7 @@ test_example!( test_example!( test_name = map_as_struct_nullable_fields, test_bytecode_deserialization = true, - field = GenericField::new("root", GenericDataType::Struct, false) + field = GenericField::new("item", GenericDataType::Struct, false) .with_strategy(Strategy::MapAsStruct) .with_child(GenericField::new("a", GenericDataType::U32, true)) .with_child(GenericField::new("b", GenericDataType::U32, true)), @@ -119,7 +119,7 @@ test_example!( test_name = map_as_map, test_bytecode_deserialization = true, tracing_options = TracingOptions::default().map_as_struct(false), - field = GenericField::new("root", GenericDataType::Map, false) + field = GenericField::new("item", GenericDataType::Map, false) .with_child( GenericField::new("entries", GenericDataType::Struct, false) .with_child(GenericField::new("key", GenericDataType::LargeUtf8, false)) @@ -137,7 +137,7 @@ test_example!( test_name = map_as_map_empty, test_bytecode_deserialization = true, tracing_options = TracingOptions::default().map_as_struct(false), - field = GenericField::new("root", GenericDataType::Map, false) + field = GenericField::new("item", GenericDataType::Map, false) .with_child( GenericField::new("entries", GenericDataType::Struct, false) .with_child(GenericField::new("key", GenericDataType::LargeUtf8, false)) @@ -156,7 +156,7 @@ test_example!( test_name = map_as_map_int_keys, test_bytecode_deserialization = true, tracing_options = TracingOptions::default().map_as_struct(false), - field = GenericField::new("root", GenericDataType::Map, false) + field = GenericField::new("item", GenericDataType::Map, false) .with_child( GenericField::new("entries", GenericDataType::Struct, false) .with_child(GenericField::new("key", GenericDataType::I32, false)) @@ -173,7 +173,7 @@ test_example!( test_example!( test_name = hash_maps, tracing_options = TracingOptions::new().map_as_struct(false), - field = GenericField::new("root", GenericDataType::Map, false) + field = GenericField::new("item", GenericDataType::Map, false) .with_child(GenericField::new("entries", GenericDataType::Struct, false) .with_child(GenericField::new("key", GenericDataType::I64, false)) .with_child(GenericField::new("value", GenericDataType::Bool, false))), @@ -188,7 +188,7 @@ test_example!( test_example!( test_name = hash_maps_nullable, tracing_options = TracingOptions::new().map_as_struct(false), - field = GenericField::new("root", GenericDataType::Map, true) + field = GenericField::new("item", GenericDataType::Map, true) .with_child(GenericField::new("entries", GenericDataType::Struct, false) .with_child(GenericField::new("key", GenericDataType::I64, false)) .with_child(GenericField::new("value", GenericDataType::Bool, false))), @@ -203,7 +203,7 @@ test_example!( test_example!( test_name = hash_maps_nullable_keys, tracing_options = TracingOptions::new().map_as_struct(false), - field = GenericField::new("root", GenericDataType::Map, false) + field = GenericField::new("item", GenericDataType::Map, false) .with_child(GenericField::new("entries", GenericDataType::Struct, false) .with_child(GenericField::new("key", GenericDataType::I64, true)) .with_child(GenericField::new("value", GenericDataType::Bool, false))), @@ -218,7 +218,7 @@ test_example!( test_example!( test_name = hash_maps_nullable_values, tracing_options = TracingOptions::new().map_as_struct(false), - field = GenericField::new("root", GenericDataType::Map, false) + field = GenericField::new("item", GenericDataType::Map, false) .with_child(GenericField::new("entries", GenericDataType::Struct, false) .with_child(GenericField::new("key", GenericDataType::I64, false)) .with_child(GenericField::new("value", GenericDataType::Bool, true))), @@ -233,7 +233,7 @@ test_example!( test_example!( test_name = btree_maps, tracing_options = TracingOptions::new().map_as_struct(false), - field = GenericField::new("root", GenericDataType::Map, false) + field = GenericField::new("item", GenericDataType::Map, false) .with_child(GenericField::new("entries", GenericDataType::Struct, false) .with_child(GenericField::new("key", GenericDataType::I64, false)) .with_child(GenericField::new("value", GenericDataType::Bool, false))), diff --git a/serde_arrow/src/test_impls/mod.rs b/serde_arrow/src/test_impls/mod.rs index c48f9670..18a398f1 100644 --- a/serde_arrow/src/test_impls/mod.rs +++ b/serde_arrow/src/test_impls/mod.rs @@ -9,7 +9,6 @@ mod primitives; mod r#struct; mod tuple; mod r#union; -mod utils; mod wrappers; mod issue_74_unknown_fields; diff --git a/serde_arrow/src/test_impls/primitives.rs b/serde_arrow/src/test_impls/primitives.rs index de663e59..0d8c2e11 100644 --- a/serde_arrow/src/test_impls/primitives.rs +++ b/serde_arrow/src/test_impls/primitives.rs @@ -4,7 +4,7 @@ test_example!( test_name = null, test_bytecode_deserialization = true, tracing_options = TracingOptions::default().allow_null_fields(true), - field = GenericField::new("root", GenericDataType::Null, true), + field = GenericField::new("item", GenericDataType::Null, true), ty = (), values = [(), (), ()], // NOTE: arrow2 has an incorrect is_null impl for NullArray @@ -14,7 +14,7 @@ test_example!( test_example!( test_name = bool, test_bytecode_deserialization = true, - field = GenericField::new("root", GenericDataType::Bool, false), + field = GenericField::new("item", GenericDataType::Bool, false), ty = bool, values = [true, false], nulls = [false, false], @@ -23,7 +23,7 @@ test_example!( test_example!( test_name = nullable_bool, test_bytecode_deserialization = true, - field = GenericField::new("root", GenericDataType::Bool, true), + field = GenericField::new("item", GenericDataType::Bool, true), ty = Option, values = [Some(true), None, Some(false)], nulls = [false, true, false], @@ -32,7 +32,7 @@ test_example!( test_example!( test_name = u8, test_bytecode_deserialization = true, - field = GenericField::new("root", GenericDataType::U8, false), + field = GenericField::new("item", GenericDataType::U8, false), ty = u8, values = [1, 2, 3, 4], nulls = [false, false, false, false], @@ -41,7 +41,7 @@ test_example!( test_example!( test_name = nullable_u8, test_bytecode_deserialization = true, - field = GenericField::new("root", GenericDataType::U8, true), + field = GenericField::new("item", GenericDataType::U8, true), ty = Option, values = [Some(1), None, Some(3), Some(4)], nulls = [false, true, false, false], @@ -50,7 +50,7 @@ test_example!( test_example!( test_name = u16, test_bytecode_deserialization = true, - field = GenericField::new("root", GenericDataType::U16, false), + field = GenericField::new("item", GenericDataType::U16, false), ty = u16, values = [1, 2, 3, 4], nulls = [false, false, false, false], @@ -59,7 +59,7 @@ test_example!( test_example!( test_name = nullable_u16, test_bytecode_deserialization = true, - field = GenericField::new("root", GenericDataType::U16, true), + field = GenericField::new("item", GenericDataType::U16, true), ty = Option, values = [Some(1), None, Some(3), Some(4)], nulls = [false, true, false, false], @@ -68,7 +68,7 @@ test_example!( test_example!( test_name = u32, test_bytecode_deserialization = true, - field = GenericField::new("root", GenericDataType::U32, false), + field = GenericField::new("item", GenericDataType::U32, false), ty = u32, values = [1, 2, 3, 4], nulls = [false, false, false, false], @@ -77,7 +77,7 @@ test_example!( test_example!( test_name = nullable_u32, test_bytecode_deserialization = true, - field = GenericField::new("root", GenericDataType::U32, true), + field = GenericField::new("item", GenericDataType::U32, true), ty = Option, values = [Some(1), None, Some(3), Some(4)], nulls = [false, true, false, false], @@ -86,7 +86,7 @@ test_example!( test_example!( test_name = u64, test_bytecode_deserialization = true, - field = GenericField::new("root", GenericDataType::U64, false), + field = GenericField::new("item", GenericDataType::U64, false), ty = u64, values = [1, 2, 3, 4], nulls = [false, false, false, false], @@ -95,7 +95,7 @@ test_example!( test_example!( test_name = nullable_u64, test_bytecode_deserialization = true, - field = GenericField::new("root", GenericDataType::U64, true), + field = GenericField::new("item", GenericDataType::U64, true), ty = Option, values = [Some(1), None, Some(3), Some(4)], nulls = [false, true, false, false], @@ -104,7 +104,7 @@ test_example!( test_example!( test_name = i8, test_bytecode_deserialization = true, - field = GenericField::new("root", GenericDataType::I8, false), + field = GenericField::new("item", GenericDataType::I8, false), ty = i8, values = [-1, 2, -3, 4], nulls = [false, false, false, false], @@ -113,7 +113,7 @@ test_example!( test_example!( test_name = nullable_i8, test_bytecode_deserialization = true, - field = GenericField::new("root", GenericDataType::I8, true), + field = GenericField::new("item", GenericDataType::I8, true), ty = Option, values = [Some(-1), None, Some(3), Some(-4)], nulls = [false, true, false, false], @@ -122,7 +122,7 @@ test_example!( test_example!( test_name = i16, test_bytecode_deserialization = true, - field = GenericField::new("root", GenericDataType::I16, false), + field = GenericField::new("item", GenericDataType::I16, false), ty = i16, values = [1, 2, 3, 4], nulls = [false, false, false, false], @@ -131,7 +131,7 @@ test_example!( test_example!( test_name = nullable_i16, test_bytecode_deserialization = true, - field = GenericField::new("root", GenericDataType::I16, true), + field = GenericField::new("item", GenericDataType::I16, true), ty = Option, values = [Some(-1), None, Some(3), Some(-4)], nulls = [false, true, false, false], @@ -140,7 +140,7 @@ test_example!( test_example!( test_name = i32, test_bytecode_deserialization = true, - field = GenericField::new("root", GenericDataType::I32, false), + field = GenericField::new("item", GenericDataType::I32, false), ty = i32, values = [-1, 2, -3, 4], nulls = [false, false, false, false], @@ -149,7 +149,7 @@ test_example!( test_example!( test_name = nullable_i32, test_bytecode_deserialization = true, - field = GenericField::new("root", GenericDataType::I32, true), + field = GenericField::new("item", GenericDataType::I32, true), ty = Option, values = [Some(-1), None, Some(3), Some(-4)], nulls = [false, true, false, false], @@ -158,7 +158,7 @@ test_example!( test_example!( test_name = i64, test_bytecode_deserialization = true, - field = GenericField::new("root", GenericDataType::I64, false), + field = GenericField::new("item", GenericDataType::I64, false), ty = i64, values = [-1, 2, -3, 4], nulls = [false, false, false, false], @@ -167,7 +167,7 @@ test_example!( test_example!( test_name = nullable_i64, test_bytecode_deserialization = true, - field = GenericField::new("root", GenericDataType::I64, true), + field = GenericField::new("item", GenericDataType::I64, true), ty = Option, values = [Some(-1), None, Some(3), Some(-4)], nulls = [false, true, false, false], @@ -176,7 +176,7 @@ test_example!( test_example!( test_name = f32, test_bytecode_deserialization = true, - field = GenericField::new("root", GenericDataType::F32, false), + field = GenericField::new("item", GenericDataType::F32, false), ty = f32, values = [-1.0, 2.0, -3.0, 4.0], nulls = [false, false, false, false], @@ -185,8 +185,8 @@ test_example!( test_example!( test_name = f32_from_f64, test_bytecode_deserialization = true, - field = GenericField::new("root", GenericDataType::F64, false), - overwrite_field = GenericField::new("root", GenericDataType::F32, false), + field = GenericField::new("item", GenericDataType::F64, false), + overwrite_field = GenericField::new("item", GenericDataType::F32, false), ty = f64, values = [-1.0, 2.0, -3.0, 4.0], nulls = [false, false, false, false], @@ -195,7 +195,7 @@ test_example!( test_example!( test_name = nullable_f32, test_bytecode_deserialization = true, - field = GenericField::new("root", GenericDataType::F32, true), + field = GenericField::new("item", GenericDataType::F32, true), ty = Option, values = [Some(-1.0), None, Some(3.0), Some(-4.0)], nulls = [false, true, false, false], @@ -204,7 +204,7 @@ test_example!( test_example!( test_name = f64, test_bytecode_deserialization = true, - field = GenericField::new("root", GenericDataType::F64, false), + field = GenericField::new("item", GenericDataType::F64, false), ty = f64, values = [-1.0, 2.0, -3.0, 4.0], nulls = [false, false, false, false], @@ -213,7 +213,7 @@ test_example!( test_example!( test_name = nullable_f64, test_bytecode_deserialization = true, - field = GenericField::new("root", GenericDataType::F64, true), + field = GenericField::new("item", GenericDataType::F64, true), ty = Option, values = [Some(-1.0), None, Some(3.0), Some(-4.0)], nulls = [false, true, false, false], @@ -222,8 +222,8 @@ test_example!( test_example!( test_name = f64_from_f32, test_bytecode_deserialization = true, - field = GenericField::new("root", GenericDataType::F32, false), - overwrite_field = GenericField::new("root", GenericDataType::F64, false), + field = GenericField::new("item", GenericDataType::F32, false), + overwrite_field = GenericField::new("item", GenericDataType::F64, false), ty = f32, values = [-1.0, 2.0, -3.0, 4.0], nulls = [false, false, false, false], @@ -232,8 +232,8 @@ test_example!( test_example!( test_name = f16_from_f32, test_bytecode_deserialization = true, - field = GenericField::new("root", GenericDataType::F32, false), - overwrite_field = GenericField::new("root", GenericDataType::F16, false), + field = GenericField::new("item", GenericDataType::F32, false), + overwrite_field = GenericField::new("item", GenericDataType::F16, false), ty = f32, values = [-1.0, 2.0, -3.0, 4.0], nulls = [false, false, false, false], @@ -242,8 +242,8 @@ test_example!( test_example!( test_name = f16_from_f64, test_bytecode_deserialization = true, - field = GenericField::new("root", GenericDataType::F64, false), - overwrite_field = GenericField::new("root", GenericDataType::F16, false), + field = GenericField::new("item", GenericDataType::F64, false), + overwrite_field = GenericField::new("item", GenericDataType::F16, false), ty = f64, values = [-1.0, 2.0, -3.0, 4.0], nulls = [false, false, false, false], @@ -252,7 +252,7 @@ test_example!( test_example!( test_name = str, test_bytecode_deserialization = true, - field = GenericField::new("root", GenericDataType::LargeUtf8, false), + field = GenericField::new("item", GenericDataType::LargeUtf8, false), ty = String, values = [ String::from("a"), @@ -266,7 +266,7 @@ test_example!( test_example!( test_name = nullable_str, test_bytecode_deserialization = true, - field = GenericField::new("root", GenericDataType::LargeUtf8, true), + field = GenericField::new("item", GenericDataType::LargeUtf8, true), ty = Option, values = [Some(String::from("a")), None, None, Some(String::from("d"))], nulls = [false, true, true, false], @@ -275,8 +275,8 @@ test_example!( test_example!( test_name = str_u32, test_bytecode_deserialization = true, - field = GenericField::new("root", GenericDataType::LargeUtf8, false), - overwrite_field = GenericField::new("root", GenericDataType::Utf8, false), + field = GenericField::new("item", GenericDataType::LargeUtf8, false), + overwrite_field = GenericField::new("item", GenericDataType::Utf8, false), ty = String, values = [ String::from("a"), @@ -290,8 +290,8 @@ test_example!( test_example!( test_name = nullable_str_u32, test_bytecode_deserialization = true, - field = GenericField::new("root", GenericDataType::LargeUtf8, true), - overwrite_field = GenericField::new("root", GenericDataType::Utf8, true), + field = GenericField::new("item", GenericDataType::LargeUtf8, true), + overwrite_field = GenericField::new("item", GenericDataType::Utf8, true), ty = Option, values = [Some(String::from("a")), None, None, Some(String::from("d"))], nulls = [false, true, true, false], @@ -300,7 +300,7 @@ test_example!( test_example!( test_name = newtype_i64, test_bytecode_deserialization = true, - field = GenericField::new("root", GenericDataType::I64, false), + field = GenericField::new("item", GenericDataType::I64, false), ty = I64, values = [I64(-1), I64(2), I64(3), I64(-4)], nulls = [false, false, false, false], @@ -313,8 +313,8 @@ test_example!( test_example!( test_name = u8_to_u16, test_bytecode_deserialization = true, - field = GenericField::new("root", GenericDataType::U8, false), - overwrite_field = GenericField::new("root", GenericDataType::U16, false), + field = GenericField::new("item", GenericDataType::U8, false), + overwrite_field = GenericField::new("item", GenericDataType::U16, false), ty = u8, values = [1, 2, 3, 4], nulls = [false, false, false, false], @@ -323,8 +323,8 @@ test_example!( test_example!( test_name = u32_to_i64, test_bytecode_deserialization = true, - field = GenericField::new("root", GenericDataType::U32, false), - overwrite_field = GenericField::new("root", GenericDataType::I64, false), + field = GenericField::new("item", GenericDataType::U32, false), + overwrite_field = GenericField::new("item", GenericDataType::I64, false), ty = u32, values = [1, 2, 3, 4], nulls = [false, false, false, false], @@ -333,7 +333,7 @@ test_example!( test_example!( test_name = chars, test_bytecode_deserialization = true, - field = GenericField::new("root", GenericDataType::U32, false), + field = GenericField::new("item", GenericDataType::U32, false), ty = char, values = ['a', 'b', 'c'], nulls = [false, false, false], diff --git a/serde_arrow/src/test_impls/struct.rs b/serde_arrow/src/test_impls/struct.rs index 5d2c6b4b..ba5847d2 100644 --- a/serde_arrow/src/test_impls/struct.rs +++ b/serde_arrow/src/test_impls/struct.rs @@ -3,7 +3,7 @@ use super::macros::*; test_example!( test_name = struct_, test_bytecode_deserialization = true, - field = GenericField::new("root", GenericDataType::Struct, false) + field = GenericField::new("item", GenericDataType::Struct, false) .with_child(GenericField::new("a", GenericDataType::U32, false)) .with_child(GenericField::new("b", GenericDataType::Bool, false)), ty = S, @@ -21,7 +21,7 @@ test_example!( test_example!( test_name = struct_nested, test_bytecode_deserialization = true, - field = GenericField::new("root", GenericDataType::Struct, false) + field = GenericField::new("item", GenericDataType::Struct, false) .with_child(GenericField::new("a", GenericDataType::U32, false)) .with_child(GenericField::new("b", GenericDataType::Bool, false)) .with_child( @@ -51,7 +51,7 @@ test_example!( test_example!( test_name = struct_nullable_field, test_bytecode_deserialization = true, - field = GenericField::new("root", GenericDataType::Struct, false) + field = GenericField::new("item", GenericDataType::Struct, false) .with_child(GenericField::new("a", GenericDataType::U32, true)) .with_child(GenericField::new("b", GenericDataType::Bool, false)), ty = S, @@ -78,7 +78,7 @@ test_example!( test_example!( test_name = nullable_struct, test_bytecode_deserialization = true, - field = GenericField::new("root", GenericDataType::Struct, true) + field = GenericField::new("item", GenericDataType::Struct, true) .with_child(GenericField::new("a", GenericDataType::U32, false)) .with_child(GenericField::new("b", GenericDataType::Bool, false)), ty = Option, @@ -96,7 +96,7 @@ test_example!( test_example!( test_name = nullable_nested_struct, test_bytecode_deserialization = true, - field = GenericField::new("root", GenericDataType::Struct, true) + field = GenericField::new("item", GenericDataType::Struct, true) .with_child(GenericField::new("a", GenericDataType::U32, false)) .with_child(GenericField::new("b", GenericDataType::Struct, true) .with_child(GenericField::new("c", GenericDataType::I16, false)) @@ -122,7 +122,7 @@ test_example!( test_example!( test_name = nullable_struct_nullable_fields, test_bytecode_deserialization = true, - field = GenericField::new("root", GenericDataType::Struct, true) + field = GenericField::new("item", GenericDataType::Struct, true) .with_child(GenericField::new("a", GenericDataType::U32, true)) .with_child(GenericField::new("b", GenericDataType::Bool, true)), ty = Option, @@ -146,7 +146,7 @@ test_example!( // arrow2 panics with: OutOfSpec("A StructArray must contain at least one field") // test_example!( // test_name = empt_struct, -// field = GenericField::new("root", GenericDataType::Struct, false), +// field = GenericField::new("item", GenericDataType::Struct, false), // ty = S, // values = [S {}, S {}, S {}], // nulls = [false, false, false], @@ -159,7 +159,7 @@ test_example!( test_example!( test_name = nullable_struct_list_field, test_bytecode_deserialization = true, - field = GenericField::new("root", GenericDataType::Struct, true) + field = GenericField::new("item", GenericDataType::Struct, true) .with_child(GenericField::new("a", GenericDataType::U32, false)) .with_child(GenericField::new("b", GenericDataType::LargeList, true) .with_child(GenericField::new("element", GenericDataType::Bool, false))), @@ -184,7 +184,7 @@ test_example!( // #[ignore = "error during serialization"] test_name = serde_flatten, test_bytecode_deserialization = true, - field = GenericField::new("root", GenericDataType::Struct, false) + field = GenericField::new("item", GenericDataType::Struct, false) .with_strategy(Strategy::MapAsStruct) .with_child(GenericField::new("a", GenericDataType::I8, false)) .with_child(GenericField::new("value", GenericDataType::Bool, false)), @@ -211,7 +211,7 @@ test_example!( test_example!( test_name = flattened_structures, - field = GenericField::new("root", GenericDataType::Struct, false) + field = GenericField::new("item", GenericDataType::Struct, false) .with_child(GenericField::new("a", GenericDataType::I64, false)) .with_child(GenericField::new("b", GenericDataType::F32, false)) .with_child(GenericField::new("c", GenericDataType::F64, false)) @@ -251,7 +251,7 @@ test_example!( test_name = struct_nullable, test_bytecode_deserialization = true, tracing_options = TracingOptions::default().allow_null_fields(true), - field = GenericField::new("root",GenericDataType::Struct, true) + field = GenericField::new("item",GenericDataType::Struct, true) .with_child(GenericField::new("a", GenericDataType::Bool, false)) .with_child(GenericField::new("b", GenericDataType::I64, false)) .with_child(GenericField::new("c", GenericDataType::Null, true)) @@ -292,7 +292,7 @@ test_example!( test_name = struct_nullable_nested, test_bytecode_deserialization = true, tracing_options = TracingOptions::default().allow_null_fields(true), - field = GenericField::new("root",GenericDataType::Struct, true) + field = GenericField::new("item",GenericDataType::Struct, true) .with_child(GenericField::new("inner", GenericDataType::Struct, false) .with_child(GenericField::new("a", GenericDataType::Bool, false)) .with_child(GenericField::new("b", GenericDataType::I64, false)) @@ -335,7 +335,7 @@ test_example!( test_name = struct_nullable_item, test_bytecode_deserialization = true, tracing_options = TracingOptions::default().allow_null_fields(true), - field = GenericField::new("root", GenericDataType::Struct, false) + field = GenericField::new("item", GenericDataType::Struct, false) .with_child(GenericField::new("a", GenericDataType::Bool, true)) .with_child(GenericField::new("b", GenericDataType::I64, true)) .with_child(GenericField::new("c", GenericDataType::Null, true)) diff --git a/serde_arrow/src/test_impls/tuple.rs b/serde_arrow/src/test_impls/tuple.rs index 8f6c2349..4d75393b 100644 --- a/serde_arrow/src/test_impls/tuple.rs +++ b/serde_arrow/src/test_impls/tuple.rs @@ -3,7 +3,7 @@ use super::macros::test_example; test_example!( test_name = tuple_u64_bool, test_bytecode_deserialization = true, - field = GenericField::new("root", GenericDataType::Struct, false) + field = GenericField::new("item", GenericDataType::Struct, false) .with_strategy(Strategy::TupleAsStruct) .with_child(GenericField::new("0", GenericDataType::U64, false)) .with_child(GenericField::new("1", GenericDataType::Bool, false)), @@ -15,7 +15,7 @@ test_example!( test_example!( test_name = tuple_struct_u64_bool, test_bytecode_deserialization = true, - field = GenericField::new("root", GenericDataType::Struct, false) + field = GenericField::new("item", GenericDataType::Struct, false) .with_strategy(Strategy::TupleAsStruct) .with_child(GenericField::new("0", GenericDataType::U64, false)) .with_child(GenericField::new("1", GenericDataType::Bool, false)), @@ -31,7 +31,7 @@ test_example!( test_example!( test_name = nullbale_tuple_u64_bool, test_bytecode_deserialization = true, - field = GenericField::new("root", GenericDataType::Struct, true) + field = GenericField::new("item", GenericDataType::Struct, true) .with_strategy(Strategy::TupleAsStruct) .with_child(GenericField::new("0", GenericDataType::U64, false)) .with_child(GenericField::new("1", GenericDataType::Bool, false)), @@ -43,7 +43,7 @@ test_example!( test_example!( test_name = tuple_nullable_u64, test_bytecode_deserialization = true, - field = GenericField::new("root", GenericDataType::Struct, false) + field = GenericField::new("item", GenericDataType::Struct, false) .with_strategy(Strategy::TupleAsStruct) .with_child(GenericField::new("0", GenericDataType::U64, true)), ty = (Option,), @@ -54,7 +54,7 @@ test_example!( test_example!( test_name = tuple_nested, test_bytecode_deserialization = true, - field = GenericField::new("root", GenericDataType::Struct, false) + field = GenericField::new("item", GenericDataType::Struct, false) .with_strategy(Strategy::TupleAsStruct) .with_child( GenericField::new("0", GenericDataType::Struct, false) @@ -69,7 +69,7 @@ test_example!( test_example!( test_name = tuple_nullable, test_bytecode_deserialization = true, - field = GenericField::new("root", GenericDataType::Struct, true) + field = GenericField::new("item", GenericDataType::Struct, true) .with_strategy(Strategy::TupleAsStruct) .with_child(GenericField::new("0", GenericDataType::Bool, false)) .with_child(GenericField::new("1", GenericDataType::I64, false)), @@ -84,7 +84,7 @@ test_example!( test_example!( test_name = tuple_nullable_nested, test_bytecode_deserialization = true, - field = GenericField::new("root", GenericDataType::Struct, true) + field = GenericField::new("item", GenericDataType::Struct, true) .with_strategy(Strategy::TupleAsStruct) .with_child(GenericField::new("0", GenericDataType::Struct, false) .with_strategy(Strategy::TupleAsStruct) diff --git a/serde_arrow/src/test_impls/union.rs b/serde_arrow/src/test_impls/union.rs index 558d6609..a1231dec 100644 --- a/serde_arrow/src/test_impls/union.rs +++ b/serde_arrow/src/test_impls/union.rs @@ -1,10 +1,10 @@ -use super::macros::{test_error, test_example}; +use super::macros::{test_example, test_generic}; test_example!( test_name = fieldless_unions, test_bytecode_deserialization = true, tracing_options = TracingOptions::default().allow_null_fields(true), - field = GenericField::new("root", GenericDataType::Union, false) + field = GenericField::new("item", GenericDataType::Union, false) .with_child(GenericField::new("A", GenericDataType::Null, true)) .with_child(GenericField::new("B", GenericDataType::Null, true)) .with_child(GenericField::new("C", GenericDataType::Null, true)), @@ -25,7 +25,7 @@ test_example!( test_name = fieldless_union_out_of_order, test_bytecode_deserialization = true, tracing_options = TracingOptions::default().allow_null_fields(true), - field = GenericField::new("root", GenericDataType::Union, false) + field = GenericField::new("item", GenericDataType::Union, false) .with_child(GenericField::new("A", GenericDataType::Null, true)) .with_child(GenericField::new("B", GenericDataType::Null, true)) .with_child(GenericField::new("C", GenericDataType::Null, true)), @@ -45,7 +45,7 @@ test_example!( test_example!( test_name = union_simple, test_bytecode_deserialization = true, - field = GenericField::new("root", GenericDataType::Union, false) + field = GenericField::new("item", GenericDataType::Union, false) .with_child(GenericField::new("U32", GenericDataType::U32, false)) .with_child(GenericField::new("Bool", GenericDataType::Bool, false)) .with_child(GenericField::new("Str", GenericDataType::LargeUtf8, false)), @@ -70,7 +70,7 @@ test_example!( test_name = union_mixed, test_bytecode_deserialization = true, field = - GenericField::new("root", GenericDataType::Union, false) + GenericField::new("item", GenericDataType::Union, false) .with_child( GenericField::new("V1", GenericDataType::Struct, false) .with_child(GenericField::new("a", GenericDataType::U32, false)) @@ -108,7 +108,7 @@ test_example!( test_example!( test_name = union_nested, test_bytecode_deserialization = true, - field = GenericField::new("root", GenericDataType::Union, false) + field = GenericField::new("item", GenericDataType::Union, false) .with_child(GenericField::new("U32", GenericDataType::U32, false)) .with_child( GenericField::new("O", GenericDataType::Union, false) @@ -141,7 +141,7 @@ test_example!( test_example!( test_name = enums, test_bytecode_deserialization = true, - field = GenericField::new("root", GenericDataType::Union, false) + field = GenericField::new("item", GenericDataType::Union, false) .with_child(GenericField::new("U8", GenericDataType::U8, false)) .with_child(GenericField::new("U16", GenericDataType::U16, false)) .with_child(GenericField::new("U32", GenericDataType::U32, false)) @@ -162,7 +162,7 @@ test_example!( test_example!( test_name = enums_tuple, test_bytecode_deserialization = true, - field = GenericField::new("root", GenericDataType::Union, false) + field = GenericField::new("item", GenericDataType::Union, false) .with_child( GenericField::new("A", GenericDataType::Struct, false) .with_strategy(Strategy::TupleAsStruct) @@ -189,7 +189,7 @@ test_example!( test_example!( test_name = enums_struct, test_bytecode_deserialization = true, - field = GenericField::new("root", GenericDataType::Union, false) + field = GenericField::new("item", GenericDataType::Union, false) .with_child( GenericField::new("A", GenericDataType::Struct, false) .with_child(GenericField::new("a", GenericDataType::U8, false)) @@ -215,7 +215,7 @@ test_example!( test_name = enums_union, test_bytecode_deserialization = true, tracing_options = TracingOptions::default().allow_null_fields(true), - field = GenericField::new("root", GenericDataType::Union, false) + field = GenericField::new("item", GenericDataType::Union, false) .with_child(GenericField::new("A", GenericDataType::Null, true)) .with_child(GenericField::new("B", GenericDataType::Null, true)), ty = Item, @@ -229,10 +229,8 @@ test_example!( }, ); -test_error!( - test_name = missing_union_variants, - expected_error = "Serialization failed: an unknown variant", - block = { +test_generic!( + fn missing_union_variants() { use crate::schema::TracingOptions; use serde::{Deserialize, Serialize}; @@ -244,40 +242,10 @@ test_error!( } let tracing_options = TracingOptions::default().allow_null_fields(true); - let field = serialize_into_field(&[U::A, U::C], "root", tracing_options).unwrap(); + let fields: Vec = SerdeArrowSchema::from_samples(&Items(&[U::A, U::C]), tracing_options).unwrap().try_into().unwrap(); // NOTE: variant B was never encountered during tracing - serialize_into_array(&field, &[U::A, U::B, U::C])?; - - Ok(()) - }, -); - -test_error!( - test_name = missing_union_variant_compilation, - expected_error = "Serialization failed: an unknown variant", - block = { - use crate::schema::TracingOptions; - use crate::test_impls::utils::ScopedConfiguration; - use serde::{Deserialize, Serialize}; - - #[derive(Serialize, Deserialize, Debug, PartialEq)] - enum U { - A, - B, - C, - } - - let _guard = ScopedConfiguration::configure(|c| { - c.debug_print_program = true; - }); - - let tracing_options = TracingOptions::default().allow_null_fields(true); - let field = serialize_into_field(&[U::A, U::C], "root", tracing_options).unwrap(); - - // NOTE: variant B was never encountered during tracing - serialize_into_array(&field, &[U::A, U::B, U::C])?; - - Ok(()) - }, + let res = to_arrow(&fields, &Items(&[U::A, U::B, U::C])); + crate::test_impls::macros::expect_error(&res, "Serialization failed: an unknown variant"); + } ); diff --git a/serde_arrow/src/test_impls/utils.rs b/serde_arrow/src/test_impls/utils.rs deleted file mode 100644 index 89451b25..00000000 --- a/serde_arrow/src/test_impls/utils.rs +++ /dev/null @@ -1,29 +0,0 @@ -//! Helpers to convert between arrow and arrow2 arrays -//! -use crate::experimental::Configuration; - -pub struct ScopedConfiguration { - prev_config: Configuration, -} - -impl ScopedConfiguration { - pub fn configure(effect: F) -> Self { - let mut prev_config = Configuration::default(); - { - let prev_config = &mut prev_config; - crate::experimental::configure(move |c| { - *prev_config = c.clone(); - effect(c); - }); - } - Self { prev_config } - } -} - -impl std::ops::Drop for ScopedConfiguration { - fn drop(&mut self) { - crate::experimental::configure(|c| { - *c = self.prev_config.clone(); - }) - } -} diff --git a/serde_arrow/src/test_impls/wrappers.rs b/serde_arrow/src/test_impls/wrappers.rs index ee9f1cd4..89a797d3 100644 --- a/serde_arrow/src/test_impls/wrappers.rs +++ b/serde_arrow/src/test_impls/wrappers.rs @@ -1,9 +1,60 @@ -use super::macros::test_serialize_into_array; +use super::macros::test_generic; -test_serialize_into_array!(test_name = outer_vec, values = vec![0_u32, 1_u32, 2_u32],); +/* + #[test] + fn serialization() { + $($($definitions)*)? -test_serialize_into_array!(test_name = outer_slice, values = &[0_u32, 1_u32, 2_u32],); + let items = &$values; + let field = serialize_into_field(&items, "item", TracingOptions::default()).unwrap(); + let array = serialize_into_array(&field, &items).unwrap(); -test_serialize_into_array!(test_name = outer_array, values = [0_u32, 1_u32, 2_u32],); + drop(array); + } +*/ -test_serialize_into_array!(test_name = outer_tuple, values = (0_u32, 1_u32, 2_u32),); +test_generic!( + fn outer_vec() { + let items: Vec = vec![0_u32, 1_u32, 2_u32]; + let fields: Vec = SerdeArrowSchema::from_samples(&Items(&items), TracingOptions::default()).unwrap().try_into().unwrap(); + let arrays = to_arrow(&fields, &Items(&items)).unwrap(); + + drop(arrays); + } +); + +test_generic!( + fn outer_slice() { + let items: &[u32] = &[0_u32, 1_u32, 2_u32]; + let fields: Vec = SerdeArrowSchema::from_samples(&Items(items), TracingOptions::default()).unwrap().try_into().unwrap(); + let arrays = to_arrow(&fields, &Items(items)).unwrap(); + + drop(arrays); + } +); + +test_generic!( + fn outer_array() { + let items: &[u32; 3] = &[0_u32, 1_u32, 2_u32]; + let fields: Vec = SerdeArrowSchema::from_samples(&Items(items), TracingOptions::default()).unwrap().try_into().unwrap(); + let arrays = to_arrow(&fields, &Items(items)).unwrap(); + + drop(arrays); + } +); + +test_generic!( + fn outer_tupple() { + // Note: the standard Items wrapper does not work with tuples, use a custom impl here + #[derive(serde::Serialize)] + struct Item { + item: u32, + } + + let items: &(Item, Item, Item) = &(Item{ item: 0_u32 }, Item{ item: 1_u32 }, Item{ item: 2_u32 }); + let fields: Vec = SerdeArrowSchema::from_samples(items, TracingOptions::default()).unwrap().try_into().unwrap(); + let arrays = to_arrow(&fields, &items).unwrap(); + + drop(arrays); + } +); diff --git a/serde_arrow/src/utils.rs b/serde_arrow/src/utils.rs new file mode 100644 index 00000000..0ebcaf3d --- /dev/null +++ b/serde_arrow/src/utils.rs @@ -0,0 +1,2 @@ +//! Helpers that may be useful when using `serde_arrow` +pub use crate::internal::generic::{Item, Items}; \ No newline at end of file From 44851b034c1f5feaa1afe900a224de62d04b8425 Mon Sep 17 00:00:00 2001 From: Christopher Prohm Date: Tue, 31 Oct 2023 10:13:01 +0100 Subject: [PATCH 14/27] Remove deprecated API from benchmarks --- serde_arrow/benches/groups/impls.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/serde_arrow/benches/groups/impls.rs b/serde_arrow/benches/groups/impls.rs index 7690bca7..b75aa5da 100644 --- a/serde_arrow/benches/groups/impls.rs +++ b/serde_arrow/benches/groups/impls.rs @@ -12,6 +12,8 @@ macro_rules! define_benchmark { )? ) => { pub fn benchmark_serialize(c: &mut criterion::Criterion) { + use serde_arrow::schema::SerdeArrowSchema; + for n in [$($n),*] { let mut group = c.benchmark_group(format!("{}_serialize({})", stringify!($name), n)); group.sample_size(20); @@ -22,7 +24,7 @@ macro_rules! define_benchmark { let items = (0..n) .map(|_| <$ty>::random(&mut rng)) .collect::>(); - let arrow_fields = serde_arrow::arrow::serialize_into_fields(&items, Default::default()).unwrap(); + let arrow_fields = SerdeArrowSchema::from_samples(&items, Default::default()).unwrap().to_arrow_fields().unwrap(); #[allow(unused)] let bench_serde_arrow = true; @@ -84,7 +86,7 @@ pub mod serde_arrow { where T: Serialize + ?Sized, { - serde_arrow::arrow::serialize_into_arrays(&fields, &items) + serde_arrow::to_arrow(&fields, &items) } } From fb537b85b4eec812e0feddf65ff36f5972c6a519 Mon Sep 17 00:00:00 2001 From: Christopher Prohm Date: Tue, 31 Oct 2023 10:35:58 +0100 Subject: [PATCH 15/27] Rework builders --- serde_arrow/src/arrow/api.rs | 26 ++++++--------- serde_arrow/src/arrow/mod.rs | 6 +++- serde_arrow/src/arrow2/api.rs | 47 +++++++--------------------- serde_arrow/src/arrow2/mod.rs | 6 +++- serde_arrow/src/lib.rs | 26 ++++++++------- serde_arrow/src/test_impls/macros.rs | 37 +++++++++------------- 6 files changed, 60 insertions(+), 88 deletions(-) diff --git a/serde_arrow/src/arrow/api.rs b/serde_arrow/src/arrow/api.rs index 5690e478..382f1833 100644 --- a/serde_arrow/src/arrow/api.rs +++ b/serde_arrow/src/arrow/api.rs @@ -225,13 +225,12 @@ where /// let array = builder.build_array().unwrap(); /// assert_eq!(array.len(), 6); /// ``` +#[deprecated = "serde_arrow::arrow::ArrayBuilder is deprecated. Use serde_arrow::ArrowBuilder instead"] pub struct ArrayBuilder(generic::GenericBuilder); +#[allow(deprecated)] impl ArrayBuilder { /// Construct a new build for the given field - /// - /// This method may fail for an unsupported data type of the given field. - /// pub fn new(field: &Field) -> Result { Ok(Self(generic::GenericBuilder::new_for_array( GenericField::try_from(field)?, @@ -239,27 +238,22 @@ impl ArrayBuilder { } /// Add a single item to the arrays - /// pub fn push(&mut self, item: &T) -> Result<()> { self.0.push(item) } /// Add multiple items to the arrays - /// pub fn extend(&mut self, items: &T) -> Result<()> { self.0.extend(items) } /// Build the array from the rows pushed to far. - /// - /// This operation will reset the underlying buffers and start a new batch. - /// pub fn build_array(&mut self) -> Result { self.0 .0.build_arrow_array() } } -/// Build arrays record by record +/// Build arrow arrays record by record /// /// Example: /// @@ -267,7 +261,7 @@ impl ArrayBuilder { /// # use serde_arrow::_impl::arrow as arrow; /// use arrow::datatypes::{DataType, Field}; /// use serde::Serialize; -/// use serde_arrow::arrow::{ArraysBuilder}; +/// use serde_arrow::ArrowBuilder; /// /// ##[derive(Serialize)] /// struct Record { @@ -279,7 +273,7 @@ impl ArrayBuilder { /// Field::new("a", DataType::Float32, true), /// Field::new("b", DataType::UInt64, false), /// ]; -/// let mut builder = ArraysBuilder::new(&fields).unwrap(); +/// let mut builder = ArrowBuilder::new(&fields).unwrap(); /// /// builder.push(&Record { a: Some(1.0), b: 2}).unwrap(); /// builder.push(&Record { a: Some(3.0), b: 4}).unwrap(); @@ -296,16 +290,16 @@ impl ArrayBuilder { /// assert_eq!(arrays.len(), 2); /// assert_eq!(arrays[0].len(), 6); /// ``` -pub struct ArraysBuilder(generic::GenericBuilder); +pub struct ArrowBuilder(generic::GenericBuilder); -impl std::fmt::Debug for ArraysBuilder { +impl std::fmt::Debug for ArrowBuilder { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "ArraysBuilder<...>") + write!(f, "ArrowBuilder<...>") } } -impl ArraysBuilder { - /// Build a new ArraysBuilder for the given fields +impl ArrowBuilder { + /// Build a new ArrowBuilder for the given fields /// /// This method may fail when unsupported data types are encountered in the /// given fields. diff --git a/serde_arrow/src/arrow/mod.rs b/serde_arrow/src/arrow/mod.rs index a6990bd2..c253ba81 100644 --- a/serde_arrow/src/arrow/mod.rs +++ b/serde_arrow/src/arrow/mod.rs @@ -13,5 +13,9 @@ mod type_support; #[allow(deprecated)] pub use api::{ deserialize_from_array, deserialize_from_arrays, serialize_into_array, serialize_into_arrays, - serialize_into_field, serialize_into_fields, ArrayBuilder, ArraysBuilder, + serialize_into_field, serialize_into_fields, ArrayBuilder, }; + +/// Build arrays record by record +#[deprecated = "serde_arrow::arrow::ArraysBuilder is deprecated. Use serde_arrow::ArrowBuilder instead."] +pub type ArraysBuilder = api::ArrowBuilder; diff --git a/serde_arrow/src/arrow2/api.rs b/serde_arrow/src/arrow2/api.rs index ad469e43..0782d9f8 100644 --- a/serde_arrow/src/arrow2/api.rs +++ b/serde_arrow/src/arrow2/api.rs @@ -69,33 +69,6 @@ where } /// Build arrays from the given items -/// -/// `items` should be given in the form a list of records (e.g., a vector of -/// structs). -/// -/// To build arrays record by record use [ArraysBuilder]. -/// -/// ```rust -/// use serde::Serialize; -/// use serde_arrow::arrow2::{serialize_into_fields, serialize_into_arrays}; -/// -/// ##[derive(Serialize)] -/// struct Record { -/// a: Option, -/// b: u64, -/// } -/// -/// let items = vec![ -/// Record { a: Some(1.0), b: 2}, -/// // ... -/// ]; -/// -/// let fields = serialize_into_fields(&items, Default::default()).unwrap(); -/// let arrays = serialize_into_arrays(&fields, &items).unwrap(); -/// -/// assert_eq!(arrays.len(), 2); -/// ``` -/// pub fn serialize_into_arrays(fields: &[Field], items: &T) -> Result>> where T: Serialize + ?Sized, @@ -118,7 +91,7 @@ where /// `items` should be given in the form a list of records (e.g., a vector of /// structs). /// -/// To build arrays record by record use [ArraysBuilder]. +/// To build arrays record by record use [Arrow2Builder]. /// /// ```rust /// # fn main() -> serde_arrow::Result<()> { @@ -407,8 +380,10 @@ where /// let array = builder.build_array().unwrap(); /// assert_eq!(array.len(), 6); /// ``` +#[deprecated = "serde_arrow::arrow2::ArrayBuilder is deprecated. Use serde_arrow::Arrow2Builder instead"] pub struct ArrayBuilder(generic::GenericBuilder); +#[allow(deprecated)] impl ArrayBuilder { /// Construct a new build for the given field /// @@ -441,7 +416,7 @@ impl ArrayBuilder { } } -/// Build arrays record by record +/// Build arrow2 arrays record by record /// /// Example: /// @@ -449,7 +424,7 @@ impl ArrayBuilder { /// # use serde_arrow::_impl::arrow2 as arrow2; /// use arrow2::datatypes::{DataType, Field}; /// use serde::Serialize; -/// use serde_arrow::arrow2::{ArraysBuilder}; +/// use serde_arrow::Arrow2Builder; /// /// ##[derive(Serialize)] /// struct Record { @@ -461,7 +436,7 @@ impl ArrayBuilder { /// Field::new("a", DataType::Float32, true), /// Field::new("b", DataType::UInt64, false), /// ]; -/// let mut builder = ArraysBuilder::new(&fields).unwrap(); +/// let mut builder = Arrow2Builder::new(&fields).unwrap(); /// /// builder.push(&Record { a: Some(1.0), b: 2}).unwrap(); /// builder.push(&Record { a: Some(3.0), b: 4}).unwrap(); @@ -478,16 +453,16 @@ impl ArrayBuilder { /// assert_eq!(arrays.len(), 2); /// assert_eq!(arrays[0].len(), 6); /// ``` -pub struct ArraysBuilder(generic::GenericBuilder); +pub struct Arrow2Builder(generic::GenericBuilder); -impl std::fmt::Debug for ArraysBuilder { +impl std::fmt::Debug for Arrow2Builder { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "ArraysBuilder<...>") + write!(f, "Arrow2Builder<...>") } } -impl ArraysBuilder { - /// Build a new ArraysBuilder for the given fields +impl Arrow2Builder { + /// Build a new Arrow2Builder for the given fields /// /// This method may fail when unsupported data types are encountered in the /// given fields. diff --git a/serde_arrow/src/arrow2/mod.rs b/serde_arrow/src/arrow2/mod.rs index 84411f00..0422e5c3 100644 --- a/serde_arrow/src/arrow2/mod.rs +++ b/serde_arrow/src/arrow2/mod.rs @@ -15,5 +15,9 @@ mod test; #[allow(deprecated)] pub use api::{ deserialize_from_array, deserialize_from_arrays, serialize_into_array, serialize_into_arrays, - serialize_into_field, serialize_into_fields, ArrayBuilder, ArraysBuilder, + serialize_into_field, serialize_into_fields, ArrayBuilder, Arrow2Builder, }; + +/// Build arrays record by record +#[deprecated = "serde_arrow::arrow2::ArraysBuilder is deprecated. Use serde_arrow::Arrow2Builder instead"] +pub type ArraysBuilder = Arrow2Builder; diff --git a/serde_arrow/src/lib.rs b/serde_arrow/src/lib.rs index e8847fd7..3f747335 100644 --- a/serde_arrow/src/lib.rs +++ b/serde_arrow/src/lib.rs @@ -124,12 +124,6 @@ //! mod internal; -#[cfg(has_arrow)] -pub use arrow::api::{from_arrow, to_arrow}; - -#[cfg(has_arrow2)] -pub use arrow2::api::{from_arrow2, to_arrow2}; - /// Internal. Do not use /// /// This module is an internal implementation detail and not subject to any @@ -217,12 +211,6 @@ pub mod _impl { } } -#[cfg(has_arrow2)] -pub mod arrow2; - -#[cfg(has_arrow)] -pub mod arrow; - #[cfg(all(test, has_arrow, has_arrow2))] mod test_impls; @@ -234,6 +222,20 @@ mod test; pub use crate::internal::error::{Error, Result}; +#[cfg(has_arrow)] +pub use arrow::api::{from_arrow, to_arrow, ArrowBuilder}; + +#[cfg(has_arrow)] +// #[deprecated = "The items in serde_arrow::arrow are deprecated. See the individual items for suitable replacements"] +pub mod arrow; + +#[cfg(has_arrow2)] +pub use arrow2::api::{from_arrow2, to_arrow2, Arrow2Builder}; + +#[cfg(has_arrow2)] +// #[deprecated = "The items in serde_arrow::arrow2 are deprecated. See the individual items for suitable replacements"] +pub mod arrow2; + #[deny(missing_docs)] pub mod schema; diff --git a/serde_arrow/src/test_impls/macros.rs b/serde_arrow/src/test_impls/macros.rs index ead1b42e..b46a5ef0 100644 --- a/serde_arrow/src/test_impls/macros.rs +++ b/serde_arrow/src/test_impls/macros.rs @@ -50,16 +50,10 @@ macro_rules! test_example_impl { use crate::{ schema::{SerdeArrowSchema, TracingOptions, Strategy}, - utils::Items, - internal::schema::{ - GenericDataType, - GenericField, - GenericTimeUnit, - }, - test_impls::{ - macros::{btree_map, hash_map}, - }, + utils::{Items, Item}, }; + use crate::internal::schema::{GenericDataType, GenericField, GenericTimeUnit}; + use crate::test_impls::macros::{btree_map, hash_map}; $(#[ignore = $ignore])? #[test] @@ -164,20 +158,22 @@ macro_rules! test_example_impl { let arrays_reference = to_arrow(std::slice::from_ref(&field), &Items(items)).unwrap(); let array_reference = arrays_reference.into_iter().next().unwrap(); - let mut builder = ArrayBuilder::new(&field).unwrap(); + let mut builder = ArrowBuilder::new(std::slice::from_ref(&field)).unwrap(); // build using extend - builder.extend(items).unwrap(); + builder.extend(&Items(items)).unwrap(); - let array = builder.build_array().unwrap(); + let arrays = builder.build_arrays().unwrap(); + let array = arrays.into_iter().next().unwrap(); assert_eq!(array.as_ref(), array_reference.as_ref()); // re-use the builder for item in items { - builder.push(item).unwrap(); + builder.push(&Item(item)).unwrap(); } - let array = builder.build_array().unwrap(); + let arrays = builder.build_arrays().unwrap(); + let array = arrays.into_iter().next().unwrap(); assert_eq!(array.as_ref(), array_reference.as_ref()); let test_deserialization: &[&str] = &["arrow", "arrow2"]; @@ -227,9 +223,7 @@ macro_rules! test_example { mod $test_name { mod arrow { use crate::{ - to_arrow, - from_arrow, - arrow::ArrayBuilder, + ArrowBuilder, to_arrow, from_arrow, _impl::arrow::datatypes::Field, }; const IMPL: &'static str = "arrow"; @@ -242,9 +236,9 @@ macro_rules! test_example { } mod arrow2 { use crate::{ + Arrow2Builder as ArrowBuilder, to_arrow2 as to_arrow, from_arrow2 as from_arrow, - arrow2::ArrayBuilder, _impl::arrow2::datatypes::Field, }; const IMPL: &'static str = "arrow2"; @@ -334,8 +328,7 @@ macro_rules! test_roundtrip_arrays { mod arrow2 { use serde::{Serialize, Deserialize}; use crate::{ - to_arrow2, from_arrow2, - arrow2, + Arrow2Builder, to_arrow2, from_arrow2, internal::schema::{GenericField, GenericDataType}, Result, }; @@ -371,7 +364,7 @@ macro_rules! test_roundtrip_arrays { let fields = fields.iter().map(|f| Field::try_from(f)).collect::>>().unwrap(); - let mut builder = arrow2::ArraysBuilder::new(&fields).unwrap(); + let mut builder = Arrow2Builder::new(&fields).unwrap(); for item in inputs.iter() { builder.push(item).unwrap(); @@ -395,7 +388,7 @@ macro_rules! test_roundtrip_arrays { let fields = fields.iter().map(|f| Field::try_from(f)).collect::>>().unwrap(); - let mut builder = arrow2::ArraysBuilder::new(&fields).unwrap(); + let mut builder = Arrow2Builder::new(&fields).unwrap(); builder.extend(inputs).unwrap(); let arrays = builder.build_arrays().unwrap(); From 67385323bb25795a939e8425efddcc929dcf5fa6 Mon Sep 17 00:00:00 2001 From: Christopher Prohm Date: Tue, 31 Oct 2023 10:36:11 +0100 Subject: [PATCH 16/27] Reformat code --- serde_arrow/src/internal/generic.rs | 45 ++++++++++++++----- serde_arrow/src/lib.rs | 2 +- .../src/test_impls/issue_90_type_tracing.rs | 16 ++++--- serde_arrow/src/test_impls/json_values.rs | 10 ++++- serde_arrow/src/test_impls/macros.rs | 8 ++-- serde_arrow/src/test_impls/union.rs | 6 ++- serde_arrow/src/test_impls/wrappers.rs | 29 +++++++++--- serde_arrow/src/utils.rs | 2 +- 8 files changed, 89 insertions(+), 29 deletions(-) diff --git a/serde_arrow/src/internal/generic.rs b/serde_arrow/src/internal/generic.rs index e1e5e721..334e29a9 100644 --- a/serde_arrow/src/internal/generic.rs +++ b/serde_arrow/src/internal/generic.rs @@ -1,4 +1,4 @@ -use serde::{Deserialize, Serialize, ser::SerializeSeq}; +use serde::{ser::SerializeSeq, Deserialize, Serialize}; use crate::internal::{ common::{BufferExtract, Buffers}, @@ -81,7 +81,10 @@ pub struct Item( ); impl Serialize for Item { - fn serialize(&self, serializer: S) -> std::result::Result { + fn serialize( + &self, + serializer: S, + ) -> std::result::Result { #[derive(Debug, Serialize)] struct Item<'a, T> { item: &'a T, @@ -91,7 +94,9 @@ impl Serialize for Item { } impl<'de, T: Deserialize<'de>> Deserialize<'de> for Item { - fn deserialize>(deserializer: D) -> std::result::Result { + fn deserialize>( + deserializer: D, + ) -> std::result::Result { #[derive(Debug, Deserialize)] struct Item { item: T, @@ -103,38 +108,58 @@ impl<'de, T: Deserialize<'de>> Deserialize<'de> for Item { // TODO: implement for all types? impl<'de, T: Deserialize<'de>> Deserialize<'de> for Items> { - fn deserialize>(deserializer: D) -> std::result::Result { - let items = Vec::>::deserialize(deserializer)?.into_iter().map(|item| item.0).collect(); + fn deserialize>( + deserializer: D, + ) -> std::result::Result { + let items = Vec::>::deserialize(deserializer)? + .into_iter() + .map(|item| item.0) + .collect(); Ok(Items(items)) } } impl Serialize for Items> { - fn serialize(&self, serializer: S) -> std::result::Result { + fn serialize( + &self, + serializer: S, + ) -> std::result::Result { Items(self.0.as_slice()).serialize(serializer) } } impl<'a, T: Serialize> Serialize for Items<&'a Vec> { - fn serialize(&self, serializer: S) -> std::result::Result { + fn serialize( + &self, + serializer: S, + ) -> std::result::Result { Items(self.0.as_slice()).serialize(serializer) } } impl Serialize for Items<[T; N]> { - fn serialize(&self, serializer: S) -> std::result::Result { + fn serialize( + &self, + serializer: S, + ) -> std::result::Result { Items(self.0.as_slice()).serialize(serializer) } } impl<'a, const N: usize, T: Serialize> Serialize for Items<&'a [T; N]> { - fn serialize(&self, serializer: S) -> std::result::Result { + fn serialize( + &self, + serializer: S, + ) -> std::result::Result { Items(self.0.as_slice()).serialize(serializer) } } impl<'a, T: Serialize> Serialize for Items<&'a [T]> { - fn serialize(&self, serializer: S) -> std::result::Result { + fn serialize( + &self, + serializer: S, + ) -> std::result::Result { let mut seq = serializer.serialize_seq(Some(self.0.len()))?; for item in self.0 { seq.serialize_element(&Item(item))?; diff --git a/serde_arrow/src/lib.rs b/serde_arrow/src/lib.rs index 3f747335..65be1a4c 100644 --- a/serde_arrow/src/lib.rs +++ b/serde_arrow/src/lib.rs @@ -240,4 +240,4 @@ pub mod arrow2; pub mod schema; #[deny(missing_docs)] -pub mod utils; \ No newline at end of file +pub mod utils; diff --git a/serde_arrow/src/test_impls/issue_90_type_tracing.rs b/serde_arrow/src/test_impls/issue_90_type_tracing.rs index 0d53f294..48bd22b2 100644 --- a/serde_arrow/src/test_impls/issue_90_type_tracing.rs +++ b/serde_arrow/src/test_impls/issue_90_type_tracing.rs @@ -231,7 +231,9 @@ mod mixed_tracing_dates { tracer.trace_samples(&Items(["foo bar"])).unwrap(); tracer.trace_type::>().unwrap(); - tracer.trace_samples(&Items(["2015-09-18T23:56:04Z"])).unwrap(); + tracer + .trace_samples(&Items(["2015-09-18T23:56:04Z"])) + .unwrap(); let actual = tracer .to_schema() @@ -252,7 +254,9 @@ mod mixed_tracing_dates { TracingOptions::default().guess_dates(true), ); - tracer.trace_samples(&Items(["2015-09-18T23:56:04Z"])).unwrap(); + tracer + .trace_samples(&Items(["2015-09-18T23:56:04Z"])) + .unwrap(); tracer.trace_type::>().unwrap(); tracer.trace_samples(&Items(["foo bar"])).unwrap(); @@ -270,7 +274,10 @@ mod mixed_tracing_dates { } mod mixed_tracing_unions { - use crate::internal::{generic::{Items, Item}, tracing}; + use crate::internal::{ + generic::{Item, Items}, + tracing, + }; use super::*; @@ -285,8 +292,7 @@ mod mixed_tracing_unions { let mut tracer = tracing::Tracer::new( String::from("$"), - TracingOptions::default() - .allow_null_fields(true), + TracingOptions::default().allow_null_fields(true), ); tracer.trace_type::>().unwrap(); tracer.trace_samples(&Items(&[E::A, E::C(32)])).unwrap(); diff --git a/serde_arrow/src/test_impls/json_values.rs b/serde_arrow/src/test_impls/json_values.rs index cd3d2a64..bbad1b2f 100644 --- a/serde_arrow/src/test_impls/json_values.rs +++ b/serde_arrow/src/test_impls/json_values.rs @@ -7,7 +7,10 @@ test_generic!( let tracing_options = TracingOptions::default(); let items = vec![json!({ "a": 1, "b": 2 }), json!({ "a": 3, "b": 4 })]; - let fields: Vec = SerdeArrowSchema::from_samples(&items, tracing_options).unwrap().try_into().unwrap(); + let fields: Vec = SerdeArrowSchema::from_samples(&items, tracing_options) + .unwrap() + .try_into() + .unwrap(); let arrays = to_arrow(&fields, &items).unwrap(); drop(arrays); @@ -21,7 +24,10 @@ test_generic!( let tracing_options = TracingOptions::default().coerce_numbers(true); let items = vec![json!({ "a": 1, "b": -2 }), json!({ "a": 3.0, "b": 4 })]; - let fields: Vec = SerdeArrowSchema::from_samples(&items, tracing_options).unwrap().try_into().unwrap(); + let fields: Vec = SerdeArrowSchema::from_samples(&items, tracing_options) + .unwrap() + .try_into() + .unwrap(); let arrays = to_arrow(&fields, &items).unwrap(); drop(arrays); diff --git a/serde_arrow/src/test_impls/macros.rs b/serde_arrow/src/test_impls/macros.rs index b46a5ef0..5621d751 100644 --- a/serde_arrow/src/test_impls/macros.rs +++ b/serde_arrow/src/test_impls/macros.rs @@ -138,7 +138,7 @@ macro_rules! test_example_impl { $(let expected_items: &[$ty] = &$expected_values;)? let Items(items_round_trip): Items> = from_arrow( - std::slice::from_ref(&field), + std::slice::from_ref(&field), std::slice::from_ref(&array), ).unwrap(); assert_eq!(expected_items, items_round_trip); @@ -184,7 +184,7 @@ macro_rules! test_example_impl { $(let expected_items: &[$ty] = &$expected_values;)? let Items(items_round_trip): Items> = from_arrow( - std::slice::from_ref(&field), + std::slice::from_ref(&field), std::slice::from_ref(&array), ).unwrap(); assert_eq!(expected_items, items_round_trip); @@ -328,7 +328,7 @@ macro_rules! test_roundtrip_arrays { mod arrow2 { use serde::{Serialize, Deserialize}; use crate::{ - Arrow2Builder, to_arrow2, from_arrow2, + Arrow2Builder, to_arrow2, from_arrow2, internal::schema::{GenericField, GenericDataType}, Result, }; @@ -455,4 +455,4 @@ pub fn expect_error(actual: &Result, expected: &s if !actual.contains(expected) { panic!("Error did not contain {expected:?}. Full error: {actual}"); } -} \ No newline at end of file +} diff --git a/serde_arrow/src/test_impls/union.rs b/serde_arrow/src/test_impls/union.rs index a1231dec..7dd51850 100644 --- a/serde_arrow/src/test_impls/union.rs +++ b/serde_arrow/src/test_impls/union.rs @@ -242,7 +242,11 @@ test_generic!( } let tracing_options = TracingOptions::default().allow_null_fields(true); - let fields: Vec = SerdeArrowSchema::from_samples(&Items(&[U::A, U::C]), tracing_options).unwrap().try_into().unwrap(); + let fields: Vec = + SerdeArrowSchema::from_samples(&Items(&[U::A, U::C]), tracing_options) + .unwrap() + .try_into() + .unwrap(); // NOTE: variant B was never encountered during tracing let res = to_arrow(&fields, &Items(&[U::A, U::B, U::C])); diff --git a/serde_arrow/src/test_impls/wrappers.rs b/serde_arrow/src/test_impls/wrappers.rs index 89a797d3..d06ed2a1 100644 --- a/serde_arrow/src/test_impls/wrappers.rs +++ b/serde_arrow/src/test_impls/wrappers.rs @@ -16,7 +16,11 @@ use super::macros::test_generic; test_generic!( fn outer_vec() { let items: Vec = vec![0_u32, 1_u32, 2_u32]; - let fields: Vec = SerdeArrowSchema::from_samples(&Items(&items), TracingOptions::default()).unwrap().try_into().unwrap(); + let fields: Vec = + SerdeArrowSchema::from_samples(&Items(&items), TracingOptions::default()) + .unwrap() + .try_into() + .unwrap(); let arrays = to_arrow(&fields, &Items(&items)).unwrap(); drop(arrays); @@ -26,7 +30,11 @@ test_generic!( test_generic!( fn outer_slice() { let items: &[u32] = &[0_u32, 1_u32, 2_u32]; - let fields: Vec = SerdeArrowSchema::from_samples(&Items(items), TracingOptions::default()).unwrap().try_into().unwrap(); + let fields: Vec = + SerdeArrowSchema::from_samples(&Items(items), TracingOptions::default()) + .unwrap() + .try_into() + .unwrap(); let arrays = to_arrow(&fields, &Items(items)).unwrap(); drop(arrays); @@ -36,7 +44,11 @@ test_generic!( test_generic!( fn outer_array() { let items: &[u32; 3] = &[0_u32, 1_u32, 2_u32]; - let fields: Vec = SerdeArrowSchema::from_samples(&Items(items), TracingOptions::default()).unwrap().try_into().unwrap(); + let fields: Vec = + SerdeArrowSchema::from_samples(&Items(items), TracingOptions::default()) + .unwrap() + .try_into() + .unwrap(); let arrays = to_arrow(&fields, &Items(items)).unwrap(); drop(arrays); @@ -51,8 +63,15 @@ test_generic!( item: u32, } - let items: &(Item, Item, Item) = &(Item{ item: 0_u32 }, Item{ item: 1_u32 }, Item{ item: 2_u32 }); - let fields: Vec = SerdeArrowSchema::from_samples(items, TracingOptions::default()).unwrap().try_into().unwrap(); + let items: &(Item, Item, Item) = &( + Item { item: 0_u32 }, + Item { item: 1_u32 }, + Item { item: 2_u32 }, + ); + let fields: Vec = SerdeArrowSchema::from_samples(items, TracingOptions::default()) + .unwrap() + .try_into() + .unwrap(); let arrays = to_arrow(&fields, &items).unwrap(); drop(arrays); diff --git a/serde_arrow/src/utils.rs b/serde_arrow/src/utils.rs index 0ebcaf3d..b8473436 100644 --- a/serde_arrow/src/utils.rs +++ b/serde_arrow/src/utils.rs @@ -1,2 +1,2 @@ //! Helpers that may be useful when using `serde_arrow` -pub use crate::internal::generic::{Item, Items}; \ No newline at end of file +pub use crate::internal::generic::{Item, Items}; From 17fd87ed7b5c9cdade067485c2720e00d3eeee16 Mon Sep 17 00:00:00 2001 From: Christopher Prohm Date: Tue, 31 Oct 2023 11:06:44 +0100 Subject: [PATCH 17/27] Deprecated serde_arrow::{arrow, arrow2} modules, add tests for deprectead api --- serde_arrow/src/arrow/mod.rs | 21 --- serde_arrow/src/arrow2/mod.rs | 23 --- .../src/arrow2/test/implementation_docs.rs | 108 ----------- serde_arrow/src/arrow2/test/mod.rs | 1 - .../src/{arrow2 => arrow2_impl}/api.rs | 0 .../deserialization.rs | 0 serde_arrow/src/arrow2_impl/mod.rs | 13 ++ .../src/{arrow2 => arrow2_impl}/schema.rs | 0 .../{arrow2 => arrow2_impl}/serialization.rs | 0 .../src/arrow2_impl/test_deprecated_api.rs | 170 ++++++++++++++++++ .../{arrow2 => arrow2_impl}/type_support.rs | 0 serde_arrow/src/{arrow => arrow_impl}/api.rs | 0 .../{arrow => arrow_impl}/deserialization.rs | 0 serde_arrow/src/arrow_impl/mod.rs | 14 ++ .../src/{arrow => arrow_impl}/schema.rs | 0 .../{arrow => arrow_impl}/serialization.rs | 0 .../src/arrow_impl/test_deprecated_api.rs | 1 + .../src/{arrow => arrow_impl}/type_support.rs | 0 serde_arrow/src/lib.rs | 38 +++- .../src/test_end_to_end/test_docs_examples.rs | 47 +++++ 20 files changed, 277 insertions(+), 159 deletions(-) delete mode 100644 serde_arrow/src/arrow/mod.rs delete mode 100644 serde_arrow/src/arrow2/mod.rs delete mode 100644 serde_arrow/src/arrow2/test/implementation_docs.rs delete mode 100644 serde_arrow/src/arrow2/test/mod.rs rename serde_arrow/src/{arrow2 => arrow2_impl}/api.rs (100%) rename serde_arrow/src/{arrow2 => arrow2_impl}/deserialization.rs (100%) create mode 100644 serde_arrow/src/arrow2_impl/mod.rs rename serde_arrow/src/{arrow2 => arrow2_impl}/schema.rs (100%) rename serde_arrow/src/{arrow2 => arrow2_impl}/serialization.rs (100%) create mode 100644 serde_arrow/src/arrow2_impl/test_deprecated_api.rs rename serde_arrow/src/{arrow2 => arrow2_impl}/type_support.rs (100%) rename serde_arrow/src/{arrow => arrow_impl}/api.rs (100%) rename serde_arrow/src/{arrow => arrow_impl}/deserialization.rs (100%) create mode 100644 serde_arrow/src/arrow_impl/mod.rs rename serde_arrow/src/{arrow => arrow_impl}/schema.rs (100%) rename serde_arrow/src/{arrow => arrow_impl}/serialization.rs (100%) create mode 100644 serde_arrow/src/arrow_impl/test_deprecated_api.rs rename serde_arrow/src/{arrow => arrow_impl}/type_support.rs (100%) create mode 100644 serde_arrow/src/test_end_to_end/test_docs_examples.rs diff --git a/serde_arrow/src/arrow/mod.rs b/serde_arrow/src/arrow/mod.rs deleted file mode 100644 index c253ba81..00000000 --- a/serde_arrow/src/arrow/mod.rs +++ /dev/null @@ -1,21 +0,0 @@ -//! Support for the `arrow` crate (*requires one the `arrow-*` features*) -//! -//! Functions to convert Rust objects into arrow Arrays. Deserialization from -//! `arrow` arrays to Rust objects is not yet supported. -//! -#![deny(missing_docs)] -pub(crate) mod api; -mod deserialization; -mod schema; -pub(crate) mod serialization; -mod type_support; - -#[allow(deprecated)] -pub use api::{ - deserialize_from_array, deserialize_from_arrays, serialize_into_array, serialize_into_arrays, - serialize_into_field, serialize_into_fields, ArrayBuilder, -}; - -/// Build arrays record by record -#[deprecated = "serde_arrow::arrow::ArraysBuilder is deprecated. Use serde_arrow::ArrowBuilder instead."] -pub type ArraysBuilder = api::ArrowBuilder; diff --git a/serde_arrow/src/arrow2/mod.rs b/serde_arrow/src/arrow2/mod.rs deleted file mode 100644 index 0422e5c3..00000000 --- a/serde_arrow/src/arrow2/mod.rs +++ /dev/null @@ -1,23 +0,0 @@ -//! Support for the `arrow2` crate (*requires one the `arrow2-*` features*) -//! -//! Functions to convert Rust objects into Arrow arrays and back. -//! -#![deny(missing_docs)] -pub(crate) mod api; -pub(crate) mod deserialization; -pub(crate) mod schema; -pub(crate) mod serialization; -mod type_support; - -#[cfg(test)] -mod test; - -#[allow(deprecated)] -pub use api::{ - deserialize_from_array, deserialize_from_arrays, serialize_into_array, serialize_into_arrays, - serialize_into_field, serialize_into_fields, ArrayBuilder, Arrow2Builder, -}; - -/// Build arrays record by record -#[deprecated = "serde_arrow::arrow2::ArraysBuilder is deprecated. Use serde_arrow::Arrow2Builder instead"] -pub type ArraysBuilder = Arrow2Builder; diff --git a/serde_arrow/src/arrow2/test/implementation_docs.rs b/serde_arrow/src/arrow2/test/implementation_docs.rs deleted file mode 100644 index a0952d2c..00000000 --- a/serde_arrow/src/arrow2/test/implementation_docs.rs +++ /dev/null @@ -1,108 +0,0 @@ -use crate::_impl::arrow2::datatypes::{DataType, Field}; -use serde::{Deserialize, Serialize}; - -use crate::{ - arrow2::{deserialize_from_arrays, serialize_into_arrays, serialize_into_fields}, - internal::{event::Event, sink::serialize_into_sink, source::deserialize_from_source}, -}; - -#[test] -fn implementation_docs() { - #[derive(Debug, PartialEq, Serialize, Deserialize)] - struct Record { - a: i32, - b: u32, - } - - let items = vec![ - Record { a: 1, b: 2 }, - Record { a: 3, b: 4 }, - // ... - ]; - - let mut events: Vec> = Vec::new(); - serialize_into_sink(&mut events, &items).unwrap(); - - assert_eq!( - events, - vec![ - Event::StartSequence, - Event::Item, - Event::StartStruct, - Event::Str("a"), - Event::I32(1), - Event::Str("b"), - Event::U32(2), - Event::EndStruct, - Event::Item, - Event::StartStruct, - Event::Str("a"), - Event::I32(3), - Event::Str("b"), - Event::U32(4), - Event::EndStruct, - Event::EndSequence - ], - ); - - let items_from_events: Vec = deserialize_from_source(&events).unwrap(); - assert_eq!(items_from_events, items); - - let fields = vec![ - Field::new("a", DataType::Int32, false), - Field::new("b", DataType::UInt32, false), - ]; - - let arrays = serialize_into_arrays(&fields, &items).unwrap(); - let items_from_arrays: Vec = deserialize_from_arrays(&fields, &arrays).unwrap(); - - assert_eq!(items_from_arrays, items); - - let fields_from_items = serialize_into_fields(&items, Default::default()).unwrap(); - assert_eq!(fields_from_items, fields); -} - -#[test] -fn example_readme() -> Result<(), PanicOnError> { - #[derive(Serialize)] - struct Item { - a: f32, - b: i32, - point: Point, - } - - #[derive(Serialize)] - struct Point(f32, f32); - - let items = vec![ - Item { - a: 1.0, - b: 1, - point: Point(0.0, 1.0), - }, - Item { - a: 2.0, - b: 2, - point: Point(2.0, 3.0), - }, - // ... - ]; - - // detect the field types and convert the items to arrays - use crate::arrow2::{serialize_into_arrays, serialize_into_fields}; - - let fields = serialize_into_fields(&items, Default::default())?; - let arrays = serialize_into_arrays(&fields, &items)?; - - drop((fields, arrays)); - Ok(()) -} - -#[derive(Debug)] -struct PanicOnError; - -impl From for PanicOnError { - fn from(e: E) -> Self { - panic!("Encountered error: {e}"); - } -} diff --git a/serde_arrow/src/arrow2/test/mod.rs b/serde_arrow/src/arrow2/test/mod.rs deleted file mode 100644 index c3bd10c4..00000000 --- a/serde_arrow/src/arrow2/test/mod.rs +++ /dev/null @@ -1 +0,0 @@ -mod implementation_docs; diff --git a/serde_arrow/src/arrow2/api.rs b/serde_arrow/src/arrow2_impl/api.rs similarity index 100% rename from serde_arrow/src/arrow2/api.rs rename to serde_arrow/src/arrow2_impl/api.rs diff --git a/serde_arrow/src/arrow2/deserialization.rs b/serde_arrow/src/arrow2_impl/deserialization.rs similarity index 100% rename from serde_arrow/src/arrow2/deserialization.rs rename to serde_arrow/src/arrow2_impl/deserialization.rs diff --git a/serde_arrow/src/arrow2_impl/mod.rs b/serde_arrow/src/arrow2_impl/mod.rs new file mode 100644 index 00000000..9b470483 --- /dev/null +++ b/serde_arrow/src/arrow2_impl/mod.rs @@ -0,0 +1,13 @@ +//! Support for the `arrow2` crate (*requires one the `arrow2-*` features*) +//! +//! Functions to convert Rust objects into Arrow arrays and back. +//! +#![deny(missing_docs)] +pub(crate) mod api; +pub(crate) mod deserialization; +pub(crate) mod schema; +pub(crate) mod serialization; +mod type_support; + +#[cfg(test)] +mod test_deprecated_api; diff --git a/serde_arrow/src/arrow2/schema.rs b/serde_arrow/src/arrow2_impl/schema.rs similarity index 100% rename from serde_arrow/src/arrow2/schema.rs rename to serde_arrow/src/arrow2_impl/schema.rs diff --git a/serde_arrow/src/arrow2/serialization.rs b/serde_arrow/src/arrow2_impl/serialization.rs similarity index 100% rename from serde_arrow/src/arrow2/serialization.rs rename to serde_arrow/src/arrow2_impl/serialization.rs diff --git a/serde_arrow/src/arrow2_impl/test_deprecated_api.rs b/serde_arrow/src/arrow2_impl/test_deprecated_api.rs new file mode 100644 index 00000000..23d85b53 --- /dev/null +++ b/serde_arrow/src/arrow2_impl/test_deprecated_api.rs @@ -0,0 +1,170 @@ +#![allow(deprecated)] + +use crate as serde_arrow; +use crate::_impl::arrow2; + +#[test] +fn api_docs_serialize_into_fields() { + use arrow2::datatypes::{DataType, Field}; + use serde::Serialize; + use serde_arrow::arrow2::serialize_into_fields; + + #[derive(Serialize)] + struct Record { + a: Option, + b: u64, + } + + let items = vec![ + Record { a: Some(1.0), b: 2 }, + // ... + ]; + + let fields = serialize_into_fields(&items, Default::default()).unwrap(); + let expected = vec![ + Field::new("a", DataType::Float32, true), + Field::new("b", DataType::UInt64, false), + ]; + + assert_eq!(fields, expected); +} + +#[test] +fn api_docs_serialize_into_field() { + use arrow2::datatypes::{DataType, Field}; + use serde_arrow::arrow2::serialize_into_field; + + let items: Vec = vec![1.0, 2.0, 3.0, 4.0]; + + let field = serialize_into_field(&items, "floats", Default::default()).unwrap(); + assert_eq!(field, Field::new("floats", DataType::Float32, false)); +} + +#[test] +fn api_docs_serialize_into_arrays() { + use serde::Serialize; + use serde_arrow::arrow2::{serialize_into_arrays, serialize_into_fields}; + + #[derive(Serialize)] + struct Record { + a: Option, + b: u64, + } + + let items = vec![ + Record { a: Some(1.0), b: 2 }, + // ... + ]; + + let fields = serialize_into_fields(&items, Default::default()).unwrap(); + let arrays = serialize_into_arrays(&fields, &items).unwrap(); + + assert_eq!(arrays.len(), 2); +} + +#[test] +fn api_docs_serialize_into_array() { + use arrow2::datatypes::{DataType, Field}; + use serde_arrow::arrow2::serialize_into_array; + + let items: Vec = vec![1.0, 2.0, 3.0, 4.0]; + + let field = Field::new("floats", DataType::Float32, false); + let array = serialize_into_array(&field, &items).unwrap(); + + assert_eq!(array.len(), 4); +} + +#[test] +fn api_docs_deserialize_from_arrays() { + use serde::{Deserialize, Serialize}; + use serde_arrow::{ + arrow2::{deserialize_from_arrays, serialize_into_arrays, serialize_into_fields}, + schema::TracingOptions, + }; + + #[derive(Deserialize, Serialize)] + struct Record { + a: Option, + b: u64, + } + + // provide an example record to get the field information + let fields = + serialize_into_fields(&[Record { a: Some(1.0), b: 2 }], TracingOptions::default()).unwrap(); + + // hidden in docs: + let items = &[Record { a: Some(1.0), b: 2 }]; + let arrays = serialize_into_arrays(&fields, &items).unwrap(); + + // deserialize the records from arrays + let items: Vec = deserialize_from_arrays(&fields, &arrays).unwrap(); + drop(items); +} + +#[test] +fn api_docs_deserialize_from_array() { + use arrow2::datatypes::{DataType, Field}; + use serde_arrow::arrow2::{deserialize_from_array, serialize_into_array}; + + let field = Field::new("floats", DataType::Float32, false); + + let array = serialize_into_array(&field, &vec![1.0_f32, 2.0, 3.0]).unwrap(); + let items: Vec = deserialize_from_array(&field, &array).unwrap(); +} + +#[test] +fn api_docs_arrays_builder() { + use arrow2::datatypes::{DataType, Field}; + use serde::Serialize; + use serde_arrow::arrow2::ArraysBuilder; + + #[derive(Serialize)] + struct Record { + a: Option, + b: u64, + } + let fields = vec![ + Field::new("a", DataType::Float32, true), + Field::new("b", DataType::UInt64, false), + ]; + let mut builder = ArraysBuilder::new(&fields).unwrap(); + + builder.push(&Record { a: Some(1.0), b: 2 }).unwrap(); + builder.push(&Record { a: Some(3.0), b: 4 }).unwrap(); + builder.push(&Record { a: Some(5.0), b: 5 }).unwrap(); + + builder + .extend(&[ + Record { a: Some(6.0), b: 7 }, + Record { a: Some(8.0), b: 9 }, + Record { + a: Some(10.0), + b: 11, + }, + ]) + .unwrap(); + + let arrays = builder.build_arrays().unwrap(); + + assert_eq!(arrays.len(), 2); + assert_eq!(arrays[0].len(), 6); +} + +#[test] +fn api_docs_array_builder() { + use arrow2::datatypes::{DataType, Field}; + use serde_arrow::arrow2::ArrayBuilder; + + let field = Field::new("value", DataType::Int64, false); + let mut builder = ArrayBuilder::new(&field).unwrap(); + + builder.push(&-1_i64).unwrap(); + builder.push(&2_i64).unwrap(); + builder.push(&-3_i64).unwrap(); + + builder.extend(&[4_i64, -5, 6]).unwrap(); + + let array = builder.build_array().unwrap(); + assert_eq!(array.len(), 6); +} diff --git a/serde_arrow/src/arrow2/type_support.rs b/serde_arrow/src/arrow2_impl/type_support.rs similarity index 100% rename from serde_arrow/src/arrow2/type_support.rs rename to serde_arrow/src/arrow2_impl/type_support.rs diff --git a/serde_arrow/src/arrow/api.rs b/serde_arrow/src/arrow_impl/api.rs similarity index 100% rename from serde_arrow/src/arrow/api.rs rename to serde_arrow/src/arrow_impl/api.rs diff --git a/serde_arrow/src/arrow/deserialization.rs b/serde_arrow/src/arrow_impl/deserialization.rs similarity index 100% rename from serde_arrow/src/arrow/deserialization.rs rename to serde_arrow/src/arrow_impl/deserialization.rs diff --git a/serde_arrow/src/arrow_impl/mod.rs b/serde_arrow/src/arrow_impl/mod.rs new file mode 100644 index 00000000..704798dd --- /dev/null +++ b/serde_arrow/src/arrow_impl/mod.rs @@ -0,0 +1,14 @@ +//! Support for the `arrow` crate (*requires one the `arrow-*` features*) +//! +//! Functions to convert Rust objects into arrow Arrays. Deserialization from +//! `arrow` arrays to Rust objects is not yet supported. +//! +#![deny(missing_docs)] +pub(crate) mod api; +mod deserialization; +mod schema; +pub(crate) mod serialization; +mod type_support; + +#[cfg(test)] +mod test_deprecated_api; diff --git a/serde_arrow/src/arrow/schema.rs b/serde_arrow/src/arrow_impl/schema.rs similarity index 100% rename from serde_arrow/src/arrow/schema.rs rename to serde_arrow/src/arrow_impl/schema.rs diff --git a/serde_arrow/src/arrow/serialization.rs b/serde_arrow/src/arrow_impl/serialization.rs similarity index 100% rename from serde_arrow/src/arrow/serialization.rs rename to serde_arrow/src/arrow_impl/serialization.rs diff --git a/serde_arrow/src/arrow_impl/test_deprecated_api.rs b/serde_arrow/src/arrow_impl/test_deprecated_api.rs new file mode 100644 index 00000000..d3f5a12f --- /dev/null +++ b/serde_arrow/src/arrow_impl/test_deprecated_api.rs @@ -0,0 +1 @@ + diff --git a/serde_arrow/src/arrow/type_support.rs b/serde_arrow/src/arrow_impl/type_support.rs similarity index 100% rename from serde_arrow/src/arrow/type_support.rs rename to serde_arrow/src/arrow_impl/type_support.rs diff --git a/serde_arrow/src/lib.rs b/serde_arrow/src/lib.rs index 65be1a4c..aa171c09 100644 --- a/serde_arrow/src/lib.rs +++ b/serde_arrow/src/lib.rs @@ -223,18 +223,44 @@ mod test; pub use crate::internal::error::{Error, Result}; #[cfg(has_arrow)] -pub use arrow::api::{from_arrow, to_arrow, ArrowBuilder}; +mod arrow_impl; #[cfg(has_arrow)] -// #[deprecated = "The items in serde_arrow::arrow are deprecated. See the individual items for suitable replacements"] -pub mod arrow; +pub use arrow_impl::api::{from_arrow, to_arrow, ArrowBuilder}; + +#[cfg(has_arrow)] +#[deprecated = "The items in serde_arrow::arrow are deprecated. See the individual items for suitable replacements"] +pub mod arrow { + #[allow(deprecated)] + pub use crate::arrow_impl::api::{ + deserialize_from_array, deserialize_from_arrays, serialize_into_array, + serialize_into_arrays, serialize_into_field, serialize_into_fields, ArrayBuilder, + }; + + /// Build arrays record by record + #[deprecated = "serde_arrow::arrow2::ArraysBuilder is deprecated. Use serde_arrow::Arrow2Builder instead"] + pub type ArraysBuilder = crate::arrow_impl::api::ArrowBuilder; +} #[cfg(has_arrow2)] -pub use arrow2::api::{from_arrow2, to_arrow2, Arrow2Builder}; +mod arrow2_impl; #[cfg(has_arrow2)] -// #[deprecated = "The items in serde_arrow::arrow2 are deprecated. See the individual items for suitable replacements"] -pub mod arrow2; +pub use arrow2_impl::api::{from_arrow2, to_arrow2, Arrow2Builder}; + +#[cfg(has_arrow2)] +#[deprecated = "The items in serde_arrow::arrow2 are deprecated. See the individual items for suitable replacements"] +pub mod arrow2 { + #[allow(deprecated)] + pub use crate::arrow2_impl::api::{ + deserialize_from_array, deserialize_from_arrays, serialize_into_array, + serialize_into_arrays, serialize_into_field, serialize_into_fields, ArrayBuilder, + }; + + /// Build arrays record by record + #[deprecated = "serde_arrow::arrow2::ArraysBuilder is deprecated. Use serde_arrow::Arrow2Builder instead"] + pub type ArraysBuilder = crate::arrow2_impl::api::Arrow2Builder; +} #[deny(missing_docs)] pub mod schema; diff --git a/serde_arrow/src/test_end_to_end/test_docs_examples.rs b/serde_arrow/src/test_end_to_end/test_docs_examples.rs new file mode 100644 index 00000000..8ae96d59 --- /dev/null +++ b/serde_arrow/src/test_end_to_end/test_docs_examples.rs @@ -0,0 +1,47 @@ +use crate::_impl::arrow2::datatypes::{DataType, Field}; +use serde::{Deserialize, Serialize}; + +use crate::internal::{event::Event, sink::serialize_into_sink, source::deserialize_from_source}; + +#[test] +fn implementation_docs() { + #[derive(Debug, PartialEq, Serialize, Deserialize)] + struct Record { + a: i32, + b: u32, + } + + let items = vec![ + Record { a: 1, b: 2 }, + Record { a: 3, b: 4 }, + // ... + ]; + + let mut events: Vec> = Vec::new(); + serialize_into_sink(&mut events, &items).unwrap(); + + assert_eq!( + events, + vec![ + Event::StartSequence, + Event::Item, + Event::StartStruct, + Event::Str("a"), + Event::I32(1), + Event::Str("b"), + Event::U32(2), + Event::EndStruct, + Event::Item, + Event::StartStruct, + Event::Str("a"), + Event::I32(3), + Event::Str("b"), + Event::U32(4), + Event::EndStruct, + Event::EndSequence + ], + ); + + let items_from_events: Vec = deserialize_from_source(&events).unwrap(); + assert_eq!(items_from_events, items); +} \ No newline at end of file From de13d45672c6431ba9bac53517da51089ae9e163 Mon Sep 17 00:00:00 2001 From: Christopher Prohm Date: Tue, 31 Oct 2023 12:14:58 +0100 Subject: [PATCH 18/27] Update API docs --- serde_arrow/src/arrow2_impl/api.rs | 390 +++++------------- serde_arrow/src/arrow2_impl/schema.rs | 13 +- .../src/arrow2_impl/test_deprecated_api.rs | 5 +- serde_arrow/src/arrow_impl/api.rs | 305 +++++++------- serde_arrow/src/arrow_impl/schema.rs | 13 +- .../src/arrow_impl/test_deprecated_api.rs | 173 +++++++- serde_arrow/src/lib.rs | 55 ++- serde_arrow/src/schema.rs | 4 +- 8 files changed, 473 insertions(+), 485 deletions(-) diff --git a/serde_arrow/src/arrow2_impl/api.rs b/serde_arrow/src/arrow2_impl/api.rs index 0782d9f8..e9863d5e 100644 --- a/serde_arrow/src/arrow2_impl/api.rs +++ b/serde_arrow/src/arrow2_impl/api.rs @@ -17,25 +17,18 @@ use crate::{ }, }; -/// Determine the schema (as a list of fields) for the given items -/// -/// `items` should be given in the form a list of records (e.g., a vector of -/// structs). -/// -/// To correctly record the type information make sure to: -/// -/// - include values for `Option` -/// - include all variants of an enum -/// - include at least single element of a list or a map + +/// Build arrow2 arrays record by record (*requires one of the `arrow2-*` +/// features*) /// /// Example: /// /// ```rust +/// # fn main() -> serde_arrow::Result<()> { /// # use serde_arrow::_impl::arrow2 as arrow2; -/// # /// use arrow2::datatypes::{DataType, Field}; /// use serde::Serialize; -/// use serde_arrow::arrow2::serialize_into_fields; +/// use serde_arrow::Arrow2Builder; /// /// ##[derive(Serialize)] /// struct Record { @@ -43,48 +36,72 @@ use crate::{ /// b: u64, /// } /// -/// let items = vec![ -/// Record { a: Some(1.0), b: 2}, -/// // ... -/// ]; -/// -/// let fields = serialize_into_fields(&items, Default::default()).unwrap(); -/// let expected = vec![ +/// let mut builder = Arrow2Builder::new(&[ /// Field::new("a", DataType::Float32, true), /// Field::new("b", DataType::UInt64, false), -/// ]; +/// ])?; /// -/// assert_eq!(fields, expected); -/// ``` +/// builder.push(&Record { a: Some(1.0), b: 2})?; +/// builder.push(&Record { a: Some(3.0), b: 4})?; +/// builder.push(&Record { a: Some(5.0), b: 5})?; /// -pub fn serialize_into_fields(items: &T, options: TracingOptions) -> Result> -where - T: Serialize + ?Sized, -{ - let mut tracer = Tracer::new(String::from("$"), options); - tracer.trace_samples(items)?; +/// builder.extend(&[ +/// Record { a: Some(6.0), b: 7}, +/// Record { a: Some(8.0), b: 9}, +/// Record { a: Some(10.0), b: 11}, +/// ])?; +/// +/// let arrays = builder.build_arrays()?; +/// # +/// # assert_eq!(arrays.len(), 2); +/// # assert_eq!(arrays[0].len(), 6); +/// # Ok(()) +/// # } +/// ``` +pub struct Arrow2Builder(generic::GenericBuilder); - let schema = tracer.to_schema()?; - schema.to_arrow2_fields() +impl std::fmt::Debug for Arrow2Builder { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "Arrow2Builder<...>") + } } -/// Build arrays from the given items -pub fn serialize_into_arrays(fields: &[Field], items: &T) -> Result>> -where - T: Serialize + ?Sized, -{ - let fields = fields - .iter() - .map(GenericField::try_from) - .collect::>>()?; +impl Arrow2Builder { + /// Build a new Arrow2Builder for the given fields + /// + /// This method may fail when unsupported data types are encountered in the + /// given fields. + /// + pub fn new(fields: &[Field]) -> Result { + let fields = fields + .iter() + .map(GenericField::try_from) + .collect::>>()?; + Ok(Self(generic::GenericBuilder::new_for_arrays(&fields)?)) + } - let program = compile_serialization(&fields, CompilationOptions::default())?; - let mut interpreter = Interpreter::new(program); - serialize_into_sink(&mut interpreter, items)?; + /// Add a single record to the arrays + /// + pub fn push(&mut self, item: &T) -> Result<()> { + self.0.push(item) + } - interpreter.build_arrow2_arrays() + /// Add multiple records to the arrays + /// + pub fn extend(&mut self, items: &T) -> Result<()> { + self.0.extend(items) + } + + /// Build the arrays from the rows pushed to far. + /// + /// This operation will reset the underlying buffers and start a new batch. + /// + pub fn build_arrays(&mut self) -> Result>> { + self.0 .0.build_arrow2_arrays() + } } + /// Build arrow2 arrays from the given items (*requires one of the `arrow2-*` /// features*) /// @@ -109,10 +126,12 @@ where /// // ... /// ]; /// -/// let fields = SerdeArrowSchema::from_type::(TracingOptions::default()).unwrap().to_arrow2_fields().unwrap(); -/// let arrays = serde_arrow::to_arrow2(&fields, &items).unwrap(); +/// let fields = SerdeArrowSchema::from_type::(TracingOptions::default())?. +/// to_arrow2_fields()?; /// -/// assert_eq!(arrays.len(), 2); +/// let arrays = serde_arrow::to_arrow2(&fields, &items)?; +/// # +/// # assert_eq!(arrays.len(), 2); /// # Ok(()) /// # } /// ``` @@ -133,20 +152,16 @@ where interpreter.build_arrow2_arrays() } -/// Deserialize a type from the given arrays + +/// Deserialize items from the given arrow2 arrays (*requires* one of the +/// `arrow2-*` features) /// /// The type should be a list of records (e.g., a vector of structs). /// /// ```rust +/// # fn main() -> serde_arrow::Result<()> { /// use serde::{Deserialize, Serialize}; -/// use serde_arrow::{ -/// arrow2::{ -/// deserialize_from_arrays, -/// serialize_into_arrays, -/// serialize_into_fields, -/// }, -/// schema::TracingOptions, -/// }; +/// use serde_arrow::schema::{SerdeArrowSchema, TracingOptions}; /// /// ##[derive(Deserialize, Serialize)] /// struct Record { @@ -154,20 +169,19 @@ where /// b: u64, /// } /// -/// // provide an example record to get the field information -/// let fields = serialize_into_fields( -/// &[Record { a: Some(1.0), b: 2}], -/// TracingOptions::default(), -/// ).unwrap(); +/// let fields = SerdeArrowSchema::from_type::(TracingOptions::default())? +/// .to_arrow2_fields()?; /// # let items = &[Record { a: Some(1.0), b: 2}]; -/// # let arrays = serialize_into_arrays(&fields, &items).unwrap(); +/// # let arrays = serde_arrow::to_arrow2(&fields, &items).unwrap(); /// # /// /// // deserialize the records from arrays -/// let items: Vec = deserialize_from_arrays(&fields, &arrays).unwrap(); +/// let items: Vec = serde_arrow::from_arrow2(&fields, &arrays)?; +/// # Ok(()) +/// # } /// ``` /// -pub fn deserialize_from_arrays<'de, T, A>(fields: &'de [Field], arrays: &'de [A]) -> Result +pub fn from_arrow2<'de, T, A>(fields: &'de [Field], arrays: &'de [A]) -> Result where T: Deserialize<'de>, A: AsRef, @@ -203,92 +217,41 @@ where deserialize_from_source(interpreter) } -/// Deserialize items from the given arrow2 arrays (*requires* one of the -/// `arrow2-*` features) -/// -/// The type should be a list of records (e.g., a vector of structs). -/// -/// ```rust -/// use serde::{Deserialize, Serialize}; -/// use serde_arrow::{ -/// arrow2::{ -/// deserialize_from_arrays, -/// serialize_into_arrays, -/// serialize_into_fields, -/// }, -/// schema::TracingOptions, -/// }; -/// -/// ##[derive(Deserialize, Serialize)] -/// struct Record { -/// a: Option, -/// b: u64, -/// } -/// -/// // provide an example record to get the field information -/// let fields = serialize_into_fields( -/// &[Record { a: Some(1.0), b: 2}], -/// TracingOptions::default(), -/// ).unwrap(); -/// # let items = &[Record { a: Some(1.0), b: 2}]; -/// # let arrays = serialize_into_arrays(&fields, &items).unwrap(); -/// # -/// -/// // deserialize the records from arrays -/// let items: Vec = deserialize_from_arrays(&fields, &arrays).unwrap(); -/// ``` -/// -pub fn from_arrow2<'de, T, A>(fields: &'de [Field], arrays: &'de [A]) -> Result +/// Determine the schema (as a list of fields) for the given items +#[deprecated = "serde_arrow::arrow2::serialize_into_fields is deprecated. Use serde_arrow::schema::SerdeArrowSchema::from_samples instead"] +pub fn serialize_into_fields(items: &T, options: TracingOptions) -> Result> where - T: Deserialize<'de>, - A: AsRef, + T: Serialize + ?Sized, { - use crate::internal::{ - common::{BufferExtract, Buffers}, - deserialization, - }; - - let fields = fields - .iter() - .map(GenericField::try_from) - .collect::>>()?; + let mut tracer = Tracer::new(String::from("$"), options); + tracer.trace_samples(items)?; - let num_items = arrays - .iter() - .map(|a| a.as_ref().len()) - .min() - .unwrap_or_default(); + let schema = tracer.to_schema()?; + schema.to_arrow2_fields() +} - let mut buffers = Buffers::new(); - let mut mappings = Vec::new(); - for (field, array) in fields.iter().zip(arrays.iter()) { - mappings.push(array.as_ref().extract_buffers(field, &mut buffers)?); - } +/// Renamed to [`serde_arrow::to_arrow2`][crate::to_arrow2] +#[deprecated = "serde_arrow::arrow2::serialize_into_arrays is deprecated. Use serde_arrow::to_arrow2 instead"] +pub fn serialize_into_arrays(fields: &[Field], items: &T) -> Result>> +where + T: Serialize + ?Sized, +{ + crate::to_arrow2(fields, items) +} - let interpreter = deserialization::compile_deserialization( - num_items, - &mappings, - buffers, - deserialization::CompilationOptions::default(), - )?; - deserialize_from_source(interpreter) +/// Renamed to [`serde_arrow::from_arrow2`][crate::from_arrow2] +#[deprecated = "serde_arrow::arrow2::deserialize_from_arrays is deprecated. Use serde_arrow::from_arrow2 instead"] +pub fn deserialize_from_arrays<'de, T, A>(fields: &'de [Field], arrays: &'de [A]) -> Result +where + T: Deserialize<'de>, + A: AsRef, +{ + crate::from_arrow2(fields, arrays) } + /// Determine the schema of an object that represents a single array -/// -/// Example: -/// -/// ```rust -/// # use serde_arrow::_impl::arrow2 as arrow2; -/// use arrow2::datatypes::{DataType, Field}; -/// use serde_arrow::arrow2::serialize_into_field; -/// -/// let items: Vec = vec![1.0, 2.0, 3.0, 4.0]; -/// -/// let field = serialize_into_field(&items, "floats", Default::default()).unwrap(); -/// assert_eq!(field, Field::new("floats", DataType::Float32, false)); -/// ``` -/// +#[deprecated = "serde_arrow::arrow2::serialize_into_field is deprecated. Use serde_arrow::schema::SerdeArrowSchema::from_samples instead"] pub fn serialize_into_field(items: &T, name: &str, options: TracingOptions) -> Result where T: Serialize + ?Sized, @@ -300,23 +263,7 @@ where } /// Serialize a sequence of objects representing a single array into an array -/// -/// Example: -/// -/// ```rust -/// # use serde_arrow::_impl::arrow2 as arrow2; -/// # -/// use arrow2::datatypes::{DataType, Field}; -/// use serde_arrow::arrow2::serialize_into_array; -/// -/// let items: Vec = vec![1.0, 2.0, 3.0, 4.0]; -/// -/// let field = Field::new("floats", DataType::Float32, false); -/// let array = serialize_into_array(&field, &items).unwrap(); -/// -/// assert_eq!(array.len(), 4); -/// ``` -/// +#[deprecated = "serde_arrow::arrow2::serialize_into_array is deprecated. Use serde_arrow::to_arrow2 instead"] pub fn serialize_into_array(field: &Field, items: &T) -> Result> where T: Serialize + ?Sized, @@ -333,24 +280,7 @@ where } /// Deserialize a sequence of objects from a single array -/// -/// Example: -/// -/// ```rust -/// # use serde_arrow::_impl::arrow2 as arrow2; -/// # -/// use arrow2::{array::Array, datatypes::{DataType, Field}}; -/// use serde_arrow::arrow2::{ -/// serialize_into_array, -/// deserialize_from_array, -/// }; -/// -/// let field = Field::new("floats", DataType::Float32, false); -/// -/// let array = serialize_into_array(&field, &vec![1.0_f32, 2.0, 3.0]).unwrap(); -/// let items: Vec = deserialize_from_array(&field, &array).unwrap(); -/// ``` -/// +#[deprecated = "serde_arrow::arrow2::deserialize_from_array is deprecated. Use serde_arrow::from_arrow2 instead"] pub fn deserialize_from_array<'de, T, A>(field: &'de Field, array: &'de A) -> Result where T: Deserialize<'de>, @@ -360,35 +290,12 @@ where } /// Build a single array item by item -/// -/// Example: -/// -/// ```rust -/// # use serde_arrow::_impl::arrow2 as arrow2; -/// use arrow2::datatypes::{Field, DataType}; -/// use serde_arrow::arrow2::ArrayBuilder; -/// -/// let field = Field::new("value", DataType::Int64, false); -/// let mut builder = ArrayBuilder::new(&field).unwrap(); -/// -/// builder.push(&-1_i64).unwrap(); -/// builder.push(&2_i64).unwrap(); -/// builder.push(&-3_i64).unwrap(); -/// -/// builder.extend(&[4_i64, -5, 6]).unwrap(); -/// -/// let array = builder.build_array().unwrap(); -/// assert_eq!(array.len(), 6); -/// ``` #[deprecated = "serde_arrow::arrow2::ArrayBuilder is deprecated. Use serde_arrow::Arrow2Builder instead"] pub struct ArrayBuilder(generic::GenericBuilder); #[allow(deprecated)] impl ArrayBuilder { /// Construct a new build for the given field - /// - /// This method may fail for an unsupported data type of the given field. - /// pub fn new(field: &Field) -> Result { Ok(Self(generic::GenericBuilder::new_for_array( GenericField::try_from(field)?, @@ -396,102 +303,17 @@ impl ArrayBuilder { } /// Add a single item to the arrays - /// pub fn push(&mut self, item: &T) -> Result<()> { self.0.push(item) } /// Add multiple items to the arrays - /// pub fn extend(&mut self, items: &T) -> Result<()> { self.0.extend(items) } /// Build the array from the rows pushed to far. - /// - /// This operation will reset the underlying buffers and start a new batch. - /// pub fn build_array(&mut self) -> Result> { self.0 .0.build_arrow2_array() } } - -/// Build arrow2 arrays record by record -/// -/// Example: -/// -/// ```rust -/// # use serde_arrow::_impl::arrow2 as arrow2; -/// use arrow2::datatypes::{DataType, Field}; -/// use serde::Serialize; -/// use serde_arrow::Arrow2Builder; -/// -/// ##[derive(Serialize)] -/// struct Record { -/// a: Option, -/// b: u64, -/// } - -/// let fields = vec![ -/// Field::new("a", DataType::Float32, true), -/// Field::new("b", DataType::UInt64, false), -/// ]; -/// let mut builder = Arrow2Builder::new(&fields).unwrap(); -/// -/// builder.push(&Record { a: Some(1.0), b: 2}).unwrap(); -/// builder.push(&Record { a: Some(3.0), b: 4}).unwrap(); -/// builder.push(&Record { a: Some(5.0), b: 5}).unwrap(); -/// -/// builder.extend(&[ -/// Record { a: Some(6.0), b: 7}, -/// Record { a: Some(8.0), b: 9}, -/// Record { a: Some(10.0), b: 11}, -/// ]).unwrap(); -/// -/// let arrays = builder.build_arrays().unwrap(); -/// -/// assert_eq!(arrays.len(), 2); -/// assert_eq!(arrays[0].len(), 6); -/// ``` -pub struct Arrow2Builder(generic::GenericBuilder); - -impl std::fmt::Debug for Arrow2Builder { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "Arrow2Builder<...>") - } -} - -impl Arrow2Builder { - /// Build a new Arrow2Builder for the given fields - /// - /// This method may fail when unsupported data types are encountered in the - /// given fields. - /// - pub fn new(fields: &[Field]) -> Result { - let fields = fields - .iter() - .map(GenericField::try_from) - .collect::>>()?; - Ok(Self(generic::GenericBuilder::new_for_arrays(&fields)?)) - } - - /// Add a single record to the arrays - /// - pub fn push(&mut self, item: &T) -> Result<()> { - self.0.push(item) - } - - /// Add multiple records to the arrays - /// - pub fn extend(&mut self, items: &T) -> Result<()> { - self.0.extend(items) - } - - /// Build the arrays from the rows pushed to far. - /// - /// This operation will reset the underlying buffers and start a new batch. - /// - pub fn build_arrays(&mut self) -> Result>> { - self.0 .0.build_arrow2_arrays() - } -} diff --git a/serde_arrow/src/arrow2_impl/schema.rs b/serde_arrow/src/arrow2_impl/schema.rs index 7a6f70d6..a357cfce 100644 --- a/serde_arrow/src/arrow2_impl/schema.rs +++ b/serde_arrow/src/arrow2_impl/schema.rs @@ -21,7 +21,18 @@ impl SerdeArrowSchema { }) } - /// Build a vec of fields from a Schema object + /// This method is deprecated. Use + /// [`to_arrow2_fields`][SerdeArrowSchema::to_arrow2_fields] instead: + /// + /// ```rust + /// # use serde_arrow::schema::{SerdeArrowSchema, TracingOptions}; + /// # #[derive(serde::Deserialize)] + /// # struct Item { a: u32 } + /// # let schema = SerdeArrowSchema::from_type::(TracingOptions::default()).unwrap(); + /// # let fields = + /// schema.to_arrow2_fields().unwrap() + /// # ; + /// ``` #[deprecated = "The method `get_arrow2_fields` is deprecated. Use `to_arrow2_fields` instead"] pub fn get_arrow2_fields(&self) -> Result> { self.to_arrow2_fields() diff --git a/serde_arrow/src/arrow2_impl/test_deprecated_api.rs b/serde_arrow/src/arrow2_impl/test_deprecated_api.rs index 23d85b53..0d40b3a9 100644 --- a/serde_arrow/src/arrow2_impl/test_deprecated_api.rs +++ b/serde_arrow/src/arrow2_impl/test_deprecated_api.rs @@ -1,7 +1,6 @@ #![allow(deprecated)] -use crate as serde_arrow; -use crate::_impl::arrow2; +use crate::{self as serde_arrow, _impl::arrow2}; #[test] fn api_docs_serialize_into_fields() { @@ -111,6 +110,8 @@ fn api_docs_deserialize_from_array() { let array = serialize_into_array(&field, &vec![1.0_f32, 2.0, 3.0]).unwrap(); let items: Vec = deserialize_from_array(&field, &array).unwrap(); + + drop(items); } #[test] diff --git a/serde_arrow/src/arrow_impl/api.rs b/serde_arrow/src/arrow_impl/api.rs index 382f1833..5db4f067 100644 --- a/serde_arrow/src/arrow_impl/api.rs +++ b/serde_arrow/src/arrow_impl/api.rs @@ -17,21 +17,144 @@ use crate::{ }, }; + +/// Build arrow arrays record by record (*requires one of the `arrow-*` +/// features*) +/// +/// Example: +/// +/// ```rust +/// # fn main() -> serde_arrow::Result<()> { +/// # use serde_arrow::_impl::arrow as arrow; +/// use arrow::datatypes::{DataType, Field}; +/// use serde::Serialize; +/// use serde_arrow::ArrowBuilder; +/// +/// ##[derive(Serialize)] +/// struct Record { +/// a: Option, +/// b: u64, +/// } +/// +/// let mut builder = ArrowBuilder::new(&[ +/// Field::new("a", DataType::Float32, true), +/// Field::new("b", DataType::UInt64, false), +/// ])?; +/// +/// builder.push(&Record { a: Some(1.0), b: 2})?; +/// builder.push(&Record { a: Some(3.0), b: 4})?; +/// builder.push(&Record { a: Some(5.0), b: 5})?; +/// +/// builder.extend(&[ +/// Record { a: Some(6.0), b: 7}, +/// Record { a: Some(8.0), b: 9}, +/// Record { a: Some(10.0), b: 11}, +/// ])?; +/// +/// let arrays = builder.build_arrays()?; +/// # +/// # assert_eq!(arrays.len(), 2); +/// # assert_eq!(arrays[0].len(), 6); +/// # Ok(()) +/// # } +/// ``` +pub struct ArrowBuilder(generic::GenericBuilder); + +impl std::fmt::Debug for ArrowBuilder { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "ArrowBuilder<...>") + } +} + +impl ArrowBuilder { + /// Build a new ArrowBuilder for the given fields + /// + /// This method may fail when unsupported data types are encountered in the + /// given fields. + /// + pub fn new(fields: &[Field]) -> Result { + let fields = fields + .iter() + .map(GenericField::try_from) + .collect::>>()?; + Ok(Self(generic::GenericBuilder::new_for_arrays(&fields)?)) + } + + /// Add a single record to the arrays + /// + pub fn push(&mut self, item: &T) -> Result<()> { + self.0.push(item) + } + + /// Add multiple records to the arrays + /// + pub fn extend(&mut self, items: &T) -> Result<()> { + self.0.extend(items) + } + + /// Build the arrays from the rows pushed to far. + /// + /// This operation will reset the underlying buffers and start a new batch. + /// + pub fn build_arrays(&mut self) -> Result> { + self.0 .0.build_arrow_arrays() + } +} + +/// Build arrow arrays from the given items (*requires one of the `arrow-*` +/// features*)) +/// +/// `items` should be given in the form a list of records (e.g., a vector of +/// structs). +/// +/// Example: +/// +/// ```rust +/// # fn main() -> serde_arrow::Result<()> { +/// use serde::{Serialize, Deserialize}; +/// use serde_arrow::schema::{SerdeArrowSchema, TracingOptions}; +/// +/// ##[derive(Serialize, Deserialize)] +/// struct Record { +/// a: Option, +/// b: u64, +/// } +/// +/// let items = vec![ +/// Record { a: Some(1.0), b: 2}, +/// // ... +/// ]; +/// +/// let fields = SerdeArrowSchema::from_type::(TracingOptions::default())? +/// .to_arrow_fields()?; +/// let arrays = serde_arrow::to_arrow(&fields, &items)?; +/// # +/// # assert_eq!(arrays.len(), 2); +/// # Ok(()) +/// # } +/// ``` +/// +pub fn to_arrow(fields: &[Field], items: &T) -> Result> { + let fields = fields + .iter() + .map(GenericField::try_from) + .collect::>>()?; + + let program = compile_serialization(&fields, CompilationOptions::default())?; + let mut interpreter = Interpreter::new(program); + serialize_into_sink(&mut interpreter, items)?; + interpreter.build_arrow_arrays() +} + /// Deserialize items from arrow arrays (*requires one of the `arrow-*` /// features*) /// /// The type should be a list of records (e.g., a vector of structs). /// /// ```rust +/// # fn main() -> serde_arrow::Result<()> { /// use serde::{Deserialize, Serialize}; -/// use serde_arrow::{ -/// arrow::{ -/// deserialize_from_arrays, -/// serialize_into_arrays, -/// serialize_into_fields, -/// }, -/// schema::TracingOptions, -/// }; +/// use serde_arrow::schema::{SerdeArrowSchema, TracingOptions}; /// /// ##[derive(Deserialize, Serialize)] /// struct Record { @@ -40,16 +163,16 @@ use crate::{ /// } /// /// // provide an example record to get the field information -/// let fields = serialize_into_fields( -/// &[Record { a: Some(1.0), b: 2}], -/// TracingOptions::default(), -/// ).unwrap(); +/// let fields = SerdeArrowSchema::from_type::(TracingOptions::default())? +/// .to_arrow_fields()?; /// # let items = &[Record { a: Some(1.0), b: 2}]; -/// # let arrays = serialize_into_arrays(&fields, &items).unwrap(); +/// # let arrays = serde_arrow::to_arrow(&fields, &items).unwrap(); /// # /// /// // deserialize the records from arrays -/// let items: Vec = deserialize_from_arrays(&fields, &arrays).unwrap(); +/// let items: Vec = serde_arrow::from_arrow(&fields, &arrays).unwrap(); +/// # Ok(()) +/// # } /// ``` /// pub fn from_arrow<'de, T, A>(fields: &'de [Field], arrays: &'de [A]) -> Result @@ -114,67 +237,23 @@ where Field::try_from(&field) } -/// Build arrays from the given items +/// Renamed to [`serde_arrow::to_arrow`][crate::to_arrow] #[deprecated = "serialize_into_arrays is deprecated. Use serde_arrow::to_arrow instead"] pub fn serialize_into_arrays( fields: &[Field], items: &T, ) -> Result> { - to_arrow(fields, items) -} - -/// Build arrow arrays from the given items (*requires one of the `arrow-*` -/// features*)) -/// -/// `items` should be given in the form a list of records (e.g., a vector of -/// structs). -/// -/// Example: -/// -/// ```rust -/// # fn main() -> serde_arrow::Result<()> { -/// use serde::{Serialize, Deserialize}; -/// use serde_arrow::schema::{SerdeArrowSchema, TracingOptions}; -/// -/// ##[derive(Serialize, Deserialize)] -/// struct Record { -/// a: Option, -/// b: u64, -/// } -/// -/// let items = vec![ -/// Record { a: Some(1.0), b: 2}, -/// // ... -/// ]; -/// -/// let fields = SerdeArrowSchema::from_type::(TracingOptions::default())?.to_arrow_fields()?; -/// let arrays = serde_arrow::to_arrow(&fields, &items)?; -/// -/// assert_eq!(arrays.len(), 2); -/// # Ok(()) -/// # } -/// ``` -/// -pub fn to_arrow(fields: &[Field], items: &T) -> Result> { - let fields = fields - .iter() - .map(GenericField::try_from) - .collect::>>()?; - - let program = compile_serialization(&fields, CompilationOptions::default())?; - let mut interpreter = Interpreter::new(program); - serialize_into_sink(&mut interpreter, items)?; - interpreter.build_arrow_arrays() + crate::to_arrow(fields, items) } -/// Deserialize a type from the given arrays +/// Renamed to [`serde_arrow::from_arrow`][crate::from_arrow] #[deprecated = "deserialize_from_arrays is deprecated. Use serde_arrow::from_arrow instead"] pub fn deserialize_from_arrays<'de, T, A>(fields: &'de [Field], arrays: &'de [A]) -> Result where T: Deserialize<'de>, A: AsRef, { - from_arrow(fields, arrays) + crate::from_arrow(fields, arrays) } /// Serialize an object that represents a single array into an array @@ -195,7 +274,7 @@ where } /// Deserialize a sequence of objects from a single array -#[deprecated = "deserialize_from_array is deprecated"] +#[deprecated = "deserialize_from_array is deprecated. Use serde_arrow::from_arrow instead"] pub fn deserialize_from_array<'de, T, A>(field: &'de Field, array: &'de A) -> Result where T: Deserialize<'de>, @@ -205,26 +284,6 @@ where } /// Build a single array item by item -/// -/// Example: -/// -/// ```rust -/// # use serde_arrow::_impl::arrow as arrow; -/// use arrow::datatypes::{Field, DataType}; -/// use serde_arrow::arrow::ArrayBuilder; -/// -/// let field = Field::new("value", DataType::Int64, false); -/// let mut builder = ArrayBuilder::new(&field).unwrap(); -/// -/// builder.push(&-1_i64).unwrap(); -/// builder.push(&2_i64).unwrap(); -/// builder.push(&-3_i64).unwrap(); -/// -/// builder.extend(&[4_i64, -5, 6]).unwrap(); -/// -/// let array = builder.build_array().unwrap(); -/// assert_eq!(array.len(), 6); -/// ``` #[deprecated = "serde_arrow::arrow::ArrayBuilder is deprecated. Use serde_arrow::ArrowBuilder instead"] pub struct ArrayBuilder(generic::GenericBuilder); @@ -252,83 +311,3 @@ impl ArrayBuilder { self.0 .0.build_arrow_array() } } - -/// Build arrow arrays record by record -/// -/// Example: -/// -/// ```rust -/// # use serde_arrow::_impl::arrow as arrow; -/// use arrow::datatypes::{DataType, Field}; -/// use serde::Serialize; -/// use serde_arrow::ArrowBuilder; -/// -/// ##[derive(Serialize)] -/// struct Record { -/// a: Option, -/// b: u64, -/// } - -/// let fields = vec![ -/// Field::new("a", DataType::Float32, true), -/// Field::new("b", DataType::UInt64, false), -/// ]; -/// let mut builder = ArrowBuilder::new(&fields).unwrap(); -/// -/// builder.push(&Record { a: Some(1.0), b: 2}).unwrap(); -/// builder.push(&Record { a: Some(3.0), b: 4}).unwrap(); -/// builder.push(&Record { a: Some(5.0), b: 5}).unwrap(); -/// -/// builder.extend(&[ -/// Record { a: Some(6.0), b: 7}, -/// Record { a: Some(8.0), b: 9}, -/// Record { a: Some(10.0), b: 11}, -/// ]).unwrap(); -/// -/// let arrays = builder.build_arrays().unwrap(); -/// -/// assert_eq!(arrays.len(), 2); -/// assert_eq!(arrays[0].len(), 6); -/// ``` -pub struct ArrowBuilder(generic::GenericBuilder); - -impl std::fmt::Debug for ArrowBuilder { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "ArrowBuilder<...>") - } -} - -impl ArrowBuilder { - /// Build a new ArrowBuilder for the given fields - /// - /// This method may fail when unsupported data types are encountered in the - /// given fields. - /// - pub fn new(fields: &[Field]) -> Result { - let fields = fields - .iter() - .map(GenericField::try_from) - .collect::>>()?; - Ok(Self(generic::GenericBuilder::new_for_arrays(&fields)?)) - } - - /// Add a single record to the arrays - /// - pub fn push(&mut self, item: &T) -> Result<()> { - self.0.push(item) - } - - /// Add multiple records to the arrays - /// - pub fn extend(&mut self, items: &T) -> Result<()> { - self.0.extend(items) - } - - /// Build the arrays from the rows pushed to far. - /// - /// This operation will reset the underlying buffers and start a new batch. - /// - pub fn build_arrays(&mut self) -> Result> { - self.0 .0.build_arrow_arrays() - } -} diff --git a/serde_arrow/src/arrow_impl/schema.rs b/serde_arrow/src/arrow_impl/schema.rs index 46f179ac..ff12e8d4 100644 --- a/serde_arrow/src/arrow_impl/schema.rs +++ b/serde_arrow/src/arrow_impl/schema.rs @@ -22,7 +22,18 @@ impl SerdeArrowSchema { }) } - /// Build a vec of fields from a Schema object + /// This method is deprecated. Use + /// [`to_arrow_fields`][SerdeArrowSchema::to_arrow_fields] instead: + /// + /// ```rust + /// # use serde_arrow::schema::{SerdeArrowSchema, TracingOptions}; + /// # #[derive(serde::Deserialize)] + /// # struct Item { a: u32 } + /// # let schema = SerdeArrowSchema::from_type::(TracingOptions::default()).unwrap(); + /// # let fields = + /// schema.to_arrow_fields().unwrap() + /// # ; + /// ``` #[deprecated = "The method `get_arrow_fields` is deprecated. Use `to_arrow_fields` instead"] pub fn get_arrow_fields(&self) -> Result> { self.to_arrow_fields() diff --git a/serde_arrow/src/arrow_impl/test_deprecated_api.rs b/serde_arrow/src/arrow_impl/test_deprecated_api.rs index d3f5a12f..f95186f1 100644 --- a/serde_arrow/src/arrow_impl/test_deprecated_api.rs +++ b/serde_arrow/src/arrow_impl/test_deprecated_api.rs @@ -1 +1,172 @@ - +#![allow(deprecated)] + +use crate::{self as serde_arrow, _impl::arrow}; + +#[test] +fn test_serialize_into_fields() { + use arrow::datatypes::{DataType, Field}; + use serde::Serialize; + use serde_arrow::arrow::serialize_into_fields; + + #[derive(Serialize)] + struct Record { + a: Option, + b: u64, + } + + let items = vec![ + Record { a: Some(1.0), b: 2 }, + // ... + ]; + + let fields = serialize_into_fields(&items, Default::default()).unwrap(); + let expected = vec![ + Field::new("a", DataType::Float32, true), + Field::new("b", DataType::UInt64, false), + ]; + + assert_eq!(fields, expected); +} + +#[test] +fn test_serialize_into_field() { + use arrow::datatypes::{DataType, Field}; + use serde_arrow::arrow::serialize_into_field; + + let items: Vec = vec![1.0, 2.0, 3.0, 4.0]; + + let field = serialize_into_field(&items, "floats", Default::default()).unwrap(); + assert_eq!(field, Field::new("floats", DataType::Float32, false)); +} + +#[test] +fn test_serialize_into_arrays() { + use serde::Serialize; + use serde_arrow::arrow::{serialize_into_arrays, serialize_into_fields}; + + #[derive(Serialize)] + struct Record { + a: Option, + b: u64, + } + + let items = vec![ + Record { a: Some(1.0), b: 2 }, + // ... + ]; + + let fields = serialize_into_fields(&items, Default::default()).unwrap(); + let arrays = serialize_into_arrays(&fields, &items).unwrap(); + + assert_eq!(arrays.len(), 2); +} + +#[test] +fn test_serialize_into_array() { + use arrow::datatypes::{DataType, Field}; + use serde_arrow::arrow::serialize_into_array; + + let items: Vec = vec![1.0, 2.0, 3.0, 4.0]; + + let field = Field::new("floats", DataType::Float32, false); + let array = serialize_into_array(&field, &items).unwrap(); + + assert_eq!(array.len(), 4); +} + +#[test] +fn test_deserialize_from_arrays() { + use serde::{Deserialize, Serialize}; + use serde_arrow::{ + arrow::{deserialize_from_arrays, serialize_into_arrays, serialize_into_fields}, + schema::TracingOptions, + }; + + #[derive(Deserialize, Serialize)] + struct Record { + a: Option, + b: u64, + } + + // provide an example record to get the field information + let fields = + serialize_into_fields(&[Record { a: Some(1.0), b: 2 }], TracingOptions::default()).unwrap(); + + // hidden in docs + let items = &[Record { a: Some(1.0), b: 2 }]; + let arrays = serialize_into_arrays(&fields, &items).unwrap(); + + // deserialize the records from arrays + let items: Vec = deserialize_from_arrays(&fields, &arrays).unwrap(); + + drop(items); +} + +#[test] +fn test_deserialize_from_array() { + use arrow::datatypes::{DataType, Field}; + use serde_arrow::arrow::{deserialize_from_array, serialize_into_array}; + + let field = Field::new("floats", DataType::Float32, false); + + let array = serialize_into_array(&field, &vec![1.0_f32, 2.0, 3.0]).unwrap(); + let items: Vec = deserialize_from_array(&field, &array).unwrap(); + + drop(items); +} + +#[test] +fn test_arrays_builder() { + use arrow::datatypes::{DataType, Field}; + use serde::Serialize; + use serde_arrow::arrow::ArraysBuilder; + + #[derive(Serialize)] + struct Record { + a: Option, + b: u64, + } + let fields = vec![ + Field::new("a", DataType::Float32, true), + Field::new("b", DataType::UInt64, false), + ]; + let mut builder = ArraysBuilder::new(&fields).unwrap(); + + builder.push(&Record { a: Some(1.0), b: 2 }).unwrap(); + builder.push(&Record { a: Some(3.0), b: 4 }).unwrap(); + builder.push(&Record { a: Some(5.0), b: 5 }).unwrap(); + + builder + .extend(&[ + Record { a: Some(6.0), b: 7 }, + Record { a: Some(8.0), b: 9 }, + Record { + a: Some(10.0), + b: 11, + }, + ]) + .unwrap(); + + let arrays = builder.build_arrays().unwrap(); + + assert_eq!(arrays.len(), 2); + assert_eq!(arrays[0].len(), 6); +} + +#[test] +fn test_array_builder() { + use arrow::datatypes::{DataType, Field}; + use serde_arrow::arrow::ArrayBuilder; + + let field = Field::new("value", DataType::Int64, false); + let mut builder = ArrayBuilder::new(&field).unwrap(); + + builder.push(&-1_i64).unwrap(); + builder.push(&2_i64).unwrap(); + builder.push(&-3_i64).unwrap(); + + builder.extend(&[4_i64, -5, 6]).unwrap(); + + let array = builder.build_array().unwrap(); + assert_eq!(array.len(), 6); +} diff --git a/serde_arrow/src/lib.rs b/serde_arrow/src/lib.rs index aa171c09..f8be1375 100644 --- a/serde_arrow/src/lib.rs +++ b/serde_arrow/src/lib.rs @@ -16,40 +16,31 @@ //! E.g., to convert Rust strings containing timestamps to Date64 arrays, the //! schema should contain a `Date64`. `serde_arrow` supports to derive the //! schema from the data itself via schema tracing, but does not require it. It -//! is always possible to specify the schema manually. +//! is always possible to specify the schema manually. See the [`schema`] +//! module for further details. //! //! ## Overview //! -//! The functions come in pairs: some work on single arrays, i.e., the series -//! of a data frame, some work on multiples arrays, i.e., data frames -//! themselves. -//! -//! | implementation | operation | multiple arrays | single array | -//! |---|---|---|---| -//! | **arrow** | schema tracing | [arrow::serialize_into_fields] | [arrow::serialize_into_field] | -//! | | Rust to Arrow | [arrow::serialize_into_arrays] | [arrow::serialize_into_array] | -//! | | Arrow to Rust | [arrow::deserialize_from_arrays] | [arrow::deserialize_from_array] | -//! | | Builder | [arrow::ArraysBuilder] | [arrow::ArrayBuilder] | -//! | | | | | -//! | **arrow2** | schema tracing | [arrow2::serialize_into_fields] | [arrow2::serialize_into_field] | -//! | | Rust to Arrow | [arrow2::serialize_into_arrays] | [arrow2::serialize_into_array] | -//! | | Arrow to Rust | [arrow2::deserialize_from_arrays] | [arrow2::deserialize_from_array] | -//! | | Builder | [arrow2::ArraysBuilder] | [arrow2::ArrayBuilder] | +//! | Operation | `arrow` | `arrow2` | +//! |------------------|------------------|-------------------| +//! | Rust to Arrow | [`to_arrow`] | [`to_arrow2`] | +//! | Arrow to Rust | [`from_arrow`] | [`from_arrow2`] | +//! | Arrow Builder | [`ArrowBuilder`] | [`Arrow2Builder`] | +//! | | | | +//! | Fields to SerdeArrowSchema | [`SerdeArrowSchema::from_arrow_fields`][schema::SerdeArrowSchema::from_arrow_fields] | [`SerdeArrowSchema::form_arrow2_fields`][schema::SerdeArrowSchema::from_arrow2_fields] | +//! | SerdeArrowSchema to fields | [`schema.to_arrow_fields()`][schema::SerdeArrowSchema::to_arrow_fields] | [`schema.to_arrow2_fields()`][schema::SerdeArrowSchema::to_arrow2_fields] | //! //! ## Example //! //! Requires one of `arrow2` feature (see below). //! //! ```rust -//! # use serde::Serialize; -//! # #[cfg(feature = "arrow2-0-17")] +//! # use serde::{Deserialize, Serialize}; +//! # #[cfg(feature = "has_arrow2")] //! # fn main() -> serde_arrow::Result<()> { -//! use serde_arrow::{ -//! schema::TracingOptions, -//! arrow2::{serialize_into_fields, serialize_into_arrays} -//! }; +//! use serde_arrow::schema::{TracingOptions, SerdeArrowSchema}; //! -//! ##[derive(Serialize)] +//! ##[derive(Serialize, Deserialize)] //! struct Example { //! a: f32, //! b: i32, @@ -62,13 +53,15 @@ //! ]; //! //! // Auto-detect the arrow types. Result may need to be overwritten and -//! // customized, see serde_arrow::schema::Strategy for details. -//! let fields = serialize_into_fields(&records, TracingOptions::default())?; -//! let arrays = serialize_into_arrays(&fields, &records)?; +//! // customized, see serde_arrow::schema for details. +//! let fields = SerdeArrowSchema::from_type::(TracingOptions::default())? +//! .to_arrow2_fields()?; +//! +//! let arrays = serde_arrow::to_arrow2(&fields, &records)?; //! //! # Ok(()) //! # } -//! # #[cfg(not(feature = "arrow2-0-17"))] +//! # #[cfg(not(feature = "has_arrow2"))] //! # fn main() { } //! ``` //! @@ -124,14 +117,13 @@ //! mod internal; -/// Internal. Do not use +/// *Internal. Do not use* /// /// This module is an internal implementation detail and not subject to any /// compatibility promises. It re-exports the arrow impls selected via features /// to allow usage in doc tests or benchmarks. /// #[rustfmt::skip] -#[doc(hidden)] pub mod _impl { #[allow(unused)] macro_rules! build_arrow2_crate { @@ -196,6 +188,7 @@ pub mod _impl { #[cfg(has_arrow_37)] build_arrow_crate!(arrow_array_37, arrow_buffer_37, arrow_data_37, arrow_schema_37); #[cfg(has_arrow_36)] build_arrow_crate!(arrow_array_36, arrow_buffer_36, arrow_data_36, arrow_schema_36); + /// Documentation pub mod docs { #[doc = include_str!("../Implementation.md")] #[cfg(not(doctest))] @@ -237,7 +230,7 @@ pub mod arrow { serialize_into_arrays, serialize_into_field, serialize_into_fields, ArrayBuilder, }; - /// Build arrays record by record + /// Renamed to [`serde_arrow::ArrowBuilder`][crate::ArrowBuilder] #[deprecated = "serde_arrow::arrow2::ArraysBuilder is deprecated. Use serde_arrow::Arrow2Builder instead"] pub type ArraysBuilder = crate::arrow_impl::api::ArrowBuilder; } @@ -257,7 +250,7 @@ pub mod arrow2 { serialize_into_arrays, serialize_into_field, serialize_into_fields, ArrayBuilder, }; - /// Build arrays record by record + /// Renamed to [`serde_arrow::Arrow2Builder`][crate::Arrow2Builder] #[deprecated = "serde_arrow::arrow2::ArraysBuilder is deprecated. Use serde_arrow::Arrow2Builder instead"] pub type ArraysBuilder = crate::arrow2_impl::api::Arrow2Builder; } diff --git a/serde_arrow/src/schema.rs b/serde_arrow/src/schema.rs index bc41452f..4007be10 100644 --- a/serde_arrow/src/schema.rs +++ b/serde_arrow/src/schema.rs @@ -19,7 +19,7 @@ //! time objects that are serialized to strings (chrono's default), use //! //! ```rust -//! # #[cfg(feature="arrow2")] +//! # #[cfg(feature="has_arrow2")] //! # fn main() { //! # use arrow2::datatypes::{DataType, Field}; //! # use serde_arrow::schema::{STRATEGY_KEY, Strategy}; @@ -27,7 +27,7 @@ //! field.data_type = DataType::Date64; //! field.metadata = Strategy::UtcStrAsDate64.into(); //! # } -//! # #[cfg(not(feature="arrow2"))] +//! # #[cfg(not(feature="has_arrow2"))] //! # fn main() {} //! ``` pub use crate::internal::{ From 5d87e9922312f6b1c39bcdce46ed0b62b40fb92b Mon Sep 17 00:00:00 2001 From: Christopher Prohm Date: Tue, 31 Oct 2023 13:39:34 +0100 Subject: [PATCH 19/27] Add doc tests for migrated APIs --- serde_arrow/src/arrow2_impl/api.rs | 113 ++++++++++++++++-- serde_arrow/src/arrow_impl/api.rs | 110 ++++++++++++++++- serde_arrow/src/internal/error.rs | 6 +- serde_arrow/src/internal/generic.rs | 38 +++++- serde_arrow/src/lib.rs | 3 + serde_arrow/src/test_end_to_end/issue_90.rs | 4 +- serde_arrow/src/test_end_to_end/test_items.rs | 2 +- 7 files changed, 255 insertions(+), 21 deletions(-) diff --git a/serde_arrow/src/arrow2_impl/api.rs b/serde_arrow/src/arrow2_impl/api.rs index e9863d5e..23b48bec 100644 --- a/serde_arrow/src/arrow2_impl/api.rs +++ b/serde_arrow/src/arrow2_impl/api.rs @@ -17,7 +17,6 @@ use crate::{ }, }; - /// Build arrow2 arrays record by record (*requires one of the `arrow2-*` /// features*) /// @@ -101,7 +100,6 @@ impl Arrow2Builder { } } - /// Build arrow2 arrays from the given items (*requires one of the `arrow2-*` /// features*) /// @@ -152,7 +150,6 @@ where interpreter.build_arrow2_arrays() } - /// Deserialize items from the given arrow2 arrays (*requires* one of the /// `arrow2-*` features) /// @@ -217,7 +214,29 @@ where deserialize_from_source(interpreter) } -/// Determine the schema (as a list of fields) for the given items +/// Replaced by +/// [`SerdeArrowSchema::from_samples`][crate::schema::SerdeArrowSchema::from_samples] +/// (*[example][serialize_into_fields]*) +/// +/// ```rust +/// # fn main() -> serde_arrow::Result<()> { +/// use serde::Serialize; +/// use serde_arrow::schema::{SerdeArrowSchema, TracingOptions}; +/// +/// ##[derive(Serialize)] +/// struct Record { +/// a: u32, +/// b: f32, +/// } +/// +/// let samples = [Record { a: 1, b: 2.0 }, /* ... */ ]; +/// let fields = SerdeArrowSchema::from_samples(&samples, TracingOptions::default())? +/// .to_arrow2_fields()?; +/// # +/// # drop(fields); +/// # Ok(()) +/// # } +/// ``` #[deprecated = "serde_arrow::arrow2::serialize_into_fields is deprecated. Use serde_arrow::schema::SerdeArrowSchema::from_samples instead"] pub fn serialize_into_fields(items: &T, options: TracingOptions) -> Result> where @@ -249,8 +268,25 @@ where crate::from_arrow2(fields, arrays) } - -/// Determine the schema of an object that represents a single array +/// Replaced by +/// [`SerdeArrowSchema::from_samples`][crate::schema::SerdeArrowSchema::from_samples] +/// and [`Items`][crate::utils::Items] (*[example][serialize_into_field]*) +/// +/// ```rust +/// # fn main() -> serde_arrow::Result<()> { +/// use serde_arrow::{ +/// schema::{SerdeArrowSchema, TracingOptions}, +/// utils::Items, +/// }; +/// +/// let samples: Vec = vec![1, 2, 3, /* ... */ ]; +/// let fields = SerdeArrowSchema::from_samples(&Items(&samples), TracingOptions::default())? +/// .to_arrow2_fields()?; +/// # +/// # drop(fields); +/// # Ok(()) +/// # } +/// ``` #[deprecated = "serde_arrow::arrow2::serialize_into_field is deprecated. Use serde_arrow::schema::SerdeArrowSchema::from_samples instead"] pub fn serialize_into_field(items: &T, name: &str, options: TracingOptions) -> Result where @@ -262,7 +298,26 @@ where Field::try_from(&field) } -/// Serialize a sequence of objects representing a single array into an array +/// Replaced by [`serde_arrow::to_arrow2`][crate::to_arrow2] and +/// [`Items`][crate::utils::Items] (*[example][serialize_into_array]*) +/// +/// ```rust +/// # fn main() -> serde_arrow::Result<()> { +/// use serde_arrow::{ +/// schema::{SerdeArrowSchema, TracingOptions}, +/// utils::Items, +/// }; +/// +/// let samples: Vec = vec![1, 2, 3, /* ... */ ]; +/// let fields = SerdeArrowSchema::from_samples(&Items(&samples), TracingOptions::default())? +/// .to_arrow2_fields()?; +/// +/// let arrays = serde_arrow::to_arrow2(&fields, &Items(&samples))?; +/// # +/// # drop(fields); +/// # Ok(()) +/// # } +/// ``` #[deprecated = "serde_arrow::arrow2::serialize_into_array is deprecated. Use serde_arrow::to_arrow2 instead"] pub fn serialize_into_array(field: &Field, items: &T) -> Result> where @@ -279,7 +334,25 @@ where interpreter.build_arrow2_array() } -/// Deserialize a sequence of objects from a single array +/// Replaced by [`serde_arrow::to_arrow2`][crate::from_arrow2] and +/// [`Items`][crate::utils::Items] (*[example][deserialize_from_array]*) +/// +/// ```rust +/// # fn main() -> serde_arrow::Result<()> { +/// # use serde_arrow::schema::{SerdeArrowSchema, TracingOptions}; +/// # let samples: Vec = vec![1, 2, 3, /* ... */ ]; +/// # let fields = SerdeArrowSchema::from_samples(&Items(&samples), TracingOptions::default())? +/// # .to_arrow2_fields()?; +/// # let arrays = serde_arrow::to_arrow2(&fields, &Items(&samples))?; +/// # +/// use serde_arrow::utils::Items; +/// +/// let Items(items): Items> = serde_arrow::from_arrow2(&fields, &arrays)?; +/// # +/// # drop(items); +/// # Ok(()) +/// # } +/// ``` #[deprecated = "serde_arrow::arrow2::deserialize_from_array is deprecated. Use serde_arrow::from_arrow2 instead"] pub fn deserialize_from_array<'de, T, A>(field: &'de Field, array: &'de A) -> Result where @@ -289,7 +362,29 @@ where generic::deserialize_from_array(field, array.as_ref()) } -/// Build a single array item by item +/// Replaced by [`Arrow2Builder`][crate::Arrow2Builder] and +/// [`Items`][crate::utils::Items] / [`Item`][crate::utils::Item] (*[example][ArrayBuilder]*) +/// +/// ```rust +/// # fn main() -> serde_arrow::Result<()> { +/// # use serde_arrow::_impl::arrow2; +/// use arrow2::datatypes::{DataType, Field}; +/// use serde_arrow::{Arrow2Builder, utils::{Items, Item}}; +/// +/// let fields = vec![Field::new("item", DataType::UInt8, false)]; +/// let mut builder = Arrow2Builder::new(&fields)?; +/// +/// builder.push(&Item(0))?; +/// builder.push(&Item(1))?; +/// builder.push(&Item(2))?; +/// +/// builder.extend(&Items(&[3, 4, 5]))?; +/// +/// let arrays = builder.build_arrays()?; +/// # drop(arrays); +/// # Ok(()) +/// # } +/// ``` #[deprecated = "serde_arrow::arrow2::ArrayBuilder is deprecated. Use serde_arrow::Arrow2Builder instead"] pub struct ArrayBuilder(generic::GenericBuilder); diff --git a/serde_arrow/src/arrow_impl/api.rs b/serde_arrow/src/arrow_impl/api.rs index 5db4f067..2f49a064 100644 --- a/serde_arrow/src/arrow_impl/api.rs +++ b/serde_arrow/src/arrow_impl/api.rs @@ -17,7 +17,6 @@ use crate::{ }, }; - /// Build arrow arrays record by record (*requires one of the `arrow-*` /// features*) /// @@ -211,7 +210,29 @@ where deserialize_from_source(interpreter) } -/// Determine the schema (as a list of fields) for the given items +/// Replaced by +/// [`SerdeArrowSchema::from_samples`][crate::schema::SerdeArrowSchema::from_samples] +/// (*[example][serialize_into_fields]*) +/// +/// ```rust +/// # fn main() -> serde_arrow::Result<()> { +/// use serde::Serialize; +/// use serde_arrow::schema::{SerdeArrowSchema, TracingOptions}; +/// +/// ##[derive(Serialize)] +/// struct Record { +/// a: u32, +/// b: f32, +/// } +/// +/// let samples = [Record { a: 1, b: 2.0 }, /* ... */ ]; +/// let fields = SerdeArrowSchema::from_samples(&samples, TracingOptions::default())? +/// .to_arrow_fields()?; +/// # +/// # drop(fields); +/// # Ok(()) +/// # } +/// ``` #[deprecated = "serialize_into_fields is deprecated. Use serde_arrow::schema::SerdeArrowSchema::from_samples instead"] pub fn serialize_into_fields(items: &T, options: TracingOptions) -> Result> where @@ -224,7 +245,25 @@ where schema.to_arrow_fields() } -/// Determine the schema of an object that represents a single array +/// Replaced by +/// [`SerdeArrowSchema::from_samples`][crate::schema::SerdeArrowSchema::from_samples] +/// and [`Items`][crate::utils::Items] (*[example][serialize_into_field]*) +/// +/// ```rust +/// # fn main() -> serde_arrow::Result<()> { +/// use serde_arrow::{ +/// schema::{SerdeArrowSchema, TracingOptions}, +/// utils::Items, +/// }; +/// +/// let samples: Vec = vec![1, 2, 3, /* ... */ ]; +/// let fields = SerdeArrowSchema::from_samples(&Items(&samples), TracingOptions::default())? +/// .to_arrow_fields()?; +/// # +/// # drop(fields); +/// # Ok(()) +/// # } +/// ``` #[deprecated = "serialize_into_field is deprecated. Use serde_arrow::to_arrow with serde_arrow::utils::Items instead"] pub fn serialize_into_field(items: &T, name: &str, options: TracingOptions) -> Result where @@ -256,7 +295,26 @@ where crate::from_arrow(fields, arrays) } -/// Serialize an object that represents a single array into an array +/// Replaced by [`serde_arrow::to_arrow`][crate::to_arrow] and +/// [`Items`][crate::utils::Items] (*[example][serialize_into_array]*) +/// +/// ```rust +/// # fn main() -> serde_arrow::Result<()> { +/// use serde_arrow::{ +/// schema::{SerdeArrowSchema, TracingOptions}, +/// utils::Items, +/// }; +/// +/// let samples: Vec = vec![1, 2, 3, /* ... */ ]; +/// let fields = SerdeArrowSchema::from_samples(&Items(&samples), TracingOptions::default())? +/// .to_arrow_fields()?; +/// +/// let arrays = serde_arrow::to_arrow(&fields, &Items(&samples))?; +/// # +/// # drop(fields); +/// # Ok(()) +/// # } +/// ``` #[deprecated = "serialize_into_array is deprecated. Use serde_arrow::arrow::ArrayBuilder instead"] pub fn serialize_into_array(field: &Field, items: &T) -> Result where @@ -273,7 +331,25 @@ where interpreter.build_arrow_array() } -/// Deserialize a sequence of objects from a single array +/// Replaced by [`serde_arrow::to_arrow`][crate::from_arrow] and +/// [`Items`][crate::utils::Items] (*[example][deserialize_from_array]*) +/// +/// ```rust +/// # fn main() -> serde_arrow::Result<()> { +/// # use serde_arrow::schema::{SerdeArrowSchema, TracingOptions}; +/// # let samples: Vec = vec![1, 2, 3, /* ... */ ]; +/// # let fields = SerdeArrowSchema::from_samples(&Items(&samples), TracingOptions::default())? +/// # .to_arrow_fields()?; +/// # let arrays = serde_arrow::to_arrow(&fields, &Items(&samples))?; +/// # +/// use serde_arrow::utils::Items; +/// +/// let Items(items): Items> = serde_arrow::from_arrow(&fields, &arrays)?; +/// # +/// # drop(items); +/// # Ok(()) +/// # } +/// ``` #[deprecated = "deserialize_from_array is deprecated. Use serde_arrow::from_arrow instead"] pub fn deserialize_from_array<'de, T, A>(field: &'de Field, array: &'de A) -> Result where @@ -283,7 +359,29 @@ where generic::deserialize_from_array(field, array.as_ref()) } -/// Build a single array item by item +/// Replaced by [`ArrowBuilder`][crate::ArrowBuilder] and +/// [`Items`][crate::utils::Items] / [`Item`][crate::utils::Item] (*[example][ArrayBuilder]*) +/// +/// ```rust +/// # fn main() -> serde_arrow::Result<()> { +/// # use serde_arrow::_impl::arrow; +/// use arrow::datatypes::{DataType, Field}; +/// use serde_arrow::{ArrowBuilder, utils::{Items, Item}}; +/// +/// let fields = vec![Field::new("item", DataType::UInt8, false)]; +/// let mut builder = ArrowBuilder::new(&fields)?; +/// +/// builder.push(&Item(0))?; +/// builder.push(&Item(1))?; +/// builder.push(&Item(2))?; +/// +/// builder.extend(&Items(&[3, 4, 5]))?; +/// +/// let arrays = builder.build_arrays()?; +/// # drop(arrays); +/// # Ok(()) +/// # } +/// ``` #[deprecated = "serde_arrow::arrow::ArrayBuilder is deprecated. Use serde_arrow::ArrowBuilder instead"] pub struct ArrayBuilder(generic::GenericBuilder); diff --git a/serde_arrow/src/internal/error.rs b/serde_arrow/src/internal/error.rs index 69b082c2..957aa602 100644 --- a/serde_arrow/src/internal/error.rs +++ b/serde_arrow/src/internal/error.rs @@ -160,12 +160,14 @@ impl From for Error { } } +pub type PanicOnError = std::result::Result; + /// An error type for testing, that panics once an error is converted #[allow(unused)] #[derive(Debug)] -pub struct PanicOnError; +pub struct PanicOnErrorError; -impl From for PanicOnError { +impl From for PanicOnErrorError { fn from(value: E) -> Self { panic!("{value}"); } diff --git a/serde_arrow/src/internal/generic.rs b/serde_arrow/src/internal/generic.rs index 334e29a9..c90e643b 100644 --- a/serde_arrow/src/internal/generic.rs +++ b/serde_arrow/src/internal/generic.rs @@ -68,13 +68,49 @@ where deserialize_from_source(interpreter) } -/// A wrapper around a sequence of individual items +/// A wrapper around a sequence of items +/// +/// When serialized or deserialized, it behaves as if each item was wrapped in a +/// struct with a single attribute `item`. +/// +/// ```rust +/// # fn main() -> serde_arrow::_impl::PanicOnError<()> { +/// # use serde_arrow::utils::Items; +/// # +/// assert_eq!( +/// serde_json::to_string(&Items([13, 21]))?, +/// r#"[{"item":13},{"item":21}]"#, +/// ); +/// +/// let Items(items): Items> = serde_json::from_str(r#"[ +/// {"item": 21}, +/// {"item": 42} +/// ]"#)?; +/// assert_eq!(items, &[21, 42]); +/// # Ok(()) +/// # } +/// ``` pub struct Items( /// The wrapped object pub T, ); /// A wrapper around a single item +/// +/// When serialized or deserialized, it behaves as if the Item was wrapped in a +/// struct with a single attribute `item`. +/// +/// ```rust +/// # fn main() -> serde_arrow::_impl::PanicOnError<()> { +/// # use serde_arrow::utils::Item; +/// # +/// assert_eq!(serde_json::to_string(&Item(42))?, r#"{"item":42}"#); +/// +/// let Item(item): Item = serde_json::from_str(r#"{"item":21}"#)?; +/// assert_eq!(item, 21); +/// # Ok(()) +/// # } +/// ``` pub struct Item( /// The wrapped object pub T, diff --git a/serde_arrow/src/lib.rs b/serde_arrow/src/lib.rs index f8be1375..8dca4827 100644 --- a/serde_arrow/src/lib.rs +++ b/serde_arrow/src/lib.rs @@ -202,6 +202,9 @@ pub mod _impl { #[cfg(not(doctest))] pub mod status {} } + + // Reexport for tests + pub use crate::internal::error::PanicOnError; } #[cfg(all(test, has_arrow, has_arrow2))] diff --git a/serde_arrow/src/test_end_to_end/issue_90.rs b/serde_arrow/src/test_end_to_end/issue_90.rs index dfea8dd0..97dc7908 100644 --- a/serde_arrow/src/test_end_to_end/issue_90.rs +++ b/serde_arrow/src/test_end_to_end/issue_90.rs @@ -22,7 +22,7 @@ pub struct VectorMetric { } #[test] -fn example() -> Result<(), PanicOnError> { +fn example() -> PanicOnError<()> { let metrics = vec![ VectorMetric { distribution: Some(Distribution { @@ -55,7 +55,7 @@ fn example() -> Result<(), PanicOnError> { } #[test] -fn example_top_level_none() -> Result<(), PanicOnError> { +fn example_top_level_none() -> PanicOnError<()> { use serde_arrow::schema::SerdeArrowSchema; // top-level options are not supported if fields are are extracted diff --git a/serde_arrow/src/test_end_to_end/test_items.rs b/serde_arrow/src/test_end_to_end/test_items.rs index 3a27b6c1..3805ba33 100644 --- a/serde_arrow/src/test_end_to_end/test_items.rs +++ b/serde_arrow/src/test_end_to_end/test_items.rs @@ -5,7 +5,7 @@ use serde::{Deserialize, Serialize}; use crate::{self as serde_arrow, internal::{error::PanicOnError, generic::Items}, schema::TracingOptions}; #[test] -fn example() -> Result<(), PanicOnError> { +fn example() -> PanicOnError<()> { use serde_arrow::schema::SerdeArrowSchema; let items: Vec = vec![1, 2, 3, 4, 5]; From 415d618e5d29a6eaf540f700cc28fab4c122132d Mon Sep 17 00:00:00 2001 From: Christopher Prohm Date: Tue, 31 Oct 2023 13:52:00 +0100 Subject: [PATCH 20/27] Update top-level docs --- Readme.md | 26 ++++++++++++-------------- serde_arrow/src/lib.rs | 35 ++++++++++++++++++++--------------- 2 files changed, 32 insertions(+), 29 deletions(-) diff --git a/Readme.md b/Readme.md index 53bd1f1c..01da44eb 100644 --- a/Readme.md +++ b/Readme.md @@ -36,27 +36,25 @@ arrays, and deserialization from arrays to Rust structs. ## Example ```rust -#[derive(Serialize)] -struct Item { +use serde_arrow::schema::{TracingOptions, SerdeArrowSchema}; + +#[derive(Serialize, Deserialize)] +struct Record { a: f32, b: i32, - point: Point, } -#[derive(Serialize)] -struct Point(f32, f32); - -let items = vec![ - Item { a: 1.0, b: 1, point: Point(0.0, 1.0) }, - Item { a: 2.0, b: 2, point: Point(2.0, 3.0) }, - // ... +let records = vec![ + Record { a: 1.0, b: 1 }, + Record { a: 2.0, b: 2 }, + Record { a: 3.0, b: 3 }, ]; -// detect the field types and convert the items to arrays -use serde_arrow::arrow2::{serialize_into_fields, serialize_into_arrays}; +let fields = + SerdeArrowSchema::from_type::(TracingOptions::default())? + .to_arrow2_fields()?; -let fields = serialize_into_fields(&items, TracingOptions::default())?; -let arrays = serialize_into_arrays(&fields, &items)?; +let arrays = serde_arrow::to_arrow2(&fields, &records)?; ``` These arrays can now be written to disk using the helper method defined in the diff --git a/serde_arrow/src/lib.rs b/serde_arrow/src/lib.rs index 8dca4827..c67380e8 100644 --- a/serde_arrow/src/lib.rs +++ b/serde_arrow/src/lib.rs @@ -10,25 +10,30 @@ //! //! In the Rust ecosystem there are two competing implementations of the arrow //! in-memory format, [`arrow`][] and [`arrow2`][]. `serde_arrow` supports both. +//! The supported arrow implementations can be selected via +//! [features](#features). //! -//! `serde_arrow` relies on a schema to translate between Rust and Arrow. The -//! schema is expressed as Arrow fields and describes the schema of the arrays. +//! `serde_arrow` relies on a schema to translate between Rust and Arrow as +//! their type system are not directly translatable. The schema is expressed as +//! a collection of Arrow fields with additional metadata describing the arrays. //! E.g., to convert Rust strings containing timestamps to Date64 arrays, the //! schema should contain a `Date64`. `serde_arrow` supports to derive the //! schema from the data itself via schema tracing, but does not require it. It -//! is always possible to specify the schema manually. See the [`schema`] -//! module for further details. +//! is always possible to specify the schema manually. See the [`schema`] module +//! for further details. //! //! ## Overview //! -//! | Operation | `arrow` | `arrow2` | +//! | Operation | `arrow` | `arrow2` | //! |------------------|------------------|-------------------| +//! | Required features | [`arrow-*`](#features) | [`arrow2-*`](#features) | +//! | | | | //! | Rust to Arrow | [`to_arrow`] | [`to_arrow2`] | //! | Arrow to Rust | [`from_arrow`] | [`from_arrow2`] | //! | Arrow Builder | [`ArrowBuilder`] | [`Arrow2Builder`] | //! | | | | -//! | Fields to SerdeArrowSchema | [`SerdeArrowSchema::from_arrow_fields`][schema::SerdeArrowSchema::from_arrow_fields] | [`SerdeArrowSchema::form_arrow2_fields`][schema::SerdeArrowSchema::from_arrow2_fields] | -//! | SerdeArrowSchema to fields | [`schema.to_arrow_fields()`][schema::SerdeArrowSchema::to_arrow_fields] | [`schema.to_arrow2_fields()`][schema::SerdeArrowSchema::to_arrow2_fields] | +//! | Fields to Schema | [`SerdeArrowSchema::from_arrow_fields`][schema::SerdeArrowSchema::from_arrow_fields] | [`SerdeArrowSchema::form_arrow2_fields`][schema::SerdeArrowSchema::from_arrow2_fields] | +//! | Schema to fields | [`schema.to_arrow_fields()`][schema::SerdeArrowSchema::to_arrow_fields] | [`schema.to_arrow2_fields()`][schema::SerdeArrowSchema::to_arrow2_fields] | //! //! ## Example //! @@ -41,24 +46,24 @@ //! use serde_arrow::schema::{TracingOptions, SerdeArrowSchema}; //! //! ##[derive(Serialize, Deserialize)] -//! struct Example { +//! struct Record { //! a: f32, //! b: i32, //! } //! //! let records = vec![ -//! Example { a: 1.0, b: 1 }, -//! Example { a: 2.0, b: 2 }, -//! Example { a: 3.0, b: 3 }, +//! Record { a: 1.0, b: 1 }, +//! Record { a: 2.0, b: 2 }, +//! Record { a: 3.0, b: 3 }, //! ]; //! -//! // Auto-detect the arrow types. Result may need to be overwritten and -//! // customized, see serde_arrow::schema for details. -//! let fields = SerdeArrowSchema::from_type::(TracingOptions::default())? +//! let fields = +//! SerdeArrowSchema::from_type::(TracingOptions::default())? //! .to_arrow2_fields()?; //! //! let arrays = serde_arrow::to_arrow2(&fields, &records)?; -//! +//! # +//! # drop(arrays); //! # Ok(()) //! # } //! # #[cfg(not(feature = "has_arrow2"))] From c5c81d7b9aa104986e33f0a4068c8cbba5a54f0c Mon Sep 17 00:00:00 2001 From: Christopher Prohm Date: Tue, 31 Oct 2023 15:06:02 +0100 Subject: [PATCH 21/27] Fix bug in deserialize_from_source for Maps --- serde_arrow/src/internal/source.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/serde_arrow/src/internal/source.rs b/serde_arrow/src/internal/source.rs index eefce8d7..15e1dab4 100644 --- a/serde_arrow/src/internal/source.rs +++ b/serde_arrow/src/internal/source.rs @@ -116,8 +116,10 @@ impl<'de, 'a, 'event, S: EventSource<'event>> de::Deserializer<'de> Some(Event::F64(_)) => self.deserialize_f64(visitor), Some(Event::Str(_)) => self.deserialize_str(visitor), Some(Event::OwnedStr(_)) => self.deserialize_string(visitor), - Some(Event::StartStruct) => self.deserialize_map(visitor), + Some(Event::StartStruct) => self.deserialize_struct("", &[], visitor), + Some(Event::StartMap) => self.deserialize_map(visitor), Some(Event::StartSequence) => self.deserialize_seq(visitor), + Some(Event::StartTuple) => self.deserialize_tuple(0, visitor), Some(Event::Variant(_, _) | Event::OwnedVariant(_, _)) => { self.deserialize_enum("", &[], visitor) } From 7c91f3f21d1948818437bc2439ee074d97f5553b Mon Sep 17 00:00:00 2001 From: Christopher Prohm Date: Tue, 31 Oct 2023 15:06:24 +0100 Subject: [PATCH 22/27] Rework schema docs --- serde_arrow/Quickstart.md | 23 +-- serde_arrow/src/internal/schema.rs | 216 ++++++++++++++++++++++------- serde_arrow/src/lib.rs | 3 +- serde_arrow/src/schema.rs | 22 ++- 4 files changed, 175 insertions(+), 89 deletions(-) diff --git a/serde_arrow/Quickstart.md b/serde_arrow/Quickstart.md index 39fc5ad9..5380373f 100644 --- a/serde_arrow/Quickstart.md +++ b/serde_arrow/Quickstart.md @@ -5,8 +5,7 @@ 1. [Working with date time objects](#working-with-date-time-objects) 2. [Dictionary encoding for strings](#dictionary-encoding-for-strings) 3. [Working with enums](#working-with-enums) -4. [Specifying the schema in JSON](#specifying-the-schema-in-json) -5. [Convert from arrow2 to arrow arrays](#convert-from-arrow2-to-arrow-arrays) +4. [Convert from arrow2 to arrow arrays](#convert-from-arrow2-to-arrow-arrays) ## Working with date time objects @@ -120,26 +119,6 @@ will be mapped to the following arrow union: - `type = 1`: `Struct { 0: u32, 1: u32 }` - `type = 2`: `Struct { a: f32, b: f32 }` -## Specifying the schema in JSON - -TODO: cross-reference - -```rust -let schema_json = r#" - [ - { - "name": "date", - "data_type": "Date64", - "strategy": "NaiveStrAsDate64" - }, - {"name":"foo","data_type":"U8"}, - {"name":"bar","data_type":"Utf8"} - ] -"#; - -let schema: Schema = serde_json::from_str(&schema_json).unwrap(); -``` - ## Convert from arrow2 to arrow arrays Both `arrow` and `arrow2` use the Arrow memory format. Thanks to this fact, it diff --git a/serde_arrow/src/internal/schema.rs b/serde_arrow/src/internal/schema.rs index 9a79e952..3733da34 100644 --- a/serde_arrow/src/internal/schema.rs +++ b/serde_arrow/src/internal/schema.rs @@ -18,55 +18,14 @@ pub const STRATEGY_KEY: &str = "SERDE_ARROW:strategy"; /// A collection of fields as understood by `serde_arrow` /// -/// `SerdeArrowSchema` is designed to be easily serialized and deserialized +/// There are three main ways to specify the schema: /// -/// ```rust -/// # use serde_arrow::schema::SerdeArrowSchema; -/// let schema_json = r#" -/// [ -/// { -/// "name": "date", -/// "data_type": "Date64", -/// "strategy": "NaiveStrAsDate64" -/// }, -/// {"name":"foo","data_type":"U8"}, -/// {"name":"bar","data_type":"Utf8"} -/// ] -/// "#; -/// -/// let schema: SerdeArrowSchema = serde_json::from_str(&schema_json).unwrap(); -/// ``` -/// -/// The schema can be given in two ways: -/// -/// - an array of fields -/// - or an object with a `"fields"` key that contains an array of fields -/// -/// Each field is an object with the following keys: -/// -/// - `"name"` (**required**): the name of the field -/// - `"data_type"` (**required**): the data type of the field as a string -/// - `"nullable"` (**optional**): if `true`, the field can contain null values -/// - `"strategy"` (**optional**): if given a string describing the strategy to -/// use (e.g., "NaiveStrAsDate64"). -/// - `"children"` (**optional**): a list of child fields, the semantics depend -/// on the data type -/// -/// The following data types can be given -/// -/// - booleans: `"Bool"` -/// - signed integers: `"I8"`, `"I16"`, `"I32"`, `"I64"` -/// - unsigned integers: `"U8"`, `"U16"`, `"U32"`, `"U64"` -/// - floats: `"F16"`, `"F32"`, `"F64"` -/// - strings: `"Utf8"`, `"LargeUtf8"` -/// - lists: `"List"`, `"LargeList"`. `"children"` must contain a single field -/// named `"element"` that describes the element types -/// - structs: `"Struct"`. `"children"` must contain the child fields -/// - maps: `"Map"`. `"children"` must contain two fields, named `"key"` and -/// `"value"` that encode the key and value types -/// - unions: `"Union"`. `"children"` must contain the different variants -/// - dictionaries: `"Dictionary"`. `"children"` must contain two different -/// fields, named `"key"` of integer type and named `"value"` of string type +/// 1. [`SerdeArrowSchema::from_value`]: specify the schema manually, e.g., as a +/// JSON value +/// 2. [`SerdeArrowSchema::from_type`]: determine the schema from the record +/// type +/// 3. [`SerdeArrowSchema::from_samples`]: Determine the schema from samples of +/// the data /// #[derive(Default, Debug, PartialEq, Clone, Serialize, Deserialize)] #[serde(from = "SchemaSerializationOptions")] @@ -96,7 +55,112 @@ impl SerdeArrowSchema { Self::default() } - /// Determine the schema from the given type + /// Build the schema from an object that implements serialize (e.g., `serde_json::Value`) + /// + /// ```rust + /// # fn main() -> serde_arrow::_impl::PanicOnError<()> { + /// use serde_arrow::schema::SerdeArrowSchema; + /// + /// let schema = serde_json::json!([ + /// {"name":"foo","data_type":"U8"}, + /// {"name":"bar","data_type":"Utf8"}, + /// ]); + /// + /// let schema = SerdeArrowSchema::from_value(&schema)?; + /// # Ok(()) + /// # } + /// ``` + /// + /// `SerdeArrowSchema` can also be directly serialized and deserialized. + /// + /// ```rust + /// # fn main() -> serde_arrow::_impl::PanicOnError<()> { + /// # let json_schema_str = "[]"; + /// # + /// use serde_arrow::schema::SerdeArrowSchema; + /// + /// let schema: SerdeArrowSchema = serde_json::from_str(json_schema_str)?; + /// serde_json::to_string(&schema)?; + /// # Ok(()) + /// # } + /// ``` + /// + /// The schema can be given in two ways: + /// + /// - an array of fields + /// - or an object with a `"fields"` key that contains an array of fields + /// + /// Each field is an object with the following keys: + /// + /// - `"name"` (**required**): the name of the field + /// - `"data_type"` (**required**): the data type of the field as a string + /// - `"nullable"` (**optional**): if `true`, the field can contain null values + /// - `"strategy"` (**optional**): if given a string describing the strategy to + /// use (e.g., "NaiveStrAsDate64"). + /// - `"children"` (**optional**): a list of child fields, the semantics depend + /// on the data type + /// + /// The following data types can be given + /// + /// - booleans: `"Bool"` + /// - signed integers: `"I8"`, `"I16"`, `"I32"`, `"I64"` + /// - unsigned integers: `"U8"`, `"U16"`, `"U32"`, `"U64"` + /// - floats: `"F16"`, `"F32"`, `"F64"` + /// - strings: `"Utf8"`, `"LargeUtf8"` + /// - lists: `"List"`, `"LargeList"`. `"children"` must contain a single field + /// named `"element"` that describes the element types + /// - structs: `"Struct"`. `"children"` must contain the child fields + /// - maps: `"Map"`. `"children"` must contain two fields, named `"key"` and + /// `"value"` that encode the key and value types + /// - unions: `"Union"`. `"children"` must contain the different variants + /// - dictionaries: `"Dictionary"`. `"children"` must contain two different + /// fields, named `"key"` of integer type and named `"value"` of string type + /// + pub fn from_value(value: &T) -> Result { + // simple version of serde-transcode + let mut events = Vec::::new(); + crate::internal::sink::serialize_into_sink(&mut events, value)?; + let this: Self = crate::internal::source::deserialize_from_source(&events)?; + Ok(this) + } + + /// Determine the schema from the given record type + /// + /// ```rust + /// # fn main() -> serde_arrow::_impl::PanicOnError<()> { + /// # use serde_arrow::_impl::arrow; + /// use arrow::datatypes::DataType; + /// use serde::Deserialize; + /// use serde_arrow::schema::{SerdeArrowSchema, TracingOptions}; + /// + /// ##[derive(Deserialize)] + /// struct Record { + /// int: i32, + /// float: f64, + /// string: String, + /// } + /// + /// let schema = SerdeArrowSchema::from_type::(TracingOptions::default())?; + /// let fields = schema.to_arrow_fields()?; + /// + /// assert_eq!(*fields[0].data_type(), DataType::Int32); + /// assert_eq!(*fields[1].data_type(), DataType::Float64); + /// assert_eq!(*fields[2].data_type(), DataType::LargeUtf8); + /// # Ok(()) + /// # } + /// ``` + /// + /// This approach requires the type to implement + /// [`Deserialize`][serde::Deserialize]. As only type information is used, + /// it is not possible to detect data dependent properties. E.g., it is not + /// possible to auto detect date time strings. + /// + /// Note, the type must encode a single "row" in the resulting data frame. + /// When encoding single arrays, use the [Item][crate::utils::Item] wrapper + /// instead of [Items][crate::utils::Items]. + /// + /// See [TracingOptions] for customization options. + /// pub fn from_type<'de, T: Deserialize<'de>>(options: TracingOptions) -> Result { let mut tracer = Tracer::new(String::from("$"), options); tracer.trace_type::()?; @@ -105,11 +169,57 @@ impl SerdeArrowSchema { /// Determine the schema from the given samples /// - /// To correctly record the type information make sure to: + /// ```rust + /// # fn main() -> serde_arrow::_impl::PanicOnError<()> { + /// # use serde_arrow::_impl::arrow; + /// use arrow::datatypes::DataType; + /// use serde::Serialize; + /// use serde_arrow::schema::{SerdeArrowSchema, TracingOptions}; + /// + /// ##[derive(Serialize)] + /// struct Record { + /// int: i32, + /// float: f64, + /// string: String, + /// } + /// + /// let samples = vec![ + /// Record { + /// int: 1, + /// float: 2.0, + /// string: String::from("hello") + /// }, + /// Record { + /// int: -1, + /// float: 32.0, + /// string: String::from("world") + /// }, + /// // ... + /// ]; + /// + /// let schema = SerdeArrowSchema::from_samples(&samples, TracingOptions::default())?; + /// let fields = schema.to_arrow_fields()?; + /// + /// assert_eq!(*fields[0].data_type(), DataType::Int32); + /// assert_eq!(*fields[1].data_type(), DataType::Float64); + /// assert_eq!(*fields[2].data_type(), DataType::LargeUtf8); + /// # Ok(()) + /// # } + /// ``` + /// + /// This approach requires the type to implement + /// [`Serialize`][serde::Serialize] and the samples to include all relevant + /// values. It uses only the information encoded in the samples to generate + /// the schema. Therefore, the following requirements must be met: + /// + /// - at least one `Some` value for `Option` fields + /// - all variants of enum fields + /// - at least one element of sequence fields (e.g., `Vec`) + /// - at least one example of map types (with all possible keys , if + /// [`options.map_as_struct == true`][TracingOptions::map_as_struct]) + /// (e.g., `HashMap`) /// - /// - include values for `Option` - /// - include all variants of an enum - /// - include at least single element of a list or a map + /// See [TracingOptions] for customization options. /// pub fn from_samples(samples: &T, options: TracingOptions) -> Result { let mut tracer = Tracer::new(String::from("$"), options); diff --git a/serde_arrow/src/lib.rs b/serde_arrow/src/lib.rs index c67380e8..195f276c 100644 --- a/serde_arrow/src/lib.rs +++ b/serde_arrow/src/lib.rs @@ -19,7 +19,8 @@ //! E.g., to convert Rust strings containing timestamps to Date64 arrays, the //! schema should contain a `Date64`. `serde_arrow` supports to derive the //! schema from the data itself via schema tracing, but does not require it. It -//! is always possible to specify the schema manually. See the [`schema`] module +//! is always possible to specify the schema manually. See the [`schema` +//! module][schema] and [SerdeArrowSchema][schema::SerdeArrowSchema] //! for further details. //! //! ## Overview diff --git a/serde_arrow/src/schema.rs b/serde_arrow/src/schema.rs index 4007be10..3d324feb 100644 --- a/serde_arrow/src/schema.rs +++ b/serde_arrow/src/schema.rs @@ -1,21 +1,17 @@ -//! Configure how Arrow and Rust types are translated into one another +//! The mapping between Rust and Arrow types //! -//! When tracing the schema using the `serialize_into_fields` methods, the -//! following defaults are used: +//! To convert between Rust objects and Arrow types, `serde_arrows` requires +//! schema information as a list of Arrow fields with additional meta data. See +//! [SerdeArrowSchema] for details how to specify the schema. +//! +//! The default mapping of Rust types to Arrow types is as follows: //! //! - Strings: `LargeUtf8`, i.e., i64 offsets //! - Lists: `LargeList`, i.e., i64 offsets -//! - Strings with dictionary encoding: U32 keys and LargeUtf8 values -//! - Rationale: `polars` cannot handle 64 bit keys in its default -//! configuration -//! -//! Null-only fields (e.g., fields of type `()` or fields with only `None` -//! entries) result in errors per default. -//! [`TracingOptions::allow_null_fields`][crate::internal::tracing::TracingOptions::allow_null_fields] -//! allows to disable this behavior. +//! - Strings with dictionary encoding: `UInt32` keys and `LargeUtf8` values //! -//! All customization of the types happens via the metadata of the fields -//! structs describing arrays. For example, to let `serde_arrow` handle date +//! All customization of the types happens by including a suitable [Strategy] in +//! the metadata of the fields. For example, to let `serde_arrow` handle date //! time objects that are serialized to strings (chrono's default), use //! //! ```rust From 6686f4611a2e923488f3e9f444b61560515219f6 Mon Sep 17 00:00:00 2001 From: Christopher Prohm Date: Tue, 31 Oct 2023 15:19:15 +0100 Subject: [PATCH 23/27] Prevent blowup in recursive types --- serde_arrow/src/internal/tracing/tracer.rs | 26 +++++++++++++++++++ .../src/test_impls/issue_90_type_tracing.rs | 14 ++++++++++ 2 files changed, 40 insertions(+) diff --git a/serde_arrow/src/internal/tracing/tracer.rs b/serde_arrow/src/internal/tracing/tracer.rs index 2a9cb744..22a53489 100644 --- a/serde_arrow/src/internal/tracing/tracer.rs +++ b/serde_arrow/src/internal/tracing/tracer.rs @@ -6,6 +6,11 @@ use crate::internal::{ tracing::TracingOptions, }; +// TODO: allow to customize +const MAX_TYPE_DEPTH: usize = 20; +const RECURSIVE_TYPE_WARNING: &str = + "too deeply nested type detected. Recursive types are not supported in schema tracing"; + macro_rules! defined_tracer { ($($variant:ident($impl:ident)),* $(,)? ) => { #[derive(Debug, PartialEq, Clone)] @@ -96,6 +101,10 @@ impl Tracer { pub fn reset(&mut self) -> Result<()> { dispatch_tracer!(self, tracer => tracer.reset()) } + + pub fn get_depth(&self) -> usize { + self.get_path().chars().filter(|c| *c == '.').count() + } } // TODO: move into trace any? @@ -104,7 +113,16 @@ impl Tracer { dispatch_tracer!(self, tracer => { tracer.nullable = true; }); } + pub fn enforce_depth_limit(&self) -> Result<()> { + if self.get_depth() >= MAX_TYPE_DEPTH { + fail!("{RECURSIVE_TYPE_WARNING}"); + } + Ok(()) + } + pub fn ensure_struct(&mut self, fields: &[S]) -> Result<()> { + self.enforce_depth_limit()?; + match self { this @ Self::Unknown(_) => { let field_names = fields @@ -152,6 +170,8 @@ impl Tracer { } pub fn ensure_tuple(&mut self, num_fields: usize) -> Result<()> { + self.enforce_depth_limit()?; + match self { this @ Self::Unknown(_) => { let tracer = TupleTracer { @@ -183,6 +203,8 @@ impl Tracer { } pub fn ensure_union(&mut self, variants: &[&str]) -> Result<()> { + self.enforce_depth_limit()?; + match self { this @ Self::Unknown(_) => { let tracer = UnionTracer { @@ -218,6 +240,8 @@ impl Tracer { } pub fn ensure_list(&mut self) -> Result<()> { + self.enforce_depth_limit()?; + match self { this @ Self::Unknown(_) => { let tracer = ListTracer { @@ -242,6 +266,8 @@ impl Tracer { } pub fn ensure_map(&mut self) -> Result<()> { + self.enforce_depth_limit()?; + match self { this @ Self::Unknown(_) => { let tracer = MapTracer { diff --git a/serde_arrow/src/test_impls/issue_90_type_tracing.rs b/serde_arrow/src/test_impls/issue_90_type_tracing.rs index 48bd22b2..108cf637 100644 --- a/serde_arrow/src/test_impls/issue_90_type_tracing.rs +++ b/serde_arrow/src/test_impls/issue_90_type_tracing.rs @@ -2,6 +2,7 @@ use std::collections::HashMap; use serde::{Deserialize, Serialize}; +use super::macros::expect_error; use crate::internal::{ generic::{Item, Items}, schema::{GenericDataType as T, GenericField as F, Strategy}, @@ -307,3 +308,16 @@ mod mixed_tracing_unions { assert_eq!(actual, expected); } } + +#[test] +fn unsupported_recursive_types() { + #[derive(Deserialize)] + struct Tree { + left: Option>, + right: Option>, + } + + let mut tracer = Tracer::new(String::from("$"), TracingOptions::default()); + let res = tracer.trace_type::(); + expect_error(&res, "too deeply nested type detected"); +} From 73cd7e11fd2f9f7ecac1557505d9ae5811a8f85a Mon Sep 17 00:00:00 2001 From: Christopher Prohm Date: Tue, 31 Oct 2023 15:57:03 +0100 Subject: [PATCH 24/27] Update benchmarks --- Readme.md | 24 ++++++++++++------------ timings.png | Bin 27247 -> 26886 bytes x.py | 2 +- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/Readme.md b/Readme.md index 01da44eb..e006ade8 100644 --- a/Readme.md +++ b/Readme.md @@ -111,33 +111,33 @@ The detailed runtimes of the [benchmarks](./serde_arrow/benches/groups/) are lis | label | time [ms] | arrow2_convert | serde_arrow | arrow | |----------------|-----------|----------------|-------------|-------| -| arrow2_convert | 53.14 | 1.00 | 0.15 | 0.03 | -| serde_arrow | 348.85 | 6.57 | 1.00 | 0.20 | -| arrow | 1703.18 | 32.05 | 4.88 | 1.00 | +| arrow2_convert | 52.28 | 1.00 | 0.21 | 0.06 | +| serde_arrow | 252.35 | 4.83 | 1.00 | 0.29 | +| arrow | 877.10 | 16.78 | 3.48 | 1.00 | ### complex_common_serialize(1000000) | label | time [ms] | arrow2_convert | serde_arrow | arrow | |----------------|-----------|----------------|-------------|-------| -| arrow2_convert | 478.85 | 1.00 | 0.18 | 0.05 | -| serde_arrow | 2711.26 | 5.66 | 1.00 | 0.31 | -| arrow | 8729.60 | 18.23 | 3.22 | 1.00 | +| arrow2_convert | 527.58 | 1.00 | 0.22 | 0.06 | +| serde_arrow | 2403.99 | 4.56 | 1.00 | 0.27 | +| arrow | 8857.78 | 16.79 | 3.68 | 1.00 | ### primitives_serialize(100000) | label | time [ms] | arrow2_convert | serde_arrow | arrow | |----------------|-----------|----------------|-------------|-------| -| arrow2_convert | 13.64 | 1.00 | 0.19 | 0.07 | -| serde_arrow | 73.32 | 5.38 | 1.00 | 0.38 | -| arrow | 192.76 | 14.13 | 2.63 | 1.00 | +| arrow2_convert | 13.05 | 1.00 | 0.21 | 0.06 | +| serde_arrow | 62.63 | 4.80 | 1.00 | 0.30 | +| arrow | 209.27 | 16.04 | 3.34 | 1.00 | ### primitives_serialize(1000000) | label | time [ms] | arrow2_convert | serde_arrow | arrow | |----------------|-----------|----------------|-------------|-------| -| arrow2_convert | 178.98 | 1.00 | 0.25 | 0.09 | -| serde_arrow | 716.76 | 4.00 | 1.00 | 0.36 | -| arrow | 2005.61 | 11.21 | 2.80 | 1.00 | +| arrow2_convert | 144.58 | 1.00 | 0.24 | 0.07 | +| serde_arrow | 603.95 | 4.18 | 1.00 | 0.28 | +| arrow | 2160.02 | 14.94 | 3.58 | 1.00 | diff --git a/timings.png b/timings.png index f409259fcf55a8f1ae626b5b79a6025a7ceedbf9..a93e623a20a74590a87e7b750912e92a20c5983f 100644 GIT binary patch literal 26886 zcmeIb2UL|=w=IaJrDdedf&{}60}6tm1OX#K5R{xz$vI~MLz#(3q9m0hSwV6#f`CW{ z;T(dJBumcFb6eGY?;ZE`9X(!;JNmub*HJZo{^Er1+uz!2%{Av-+Xt6qCDyNDUcw*LFGR z+t?agS{&m#!Nq%cmyw;Fm8}prxB36|0xnA%L+*oGT^{(5)mD;fwhRp0H<5pr8AjIN ztBh==&Yf0t3L9*3wA-T8Q9NQ;yZN^jXMX(Y!gE35_KHn=t}yhjc_yWBdE@Czom1D| zPCoo?kLu0s%i{N(KY!`eu0Q^eSM=ZJ5^&Av%&y&Ci;L|8Woe-+Z$G;(*3-$7?b!V# zJV!WmApMx5S@Dj&xCH!J>(uG6>eo+i25&!I_Uk9ryZU z$__cY?F3`-_wKi<^piUtg_b1qZaaAJVAeS?+gs#K2efiuZ`!;$!@j?PK2*kPIWyWt zjsEfDhvv{~yn2{cyq5 zA)UhOYu2q(jJXg{m#D(cDriHYxe3_)@D*L0)}8oKeO4(^IpE&CduGQ&zkmPEtGQoD zNWHqcx~X+#Qbz!d_1B}O=eIg_tan?OXcW{Qu8UVNX?`7l+`KvZ+Q*0S)*Z#v=$1T3 z)0$|hzWT(V3;qZEk~5Kr5_=+Cxk}tl4ec`e^&xsMv~Ogz#RF zmHlGJs_V(-K7G}^`L$GP;cSAoOYc5s=Y_Em+u_!NvLJ4q%(+V`nu6lytJg`rIGI+yLx-$ z>l0Oa&J0^H{uBByWiDKJsGO>OgqxeY#dWw~c44Y*qAN(J@56)5hb}+=OFhdX8B5Tk zzC@Pmh*qwmj{8hZtW3!Bp_V-1J~oeeYgQ2##k2P|D4jcZE@fgwxn%45Y~3T8*_R4j zCgkfAlmfVQL`mAh7dLt=&FKWWe&LXdI2>9Ld-eTYbCQGd`lT zFTQtlgl9VHyQ(B!j=GD-p6f8!r)c4MSyJ-;ty{PJ0|O13tbfBI*<8Z|xNzZuyp`2U zaUZtiC;X;;6(`#2rbjv^`VvzU3@V=F2)&B(@$re3i{$d~@R;atl$)Czd~7#!$dge z1lJ{QQWO0kzo3A1Vmr6CqMlwfmLZ9m-{b|N_0XlKHz>wUY5EBYaoz@>gP)Nwbeo$z zq?!FPRyI6A$iB}AYw7J!=6^7%s_Ig*MwYCUl$7_VLY!>xRGUW-C0R44Cf-sb)9mt@ zGq?8a+0)tEYux*a+;dw0cyBE=`tg3bzNN+CrSZ7C!mYau<@9X%^UtTowXuR1H6Mg^{P^b0Z$TH}H*Z#4 zTwL@{v+em3kAG(;X_owMvvwl0KcZy&KR@A@t~@#B+3K<67RNY79x5eSr$~FEuf7(` zwsp&v&KGhhUO`-1b|c@ioFz&Pi&Iqveu>; zyI>T2QP{Kvgpm+O{z+pl4w7*7r~3AxW_lPxl8 zFiW?sjCUAp7P>zD64|2vK-Q|2lMLU=Jc>rkIGGyGi;Ek4%}Ts;=T6Q-Q@WvCCXH;{ zlr1}U{Frvc6Vd8(9sjXgO30@3zQb_qf~9?V@`KI$%di9f!uCfv?NOPt&1-+S zl3tB3iaGyin=zK)0#?$+CD)V3J2w%}Ms?Y;WlWhWNvb^c@d_m|ba#z&SZ8`lV}_{; zNlA&y$u@^|8P}z3*o@?bf7DAG8&#~Wtz8CQmFRg^`-m;RxIQx)k6$#=oXMK5{A2xk z`}sViH?9eMI0;MHFo9>A_sa^TR8_2RyFTXCq1-h+?G!HN(R%Q}+l<{{@n6;6L& zTQJq?dPprTy8ClT)LseiA2l)3yu|_$0^hi9ysV7$^-JnIb}hY$9w;F$en?PIjl|&R z$}2@Ndk!7CaF3C_0ckL`H*+;vByr>VgaArUqU~TqvPPVA5Lanin@#|y%3_eKgPMs+ z0+Ld~hY#n-H$FLT{v<&${<+j+4#Ac$o~CE3ZV1=lM)pZXULSpcTS@)?@sV%@+bhRm z?eZ*@d?#zEnw!g4(Hm&{A0I|qR+tIMlG7Av>IkqEIJ%L7Z@1uBzH*%}5|`-w&$HMW zUoJ=Z`nEgGyCSf<`uqD|2qUb>R~}MIp#NMfSe)%wFLEzX&oqk{b({T!E3J*a_!wz6 z!s+|!lC7oL$5*V+-rL0ejVz9I%Rp25kv8qyE7wz`jv0Qj(-1Rn=8NkZKm@&e_ioFk zO?#x)FfsAka@=6k`$ut+ z?#-7ci+5V;nR+y&XeBN!P8A)JkG{_#8}{Qss#abmie)TPW4we98&m5wucVE9?cRDG zdKJj?;jO3e;V~QcimMhDY+x07A?deAy}%`-*oS>s@r1*GsyRLT^1ciHmyr9y`%tV? zsr#f&a4RG#!%|dVR@NRxDDI?Z+qYqQzNh1a*2x$>P*lfWF7VL{Pa4 zN?^@sSNT>}*5o#iB`s9>%ZP8>?$L$m4j;UCx1eB?v8!QaL;%ISt+3^CVEFzxY$0u= zJ|%VaC!%u$$4Y{n9M0Zdclq48zXG|m5|FFKXbZQGaoqmE+S_CpS>IXucw(f4os)}8 z78Rn_Z?D8_mkE>g%zTmv=ht?w&ZC{xC;+TM243u9+5v}@YP?y6$NN)@4jnyui9Ac; zlCvmAq~4WIG~9pj;y}q?zVzd`s~bC?Vg0-%mh(fH)U6qY`%0ZiN-n<#I zauXXHoA>_WZ_U={mk&vjk1qDwS)Zmajm7y9YFSu4GuDH2l)cmD{B4TJ&o|4PUtYb9 zQWpLG9;4}(FhL=Qfnc&Jymp?*r60d)=%Jh0WXAPaM0+Oo;x8c7ZgAxo-RZaXLszd} zyZ5x!@7ZnL4%}C={(6Th;EwFL)9ehTKAqN%R^noRdWcV-|DIzjga|eTtSa61X&=a$~l2$Kk$% z3=D57?mS(VWAgRo)wCwFCnudj^!(1i5_m)-_Mk%S4`ltK?&A&vU!MUA9+ZnXNy^x9 z3;K(YV}>TB)O`dVv?sU|WGMQS%#cMPF^4&rohZm&nrmSR?(*o14o6{QaX# zAMfv9oEvsb82kD2Qn-+PN>(Dm?!-QF(J4QhO;e50I?ayP%5@D6x?MD95weqfw4GZy z$EK?;-+8RRPY(5}uQ8PyjUDjg*BE!)%;@;IBKl7I(azHT=50vNTk~!5uoU#e?n?`f zUd;R#L|i8ofht~mEG?Rv>_-jj?d{E7TAT}b@+1Ke!nooI{{?C3+7`#QWR$IhQ#XHW zGAXaERX}l9!u!YP$0|9|-76(1`cc9~T$NjL?H}`qqfTA6viiwCu^nr8L9^QG-q$q! zN2Ju)bd?cQxqJ8S1ayjMDW3wK3LVLFn{yz<0XhIGK+MA}2cLias%j3b8PkG=!J{_D zLuk&ijk~6+s}wD{(b>uAM`CKB&EYAukAwwS^T!{T&{PuW7B1{)(A)!*^J6fpO&NEv zb;phuZ{D0jr(ufzt_q=W)mg@a0s^XN?~4{DL!-ju72*!<*>eWvPT|z4 z8wpB@u~}{I8h}o+v~r`Nhi&e27x7yBBR2o@wn7Vj$%)4V;-Nd&o6#!nf}C8Vu8&CU z)2F)%=7#bb>l-r7X%s2cc$2E9H&LZuywWY{i}DezQIHdfS+n(!(Z`2dn=Dc_b8KeY zZ!?kx&VGolh8}}T5m^|o6|!o-F*`M!%5m|DrV7DFNoT68R(*YK{FrpZy|r=VeoR_& zGv2*_A3&u(=g}=bAS&AWYCuKDEe3s#308@0AN35Am^A%z6Es=50L>EsV7V>g|Gug2 zpmxZKEhqx@HlR;TB18;;bu+$is9;KauqnO7+;zWf7-1yFBbVk0gQ@_?!+hNA1*@>b z3xFI1ZW}+l4|={|m7w>64^XP|? zfa(K{{X-1sVwL3Osp8%&@uzRE6s#Zo`syq&Taew1+Auzsur7rQlN!KqUtiIIJX3+3 z<4qe=nhRXBu3x_%)*Qlb8i*)jD!`V-X1DWa7qB{2DKZrXo+IPR+uuZ{+iz2&{r$I0 z3^XaA!LX(EM2NWd;iFWR78fSI2oHZk=Sj7jpSBhmuR1&X^F3qz^Yd%56FR4z6jLNs*Q;UO%G7Bb7zczAeR`r_m4epKxSl~xrK?(6$!!PpmJ!brGH zHW?BMD^50?1^?IC)z$QKFc0#Nm2p2}vlhIDVwQy#X)|R#PHjIZf3Bm%Cy3%UTDreU zWQA;5;F11KyprLyK1B*a=w!}%S3^)aOv?$=Sq7nQ8DYpb%CzIUg22Zru zH)^d}y}GpZrY}gF?#ca8oELA?9JhXiI@`+D-eX{GO5GmBx z$kQwt8h9LT*gYgaAuK0m`%@E<6B^sDqB5=B$VXph(wK6jca*&b+yw|9`#d_fDn$y- znyp6L)6>hPYLG3p`j}5x3>@cE%`gd|96Wx!A%+XN^J3`nikIwQL#*izR{Q3{)(_44 zX&DHy9Yh*R|W`=tmSSuTWX zrvOftw6k$DEGdO<*FZT4xy``lwDh%*TELEot_OR0Y@nom^8tg#Z}0DsGGmOpVB)eJq6=nqF%23T zOzfA7sKd=DzPPj-tZvDmi+-7Zb$3N*_)w$w!-q9HE%TpcXA7W~zTYUoXHxgcy8aBg zC_I4|<;GR*7PsVjM*}uI5HwkxJrOQw%kiFb9hFLDv2Su7j#o>MjrQm^$!*dR%Gt<*XC2Mzny)rU%W6at zA^D$he`s+DK${)!-K*&$g-Q<9=QTUM=5~GvuDp_?Kh1b zV`w}SW$TV4cSs{MiLeOJ6AbeT=7x)O-oAZHI$uy4~&|AV5UB2rb8wF}f;r-tl7h7H?- ztT*@)7LPR9hnQBtUQo)mx>`QRxJ#Gaefk-pq)06#k0XkX8^a~k;dV*>sR%tG2i)?Q zvVh$0EwOy6w|2hbPJ-1WsFC&SA6{a5O^$P6O~ujK58!R zl9Q6k*-aatoOF;86*O%)95j1GBhv^GPsAlGUs&Hj))`NxW8!ggk;+lxUSWM;u%fXu zj)^Vzl?I64!p@$Go6fB(`1x+F2|fE2f#w$i4#gw8JiLGZl8lU}yuAE)cc^7wtLtz@ zg9E9w*dBxnAuM-PzD|)lQJxx-)H=4<;f^{vZrv4rgG7-4zBwq&*R;>!_%fw4q20#p0hI<%=eRYy#! z=H=D*VJS}pK;Du%&!k%yC&wP7!9DQp=7Q8tmAk7|8m(z7tZYXnCkdfGj*9-`(z7$_ z`A)AEr#l!yd7Hew_4{n4$KvB;2T~3S=10qk+5rjayIo=cU@i!(tt>1_joPk*xg4dx zgQi)NB}-o>x-hOFGEt|H+n#i?CfleqST!4oIXE4KaLtMNd-uJie@JP@`FiR zVFCHS0KbRO3X$H&eWq(WLc0&O)}G$#8$h6B-HXdE-64hq&Ltpb4ynI>LC9K^=R{;^ z0+q^LfKwFk0E14Gs=wVOh_|PA^w7vR$umdx36- z`uOp&ByQ!TV49Q*GKI}r6(mO@+d*`Z2KX-zJW_k>_q7DBi%fpioke$|Smf?jl16~@ zaM4lLqN1pZNptN?8-CaNKZ9kp!- z9PE&&Y=SxZAp{WPh$9$#ARyZfw#l@ETy(Edfd>V`%$ zmWLnp(Co-gCMK!BrQ8!cPA|7GRuggAKwkHA_?kERY5cjuL>lY={^37=5mMzI$f@9Z1Q8v*DUrfd}x3?f+yO|Nloa(wl!-WXQfbwesHtJ_*hLB9^NE zO_|0w=K82a%%az`(Mralu_-f1FEvQlQVAt+1juqLGxH%G9i2OO@0z^1u}sLe+mk<> zY$uDo+Bq%D7+(GYkWv*wkDI@??^pH$v}tpj3MUd83WIv4 z2;SZP%fVr>olCQd&@sTwiD+U%@2r(NP)Y9NQJWY;C4-#(qd|A zYF-;vKR-;vTI2(H>;fhSqROoU<)ekGBn_dBO`853zLQ~|M!q!HaX5vL2Pk{8T_7cD zCD`34Qt*mI09X3&Y+0Vt1bDVj3Vo!Fh9x)LPHpLCoRH}d2an_8;;`6Vc3VF{2(ANi zo|&03F7{+)qc4GCk){HS5Zc#_jep90amJ8*$7RV&--ZA>_X%_g(sB!o!7fm0gou1Eyq z3ARFpIE;QdXh!UQ$EUH0xO8?_?ODExku*eLMDGJ+#d}08 z?Oa4eglzwZk`e-|)@;~7+6=qdabvJL!hPJ@`7b~YvVv2CJPu?VxMs_NT~g5OLx^q< z`kfe-9-{3AH_m6@r&tjwrt@gq5wd?MQUD^Cjf{T5+vsQtqD6`-8!pr}eb)ggK-P^0 zMjUCO4s=_PdtN$>22w$mso4=BGbNr(QHX^@V0B0b`u^_v0AJsyM8M|G|9Oj8zDm+O z=11-UMv-5j8%qRPDCG3>X?JC0>hEi}86hlPI{hRxGp#}K5Iz*aL>yhjbqf2=dM4g; zNCdTzh!08;FUb==Juvy(bS1?ijY2VYa|#(x=qds6Qf-x@2$H# z&FI$B_9Z{e|piL-&vr`82rbvPQqWTY( zuELYsEA-`J z-)Pu6WfAHP=%q=lNaTndeYxrn706I3+NCdt`WW+v4xg*g3y-Idr4k861?FwePLuJ|NL=S?k^aq~6>_B>@W@ZO$j;H~GXI4rP zuOD%BK`Amq)8jn~)rIt~PlO#)%8wi!>#1z~_-OkCcoE@u0d;kXM4Kslyx#|VzH6Ls z1?-P*Ki{q*EiCb_OgeZX+Zgr-_b9?0MT>;8rdpV+;U}VMh z(sG?fo)c;UQBGL$#zr+PVmrs_s-N71?8F9+Lb;AdE$IUT+Hrd-MJrD;&=OI9H^@Xf zXunz#E?#;DIfgXbNO%30Z%*jS;=B~)U)oP4)7^w#Zmc@MO*ELy?>^W>4e4ymLJ$ZN;T z?~@8*w>JvfFUOW)(V`B+jJ@-%8+kO2Up+qDCI~=BXP29lnYp<{6azxM(i+c!l-6|I zc0v>6);=k?wH#1Y8P~6`xSR)>k@Th=ODcC&e}#`X!e^JRXN1S1GeW1?nD;GbPF@ym z@f*iprurrHyHu{qf_CqM`A>d(z0-h*;$h8)GmT3#`zgeqvvML6{@^9Hu(1sV$GsI5 zI}4KLSV8l5yc`>{8lo4!Rr@J0SYovcrJtcG+M^BMdRVoJhw;2YO4p9B`h4Fa7ich{ zYQlr|h>)Imk1&VZNe~HyRssMdjD@%UU{=9ltz5qH@{U1UY0ZO)jUO`LXHTS!A}j1` z^*z$6KCr9_Me2KItF|`K6idb&SQ%m;C6xkxH6ly!!@@+%q7dMJ;u!MT4}hsE_!{6R>!Vb&Wgsr8Ca3u?gnAiG!aMsG*t=4PaJ?!T`IbJVay| zggWaY798%Q?h}huNaju!mH~N(WL@w6&6_vxWn^T;U0oo89?Vp%_NAqzgd;$L2)Ae} zEdw(?POQ9F-rgc~PNrpB3UtUH)e`LWC^17e(pzxPR*Byl9_%y%`$dhG6P-B+(pLqE zpEYQV7Fm2{-<(J*lZZ*>VzSGFP5Z*iQuE!u_m#4xhP~gN_+S^<#Z+uTW7IT)S2$#X zty&(_wjNTHgdG+a?+<-&a@4bJY-r!6*ylXRs(mbuezmG3+@;jKe60L^J(I z-g0Tbxr*<~8wOJCNC?Kvfk0#ETuB}&Q4HxiR58~{v+^uM53?@VB3l~dAbc1L=3$>A zUhe$$mn0SB6Cs_E@qYj{D-s^YP=y^%#^Owp}7rgW(NHF3~agSX*0LU4-v9 zBwLnP)MH{|a_S82)};iviGnUQFv2wwt5?V6SQUcbM|&z$5gCxm6avjOa!>k2Q{ouk zIOYPa9y)oFK4J@$AZ2qX&qdqoufJA&EGZGLcz<;YTRIU%$2-pTLve9mgzHfC*H^kg zkLQW9)igu{A^IBmgeZ%6e2)H*!fCn4Ee( zD9>tqc6Jp31dvP|W-f{Ti|Nm5_QqZ*a6~GELw6WdUqF78RXO~%$2b@$opezk%z_x+ zx+lItv8o3#5B|DxWZBm>+=bK6QAS%``j5b||1O@3pI-@3%w}E;Mp2YupUL62R26OK zkG(1nT)}iG=F#`3{-0(+1G)l{vTq6cFXB9t^HluALOABrqILpa7XH%uc|s zekHS6?X{7g=uGEkLd_93bHdofL_I==ghHX)oKQ+WNe&k0KUZ5V7nvJKhy5gtP@*3{ zevAhwA_Obu29lv0tBHD?MsLUm^EcrO0q3jXvqVtuBYH9}FcE4kfpA$;engKjFyQ2 zPAoYme3RB1chJ$`ih&9CE=(0I)uYHN0Xh)n*0dqX51S8$&=;`McV)9Pp$0tC_E@kd zE=aUzSX+lYIv6|>Ll?t^lu)shz(p!$nyHaBLnlZYLxfDX6R)@fkwIW1Uc2q^C9tV@ z?GflvBs=vrrN=4dIZ5}TOuCH!@Qbo1>LC)fzN!wWU@unThJCv}JwRV)C~z=|(gJ}M z;%#pCBxw>te+NZ->i?B-(!i$WYCI|8VyIQi+V4T!(TbisnZj5pYlZrw|s1>lPM@D1Ot17ofKr z>5Psk`L~|oc{7RbM#P?t^jOR(T%0xKH?BDckX;MpmjUM~8japN+qJ6zm;*Zomxeu- z6j09XMmwblnGOhLx8f`Ui{YnbddTO(Ys;-9)~sEtaIM_YHg6}>-*nVBo%DZ2(fu=s z_OE8zECOErv3Bia|Bka?{uTSD6I_zhlYZA%;wOk2RIxEknL|R71p8^$p|Np?kYQdIOriwO00x}JTSu`yi zKk_0}-+Tn54+snlq_y0LM%wK~P5Zwc+QeH=xb2`S{G#(tM557_re<2SJSi7B4Tw)E z{%-K*?+MR;Gqd?glPRkL*C}O?6kR<% zaVMQeU*O#?7}T=|Tw}>liIrh$GRs`&0g}%8_GIVe#;scedU@sn7i{a;dWth1JbF}a z$1)&>Qs|P|+(G};{BI`;?N~ySWFk(g0Y#*uFcHBHXjoGW+7W* zui19k>Gsr*20rxUBG6%}(f{;hv+y5G)XItV)~o2$0}c{4W-FJbfW#p9+7@$XtZ-^x zrQ<_1{NNp{V)DV+HCf!Y?P%V5M4j(_qc!k3R+B>t#=z6w&0X|QUy0)bOD!bPzi!9T zQ@X{TesBm7#ss7j@nB%+p*l>^whnJI%OXUo1e^0YlqD7Lh3c=3o&oVygdCGV6I=(I zh8@N`yRA|!E($U|LKKjWhsqUs0$w@T%j$q>$mkgGOfdr=g? zj}xDZ#0<1lPs(N{rjWtRT)W#I9R8w4xQ+*+6_2lm8I56kJ;Vs33g-+{~KrOL$1$H=(2xMVhb zLv(^hs3_+yGAo?v-f4>omyZ7a&|V&mT)TKbJ@|qvU`Ax^LUcX`B;h%I`gAuSPgCpS zU#mBJQWOA(Z3n*cNX%e~@qeG4`8{szheP1wS6^Y#GseK78py)h8DD>YJ)o1Tln~4} zg`%U=cb$VF zp|&)@(u@+a`|MpkSJp3HVdZ^3(`d4oqzurE za@k{SE6Gm}?un4WK@h3+lDvo@$9U|-06#r2{*MPIE%rCg-t5D}^=_L2AJ(Mp!>GtK zmvwvPmq(sj{mwk@^G=tQ_!l&(XuRqut^a8)-8kIc#&sFs-%uX_3pN@ z^i>Gke&X&zgRuL(PPT3%1`CDI6PAG#B%N}?L&H0n6`f4x*fh-FHi7clCxtp8Sm@*N z;r_^n6IJ|REIDybangUwNn@E9vL45 z^o3E+vCGNr%X(Ar(LmU0a7q7~{&JIb_mNx*&SvoJs z1j5(BK1bY6gqy~cnQBV~jZ8mHfSDx&m>A|(M2l~8Qfzt8orw)>C&}PzDUGH94{m!~{T!O)cIAkxSY8M>t zZyOwf4cl&Z2`pl2kdC_=LKETDgQ!fI3O#^3(wmt50_qIIz=T~VjTP9sf|N-Cos#Au$j~Fwx2g|Bff%1EU6iMVMP0?B~zOATU_HABb^cIn}T%*a!Pb z0cK0R84hrU?Q9Ts8Giw)h(k_J4(-xGiGCXkRnEl5tPjIfx&y4Fm^VK~9S5P%bKqqP zWG-T{8KmbZ5Ume^Gy#s};DG~@7~gxP?ey)=&bodL9kh~sHl31$Ztkg!MB{vq^m%0N z8oYPKWyvz!e*fx3LGHu;w8!TdvUx}gvA1*`t5BW!0!TxaV=RU-plQXaAxE#?qPff^ zb;6F6lOozC1!~3ug2oj}2x7X8dVFv*yHbi~6LoUv9HfooR2fu=98Bm2mfKpWRxT=r z1qbgaZ|IT!MW?fw;$N|Uu>MetnY8RzdB#wM9M|w zRLci#`*QZ}+jpKgxe4nIN*-nJeOl)Gckk*j{}196Dx`j3hU@+Vu>8184jk+q)qC6Y z*7@p@xnW-vSxQM(w87o6Jw1Og-=O)soymhhm6x0qbnoun#YFR#UWh3%ScpDE9cf~b zT#eOeND&}w^U;?_b~FQpT$Z2o-raLIgV1O?9t%ojCZIOfOMlSDLNhc;yAA1%2+w$o zzR8TI!?+tHq1b={oPZJ*FYd)mn7R3dg(EHEvaPGW9gZvU9(yN*JOSQ2kV35FfmxN# ze#cgWTN4ih9{P&mTo{gaUz1R-pujIAYCoj3uQ@qwkJZi zQ&aZW%1)G_a5;t2qQJfw1hgdgP}HPHmtC4um_J@!=FmEX&G#Z2a<>* z&N0xP#H)t9aJX${PkBap@2igdmtSBNvqLFH5l=wF`!Cm+09a8LDOLm{Y80j_Dq`Hx zN>iip!3Isr$WSLwo_uXme<)nQ8VnHjFZcpq2)B@UEu`8kB|ue@v4@6KozzwA*U9vy zKYklS`wIRRcF?VV{`qEaa7r~6ItIB>s%+XXK;=9Ar8iHp^-NZMr3ez*agdAviGdWi znXVwN3BehWJyMV_l~5iCqYStB&s|EHxl9r?0ZLl+J22ko}PB< zi%7ahfwYc8|FJeEra5t{F}!!cuGaoy;EBYky{!dvUd>(_+MqDOk|z7j=b*@IFS@kf6jDr^~Kx&1}9BS@c)C8Wc%l_ont}Qa|j7F#}pud z1UUAHf>MvV4!uj98V`(E>uk5b)Z{Q6nj(8kw;WNgGd|4wxLVZ=1G(3sym3A5BG2=+ zac!|AFBGE_&<+IK{Ai~PbgfY~#$aB1Qna*4i=PJrf-zcAZ=T#)N0FY78KpydzStlf zK+=-D!VUwU?9{$I@>s>V>~{-YGFsq1{cRO-%q6J4dX6P}ih@dVfih05Ad|mwT~MxI zF&&<4O-F{*fzytVEJ_#|z{H1yMwUfA0#ETDMs%scqQ!==$t5RP2>%C^Kj@GH-ViYO z43(Gd5!SLlqfuJ{stVZs6qx7+Tk=#0>qEpekoQo(+P<>pek3J<%w+-Q5Tfh%-+w2S ziHy_0sZ9=EAgh@CKF7MFspvr;JtzI-N#4SCs%^*YVazJ@TIc>61iEwYo^YAA`;0zh zz%$gkY-?R&-$j3L8P3|eXsOMFWkdJ`@Q6R=xM-7O*S7fSD|&{Y!711kfs@HRJh@k5 zmL-Q(p#3F>RX{gr;i-I4S*kh@nHJ0D8(rRZqiCW*!-94l=ND|*xbfSxhq$gaU@3xkdX}H%Ovs=K~4xK zA{Uhp>V$1?dR#koAdj@E*cdA4%HZPelF54OJi=W%7`^PLdDZ!rrdrY435|mq>rDkOymZG*WK+CGs z4^J%l4p$~G+C5%EBCp=C;kdT8b}YmV4$KS41Rs6drQwoI@}o26MPwk1$RK(~Mn`!G zM#td^25`Juu$z|VSnHyk2(nG(0%^zub25b zPJ$nJ@Hl4q(J|n-k*o>qNb*S>hYx?Ao|y^3wAvVS$xt{))wAPH6oq--SqSd78{lPr zgf>4I-kD&?#N@_11_#a2WnaXVea66?398P-Fw0+|=VXh&LytL5c=HFIo_v_NKZ=yi zv1`|xzwX=#LDmf+^{QoM&-qEsepiAVJt=T*1Ro|VJ^YeM*OT9mXdPfJYI&F_zkd3k zXD$98z5R{+@z9EqbhLx(f}*UjP8FPnE|{flxCBtJ-HwCk0FDec?H7#PeZ=g*(- za!?s^g8a}3wYwf8TEurRDk_R$K6!M8@WAfTm4T7!4eNa{l2-3!B1J{s=OV-x9*N|P z{n_4bfKf(5chHB5%?;6WlM^}!8vq0g>?9qD2UVha7RA+zjz8VQ9l01__vj$Sfk}Ga?BWVNT2JGKw4wy1P?%{Ou}sLOhYV zP9!XfzFSCgp7Z9E{OybX7?Z$V5v}AOcanAPDq7)sDKchJ8^)y4Di>MJvVKy73%PXy z2UWZP7d<*PrGhdIzMN7*&LmTT*T)~HBW*o!=@dD{2yJAqLlJpH>HbJEQ%Wui;E8A& zWWdfM)v2_M+AwIeuoGKfIR38ZL65L;xV6jGK2Ez@Nw z_2d`gYzH3#obeP*L?Y%lh>TM%WcVel8i3>}{;lF^zG%odQEQNn`*u+r|XV|otkBw)@RHFXL(5^fk?0)0u@dswfzo6v{b3FXGwQc38Fj^*oqATyh;owbsoE8&&PX zekWp2{5)3HWo|GFrw?6(XNjD$fz^9L)EI|>)N1UIUbs?J5W1a+9(Y3zAa|hL`cn!f zn@wd?`>`tx?Imy0FgW|>4Am0=k*7EqE}+eS{&@u#((|GXO}?KVSV9ADHW8Cw2|CCD z)+@*S>?i{BaFyuu z^uKPO8=(@$tLFfCjz~W}*Z~bO8Y=@@w5m0`5;6RPwE4zjVY>fEIA7%Ds~kD-!ZtFU>qE@pQ0 zDItXH3KYTNO$Tutk)Iw8Br=D3Zv>jw3s1pMk08XL$>dg#IqM-DR`sgMj0k-k7%B8W zr~q&XhJz3BZ=>D=;qn+2(;NCSD6%pbTe*k|?pa}Q30#MF!=87AEXswz!^V^l ztXl|F3_(^M#}X?rN#Y0V9*vT3q-CJWpctjQ4WtAnE(r(>!-JkTRZG()l1ly$)|1JvBL<*_Vg)Y zJGC)cT3V8?dK?xua6nE{@+gvl{mj!WcrQtr2X*>w^jY)bTd1q=KVydNas<{24l+wh z2%=t)HfO=ejB=xg6!%@;L*(@Bh2^CPWmfy0$IXtyNXm5;o#LQ%g^9-G1SJG9NDD*b zav9Im28|62eJfV25*Afu(*2BgU4(?>rx(brGgQ}#UcsCa0uw1f?iYbF)EY~&40}tj zV`!LZa_j;|!uTNN6qlEm1ERFVbHRLjSUdkK1(33gxGh0@57C=?Gt$ugl8ghaQ9%Tl zvr}#EVFidAWJOjqtXjN){XA4n(HCew8gay1gZ`2SK=$S>Tl4^zdw5$X2MNH$kX(>G z{GD2ROV6D@FL=igt!u2~a4S;J$j9yB4QQ))s4^h`$oYj)^Kg<{U-$hiTXn@yOpMp& zNJoHQzq6RH9w@o4n%IT;437gBF&_+XsC&W4&!0y@V;BIrdyk^wC|NauY$cq$nVH!H z8MFcd9jpG|gfk|HWKcfC11$ly{PbW=_U+=yw(F(B0g(Vc+KV}$Ym~{k!|Q}6#TM*o z2sWosJyElIC&+QF9mB(xWF_EIA3Pw3osBcPuoJMq!K z+IxY24n0cV_|!dicf1gcfg*RMnhP1tqv8nK1J8- zZlxB71xCZ)oRKKq63nBkiHA4R;eF{xpJllhPxp`ND8jE{^BsBbuH73S7r{&Y44EvX zD{Yd~5M4n95|mcostqhgl#miO*U$0vvV{}fksauA4TJ1qw~@hd7D%(*;C$7F9`h6v zT2Ydh95+WB93P}L>W4#@QNS!R*=uo@b%UVet~}g!!~}#?`?Rm2m^)@mZP(l%ASuhZ z$|||T{z#gO804Zmw26TpOY}-oGBS4Y9+)!6vUtFS#buAIaRQ~DHtJ)I`><$qv*ewYVFl!R4tfI7?s$(nDM>T2GLfctCbdvN*fVi-Kpa zDA9z?gi?>9pIdON57~gHri(KvMbATQ8l7g#5clcjzB~{M80e&hBSX@l24az#jgfZ2 z7OR+)kQ)8q&vD|ONDO>M2{;gA7N5FL3NN&w zX`>Ekm4VQwNC~-&8*#Y)&?YrIUsf3im`pGSjrR8%iUUZ5ciX-sC;3sPO3Jkz^~K=L z2H*A^v@L9zc}b)lgmiwHc*nW4^Mmz^2mvA`A1UgVrs)7%4K*4sC7#HbLXaEOXs{1C zb!r()DmWt4{@_};<8dkap4Q`yT##vru!wmEGLTA+ZUL~bf|y0opI3L7YMuj2?+>pK z^iG}g54TvyPXT-b`r3p2m&j9LD+0NYmsA_Cus@4dq}+H+TP-8BcItN0mo^-5z=%I&e8Qu9m$qn7f)BO_z$G)2CeCV+LPrmz!yYseo(som zg>|4WWu|uC;3T7O4DFK*un&?sYtp|#$!5_Hc_nu#xISt-_$8JlQ5@7j$W_!;!|Zv8 z6!gF#%sj*#Mq@;c?(FQ8nQNCt^Ja@DPM^4(Tj*{OjSXZW_G!h+l^1NpxF&Ns=dQAs zfE6M$9U&HD@cF5EPk}(B5HDS&onUWd>2OMfS=ATqAq!sd#L1-#K#?Jn9c5CZ8`wpY zfYl{I20G+<`;VA5wU}0lZXlIWX3G;lsKZ1PsE$mUH4o@#;&x2wS=g00h7%-Bo9@BG z2u2WocqHP3n9M*!gyTMHagBl9?s?~FEjK?H<7baWaSLAsd<*!Dfc2H+xU|W+bkWEhI9?mnw1^quC+ztYzF~@ zqRLN23-_)YuxBSXFPTn!)%78VG?4jci~=WtB_kazxn-o{dW?##(dqHSpM84xA*xj%9BqA#@U_ZkJmd`oHMJT%aA3Oaq} z+0A*Ttd{Mv5*Ulkel`$P-dt4ej+0E%HRJS3sYJIV)pU?i6#Q=x&^A%SN$|uI?L=^R zdi|B6!_(Nb7X5>5Y^_`aZI{H)o%;b~nLRWK6dHpG6l0`IZ;87G(J-`AJDsAx*QdGG zdnyqJGM+}xk)Xw1%WK!3lT#FdD+vu^m6P^eUAr7>105ifJ&zoj8n>ulXPKxfqMbA1 zehJMI5o}SRshPaMD!Q^>_OsTYzHYM|vo?jwhz$Xj+ft3XgEX9|%oz%dMo zZy?(p^F6sk=F2}Z+u|g4i)@Szk>jAi`r5b-57ndV+0#3E@6G_x zpa`fM1p$l+xA288t*0g#Edl22;%PX`3azbCQzjBOPTMk3b8UQ%LyC#b0-!(9-`~IA zA{CPg)aYMUUo-SulwPFMM#v_beQ_{>^hVBzTJSlhzC{ae448y7%Dg$(6G0sZoRImE zeR`5oQl-^`S5?q!5QjI419C%LTE>@vl3}oa4)Kxvrpn%PR~eh>1zgE&sdOi?ms92t z^ttQ_{R%8Sz}Mg~sj(aW$v9%O2Nh!zKL_gZSa{@BU>P!lL(XGEo;1b~WZ=Lreq+m) zXl%*6@S=8;5Z2qsC4${0FZ=S-mqbGloA~%B{I_^i$I(%nMXm{T_c%QXgH+>xqVpT! z;*oe@InrTj9dnBT8A* zt$?nK8^(D54`6VVtWcINlm%6I|au8qLMuxRTid`D4A2?G^l| z=OKNQGsfXY*K?x%WjAv?4^sd}=={hr)3l#vNSS16$@cZaXYJZItlZlpot80VukU?W zqG7NvszifX_cKN#^?&>AHpF__RDr+&$NE&=kq5jb{BW?>8<>7d z=`c_b(0CF|{Vx?PulD$WPkP>LwXe#Mw1Ln4e02k#;q-K|#fr+x%9BX<9H82_9Jus` zj)D^l!jDO`l{c#7k7o9pCqNR$jiU1uWMgW?uvRT@e+>0PsPC9-zP|WeSkp+Fn8i?^ z?sbR_wP42fmSR$>1c$%3kxPLHS%xEe-b(T!S37vTqz7a63TJd2r+!8r(+6345nzD$ zkkGgQ5|yN(jW|x=>wp4~qBEun2=Vl*#p$Xkte^Af(%o-*&P*$CAENR6?Xj%XurXGcNfkL9m5C2B3%}eFpm;OqpZ}Sunv=-1#LLiLA%@iYKIx9jOaVT07 zVT;hg=bf~50~F!F2nG4gir;_#b^qDf+3Unsbci)X(H&QL=b7$${SD)YWU( z+Qz3xc7Q-4X32JQ{k0{XvF>GZ3T*cL+ z>)_o2E9oy%5mrqvC?j+3>*x>6iuI0mkyBz*9=Y)%Bq!AtB_)9jlt2Hwzu=@JU}6su zyDaP>NLQhPrDu@Rj?=GR?WFHz$l@dx=^HyKTwGjPc?<$9!VWUvrb;JAA@HPZeSDhy ujRFh|)8b|C0cKw70+>BgixrBzDm zAT52@iVBSkug~Z6$c)H&zhANU+H0-7=l(fG*^TR%)-f&D+!(eCOME4wS^mv%cC+t`|0QIGJ7 z@CqER{r|% zx*hMQ)i-`y8KbiGnf{8}wJUG^exh_{;a!T0mP6fz!_RXKs-E^eu{Eg-5)~JxyF2uH zR_rufiDzJ7xbTbj5PkV8hGNm*@TT~6-S~fvFLIM}Rxr?|KRwu{>gt+%#*a6+UGEpX zpH1re9lYP)tzoMd>2!VCuJ>kxrXa)BKiJrkK7RZ-M3?a*U;h00lYII9x%R`+R93^w zhYIVQJ!8^#Dh>8T@;~zSZuGQ$pd;kdH%*SC1<^&Mw0#^$^%c&^4^d(7kmpXygkR!)K)3}!?umf zZ}e7R<^TEOUtlqe#sr&KjZT<7ly@3_`0xNvKI!7r6*M50-Q~k}Q(=$JR z7I$_U2w61NM=OWf^_2S9H9hk5R5LOPS8eXdb7@F1RocdISRa*-jh69`@zQ43+O6;F(GHAlRt!4&@~;(ZI)(8rx*(=s; z+P{1Ev0r}qMeN#-3z021d2WzA#&A7CJJoCt#d$Cx;~9u_c<`U{Q<&QNNq_!PR&`K#(4te%0JVesvfRvOzo7=>b z1M0EQVy3=4c=1bCV{L@I(1o`Q*tfGY<9(VPxlZ`<(865jlE$S=uj-PqBYqVob>C~k z?7zJI<@D*(F=jQd#(w;$)--U7c>a86^A+LKr%rvZl3uEp=gSMndX%)b>YTj0$v!p3 zw2HMzAIF65~G=(#bJdzPJckcg_Zw}( z`_4BMdNPJF30*d;4X4DOfAfmfWr!aKO3}E?S6xBD)1)FG0{=ITP5AQV0;TqiVNqKn zo20W~`)upBgXhl~xX(mz#3+SIJ$U%=DaRvLN$u8jOI2K-iSDOLCb5SyWvj32T6}r? z7gqJjx@`x-vB`0ir~78Ryrl(gy5-W$>k5bcCKu+Wehey)5yNQLlWH44ePOZS*t-}9^?5Br2G@BCj)WhZes0b2Oog8dx z%C;Y59)5O6N5i=ESq*-U{lcuh>U;0ar$}A{?s`0x=0#z@!Yk!u!Bk=5-~nJ znrBq*c$ep@rhwdw%k}#|6&1yqR)w6AldI+mb$x}sIBnZk*^pGQ z7>oOQ4%@Bn%_ln=dUc%+3xbZQHRUY73X*_b_j%+UZV@Bt)q!-G)G4tWq?!Jv-ZZGFCf-ns7)b z{W}5^2|wf?ySt4jHvR>B*zwcS@rntn)^Cr-stXjF;ad*qW?uE3-ck7*(_;(!q0h?h z?(T}lGh;nx(BhG6@#P z&_C@kP|I_kwEOmP?IG>dVrA;)4MV@YPgq*OLq}=tU}X(4>7?#pVj7ql`SJCGKb0iu zOm!6%8ad#>V)kbq?%*RQYU|FO(KJr^%f*lPIL{4d*pGfYU1Zwg`T2_QA^qI*NZ>8$ zmVvC!-}hj{l(4L?D}%+OYQkjY&$w*5o7!FSRQ8BT`D2a+Z0~H5^x{k3Lpmlg8#DWS z+F2Nq>)zd5lXj(9fJMws4w2)`*|YW%kwxtXQ@o{SqvsbE>aHYR8~qk*P>?U5wUB-f zQ6pO~3H#QP@0Np`(s#a)b!QvvK192-_z{o2y}e(Bg-zmWVGBduXDK9zv}fBN$r#7m zy_J0JG;YEoaqZl;18P_2nkPn!*qUBn{4i0cByF5sAz)l;R^-K^jc6-OZHvBjYvt`b zcVcmVv~#b$315=jC%}VqIr4LaMciROC+F?f$A^yL@|Nrj^D}VQ>MHV98lNek(?<{J zW!q&BN2+@a)N@lr`t+|B>AR10Q)IQUgX-CK=0y@4aVP6W3YHR0L)|p;^YhoOUvFDp zcOf+N<=H@CEiwMc$@mVE5>2zW;jzDXAaCrU960yldaJWbmfN3&;`(OshNFJ8PjHac2^SWONWw?V$Emb1d|c<9HB@_xMP01aIu zg^za8y4BL{`c;a?a0q7S$E$jJdLjT6RB%4eAxG3F8_TOFXsZ`2E#?r&^9Q@kQOmxu zp0bt#j{+|?X?pG5%~C&)0gaFtTOTI7S#tJ!7)eryhp(FSUCz4AO;t=R&QBLCH6XOA z=Q-QwO?S9>jRC}bkBJJYeR=Y(pi%MT_)S}=$8nYkfS`q!JmnH^-}?q(I&W!l&ev+> zK3B*&#MCtR1?RIN63sExZy)cITYtv)U`(Q3j#z_w^tt2zfGhW<1;;3!?L*^&2N53L zym@oEKKhR>P2}j<^i_p6AvF*EO!08fPa6W5QyjDd%+pD~qKrTc?2(FCVEOs=aV$V} z!P0_Nf zO#OFUNzI5eK7#fGYDMbR#bWzzo=Ti}zn;%*cA`H((42-)fpt3@Ea9XAFu5EFP}-f( z&O>Us3{ZEuGe0lHsh1J&tc7gWnC~_>F_=_v0I&#Y#9qAUWU9crgW;b;!j$Efc&wRnnjf)rtTu`R=p1|`(ovmiURR%FP%uQ_#PI#*W~p?Q{U-yMWBf0G`&3~M;ymwnwcr_h$`SacEdZgPzP?TD7lyZoA{B#Uk*112d^m~Bp@HP323#b5nbhwDg>|@& zpH+`Pugs+wRBW>g84gInmq~;g|I3Qi-%&K_u}}??!m!0QQO|)qM|;p z*|j*ooQY<|oDRb+5h%Q3TxZ6{CMVUeUVTmK2qg4@iT*mpGJ1QC=ECfxYK%%mcUM;! zDxw1i4xGtx9C0vrME*-|zQQEwoVtnm=!@v+{RHu_va-$eD8k*st1^Xr#@-`los z4M&0>c>C9i{;3&M$xKH~Jh6~r*oajtSMEP}PyygRGww|$K%dm&PeWjN6%>|F0K{+H zxKR%89_}-9b8%Q<^5KaF1@zQBoz|QNkQ~fMzAXLh@>aC(bej%F+5-r5 z)$Cf)?Lew3f+`pKq?c;2e`?^V2=>&Xa)ERpEDoVgz!^*?&TielJ(57BzRKV_yeB6m zCMJfEE4&8z@u=np{#wJD8E1*}aQ^m-Jv%<5@L&=&dx3rD!D|eBeiQco{rhKNG}dm} zXNH>4%eY{6fTtlwm6Sc_fU+B2o?L@7phyZdoW^LBB_+ecQzM;8t2Z*4kr!`__YV-U z;sq5$sYOK)i8mYQ3-!>rxq2h2Tw6lH0p`8f#3nVAo(JR+bMeESS-e>cmT13(gf5B( z5MFaQ^U{()3XT%2fg)Co>-lnDfW(PD<8xr3Eh~k*Fq`|&<0wSycNn-n>CHAQ^i&?{ z$n)V+Vnxkc*>iINcPbpHlxN>Q8I+9Y>LSnfe6=38oC2(h28BV2F5sqD>6WeJj4eMx zaj_iA)`}WMKp_rpF|n4I1u%g}7sV5@0g2)_t!1E)#{>AZ|M}x0zV+}c zA79?$>|ns*izj?r=9&r?BfvBEr&LL)hDbQkvptq(n+gcxbDM5=BxsG$F!qCuF+jV{ zUvKZAXS-of;;s&!;yrBr?UY`DyB?eCkH0|Q^{Qo9bxLj9f9@D<>y|BHxFduEB1fdo zw!@FlAQ9j<8uwVt?)&4iyS+~1z2?CG0M6ds{r&y>r5%Qz>${Ggc5-nU1SO)59pa?m z*Fa}M0S;kLdKJi+Cc*;jIeF)MyJM%l?k3l%AuYlQg}ROrU<~d}+t^8hEID4D`{2Ry znE56H4|PzYz2P}16&PaY7qycdsKJ8Swr(8-J_y6Q1 zdaT>Wj`Amo8RF*Vq)3|p{nn0tZHwhoJp}_sGaT& zv=Up$0ia*rC)9>;(>&Z5O?svSkfz~}doPaw6?zl4|0VTB=LgVG? z7k`lgHOj^7;r2rV*96<5Lfr$J_gqy;)D`u^&mge}JXk-f2)k^XFB&-D3eyi=A2A6{ zl=38HK2ri6LL=0VZ1HZ&s&!kdJr>8=Knl7o&h&Zd?00o@o2)*$xgWd|>b)m+;{yHU zNLhCkl1rFQZ6C;7_l?w7Zw&7eQu)i5FXbC=g`*bF@G#di|FwJhUSJRG`)fHDP_Qxb zh6V-2M};__0SV*vqxt|ykS3g(1h5C3oSf$13=b&3x)ni|FH>jb#;jwYWH>0e*(!E+ zskEn0+j*xyY?hXzAv=6a&qt1-h2w^1){{bxdh#ZojzW9&>Xq&Q38#eCnyZOXglcDAec9Csva_sq7BBCy ziW*$JO5dqx8!72(YHG-W|N85%=aY+ zcDE`_RFS3|GOZ0MBB29^?Y^I@2o~35m-VWL3INUM)rY&AEkLxaTemLRD2I@~iwkpL zHTl((^xGpAaTR%u!5z@39b(pV-9C;y$k5D>NDj7C@t1a?s$!i5sM{#;qy( zm<)1?oNyuc`D>DMLn#4}HY8i67#NOe9F{%FOBi-S?$c5sHBnSRBy&C>ketrQbQdl70|HI~gEyC@I#;2h4yHmozEIbka=w`IP81Q3ZM zupbJSYNUd-79YrbK0z}sD$5Jw{`yu zAOrIccQ$&BJ$tq_4>(lp>X&1_t>vD;LHaa7=Lxf@xP6JbS+7tclN4RI>$X<8>zb zqe5c=oV~a{e{QZyf;0(C_qc0dKWd4gj|p3{-Vz)}*x80SO--DjZV+?e(#)~c0NV{< zfq0k2px(;hI95sLXb66Ub|dSKEhIr^7TJ9(+yIa;8+yjRral7%#677a93yBw=x=JM zBkFLeNXP}TO&BNMLGW4f3EFAM8&3&ZZb;PAAtw)Mh3GT=$f~3s0kDoc_~^EupC1VZ z2#E&(1D`y3azHKmvHRkTX|SXVL65b_exV1`{29F^r=F9vPnfLD5GF?&mqf}RElFTM6$Ng?o34c>1Cu~l{e|Tdc)4$!1y3x`w5uBl?rhkqYQt3 za}qh*=3wg^L)nF?Rx4+ERv$tBHa%w!>1EejkBlLBv{j}(4nejzhh9gJ0eQ28^KVXI zKWedVM10N|+{&$Tid<_X0zz*Rb%UfH_qkyaA3g(p@Pzp|uRY>S-efTX_zmxnyoby2 z9hu8+A`1D^vEIbQ6#!ututz;gkp*F0g9PdaTe)93b~wp`0dttUcFx1YL&&W97%iyR z^mWSJ^@LYOj2hAEL?~p~opNPgoOA+1@#u+@C%1Jj<_*zTU~@-Q8NQzTErqNikW ziLUvt93dMwZuEIJ71p=p^^XS4aQ@B<@*`1k+$q^Tnz4P?uE72*rdW-HSP;N%U998} zkI`kSuNS1@(Jljb`D+VHNCf@;x9I#gR?jD3>x9d^9}rQ8p-sH}zhd?n{*B`MKVOgq z`hR~lx`?`UH8em(j-MYMhlk7;0rY9%#l^)}=#-383j)LL+_M6;|?Qa6C4|kZ6>H}5#B~(4X~weJ8x?`I0F3E>saMQKx}VG zp1NdcNR(+J^CFdLf)9i7hI;n-;p3eiOL@YUE#bhqTIrJb>$|yNjDuDO3r$>7ek~{& z^@uauW@cv(Saswg@A(q$OfSz_C@1G0b1bM3=i#)bi3*X<@8BdLx@E;p-Pyzv3(rRk zq&A|I5jO$wML&ecCfIK#YUTN~>S3T{*xk3+?5$ql5mPTQM|wP8{?8L3At3|RFK-f- z0|gbND`k>+P#5{Nx0gS;bm*3KN}#ZjC2z`;Ogp1@GD>`shAnl?I#D+36&@ucoy$# z9}^0=j*6XRIl!~BDUvNR*?t%Nk~lfssu}9Bn1w~`RnrIfB=p!54kF_NQWksBfALQS zGNY(KF>h~dE%SifzoB*AZ@*Q?Yo&l2i2*rzRBx3vusbXUa;Q=Qz(DDB(#c1(S^dUD ze*;Iz<+s;=L9Plw)iLrs=S%0(Vn!LC|NoV7r^n8J9?uoyr|(RGee5aLrf%B`EQ@kg zh$I2YJ%IgCiq#zwN!`;4WtFDQ}-Wd{RKW5H%l-6JXs|%?%7rH3z3rHb4#@! z)bKO5V!4YN{?ShU_ixDV5@Ee`XV>|~wk%ty?<(MCP__-EClR24qG~6QO=K!!oP7f$4khz39C5+953oihs5n0Sk?K&$szbNTkFSXc_QmYp znW;i9Q&%W9>n@{A19;;jB`sKbVno3bL_=)tCrSe``~XCQP0LCZF)81<(7Ch_3K$#? zs*YgwbNDQ{4>iH2$`O?w2%5PJckQQFkyu8ulJ>yJ$Vj>r;ug?FHp3-fT-)l}1iW72l zf|T3r(F9_Z+Q=$#o;d94{AS>dqtwEBL--crk|K1~EUMqhp=MPakXcl}f@ambW!qsf znnZw%k@aSU@$Ut@+HadTH^JZ%18qb+@yhuehXhalbsIM}Kwv=WyoZDy8feNx6#o#{ ztM6~ErG?}AwU2UslU|s3O6X1q1!h3-iUvbZpFW)g+jVr&(vw{6ZP!U5tdgrX-5ESuAvuH%nLRf%s#efwxv6p6}v_nsiKy@LL=uELFy(`~i^V)n5e z*Sa>sjMiwpA%A{MJ^^|+PP$?b(oZAUHnp_N>M++y#+`iVQvN^WN4qkKBi@FI8k|`K zXUX@m_4fI$GlF0WD7Bz{hm#fmW@z&R+M5#T2v3}wN2n_+<4jj=+;|>CPZ=-f{;`OTvNCy;Vb%kk zf7E2gb9o}nru`plny{z+sLP@XaH`ZzXeVGX}X z$yEQ?aTMonpThGOr1poY^cJ!t~sE$r?s|%vK>KXL?s`@8%4y!(y_>$ zvA|8V-{Y@$FySD$)kAiHHTmd(C;u#}{b$l1i-KI`DPY+`LF#ZCdUkj$xfSVd)ba@m z8v48lSuptP*p!>gxBFZ{RFOA>P=M-eGDrXejsHzWKg-6LBEsfj<|ksMPds@L5*(aj zujjPxLv^`>#HX)!*&Li+{l$T35ChQxhEH1;1!VDe_NEtOup#m^__)5Ncj)U!D)=8v z3k(Q|y}f>i*rJ=o1>&#lKGh~>y_wN)0#&HR{LfLvr)xV(+ZQ^f_$16Jaf`5y(5RJp zU%c|>vF={UV5#Hm`fI(Mm=H%Jfui&X^8tc;+xG3f&!gT+NuR{ksm*KohHnQ|@AK{3 zH-dc3;p6M4D>?ImAVmO&(WaPx^J!!*e_c_$DRY0WVxW?^X@Y)HJ$339@lS(|iGecWpT7W-l;LO29<$851f>yICWxk51R(+!VZ?>a zRR;l+IQv=|-b;C$&=!bQlGir4`r*R|A=_RB!e=8$3KSmAUkL)%3;`Fq&bdfxVCJ>a zS0rviRf_RgTAT$gr%qdi7) zZus|YJQ`|YGLLF4MfUG6ju=ujur<=hLUe!q`Vz!^KXUy6&{tu+{QSxU21rOs22&eK zil>HK;i=w_rImx}xRH3^A>$F>YQe%(VA-$|-ual~+k`FC)wp3C6exl*MBVBhrF+GV~ zxjed=uR({B6bl0)_lKSlyAAYPpol>IMgj~K6w#W+9^`EkOq3)CMltM_M5=&X;76m;1f9l=eD*7! z@Rk_6NAUJCj=~9ym?MX|jSng6B`O9Aov*}T*)5kMoo@5rdgXxII4S?FTMpt&h-%IT_hF5}d8RR7DQ%l74w>YBD5z=+UB!#05!FG|s4c`3G_m^gZ#4 ztc%l-0%A~>cyT$Mg3%nt-T(w|Fz?~S>|f`$OZdujY;_dq>^LAAuQ7hZLeiuFEjOg= z+Hy+x^Tr*QVyXLIa0P>@PtGHglc-acG5v9iQZ$_F+6C@;WqxD#w{TsCQgT2mIlRt~ zl@fdIsqe91A7yO-gX?xa{dj2j8+RUlOUuU+koVM1 zF?rM_{D1(J6zb5ZXVrZU=8X@jH`x3l{QJqmn-f@Z`(=af56nKFT;Y7-D2Ug04D1un;uJ zD6zk(W1WBFVh`-XG1?t zY@`w+Sbm>UNIU|RMQ6U-EZQ)({r>w40D$bgPH72=HtXi4;S3q9=v7Uw_S~)Q?tyZ6 zr^zQwOl!-q-qHJD=^(5$2efk8b+g%fd}$S`LV6ztkGE}~Zg zp9;NOVQF!?APw(H_}U_GwkF8Xgffm%jf#bTS&J^K?L0LUgPsyqtfVgxKJ1QdydsxA z{zeRq#Oi>Qr3UViI0Xp5k3*G)hJ}WBt-4Vo&+_7@T;71)5lEaG_)b!x5#1j9Du>gW zae2px4xVtiZb+>}cZ3kSPy`3pqn`k0D|_Wit(wTQfr4DueZqlGze5amY=d^ea87n3ivY#Mo4KB8DC3@-ND|X?aILKcU?@9A-Sg(h%CrF; zNu3Q_rUh1fnVGMTeu-QldPpZsX5#`9pYuRWlpP9MqAEa7SSAV}8W6?Yv2C*>HSfkt z00wiZJ=mU;q_-k}an^!>cfi#K0H;(~R6!LC_-N|a5SK&G@DhRK&eQY+*m2Ad8VH19 z-wFN=uAzX6@adFY*LAs}lZ*~`meHyEO|G;K(SN0@;<0@>PRq!c*ecnq{`Uqx{$-1u zUtpmi{)S46cgT`B>Ea{5E-<1$CbI~w3d3?eD2T5Id4at3D;w%|LzYi>eXl^i>uug? z(6()!j7J30bfIhh-7fjX3XHFZXfGQHo!5gNivF+FP5Cd|6Z&_Xr|S4>?tT0A<+u-(Wk|Uq@&fWO8E_0Uu{CWS@uDamT^`CDBIsD648_kaVPnV8~D=QYkbM3FjP$ z1tQ$RG!TY#*`dq^I76yO$H?HVA@I1Oc6-V zmjTZJrsaHFu%Jn;V?bcH zXv9C4ck(%)p4_J4kn_|~jYjAoRLI%Snt-9@hr4?X)y~XG?5E%ys9(8q#kVR!rBYO8 zWa7x5eKpMG48?Cy&zp1jV_5^h2^}T$g(-*sj~_o2GDs^edkLz(PaK?_qNU(Wo!Cnz zhFhE7hjfPGG*JnqO!^$7#?Woxj~CrZi3FJUrzR=*3*4!D%@1-zh9(MdpqKHtNQ2IRE+%*kkY*T8+kNHM{s@Vz1}!WPz#53-GS8O++v zE;_t3?`f4>kgRQb+i3qU#kb)DKKs|Nt2{Zz%lHa1>ze?YEBkbS+h$?Ny!$Ly|URQ#}w->8TJzN>_0~fze2*(!2FL9MX@jb>JoI5be=d& zE5G*#yFi{IRZ8Z_Y;8@py#;%TC!2If1RbChe~SD2xPKVLUWJ?h3?J4pFkIbyVzixC z_j&&mUBq#glMbNcwd4ojYU-R9G{spNTp(ulWv9Ir+*2~Nwd>cH)jDT=-I*poNKQb| zBT=U7%$H$>Ae1h`!VBzTX1z4pnP}laf5IQqePLm&uUEstKLy6jK6ZR`RC#V{m>9vZ zpIUV@1up>=DKxAK+K6vop`WFH#9o8!E8={QAX3@eF~7jc!C^)u$wf!rJavuABpRZI z@-RHZ2so%khV?T1p8f=pvD^@ZMKrdxvf=ZpNo}^GbhQNtJBB_v>fP%KUAg-loG{Ng z8WMDxEc5c_!KkK$CqPQst7_S!ChNuggabxOQ>RVW!Ic`^-OT2TN*<}JA~6YqUaPLULKU(@7}#zZpNxAol&3dd^7iU#pdGH z*15#iG72s?aTrl-T-aP>NUei~A`t`Xf-06u9kgCtj+227+9w=_KEK3yfC7@bAhT^Do(Q6f-1JIGpg-!ZIjH|3}d6Jf29eYT*8Q^4N)U>Q2l@U4*41X7L2MZ}m6ijWG6)r6Qv39*15L#%70Ne$a?%zdqUr~L@^L|dDUXXGS# zb>7GAPfsyqijcg7Tg@z5Yq2gdj?Ggz=%~I!iaS-##${!+WG@X6~E@S4$4MzhLLge`16% zV5B7tUBMV}!w{=kKH61u$U;QCt-t#=^i%r5Revem4n`TNB#kfpgdX?Pf?rCBdPYE&{ks(dQ9`n{vNzMY$G(wCyXd#Y@ z3G(kb8=GX}j@HWY;3CKZ!V_nE&?%84M`iSys}i7@T=TW^S5Q(EMZ6`5q>tT*0+YyLOl3Z* zs7C@@!QyNjF`U9aTh>xBTx5pp2JFnMmrceMm<*geR778b(KXhpGoPRr^&AIFHz=TE zw!|KwYrcWYx6cwDFP*= zBAb#PQm}9KXdm{Qjc2u(GIT`i@R>7bvInOYD?^a~i7*9(+la%DMs2ehVMGfHzXpCJ zm~<$7!~_Y!Uis<1vj?q~MqE8jrk1me_lsYOUlL>job?^Sinw{@7r=aQQh>jzUMFCW zN>E}W2-X3e1q~pXOhQl~9#W_bZ^B2WQ@|+_BAhgDf@l&#h;njv*3N!J5FmDBYuK0VpC1@D=FhHR?$?su;{@$m|!Sv*0?76EizAgZ4EQ+5}|crU)sXK26F2&BP$U zgh-Inu!2n_w$A0C6JRld2nB!(+F3RcFlJqv5ym627zX)|pHVdOtRY0IvHOOjGQrJO zf!{$<_DBoP2Yc7VyUKi%2v~14&G82VJs3_oCQ2`edSUFH{j*lCyX!Y^cGO-(kQP>t zRpW=Ebk&@U43MGq!)Z%7bH?Ub!H@Ur@sEJ{c`c9TWN>~#{H8oHg2I34TUXZ+U9JRv z5Uw#=P({ne5UB+b*N)K2(SLD4t8+vXG~r<&1OAed62%_S6hbh*3D6YKSsyfTRPl%@ zfUgXV8}WcN8Q6+?DNhRo?5vc^GD9!5C_-Ks>F+KqDk@sJdUXI2LzF%7!KuO%v4xSb zzqSJ-6qvV6fD3L^&d00~_7b)D^ELg?7jsI?B&6Y`XD`{xqfu@yQTMd*8bV`}>r9Uy z@0JOqlS=g7mdFVJkPE=Dt=0KtPR`lTncgNk4j=4u>81I)n0e4>z5b>l)}E8YOKe>` zD}Kh2ZtJzb{RZpYUIi~qK>ozM6u-)B?Lsm;l!t#U{25KjuAPYK4+0D07)llm1}xQQ$PR6Ff( zmk!CC`5ewBL`!}k7EOd}2zJkj*_mK~f9$f_ce?~_hYB)@rpn4{Zt%94-? z>CzZ(Lb56jgfd=s5?b?qcykerePFBw`sL(HDyS+-2V_PU5Sf|*am&ZGFW9jyeFhqB zEZ8zHk0_xKc+iz$hd}f8?+r-?1^@=6tRXLgqG3P43Fq8umc+0757A4jJZ?K2rtla4kG41?%cIKI*cwo%W zh&mhC>i{3Y03#fnTM#61M2v;6eE#CF%c3!!pN7&1{RLcHT!C0zfAo5E4Gvo1Im!*a zB~2VgIxN42ED9wHgiKkN4E)5>b18)g!d)bU&n7Qk2&cr#l`EeECp2D>9TWhxnw8?-g z80kJjV?i-cf(c2EThx|W5r)tFj2z2f{eL|2>#YjrKa6JJe*f~tT{ZoRR`Zo~GB)i9&b%a!`Ya4LYO!kE=(ITvn!6gD zH4eHoVS>+4C?#}iuiSrlia>nlQ~4G^`#2x$sWY@ngX)0{|Y9UUFb?b{d}FA^dI9;+DmzoN!SyC1F~EgT8e zM7D{c`x1I`ue-cLP5_fcwR-X#J)i`kAXwV9e$H`B!aGOd)|3<#$pXQutEoN1aU4z_ zLMRc!Py>>E;STddkHa^B?yg8xXaOz6Qo5Pg_mS_S1FEiBgx)#SAHC`AMC(K$t${#9J|;;& z?_lQ?@xZ|84!N0P6H?SO)#{Qw>m1QTty;sQ5q}QrfIzz+-vHi6G|yx6f7^IFd=QBN zJLu?X4gN?F+7MtxiMfk;@Md29-2GrN(8#vzFj8&DbHGbhkKYs?okWqrK|u*nT0ds~ zhD{tlLKE15EDx>~xiENmKojDmAw(WC^FgqM#M@tmM%Y&OHnB~FQidW0Is71|D3OFh z^z6DZ#4;jQVJ=wh&+aoMh|sohGs!NX zdPJ?1<7j0&J7LaV0+3S}5e#?NPCPn$30}Na0+dKE8cPHat$xnU1)?4mJ5+$Hjql#C z{9e$avD|L3v4Z?mbk?H2V-hMg-i0O(AzC?RgOLxdIJsHUDCF9=wW@G;u$Led5TBx{ zb7$EzZbJ@#^d7ccb8!hI&FJJu+0omYcN&%>OwI6ud`K)-b!pz>3-5m=I_VZh{+=)}9!l>|ibnIcohLPMi)o0Tgn=$f`%sK2lg$ z2jUD}!Y}3LG)4;Cg4S$gGJ>d#L>hDd!2^ol{zDZY4SR`)g#7N0#*>|>n$Y5XWtk1#V32|<7yt|!0M3Gm zqFB_814^pFv)LIc7 zbu4R9MEX;S&|#F;Y~$#oGO zB!(ykjX3-fDMVSCkN|VKMpxUcgQR&qg1V6jB`Z!A;}Mgi1K@?vvX{i4f3vsZ#aAc_ z`yf5-+r9g3JlYv!=rB8rAp0#-%KZE_%B;I~S!g8c_K%$&f#)<5qDdi)J3aDg3lDFj zR@Z_RAxr{mWjEZ?ASVD;OgnPAomFrvukPoemb9{nMe2>!b316TW{MBM&~X`;R_{Ht zp=Yagg_+LOB!@pSyau6^QO-w8B}yH5262r=^UpIHmLr;<#WTvJHihBE1Qt3On$u*1W^g09vrmi&G=|}Z zxrb2lT@Q^+Zmbrcn`1xZI7ZS-zhy@eKgB+iQcV7a-Ln2M5Q|Qdmk?v6tMsb0^^OZMZHC}*&Xe#<_EtIZkY!ku{8>TyV<@Hr9&tVm70jcQu`;yCXhv4vMuG>8h3=%@;CQsgHw_p_%ew3B;x zGVX0)#PJWo`TCA~!^V%JI&Duh^;m-95eW9*Zx+Vjos{0Cg@UC%6-VeFU44C~I442+ z7IR`5`5vf-%)z{vK$0`&@V~icGm|0pjyaxQ7%0tgt!NuLl?eFg#m`u%@C5%uswoc9 z+g#>Om`Q+qo#*1{mrc|iSlJMF07SiP<}!VGG%L9;2=w;8K{;`}Yj}8AEyJXOiM<5V zzG{udLFZ$)Aw;tn6OImPPSEVj)Vf%5W8foso$a&W`h)O8!;60kGMF2Bt;Qrjd~!70 zyq($To7|SQY}uBFUCT?+~B8VwE>kc@#Bjha*TruAN?Ff zG378U6FmlGaUJu1pw60nlw>ZHr6K6obXUNlp%WyH&1s zTvB?|d<7HQ+_L93zV%sdIC`7{iAWxpwGsD(=vlH_&O=+anoNBEgeNZYKoz6~NL_<+ ztxk6l%oermIyvm|4FhsXTrT@K=)>*wWKuo}y0<=T~+;+GTXSrfZZppF`c5@yAl z@3wxp1$+xW2^10Li73D&M}`?ebm>1|sP~&g94gQ_Km;-nfV7}&-|kBv>n|yFQOwS1 z;^*l*is}KjRy}yb^NG=RxvhEvH659xp$6#x4A``2Y-;)~(zK9Gcd_7{2RecMlJ}l| zfVo3l74{66REtKUDB@Ch`}UX{SVY1&5vGo~}Fh1e0v*j7wd z%^NM;MH)~@{e$Y9oaf8U$q{vRjMN_bj>Md=gM%)zgU!>eEOB$sLp;-cn)5$TN8y}# zHILMdX?i?X8d;yAKIaj&+&SEqrHVY(0M44Ug^_{{kvl-bDH->}U(nmwu_$E0J{rB) z05s3&={IlOpu4Z3|K<5Bt+l;kW_Q^1NN|2Z`|wF!aQP7pX0hMe_N*`vCL;? z4R&5ESob~{nJ^ahT())B9ima<{Ix3^jQHd@b+#uW9XKy7Op`+|TYZ>xizI5QeePU= z>|beTcF8d+N#t=WnVCoDp6f}TGDtPJ_VxCLCIo~U)Hb+2nF=1G1pffiOhV||@3A|>apMC%WH>WZ8Aue z^myR;d@+O#m%H6@_)#@Y$*9W2kCDg1*k_5WeooFH6WHV$26QrZ%T*RFM*kBLR8g`+x`uddx z5gjE;XXhj7c{DQD*1J!-oYIg;r z28dxE*9P7AFLFMAC=bNp)`&m<^`|3Z>VO9Pgh5VhktL- zG_c>LV*C-z67ja609UgVsFLr?vun|`SKagADJgoW!>{>((xPz6$-sH9F&IA< z#zU9tQC1S(X@k|*l6c@q@UZ46QO!Gq_q69$kV`?vXoGR=7M5tuvWW9>6maN6zMP-mcvJ23=qmvbr=CWkCghPT*prVR{~f+aGqQNFWD#1 z+dtZ&I|2x!{o1u_&tb8xkrM!?V&8IS!P+51nrW=jjWY7blMMNhQ8$OFq5^Pd0pLOc zkl6jfM+DqubN@_ajfx_j$jP0@=h%P1nB1#|wXPCpWo&}M=?FCEaX7C<2kKmMjkV{3 zI<7Udi8#*fG=SRfVk4W*>@MZAoCD>ffvfL^fOV&Vhx3$v02ID1kN#M9Zjb0ixnX zNHM*sl^9yqb&U;X<2FJDVT5`kYH@!M3lVcWD_RYd2v{SG6?+M`wMYV!d`Nju+*&BG zn7~~CA@-zJBKn^NQUi{Dd1BR;ZQG2|Q*#+4OBkD$Q*IB6El&>BR5NArCtyL%h<+fu z5S*)Tp0%UzX1PRZ^=yrVa6*dT~b&`7Y^k+`s4O=f5`QXDjz zSh@}+R~1xjk;_YurA1QTnn3+9Ph8QxoeZUv*V-lsRt?v`oK)yQ&PGVZXNkWMG^hgT z8ZC$rWk;HzjbFyBJq7Y7FrGO9z@CN6GQi;16m~PYvuA|}xQRjLADx;K#zY|#47%AT zCDk#!W+_`{%n zueMI*tz#lAAr{R9M7Je*JrK10$Yg{_Yol6#fC0b|BilqsNSwC-d}8EG#KAB=LkIF- z!7vBCO?V;0A;f%BOnsq3>!T;T!j;iq6Lxp)x^)7eE+T=NOfcP%k=2h*YDOZ(5|Pg9 z_4lbal>>u5tZ`Lq{MRyh^7q2Wd4%-taHH>uJTX}~Mk&|KtZSo}0tSg2MNR+%gL&|TQ82TJG;2{%ClO~$HL#`CLrcjJ}7p6NMouo}n zh6v|F=uMO{7jQ>f$N3-cYWAcUmg+$XYuh$w iKlC=+|EVL7=RHC<@wq&yOiUr~LOHE?D)GdH8~+8?@%X3! diff --git a/x.py b/x.py index 9ee26a2f..9dc6ee51 100644 --- a/x.py +++ b/x.py @@ -360,7 +360,7 @@ def plot_times(mean_times): ), ] ) - .groupby("impl") + .group_by("impl") .agg(pl.col("time").mean()) .sort("time") ) From 7d7811621d473fedab424b6aaa047aa80918a925 Mon Sep 17 00:00:00 2001 From: Christopher Prohm Date: Wed, 1 Nov 2023 17:27:20 +0100 Subject: [PATCH 25/27] Add debug symbols to benchmarking code --- Cargo.toml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Cargo.toml b/Cargo.toml index bbed1bef..015d6e66 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -5,3 +5,6 @@ members = [ ] resolver = "2" + +[profile.bench] +debug = true From 0df1d2b23143300e1c91102a64f10e3324b785fb Mon Sep 17 00:00:00 2001 From: Christopher Prohm Date: Wed, 1 Nov 2023 17:28:31 +0100 Subject: [PATCH 26/27] Documentation tweaks --- Changes.md | 33 ++------ serde_arrow/Quickstart.md | 6 +- serde_arrow/src/arrow2_impl/api.rs | 15 +++- serde_arrow/src/arrow2_impl/schema.rs | 7 +- serde_arrow/src/arrow_impl/api.rs | 15 +++- serde_arrow/src/arrow_impl/schema.rs | 7 +- serde_arrow/src/internal/error.rs | 6 +- serde_arrow/src/internal/generic.rs | 4 +- serde_arrow/src/internal/schema.rs | 83 ++++++++++--------- serde_arrow/src/internal/tracing/mod.rs | 5 +- serde_arrow/src/lib.rs | 33 ++++---- serde_arrow/src/schema.rs | 25 ++++-- .../src/test_impls/issue_90_type_tracing.rs | 1 + x.py | 19 +++-- 14 files changed, 147 insertions(+), 112 deletions(-) diff --git a/Changes.md b/Changes.md index c369415e..4322aac5 100644 --- a/Changes.md +++ b/Changes.md @@ -5,14 +5,19 @@ Breaking changes: - Make tracing options non-exhaustive -- Remove `try_parse_dates` in favor of `guess_dates` field in `TracingOptions` -- Remove experimental configuration api +- Remove the `try_parse_dates` field in favor of the `guess_dates` field in + `TracingOptions` (the setter name is not affected) +- Remove the experimental configuration api -New feature: Improved schema tracing: +Improvements: +- Simpler and streamlined API - Add type based tracing to allow schema tracing without samples + (`SerdeArrowSchema::form_type()`) +- Allow to build schema objects from serializable objects, e.g., + `serde_json::Value` (`SerdeArrow::from_value()`) -Deprecations: +Deprecations (see the documentation of deprecated items for how to migratie): - Rename `serde_arrow::schema::Schema` to `serde_arrow::schema::SerdeArrowSchema` to prevent name clashes with the @@ -24,26 +29,6 @@ Deprecations: - Deprecated single item methods in favor of using the `Items` and `Item` wrappers -Migration guide: - -```rust -// old: -serde_arrow::arrow::serialize_into_arrays(&fields, &items)? -// new: -serde_arrow::to_arrow(&fields, &items)? - -// old -serde_arrow::arrow::deserialize_from_arrays(&fields, &arrays)? -// new -serde::from_arrow(&fields, &arrays)? - -// old -serde_arrow::arrow::serialize_into_fields(&items)? -// new -use serde_arrow::schema::SerdeArrowSchema; -SerdeArrowSchema::from_samples(&items)?.to_arrow_fields()? -``` - ## 0.8.0 Make bytecode based serialization and deserialization the default diff --git a/serde_arrow/Quickstart.md b/serde_arrow/Quickstart.md index 5380373f..dd2e9465 100644 --- a/serde_arrow/Quickstart.md +++ b/serde_arrow/Quickstart.md @@ -125,9 +125,9 @@ Both `arrow` and `arrow2` use the Arrow memory format. Thanks to this fact, it is possible to convert arrays between both packages with minimal work using their respective FFI interfaces: -- [arrow2::ffi::export_field_to_c](https://docs.rs/arrow2/latest/arrow2/ffi/fn.export_field_to_c.html) -- [arrow2::ffi_export_array_to_ce](https://docs.rs/arrow2/latest/arrow2/ffi/fn.export_array_to_c.html) -- [arrow::ffi::ArrowArray::new](https://docs.rs/arrow/latest/arrow/ffi/struct.ArrowArray.html#method.new) +- [`arrow2::ffi::export_field_to_c`](https://docs.rs/arrow2/latest/arrow2/ffi/fn.export_field_to_c.html) +- [`arrow2::ffi_export_array_to_c`](https://docs.rs/arrow2/latest/arrow2/ffi/fn.export_array_to_c.html) +- [`arrow::ffi::ArrowArray::new`](https://docs.rs/arrow/latest/arrow/ffi/struct.ArrowArray.html#method.new) The arrow2 crate includes [a helper trait][arrow2-arrow2arrow] to perform this conversion when used with the `arrow` feature. diff --git a/serde_arrow/src/arrow2_impl/api.rs b/serde_arrow/src/arrow2_impl/api.rs index 23b48bec..ed5c4a69 100644 --- a/serde_arrow/src/arrow2_impl/api.rs +++ b/serde_arrow/src/arrow2_impl/api.rs @@ -20,6 +20,10 @@ use crate::{ /// Build arrow2 arrays record by record (*requires one of the `arrow2-*` /// features*) /// +/// The given items should be records (e.g., structs). To serialize items +/// encoding single values consider the [`Items`][crate::utils::Items] and +/// [`Item`][crate::utils::Item] wrappers. +/// /// Example: /// /// ```rust @@ -104,9 +108,10 @@ impl Arrow2Builder { /// features*) /// /// `items` should be given in the form a list of records (e.g., a vector of -/// structs). +/// structs). To serialize items encoding single values consider the +/// [`Items`][crate::utils::Items] wrapper. /// -/// To build arrays record by record use [Arrow2Builder]. +/// To build arrays record by record use [`Arrow2Builder`]. /// /// ```rust /// # fn main() -> serde_arrow::Result<()> { @@ -153,7 +158,9 @@ where /// Deserialize items from the given arrow2 arrays (*requires* one of the /// `arrow2-*` features) /// -/// The type should be a list of records (e.g., a vector of structs). +/// The type should be a list of records (e.g., a vector of structs). To +/// deserialize items encoding single values consider the +/// [`Items`][crate::utils::Items] wrapper. /// /// ```rust /// # fn main() -> serde_arrow::Result<()> { @@ -370,7 +377,7 @@ where /// # use serde_arrow::_impl::arrow2; /// use arrow2::datatypes::{DataType, Field}; /// use serde_arrow::{Arrow2Builder, utils::{Items, Item}}; -/// +/// /// let fields = vec![Field::new("item", DataType::UInt8, false)]; /// let mut builder = Arrow2Builder::new(&fields)?; /// diff --git a/serde_arrow/src/arrow2_impl/schema.rs b/serde_arrow/src/arrow2_impl/schema.rs index a357cfce..8b53124f 100644 --- a/serde_arrow/src/arrow2_impl/schema.rs +++ b/serde_arrow/src/arrow2_impl/schema.rs @@ -9,7 +9,7 @@ use crate::{ }, }; -/// Support for arrow2 types (requires one of the `arrow2-*` features) +/// Support for arrow2 types (*requires one of the `arrow2-*` features*) impl SerdeArrowSchema { /// Build a new Schema object from fields pub fn from_arrow2_fields(fields: &[Field]) -> Result { @@ -25,13 +25,16 @@ impl SerdeArrowSchema { /// [`to_arrow2_fields`][SerdeArrowSchema::to_arrow2_fields] instead: /// /// ```rust + /// # fn main() -> serde_arrow::_impl::PanicOnError<()> { /// # use serde_arrow::schema::{SerdeArrowSchema, TracingOptions}; /// # #[derive(serde::Deserialize)] /// # struct Item { a: u32 } /// # let schema = SerdeArrowSchema::from_type::(TracingOptions::default()).unwrap(); /// # let fields = - /// schema.to_arrow2_fields().unwrap() + /// schema.to_arrow2_fields()? /// # ; + /// # Ok(()) + /// # } /// ``` #[deprecated = "The method `get_arrow2_fields` is deprecated. Use `to_arrow2_fields` instead"] pub fn get_arrow2_fields(&self) -> Result> { diff --git a/serde_arrow/src/arrow_impl/api.rs b/serde_arrow/src/arrow_impl/api.rs index 2f49a064..d9f2036b 100644 --- a/serde_arrow/src/arrow_impl/api.rs +++ b/serde_arrow/src/arrow_impl/api.rs @@ -20,6 +20,10 @@ use crate::{ /// Build arrow arrays record by record (*requires one of the `arrow-*` /// features*) /// +/// The given items should be records (e.g., structs). To serialize items +/// encoding single values consider the [`Items`][crate::utils::Items] and +/// [`Item`][crate::utils::Item] wrappers. +/// /// Example: /// /// ```rust @@ -104,7 +108,10 @@ impl ArrowBuilder { /// features*)) /// /// `items` should be given in the form a list of records (e.g., a vector of -/// structs). +/// structs). To serialize items encoding single values consider the +/// [`Items`][crate::utils::Items] wrapper. +/// +/// To build arrays record by record use [`ArrowBuilder`]. /// /// Example: /// @@ -148,7 +155,9 @@ pub fn to_arrow(fields: &[Field], items: &T) -> Result serde_arrow::Result<()> { @@ -367,7 +376,7 @@ where /// # use serde_arrow::_impl::arrow; /// use arrow::datatypes::{DataType, Field}; /// use serde_arrow::{ArrowBuilder, utils::{Items, Item}}; -/// +/// /// let fields = vec![Field::new("item", DataType::UInt8, false)]; /// let mut builder = ArrowBuilder::new(&fields)?; /// diff --git a/serde_arrow/src/arrow_impl/schema.rs b/serde_arrow/src/arrow_impl/schema.rs index ff12e8d4..63fd5612 100644 --- a/serde_arrow/src/arrow_impl/schema.rs +++ b/serde_arrow/src/arrow_impl/schema.rs @@ -10,7 +10,7 @@ use crate::{ }, }; -/// Support for arrow types (requires one of the `arrow-*` features) +/// Support for arrow types (*requires one of the `arrow-*` features*) impl SerdeArrowSchema { /// Build a new Schema object from fields pub fn from_arrow_fields(fields: &[Field]) -> Result { @@ -26,13 +26,16 @@ impl SerdeArrowSchema { /// [`to_arrow_fields`][SerdeArrowSchema::to_arrow_fields] instead: /// /// ```rust + /// # fn main() -> serde_arrow::_impl::PanicOnError<()> { /// # use serde_arrow::schema::{SerdeArrowSchema, TracingOptions}; /// # #[derive(serde::Deserialize)] /// # struct Item { a: u32 } /// # let schema = SerdeArrowSchema::from_type::(TracingOptions::default()).unwrap(); /// # let fields = - /// schema.to_arrow_fields().unwrap() + /// schema.to_arrow_fields()? /// # ; + /// # Ok(()) + /// # } /// ``` #[deprecated = "The method `get_arrow_fields` is deprecated. Use `to_arrow_fields` instead"] pub fn get_arrow_fields(&self) -> Result> { diff --git a/serde_arrow/src/internal/error.rs b/serde_arrow/src/internal/error.rs index 957aa602..1c0081de 100644 --- a/serde_arrow/src/internal/error.rs +++ b/serde_arrow/src/internal/error.rs @@ -9,10 +9,10 @@ pub type Result = std::result::Result; /// At the moment only a generic string error is supported, but it is planned to /// offer concrete types to match against. /// -/// The error carries a backtrace if `RUST_BACKTRACE=1`, see [std::backtrace] +/// The error carries a backtrace if `RUST_BACKTRACE=1`, see [`std::backtrace`] /// for details. This backtrace is included when printing the error. If the -/// error is caused by another error, that error can be retrieved with the -/// [source][std::error::Error::source] function. +/// error is caused by another error, that error can be retrieved with +/// [`source()`][std::error::Error::source]. /// #[non_exhaustive] pub enum Error { diff --git a/serde_arrow/src/internal/generic.rs b/serde_arrow/src/internal/generic.rs index c90e643b..e2372457 100644 --- a/serde_arrow/src/internal/generic.rs +++ b/serde_arrow/src/internal/generic.rs @@ -71,7 +71,7 @@ where /// A wrapper around a sequence of items /// /// When serialized or deserialized, it behaves as if each item was wrapped in a -/// struct with a single attribute `item`. +/// struct with a single attribute `"item"`. /// /// ```rust /// # fn main() -> serde_arrow::_impl::PanicOnError<()> { @@ -98,7 +98,7 @@ pub struct Items( /// A wrapper around a single item /// /// When serialized or deserialized, it behaves as if the Item was wrapped in a -/// struct with a single attribute `item`. +/// struct with a single attribute `"item"`. /// /// ```rust /// # fn main() -> serde_arrow::_impl::PanicOnError<()> { diff --git a/serde_arrow/src/internal/schema.rs b/serde_arrow/src/internal/schema.rs index 3733da34..2370bf0f 100644 --- a/serde_arrow/src/internal/schema.rs +++ b/serde_arrow/src/internal/schema.rs @@ -55,15 +55,16 @@ impl SerdeArrowSchema { Self::default() } - /// Build the schema from an object that implements serialize (e.g., `serde_json::Value`) + /// Build the schema from an object that implements serialize (e.g., + /// `serde_json::Value`) /// /// ```rust /// # fn main() -> serde_arrow::_impl::PanicOnError<()> { /// use serde_arrow::schema::SerdeArrowSchema; /// /// let schema = serde_json::json!([ - /// {"name":"foo","data_type":"U8"}, - /// {"name":"bar","data_type":"Utf8"}, + /// {"name": "foo", "data_type": "U8"}, + /// {"name": "bar", "data_type": "Utf8"}, /// ]); /// /// let schema = SerdeArrowSchema::from_value(&schema)?; @@ -94,27 +95,29 @@ impl SerdeArrowSchema { /// /// - `"name"` (**required**): the name of the field /// - `"data_type"` (**required**): the data type of the field as a string - /// - `"nullable"` (**optional**): if `true`, the field can contain null values - /// - `"strategy"` (**optional**): if given a string describing the strategy to - /// use (e.g., "NaiveStrAsDate64"). - /// - `"children"` (**optional**): a list of child fields, the semantics depend - /// on the data type + /// - `"nullable"` (**optional**): if `true`, the field can contain null + /// values + /// - `"strategy"` (**optional**): if given a string describing the strategy + /// to use (e.g., "NaiveStrAsDate64"). + /// - `"children"` (**optional**): a list of child fields, the semantics + /// depend on the data type /// - /// The following data types can be given + /// The following data types are supported: /// /// - booleans: `"Bool"` /// - signed integers: `"I8"`, `"I16"`, `"I32"`, `"I64"` /// - unsigned integers: `"U8"`, `"U16"`, `"U32"`, `"U64"` /// - floats: `"F16"`, `"F32"`, `"F64"` /// - strings: `"Utf8"`, `"LargeUtf8"` - /// - lists: `"List"`, `"LargeList"`. `"children"` must contain a single field - /// named `"element"` that describes the element types + /// - lists: `"List"`, `"LargeList"`. `"children"` must contain a single + /// field named `"element"` that describes the element types /// - structs: `"Struct"`. `"children"` must contain the child fields /// - maps: `"Map"`. `"children"` must contain two fields, named `"key"` and /// `"value"` that encode the key and value types /// - unions: `"Union"`. `"children"` must contain the different variants /// - dictionaries: `"Dictionary"`. `"children"` must contain two different - /// fields, named `"key"` of integer type and named `"value"` of string type + /// fields, named `"key"` of integer type and named `"value"` of string + /// type /// pub fn from_value(value: &T) -> Result { // simple version of serde-transcode @@ -126,6 +129,17 @@ impl SerdeArrowSchema { /// Determine the schema from the given record type /// + /// This approach requires the type `T` to implement + /// [`Deserialize`][serde::Deserialize]. As only type information is used, + /// it is not possible to detect data dependent properties. E.g., it is not + /// possible to auto detect date time strings. + /// + /// Note, the type `T` must encode a single "row" in the resulting data + /// frame. When encoding single arrays, use the [`Item`][crate::utils::Item] + /// wrapper instead of [`Items`][crate::utils::Items]. + /// + /// See [`TracingOptions`] for customization options. + /// /// ```rust /// # fn main() -> serde_arrow::_impl::PanicOnError<()> { /// # use serde_arrow::_impl::arrow; @@ -150,17 +164,6 @@ impl SerdeArrowSchema { /// # } /// ``` /// - /// This approach requires the type to implement - /// [`Deserialize`][serde::Deserialize]. As only type information is used, - /// it is not possible to detect data dependent properties. E.g., it is not - /// possible to auto detect date time strings. - /// - /// Note, the type must encode a single "row" in the resulting data frame. - /// When encoding single arrays, use the [Item][crate::utils::Item] wrapper - /// instead of [Items][crate::utils::Items]. - /// - /// See [TracingOptions] for customization options. - /// pub fn from_type<'de, T: Deserialize<'de>>(options: TracingOptions) -> Result { let mut tracer = Tracer::new(String::from("$"), options); tracer.trace_type::()?; @@ -169,6 +172,21 @@ impl SerdeArrowSchema { /// Determine the schema from the given samples /// + /// + /// This approach requires the type `T` to implement + /// [`Serialize`][serde::Serialize] and the samples to include all relevant + /// values. It uses only the information encoded in the samples to generate + /// the schema. Therefore, the following requirements must be met: + /// + /// - at least one `Some` value for `Option<..>` fields + /// - all variants of enum fields + /// - at least one element for sequence fields (e.g., `Vec<..>`) + /// - at least one example for map types (e.g., `HashMap<.., ..>`). All + /// possible keys must be given, if [`options.map_as_struct == + /// true`][TracingOptions::map_as_struct]) + /// + /// See [`TracingOptions`] for customization options. + /// /// ```rust /// # fn main() -> serde_arrow::_impl::PanicOnError<()> { /// # use serde_arrow::_impl::arrow; @@ -207,20 +225,6 @@ impl SerdeArrowSchema { /// # } /// ``` /// - /// This approach requires the type to implement - /// [`Serialize`][serde::Serialize] and the samples to include all relevant - /// values. It uses only the information encoded in the samples to generate - /// the schema. Therefore, the following requirements must be met: - /// - /// - at least one `Some` value for `Option` fields - /// - all variants of enum fields - /// - at least one element of sequence fields (e.g., `Vec`) - /// - at least one example of map types (with all possible keys , if - /// [`options.map_as_struct == true`][TracingOptions::map_as_struct]) - /// (e.g., `HashMap`) - /// - /// See [TracingOptions] for customization options. - /// pub fn from_samples(samples: &T, options: TracingOptions) -> Result { let mut tracer = Tracer::new(String::from("$"), options); tracer.trace_samples(samples)?; @@ -244,10 +248,15 @@ pub enum Strategy { /// Serialize Rust strings containing UTC datetimes with timezone as Arrows /// Date64 /// + /// This strategy makes sense for chrono's `DateTime` types without + /// additional configuration. As they are serialized as strings. UtcStrAsDate64, /// Serialize Rust strings containing datetimes without timezone as Arrow /// Date64 /// + /// This strategy makes sense for chrono's `NaiveDateTime` types without + /// additional configuration. As they are serialized as strings. + /// NaiveStrAsDate64, /// Serialize Rust tuples as Arrow structs with numeric field names starting /// at `"0"` diff --git a/serde_arrow/src/internal/tracing/mod.rs b/serde_arrow/src/internal/tracing/mod.rs index 26f3d048..aa3df6a8 100644 --- a/serde_arrow/src/internal/tracing/mod.rs +++ b/serde_arrow/src/internal/tracing/mod.rs @@ -41,8 +41,11 @@ pub struct TracingOptions { /// If `true` serialize strings dictionary encoded. The default is `false`. /// - /// If `true`, strings are traced as `Dictionary(UInt64, LargeUtf8)`. If + /// If `true`, strings are traced as `Dictionary(UInt32, LargeUtf8)`. If /// `false`, strings are traced as `LargeUtf8`. + /// + /// Note: the 32 bit offsets are chosen, as they are supported by the + /// default polars package. pub string_dictionary_encoding: bool, /// If `true`, coerce different numeric types. diff --git a/serde_arrow/src/lib.rs b/serde_arrow/src/lib.rs index 195f276c..d6a89bc0 100644 --- a/serde_arrow/src/lib.rs +++ b/serde_arrow/src/lib.rs @@ -20,8 +20,8 @@ //! schema should contain a `Date64`. `serde_arrow` supports to derive the //! schema from the data itself via schema tracing, but does not require it. It //! is always possible to specify the schema manually. See the [`schema` -//! module][schema] and [SerdeArrowSchema][schema::SerdeArrowSchema] -//! for further details. +//! module][schema] and [`SerdeArrowSchema`][schema::SerdeArrowSchema] for +//! further details. //! //! ## Overview //! @@ -106,20 +106,18 @@ //! //! Available features: //! -//! | Feature | Arrow Version | -//! |---------------|---------------| -//! | `arrow-46` | `arrow=46` | -//! | `arrow-45` | `arrow=45` | -//! | `arrow-44` | `arrow=44` | -//! | `arrow-43` | `arrow=43` | -//! | `arrow-42` | `arrow=42` | -//! | `arrow-41` | `arrow=41` | -//! | `arrow-40` | `arrow=40` | -//! | `arrow-39` | `arrow=39` | -//! | `arrow-38` | `arrow=38` | -//! | `arrow-37` | `arrow=37` | -//! | `arrow2-0-17` | `arrow2=0.17` | -//! | `arrow2-0-16` | `arrow2=0.16` | +//! | Arrow Feature | Arrow Version | | Arrow2 version | Arrow2 Version | +//! |---------------|---------------|---|----------------|----------------| +//! | `arrow-46` | `arrow=46` | | `arrow2-0-17` | `arrow2=0.17`  | +//! | `arrow-45` | `arrow=45` | | `arrow2-0-16` | `arrow2=0.16` | +//! | `arrow-44` | `arrow=44` | | | | +//! | `arrow-43` | `arrow=43` | | | | +//! | `arrow-42` | `arrow=42` | | | | +//! | `arrow-41` | `arrow=41` | | | | +//! | `arrow-40` | `arrow=40` | | | | +//! | `arrow-39` | `arrow=39` | | | | +//! | `arrow-38` | `arrow=38` | | | | +//! | `arrow-37` | `arrow=37` | | | | //! mod internal; @@ -135,6 +133,7 @@ pub mod _impl { macro_rules! build_arrow2_crate { ($arrow2:ident) => { /// Re-export the used arrow2 crate + #[doc(hidden)] pub use $arrow2 as arrow2; }; } @@ -147,6 +146,7 @@ pub mod _impl { ($arrow_array:ident, $arrow_buffer:ident, $arrow_data:ident, $arrow_schema:ident) => { /// A "fake" arrow crate re-exporting the relevant definitions of the /// used arrow-* subcrates + #[doc(hidden)] pub mod arrow { /// The raw arrow packages pub mod _raw { @@ -210,6 +210,7 @@ pub mod _impl { } // Reexport for tests + #[doc(hidden)] pub use crate::internal::error::PanicOnError; } diff --git a/serde_arrow/src/schema.rs b/serde_arrow/src/schema.rs index 3d324feb..1aaaadd9 100644 --- a/serde_arrow/src/schema.rs +++ b/serde_arrow/src/schema.rs @@ -2,15 +2,26 @@ //! //! To convert between Rust objects and Arrow types, `serde_arrows` requires //! schema information as a list of Arrow fields with additional meta data. See -//! [SerdeArrowSchema] for details how to specify the schema. +//! [`SerdeArrowSchema`] for details on how to specify the schema. //! -//! The default mapping of Rust types to Arrow types is as follows: +//! The default mapping of Rust types to [Arrow types][arrow-types] is as follows: //! -//! - Strings: `LargeUtf8`, i.e., i64 offsets -//! - Lists: `LargeList`, i.e., i64 offsets -//! - Strings with dictionary encoding: `UInt32` keys and `LargeUtf8` values +//! [arrow-types]: https://docs.rs/arrow/latest/arrow/datatypes/enum.DataType.html //! -//! All customization of the types happens by including a suitable [Strategy] in +//! - `()`: `Null` +//! - `bool`: `Boolean` +//! - `u8`, .., `u64`, `i8`, .., `i64`: `UInt8`, .., `Uint64`, `Int8`, .. +//! `UInt64` +//! - Floats: floats are directly mapped (`f32` -> `Float32`) +//! - Strings: `LargeUtf8` with i64 offsets +//! - Sequences: `LargeList` with i64 offsets +//! - Structs / Map / Tuples: `Struct` type +//! - Enums: dense Unions. Each variant is mapped to a separate field. Its type +//! depends on the union type: Field-less variants are mapped to `NULL`. New +//! type variants are mapped according to their inner type. Other variant +//! types are mapped to struct types. +//! +//! All customization of the types happens by including a suitable [`Strategy`] in //! the metadata of the fields. For example, to let `serde_arrow` handle date //! time objects that are serialized to strings (chrono's default), use //! @@ -31,6 +42,6 @@ pub use crate::internal::{ tracing::TracingOptions, }; -/// Type alias for SerdeArrowSchema for backwards compatibility +/// Renamed to [`SerdeArrowSchema`] #[deprecated = "serde_arrow::schema::Schema is deprecated. Use serde_arrow::schema::SerdeArrowSchema instead"] pub type Schema = SerdeArrowSchema; diff --git a/serde_arrow/src/test_impls/issue_90_type_tracing.rs b/serde_arrow/src/test_impls/issue_90_type_tracing.rs index 108cf637..50b52fbd 100644 --- a/serde_arrow/src/test_impls/issue_90_type_tracing.rs +++ b/serde_arrow/src/test_impls/issue_90_type_tracing.rs @@ -311,6 +311,7 @@ mod mixed_tracing_unions { #[test] fn unsupported_recursive_types() { + #[allow(unused)] #[derive(Deserialize)] struct Tree { left: Option>, diff --git a/x.py b/x.py index 9dc6ee51..510726ef 100644 --- a/x.py +++ b/x.py @@ -149,17 +149,20 @@ def fmt(): @cmd(help="Run the linting") -def lint(): +@arg("--fast", action="store_true") +def lint(fast=False): check_cargo_toml() - cargo("check") - for arrow2_feature in (*all_arrow2_features, *all_arrow_features): - cargo( - "check", - "--features", - arrow2_feature, - ) + cargo("check", "--features", default_features) cargo("clippy", "--features", default_features) + if not fast: + for arrow2_feature in (*all_arrow2_features, *all_arrow_features): + cargo( + "check", + "--features", + arrow2_feature, + ) + @cmd(help="Run the example") def example(): From d93c722352bfe92b9a5cbfb0ea31836767fb1311 Mon Sep 17 00:00:00 2001 From: Christopher Prohm Date: Wed, 1 Nov 2023 17:51:54 +0100 Subject: [PATCH 27/27] Add support for arrow=47, arrow=48 --- .github/workflows/release.yml | 12 +- .github/workflows/test.yml | 12 +- Cargo.lock | 246 ++++++++++++++++++++++------ Changes.md | 1 + serde_arrow/Cargo.toml | 20 ++- serde_arrow/benches/groups/impls.rs | 4 +- serde_arrow/build.rs | 4 + serde_arrow/src/lib.rs | 2 + x.py | 2 + 9 files changed, 242 insertions(+), 61 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index ae0abdb0..9cad8cbc 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -80,13 +80,21 @@ "name": "Check arrow-46", "run": "cargo check --features arrow-46" }, + { + "name": "Check arrow-47", + "run": "cargo check --features arrow-47" + }, + { + "name": "Check arrow-48", + "run": "cargo check --features arrow-48" + }, { "name": "Build", - "run": "cargo build --features arrow2-0-17,arrow-46" + "run": "cargo build --features arrow2-0-17,arrow-48" }, { "name": "Test", - "run": "cargo test --features arrow2-0-17,arrow-46" + "run": "cargo test --features arrow2-0-17,arrow-48" }, { "name": "Publish to crates.io", diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 3a8f783c..963ae2e7 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -85,13 +85,21 @@ "name": "Check arrow-46", "run": "cargo check --features arrow-46" }, + { + "name": "Check arrow-47", + "run": "cargo check --features arrow-47" + }, + { + "name": "Check arrow-48", + "run": "cargo check --features arrow-48" + }, { "name": "Build", - "run": "cargo build --features arrow2-0-17,arrow-46" + "run": "cargo build --features arrow2-0-17,arrow-48" }, { "name": "Test", - "run": "cargo test --features arrow2-0-17,arrow-46" + "run": "cargo test --features arrow2-0-17,arrow-48" } ] } diff --git a/Cargo.lock b/Cargo.lock index 3aa94fbb..20633db2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -15,6 +15,12 @@ dependencies = [ "version_check", ] +[[package]] +name = "android-tzdata" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e999941b234f3131b00bc13c22d06e8c5ff726d1b6318ac7eb276997bbb4fef0" + [[package]] name = "android_system_properties" version = "0.1.5" @@ -203,6 +209,38 @@ dependencies = [ "num", ] +[[package]] +name = "arrow-array" +version = "47.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d02efa7253ede102d45a4e802a129e83bcc3f49884cab795b1ac223918e4318d" +dependencies = [ + "ahash", + "arrow-buffer 47.0.0", + "arrow-data 47.0.0", + "arrow-schema 47.0.0", + "chrono", + "half 2.2.1", + "hashbrown 0.14.0", + "num", +] + +[[package]] +name = "arrow-array" +version = "48.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "55705ada5cdde4cb0f202ffa6aa756637e33fea30e13d8d0d0fd6a24ffcee1e3" +dependencies = [ + "ahash", + "arrow-buffer 48.0.0", + "arrow-data 48.0.0", + "arrow-schema 48.0.0", + "chrono", + "half 2.2.1", + "hashbrown 0.14.0", + "num", +] + [[package]] name = "arrow-buffer" version = "37.0.0" @@ -304,16 +342,38 @@ dependencies = [ "num", ] +[[package]] +name = "arrow-buffer" +version = "47.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fda119225204141138cb0541c692fbfef0e875ba01bfdeaed09e9d354f9d6195" +dependencies = [ + "bytes", + "half 2.2.1", + "num", +] + +[[package]] +name = "arrow-buffer" +version = "48.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a722f90a09b94f295ab7102542e97199d3500128843446ef63e410ad546c5333" +dependencies = [ + "bytes", + "half 2.2.1", + "num", +] + [[package]] name = "arrow-cast" -version = "46.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "35e8b9990733a9b635f656efda3c9b8308c7a19695c9ec2c7046dd154f9b144b" +checksum = "af01fc1a06f6f2baf31a04776156d47f9f31ca5939fe6d00cd7a059f95a46ff1" dependencies = [ - "arrow-array 46.0.0", - "arrow-buffer 46.0.0", - "arrow-data 46.0.0", - "arrow-schema 46.0.0", + "arrow-array 48.0.0", + "arrow-buffer 48.0.0", + "arrow-data 48.0.0", + "arrow-schema 48.0.0", "arrow-select", "chrono", "half 2.2.1", @@ -441,6 +501,30 @@ dependencies = [ "num", ] +[[package]] +name = "arrow-data" +version = "47.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "475a4c3699c8b4095ca61cecf15da6f67841847a5f5aac983ccb9a377d02f73a" +dependencies = [ + "arrow-buffer 47.0.0", + "arrow-schema 47.0.0", + "half 2.2.1", + "num", +] + +[[package]] +name = "arrow-data" +version = "48.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0a547195e607e625e7fafa1a7269b8df1a4a612c919efd9b26bd86e74538f3a" +dependencies = [ + "arrow-buffer 48.0.0", + "arrow-schema 48.0.0", + "half 2.2.1", + "num", +] + [[package]] name = "arrow-format" version = "0.8.1" @@ -453,15 +537,15 @@ dependencies = [ [[package]] name = "arrow-json" -version = "46.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5d1b91a63c356d14eedc778b76d66a88f35ac8498426bb0799a769a49a74a8b4" +checksum = "7ac346bc84846ab425ab3c8c7b6721db90643bc218939677ed7e071ccbfb919d" dependencies = [ - "arrow-array 46.0.0", - "arrow-buffer 46.0.0", + "arrow-array 48.0.0", + "arrow-buffer 48.0.0", "arrow-cast", - "arrow-data 46.0.0", - "arrow-schema 46.0.0", + "arrow-data 48.0.0", + "arrow-schema 48.0.0", "chrono", "half 2.2.1", "indexmap 2.0.0", @@ -531,16 +615,29 @@ version = "46.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b104f5daa730f00fde22adc03a12aa5a2ae9ccbbf99cbd53d284119ddc90e03d" +[[package]] +name = "arrow-schema" +version = "47.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d1d179c117b158853e0101bfbed5615e86fe97ee356b4af901f1c5001e1ce4b" + +[[package]] +name = "arrow-schema" +version = "48.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d7a8c3f97f5ef6abd862155a6f39aaba36b029322462d72bbcfa69782a50614" + [[package]] name = "arrow-select" -version = "46.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "73b3ca55356d1eae07cf48808d8c462cea674393ae6ad1e0b120f40b422eb2b4" +checksum = "f868f4a5001429e20f7c1994b5cd1aa68b82e3db8cf96c559cdb56dc8be21410" dependencies = [ - "arrow-array 46.0.0", - "arrow-buffer 46.0.0", - "arrow-data 46.0.0", - "arrow-schema 46.0.0", + "ahash", + "arrow-array 48.0.0", + "arrow-buffer 48.0.0", + "arrow-data 48.0.0", + "arrow-schema 48.0.0", "num", ] @@ -684,18 +781,17 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] name = "chrono" -version = "0.4.24" +version = "0.4.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4e3c5919066adf22df73762e50cffcde3a758f2a848b113b586d1f86728b673b" +checksum = "7f2c685bad3eb3d45a01354cedb7d5faa66194d1d58ba6e267a8de788f79db38" dependencies = [ + "android-tzdata", "iana-time-zone", "js-sys", - "num-integer", "num-traits", "serde", - "time", "wasm-bindgen", - "winapi", + "windows-targets 0.48.5", ] [[package]] @@ -921,9 +1017,9 @@ checksum = "68b0cf012f1230e43cd00ebb729c6bb58707ecfa8ad08b52ef3a4ccd2697fc30" [[package]] name = "either" -version = "1.8.1" +version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7fcaabb2fef8c910e7f4c7ce9f67a1283a1715879a7c230ca9d6d1ae31f16d91" +checksum = "a26ae43d7bcc3b814de94796a5e736d4029efb0ee900c12e2d54c993ad1a1e07" [[package]] name = "equivalent" @@ -976,7 +1072,7 @@ dependencies = [ "cfg-if", "js-sys", "libc", - "wasi 0.11.0+wasi-snapshot-preview1", + "wasi", "wasm-bindgen", ] @@ -1551,6 +1647,8 @@ dependencies = [ "arrow-array 44.0.0", "arrow-array 45.0.0", "arrow-array 46.0.0", + "arrow-array 47.0.0", + "arrow-array 48.0.0", "arrow-buffer 37.0.0", "arrow-buffer 38.0.0", "arrow-buffer 39.0.0", @@ -1561,6 +1659,8 @@ dependencies = [ "arrow-buffer 44.0.0", "arrow-buffer 45.0.0", "arrow-buffer 46.0.0", + "arrow-buffer 47.0.0", + "arrow-buffer 48.0.0", "arrow-data 37.0.0", "arrow-data 38.0.0", "arrow-data 39.0.0", @@ -1571,6 +1671,8 @@ dependencies = [ "arrow-data 44.0.0", "arrow-data 45.0.0", "arrow-data 46.0.0", + "arrow-data 47.0.0", + "arrow-data 48.0.0", "arrow-json", "arrow-schema 37.0.0", "arrow-schema 38.0.0", @@ -1582,6 +1684,8 @@ dependencies = [ "arrow-schema 44.0.0", "arrow-schema 45.0.0", "arrow-schema 46.0.0", + "arrow-schema 47.0.0", + "arrow-schema 48.0.0", "arrow2 0.16.0", "arrow2 0.17.0", "arrow2_convert", @@ -1677,17 +1781,6 @@ version = "0.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "222a222a5bfe1bba4a77b45ec488a741b3cb8872e5e499451fd7d0129c9c7c3d" -[[package]] -name = "time" -version = "0.1.45" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1b797afad3f312d1c66a56d11d0316f916356d11bd158fbc6ca6389ff6bf805a" -dependencies = [ - "libc", - "wasi 0.10.0+wasi-snapshot-preview1", - "winapi", -] - [[package]] name = "tiny-keccak" version = "2.0.2" @@ -1741,12 +1834,6 @@ dependencies = [ "winapi-util", ] -[[package]] -name = "wasi" -version = "0.10.0+wasi-snapshot-preview1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a143597ca7c7793eff794def352d41792a93c481eb1042423ff7ff72ba2c31f" - [[package]] name = "wasi" version = "0.11.0+wasi-snapshot-preview1" @@ -1854,7 +1941,7 @@ version = "0.46.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cdacb41e6a96a052c6cb63a144f24900236121c6f63f4f8219fef5977ecb0c25" dependencies = [ - "windows-targets", + "windows-targets 0.42.2", ] [[package]] @@ -1863,13 +1950,28 @@ version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8e5180c00cd44c9b1c88adb3693291f1cd93605ded80c250a75d472756b4d071" dependencies = [ - "windows_aarch64_gnullvm", - "windows_aarch64_msvc", - "windows_i686_gnu", - "windows_i686_msvc", - "windows_x86_64_gnu", - "windows_x86_64_gnullvm", - "windows_x86_64_msvc", + "windows_aarch64_gnullvm 0.42.2", + "windows_aarch64_msvc 0.42.2", + "windows_i686_gnu 0.42.2", + "windows_i686_msvc 0.42.2", + "windows_x86_64_gnu 0.42.2", + "windows_x86_64_gnullvm 0.42.2", + "windows_x86_64_msvc 0.42.2", +] + +[[package]] +name = "windows-targets" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c" +dependencies = [ + "windows_aarch64_gnullvm 0.48.5", + "windows_aarch64_msvc 0.48.5", + "windows_i686_gnu 0.48.5", + "windows_i686_msvc 0.48.5", + "windows_x86_64_gnu 0.48.5", + "windows_x86_64_gnullvm 0.48.5", + "windows_x86_64_msvc 0.48.5", ] [[package]] @@ -1878,38 +1980,80 @@ version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "597a5118570b68bc08d8d59125332c54f1ba9d9adeedeef5b99b02ba2b0698f8" +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" + [[package]] name = "windows_aarch64_msvc" version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e08e8864a60f06ef0d0ff4ba04124db8b0fb3be5776a5cd47641e942e58c4d43" +[[package]] +name = "windows_aarch64_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" + [[package]] name = "windows_i686_gnu" version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c61d927d8da41da96a81f029489353e68739737d3beca43145c8afec9a31a84f" +[[package]] +name = "windows_i686_gnu" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" + [[package]] name = "windows_i686_msvc" version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "44d840b6ec649f480a41c8d80f9c65108b92d89345dd94027bfe06ac444d1060" +[[package]] +name = "windows_i686_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" + [[package]] name = "windows_x86_64_gnu" version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8de912b8b8feb55c064867cf047dda097f92d51efad5b491dfb98f6bbb70cb36" +[[package]] +name = "windows_x86_64_gnu" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" + [[package]] name = "windows_x86_64_gnullvm" version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "26d41b46a36d453748aedef1486d5c7a85db22e56aff34643984ea85514e94a3" +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" + [[package]] name = "windows_x86_64_msvc" version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9aec5da331524158c6d1a4ac0ab1541149c0b9505fde06423b02f5ef0106b9f0" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" diff --git a/Changes.md b/Changes.md index 4322aac5..ee076a02 100644 --- a/Changes.md +++ b/Changes.md @@ -16,6 +16,7 @@ Improvements: (`SerdeArrowSchema::form_type()`) - Allow to build schema objects from serializable objects, e.g., `serde_json::Value` (`SerdeArrow::from_value()`) +- Add support for `arrow=47` and `arrow=48` Deprecations (see the documentation of deprecated items for how to migratie): diff --git a/serde_arrow/Cargo.toml b/serde_arrow/Cargo.toml index 1045f80e..c291b6fb 100644 --- a/serde_arrow/Cargo.toml +++ b/serde_arrow/Cargo.toml @@ -13,15 +13,17 @@ bench = false [[bench]] name = "arrow2" -required-features = ["arrow2-0-17", "arrow-46"] +required-features = ["arrow2-0-17", "arrow-48"] harness = false [package.metadata.docs.rs] -features = ["arrow2-0-17", "arrow-46"] +features = ["arrow2-0-17", "arrow-48"] [features] default = [] +arrow-48 = ["dep:arrow-array-48", "dep:arrow-schema-48", "dep:arrow-data-48", "dep:arrow-buffer-48"] +arrow-47 = ["dep:arrow-array-47", "dep:arrow-schema-47", "dep:arrow-data-47", "dep:arrow-buffer-47"] arrow-46 = ["dep:arrow-array-46", "dep:arrow-schema-46", "dep:arrow-data-46", "dep:arrow-buffer-46"] arrow-45 = ["dep:arrow-array-45", "dep:arrow-schema-45", "dep:arrow-data-45", "dep:arrow-buffer-45"] arrow-44 = ["dep:arrow-array-44", "dep:arrow-schema-44", "dep:arrow-data-44", "dep:arrow-buffer-44"] @@ -40,6 +42,16 @@ chrono = "0.4" half = { version = "2", features = ["bytemuck"] } serde = { version = "1.0", features = ["derive"] } +arrow-array-48 = { package = "arrow-array", version = "48", optional = true } +arrow-buffer-48 = { package = "arrow-buffer", version = "48", optional = true } +arrow-data-48 = { package = "arrow-data", version="48", optional = true } +arrow-schema-48 = { package = "arrow-schema", version = "48", optional = true } + +arrow-array-47 = { package = "arrow-array", version = "47", optional = true } +arrow-buffer-47 = { package = "arrow-buffer", version = "47", optional = true } +arrow-data-47 = { package = "arrow-data", version="47", optional = true } +arrow-schema-47 = { package = "arrow-schema", version = "47", optional = true } + arrow-array-46 = { package = "arrow-array", version = "46", optional = true } arrow-buffer-46 = { package = "arrow-buffer", version = "46", optional = true } arrow-data-46 = { package = "arrow-data", version="46", optional = true } @@ -90,8 +102,8 @@ arrow-buffer-37 = { package = "arrow-buffer", version = "37", optional = true } arrow-data-37 = { package = "arrow-data", version="37", optional = true } arrow-schema-37 = { package = "arrow-schema", version = "37", optional = true } -arrow2-0-16 = { package = "arrow2", version = "0.16", optional = true } arrow2-0-17 = { package = "arrow2", version = "0.17", optional = true } +arrow2-0-16 = { package = "arrow2", version = "0.16", optional = true } [dev-dependencies] anyhow = "1" @@ -101,6 +113,6 @@ serde_json = "1" rand = "0.8" # for benchmarks -arrow-json-46 = { package = "arrow-json", version = "46" } +arrow-json-48 = { package = "arrow-json", version = "48" } criterion = "0.4" arrow2_convert = "0.5.0" diff --git a/serde_arrow/benches/groups/impls.rs b/serde_arrow/benches/groups/impls.rs index b75aa5da..8ed7d37c 100644 --- a/serde_arrow/benches/groups/impls.rs +++ b/serde_arrow/benches/groups/impls.rs @@ -94,8 +94,8 @@ pub mod arrow { use std::sync::Arc; - use arrow_json_46::ReaderBuilder; - use arrow_schema_46::Schema; + use arrow_json_48::ReaderBuilder; + use arrow_schema_48::Schema; use serde::Serialize; diff --git a/serde_arrow/build.rs b/serde_arrow/build.rs index 25bd2b36..463dcd8b 100644 --- a/serde_arrow/build.rs +++ b/serde_arrow/build.rs @@ -14,6 +14,10 @@ fn main() { } let max_arrow_version: Option = [ + #[cfg(feature = "arrow-48")] + 48, + #[cfg(feature = "arrow-47")] + 47, #[cfg(feature = "arrow-46")] 46, #[cfg(feature = "arrow-45")] diff --git a/serde_arrow/src/lib.rs b/serde_arrow/src/lib.rs index d6a89bc0..2786ace3 100644 --- a/serde_arrow/src/lib.rs +++ b/serde_arrow/src/lib.rs @@ -182,6 +182,8 @@ pub mod _impl { }; } + #[cfg(has_arrow_48)] build_arrow_crate!(arrow_array_48, arrow_buffer_48, arrow_data_48, arrow_schema_48); + #[cfg(has_arrow_47)] build_arrow_crate!(arrow_array_47, arrow_buffer_47, arrow_data_47, arrow_schema_47); #[cfg(has_arrow_46)] build_arrow_crate!(arrow_array_46, arrow_buffer_46, arrow_data_46, arrow_schema_46); #[cfg(has_arrow_45)] build_arrow_crate!(arrow_array_45, arrow_buffer_45, arrow_data_45, arrow_schema_45); #[cfg(has_arrow_44)] build_arrow_crate!(arrow_array_44, arrow_buffer_44, arrow_data_44, arrow_schema_44); diff --git a/x.py b/x.py index 510726ef..85fd56c1 100644 --- a/x.py +++ b/x.py @@ -15,6 +15,8 @@ "arrow-44", "arrow-45", "arrow-46", + "arrow-47", + "arrow-48", ] all_arrow2_features = ["arrow2-0-16", "arrow2-0-17"] default_features = f"{all_arrow2_features[-1]},{all_arrow_features[-1]}"