From 7c91f3f21d1948818437bc2439ee074d97f5553b Mon Sep 17 00:00:00 2001 From: Christopher Prohm Date: Tue, 31 Oct 2023 15:06:24 +0100 Subject: [PATCH] Rework schema docs --- serde_arrow/Quickstart.md | 23 +-- serde_arrow/src/internal/schema.rs | 216 ++++++++++++++++++++++------- serde_arrow/src/lib.rs | 3 +- serde_arrow/src/schema.rs | 22 ++- 4 files changed, 175 insertions(+), 89 deletions(-) diff --git a/serde_arrow/Quickstart.md b/serde_arrow/Quickstart.md index 39fc5ad9..5380373f 100644 --- a/serde_arrow/Quickstart.md +++ b/serde_arrow/Quickstart.md @@ -5,8 +5,7 @@ 1. [Working with date time objects](#working-with-date-time-objects) 2. [Dictionary encoding for strings](#dictionary-encoding-for-strings) 3. [Working with enums](#working-with-enums) -4. [Specifying the schema in JSON](#specifying-the-schema-in-json) -5. [Convert from arrow2 to arrow arrays](#convert-from-arrow2-to-arrow-arrays) +4. [Convert from arrow2 to arrow arrays](#convert-from-arrow2-to-arrow-arrays) ## Working with date time objects @@ -120,26 +119,6 @@ will be mapped to the following arrow union: - `type = 1`: `Struct { 0: u32, 1: u32 }` - `type = 2`: `Struct { a: f32, b: f32 }` -## Specifying the schema in JSON - -TODO: cross-reference - -```rust -let schema_json = r#" - [ - { - "name": "date", - "data_type": "Date64", - "strategy": "NaiveStrAsDate64" - }, - {"name":"foo","data_type":"U8"}, - {"name":"bar","data_type":"Utf8"} - ] -"#; - -let schema: Schema = serde_json::from_str(&schema_json).unwrap(); -``` - ## Convert from arrow2 to arrow arrays Both `arrow` and `arrow2` use the Arrow memory format. Thanks to this fact, it diff --git a/serde_arrow/src/internal/schema.rs b/serde_arrow/src/internal/schema.rs index 9a79e952..3733da34 100644 --- a/serde_arrow/src/internal/schema.rs +++ b/serde_arrow/src/internal/schema.rs @@ -18,55 +18,14 @@ pub const STRATEGY_KEY: &str = "SERDE_ARROW:strategy"; /// A collection of fields as understood by `serde_arrow` /// -/// `SerdeArrowSchema` is designed to be easily serialized and deserialized +/// There are three main ways to specify the schema: /// -/// ```rust -/// # use serde_arrow::schema::SerdeArrowSchema; -/// let schema_json = r#" -/// [ -/// { -/// "name": "date", -/// "data_type": "Date64", -/// "strategy": "NaiveStrAsDate64" -/// }, -/// {"name":"foo","data_type":"U8"}, -/// {"name":"bar","data_type":"Utf8"} -/// ] -/// "#; -/// -/// let schema: SerdeArrowSchema = serde_json::from_str(&schema_json).unwrap(); -/// ``` -/// -/// The schema can be given in two ways: -/// -/// - an array of fields -/// - or an object with a `"fields"` key that contains an array of fields -/// -/// Each field is an object with the following keys: -/// -/// - `"name"` (**required**): the name of the field -/// - `"data_type"` (**required**): the data type of the field as a string -/// - `"nullable"` (**optional**): if `true`, the field can contain null values -/// - `"strategy"` (**optional**): if given a string describing the strategy to -/// use (e.g., "NaiveStrAsDate64"). -/// - `"children"` (**optional**): a list of child fields, the semantics depend -/// on the data type -/// -/// The following data types can be given -/// -/// - booleans: `"Bool"` -/// - signed integers: `"I8"`, `"I16"`, `"I32"`, `"I64"` -/// - unsigned integers: `"U8"`, `"U16"`, `"U32"`, `"U64"` -/// - floats: `"F16"`, `"F32"`, `"F64"` -/// - strings: `"Utf8"`, `"LargeUtf8"` -/// - lists: `"List"`, `"LargeList"`. `"children"` must contain a single field -/// named `"element"` that describes the element types -/// - structs: `"Struct"`. `"children"` must contain the child fields -/// - maps: `"Map"`. `"children"` must contain two fields, named `"key"` and -/// `"value"` that encode the key and value types -/// - unions: `"Union"`. `"children"` must contain the different variants -/// - dictionaries: `"Dictionary"`. `"children"` must contain two different -/// fields, named `"key"` of integer type and named `"value"` of string type +/// 1. [`SerdeArrowSchema::from_value`]: specify the schema manually, e.g., as a +/// JSON value +/// 2. [`SerdeArrowSchema::from_type`]: determine the schema from the record +/// type +/// 3. [`SerdeArrowSchema::from_samples`]: Determine the schema from samples of +/// the data /// #[derive(Default, Debug, PartialEq, Clone, Serialize, Deserialize)] #[serde(from = "SchemaSerializationOptions")] @@ -96,7 +55,112 @@ impl SerdeArrowSchema { Self::default() } - /// Determine the schema from the given type + /// Build the schema from an object that implements serialize (e.g., `serde_json::Value`) + /// + /// ```rust + /// # fn main() -> serde_arrow::_impl::PanicOnError<()> { + /// use serde_arrow::schema::SerdeArrowSchema; + /// + /// let schema = serde_json::json!([ + /// {"name":"foo","data_type":"U8"}, + /// {"name":"bar","data_type":"Utf8"}, + /// ]); + /// + /// let schema = SerdeArrowSchema::from_value(&schema)?; + /// # Ok(()) + /// # } + /// ``` + /// + /// `SerdeArrowSchema` can also be directly serialized and deserialized. + /// + /// ```rust + /// # fn main() -> serde_arrow::_impl::PanicOnError<()> { + /// # let json_schema_str = "[]"; + /// # + /// use serde_arrow::schema::SerdeArrowSchema; + /// + /// let schema: SerdeArrowSchema = serde_json::from_str(json_schema_str)?; + /// serde_json::to_string(&schema)?; + /// # Ok(()) + /// # } + /// ``` + /// + /// The schema can be given in two ways: + /// + /// - an array of fields + /// - or an object with a `"fields"` key that contains an array of fields + /// + /// Each field is an object with the following keys: + /// + /// - `"name"` (**required**): the name of the field + /// - `"data_type"` (**required**): the data type of the field as a string + /// - `"nullable"` (**optional**): if `true`, the field can contain null values + /// - `"strategy"` (**optional**): if given a string describing the strategy to + /// use (e.g., "NaiveStrAsDate64"). + /// - `"children"` (**optional**): a list of child fields, the semantics depend + /// on the data type + /// + /// The following data types can be given + /// + /// - booleans: `"Bool"` + /// - signed integers: `"I8"`, `"I16"`, `"I32"`, `"I64"` + /// - unsigned integers: `"U8"`, `"U16"`, `"U32"`, `"U64"` + /// - floats: `"F16"`, `"F32"`, `"F64"` + /// - strings: `"Utf8"`, `"LargeUtf8"` + /// - lists: `"List"`, `"LargeList"`. `"children"` must contain a single field + /// named `"element"` that describes the element types + /// - structs: `"Struct"`. `"children"` must contain the child fields + /// - maps: `"Map"`. `"children"` must contain two fields, named `"key"` and + /// `"value"` that encode the key and value types + /// - unions: `"Union"`. `"children"` must contain the different variants + /// - dictionaries: `"Dictionary"`. `"children"` must contain two different + /// fields, named `"key"` of integer type and named `"value"` of string type + /// + pub fn from_value(value: &T) -> Result { + // simple version of serde-transcode + let mut events = Vec::::new(); + crate::internal::sink::serialize_into_sink(&mut events, value)?; + let this: Self = crate::internal::source::deserialize_from_source(&events)?; + Ok(this) + } + + /// Determine the schema from the given record type + /// + /// ```rust + /// # fn main() -> serde_arrow::_impl::PanicOnError<()> { + /// # use serde_arrow::_impl::arrow; + /// use arrow::datatypes::DataType; + /// use serde::Deserialize; + /// use serde_arrow::schema::{SerdeArrowSchema, TracingOptions}; + /// + /// ##[derive(Deserialize)] + /// struct Record { + /// int: i32, + /// float: f64, + /// string: String, + /// } + /// + /// let schema = SerdeArrowSchema::from_type::(TracingOptions::default())?; + /// let fields = schema.to_arrow_fields()?; + /// + /// assert_eq!(*fields[0].data_type(), DataType::Int32); + /// assert_eq!(*fields[1].data_type(), DataType::Float64); + /// assert_eq!(*fields[2].data_type(), DataType::LargeUtf8); + /// # Ok(()) + /// # } + /// ``` + /// + /// This approach requires the type to implement + /// [`Deserialize`][serde::Deserialize]. As only type information is used, + /// it is not possible to detect data dependent properties. E.g., it is not + /// possible to auto detect date time strings. + /// + /// Note, the type must encode a single "row" in the resulting data frame. + /// When encoding single arrays, use the [Item][crate::utils::Item] wrapper + /// instead of [Items][crate::utils::Items]. + /// + /// See [TracingOptions] for customization options. + /// pub fn from_type<'de, T: Deserialize<'de>>(options: TracingOptions) -> Result { let mut tracer = Tracer::new(String::from("$"), options); tracer.trace_type::()?; @@ -105,11 +169,57 @@ impl SerdeArrowSchema { /// Determine the schema from the given samples /// - /// To correctly record the type information make sure to: + /// ```rust + /// # fn main() -> serde_arrow::_impl::PanicOnError<()> { + /// # use serde_arrow::_impl::arrow; + /// use arrow::datatypes::DataType; + /// use serde::Serialize; + /// use serde_arrow::schema::{SerdeArrowSchema, TracingOptions}; + /// + /// ##[derive(Serialize)] + /// struct Record { + /// int: i32, + /// float: f64, + /// string: String, + /// } + /// + /// let samples = vec![ + /// Record { + /// int: 1, + /// float: 2.0, + /// string: String::from("hello") + /// }, + /// Record { + /// int: -1, + /// float: 32.0, + /// string: String::from("world") + /// }, + /// // ... + /// ]; + /// + /// let schema = SerdeArrowSchema::from_samples(&samples, TracingOptions::default())?; + /// let fields = schema.to_arrow_fields()?; + /// + /// assert_eq!(*fields[0].data_type(), DataType::Int32); + /// assert_eq!(*fields[1].data_type(), DataType::Float64); + /// assert_eq!(*fields[2].data_type(), DataType::LargeUtf8); + /// # Ok(()) + /// # } + /// ``` + /// + /// This approach requires the type to implement + /// [`Serialize`][serde::Serialize] and the samples to include all relevant + /// values. It uses only the information encoded in the samples to generate + /// the schema. Therefore, the following requirements must be met: + /// + /// - at least one `Some` value for `Option` fields + /// - all variants of enum fields + /// - at least one element of sequence fields (e.g., `Vec`) + /// - at least one example of map types (with all possible keys , if + /// [`options.map_as_struct == true`][TracingOptions::map_as_struct]) + /// (e.g., `HashMap`) /// - /// - include values for `Option` - /// - include all variants of an enum - /// - include at least single element of a list or a map + /// See [TracingOptions] for customization options. /// pub fn from_samples(samples: &T, options: TracingOptions) -> Result { let mut tracer = Tracer::new(String::from("$"), options); diff --git a/serde_arrow/src/lib.rs b/serde_arrow/src/lib.rs index c67380e8..195f276c 100644 --- a/serde_arrow/src/lib.rs +++ b/serde_arrow/src/lib.rs @@ -19,7 +19,8 @@ //! E.g., to convert Rust strings containing timestamps to Date64 arrays, the //! schema should contain a `Date64`. `serde_arrow` supports to derive the //! schema from the data itself via schema tracing, but does not require it. It -//! is always possible to specify the schema manually. See the [`schema`] module +//! is always possible to specify the schema manually. See the [`schema` +//! module][schema] and [SerdeArrowSchema][schema::SerdeArrowSchema] //! for further details. //! //! ## Overview diff --git a/serde_arrow/src/schema.rs b/serde_arrow/src/schema.rs index 4007be10..3d324feb 100644 --- a/serde_arrow/src/schema.rs +++ b/serde_arrow/src/schema.rs @@ -1,21 +1,17 @@ -//! Configure how Arrow and Rust types are translated into one another +//! The mapping between Rust and Arrow types //! -//! When tracing the schema using the `serialize_into_fields` methods, the -//! following defaults are used: +//! To convert between Rust objects and Arrow types, `serde_arrows` requires +//! schema information as a list of Arrow fields with additional meta data. See +//! [SerdeArrowSchema] for details how to specify the schema. +//! +//! The default mapping of Rust types to Arrow types is as follows: //! //! - Strings: `LargeUtf8`, i.e., i64 offsets //! - Lists: `LargeList`, i.e., i64 offsets -//! - Strings with dictionary encoding: U32 keys and LargeUtf8 values -//! - Rationale: `polars` cannot handle 64 bit keys in its default -//! configuration -//! -//! Null-only fields (e.g., fields of type `()` or fields with only `None` -//! entries) result in errors per default. -//! [`TracingOptions::allow_null_fields`][crate::internal::tracing::TracingOptions::allow_null_fields] -//! allows to disable this behavior. +//! - Strings with dictionary encoding: `UInt32` keys and `LargeUtf8` values //! -//! All customization of the types happens via the metadata of the fields -//! structs describing arrays. For example, to let `serde_arrow` handle date +//! All customization of the types happens by including a suitable [Strategy] in +//! the metadata of the fields. For example, to let `serde_arrow` handle date //! time objects that are serialized to strings (chrono's default), use //! //! ```rust