From 7f8c603e1d960ebb4534ca1f3ce84eb60a5bc302 Mon Sep 17 00:00:00 2001 From: comphead Date: Fri, 1 Dec 2023 14:03:30 -0800 Subject: [PATCH 1/3] Adding `is_null` datatype shortcut method --- arrow-schema/src/datatype.rs | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/arrow-schema/src/datatype.rs b/arrow-schema/src/datatype.rs index b78c785ae279..330ae5c9e346 100644 --- a/arrow-schema/src/datatype.rs +++ b/arrow-schema/src/datatype.rs @@ -350,23 +350,27 @@ impl DataType { } /// Returns true if this type is floating: (Float*). + #[inline] pub fn is_floating(&self) -> bool { use DataType::*; matches!(self, Float16 | Float32 | Float64) } /// Returns true if this type is integer: (Int*, UInt*). + #[inline] pub fn is_integer(&self) -> bool { self.is_signed_integer() || self.is_unsigned_integer() } /// Returns true if this type is signed integer: (Int*). + #[inline] pub fn is_signed_integer(&self) -> bool { use DataType::*; matches!(self, Int8 | Int16 | Int32 | Int64) } /// Returns true if this type is unsigned integer: (UInt*). + #[inline] pub fn is_unsigned_integer(&self) -> bool { use DataType::*; matches!(self, UInt8 | UInt16 | UInt32 | UInt64) @@ -387,6 +391,7 @@ impl DataType { /// Returns true if this type is nested (List, FixedSizeList, LargeList, Struct, Union, /// or Map), or a dictionary of a nested type + #[inline] pub fn is_nested(&self) -> bool { use DataType::*; match self { @@ -398,6 +403,13 @@ impl DataType { } } + /// Returns true if this type is DataType::Null. + #[inline] + pub fn is_null(&self) -> bool { + use DataType::*; + matches!(self, Null) + } + /// Compares the datatype with another, ignoring nested field names /// and metadata. pub fn equals_datatype(&self, other: &DataType) -> bool { @@ -855,6 +867,12 @@ mod tests { assert!(!DataType::is_floating(&DataType::Int32)); } + #[test] + fn test_datatype_is_null() { + assert!(DataType::is_null(&DataType::Null)); + assert!(!DataType::is_null(&DataType::Int32)); + } + #[test] fn size_should_not_regress() { assert_eq!(std::mem::size_of::(), 24); From 28af1b38f1a4bc1e7aa4862d516f46ae7ad379e4 Mon Sep 17 00:00:00 2001 From: comphead Date: Thu, 7 Dec 2023 08:43:30 -0800 Subject: [PATCH 2/3] Introduce `Schema` fields names method --- arrow-array/src/record_batch.rs | 3 +- arrow-flight/tests/encode_decode.rs | 2 +- arrow-schema/src/field.rs | 61 +++++++++++++++++++++++++++++ arrow-schema/src/schema.rs | 48 +++++++++++++++++++++++ 4 files changed, 112 insertions(+), 2 deletions(-) diff --git a/arrow-array/src/record_batch.rs b/arrow-array/src/record_batch.rs index 4e859fdfe7ea..9db1fcc92e80 100644 --- a/arrow-array/src/record_batch.rs +++ b/arrow-array/src/record_batch.rs @@ -149,9 +149,10 @@ impl RecordBatch { // check that number of fields in schema match column length if schema.fields().len() != columns.len() { return Err(ArrowError::InvalidArgumentError(format!( - "number of columns({}) must match number of fields({}) in schema", + "Mismatch between columns [{}] and schema fields [{}].\nKnown schema fields[{}]", columns.len(), schema.fields().len(), + schema.field_names(false).join(","), ))); } diff --git a/arrow-flight/tests/encode_decode.rs b/arrow-flight/tests/encode_decode.rs index f4741d743e57..abed4d3dc04b 100644 --- a/arrow-flight/tests/encode_decode.rs +++ b/arrow-flight/tests/encode_decode.rs @@ -287,7 +287,7 @@ async fn test_mismatched_record_batch_schema() { let err = result.unwrap_err(); assert_eq!( err.to_string(), - "Arrow(InvalidArgumentError(\"number of columns(1) must match number of fields(2) in schema\"))" + "Arrow(InvalidArgumentError(\"Mismatch between columns [1] and schema fields [2].\\nKnown schema fields[i,f]\"))" ); } diff --git a/arrow-schema/src/field.rs b/arrow-schema/src/field.rs index 574c024bb9b9..00cb4609d451 100644 --- a/arrow-schema/src/field.rs +++ b/arrow-schema/src/field.rs @@ -333,6 +333,67 @@ impl Field { collected_fields } + /// Returns a [`Vec`] direct children [`Field`]s + /// within `self` + pub(crate) fn nested_fields(&self) -> Vec<&Field> { + Field::_nested_fields(&self.data_type) + } + + /// Return self and direct children field names of the [`Field`] + /// + /// ``` + /// # use arrow_schema::*; + /// let field = Field::new("nested", + /// DataType::Struct( + /// Fields::from( + /// vec![ + /// Field::new("inner", + /// DataType::Struct( + /// Fields::from( + /// vec![ + /// Field::new("a", DataType::Int32, true) + /// ])), true)])), true + /// ); + /// + /// assert_eq!(field.children_names(), vec!["nested", "nested.inner", "nested.inner.a"]); + /// ``` + pub fn children_names(&self) -> Vec { + fn nested_field_names_inner(f: &Field, parent_name: String, buffer: &mut Vec) { + let current_name = format!("{}{}", parent_name, f.name()); + + // Push the concatenated name to the result vector + buffer.push(current_name.clone()); + + // Recursively concatenate child names + for child in f.nested_fields() { + nested_field_names_inner(child, format!("{}.", current_name), buffer); + } + } + + if !&self.data_type().is_nested() { + vec![] + } else { + let mut result: Vec = Vec::new(); + nested_field_names_inner(self, "".to_string(), &mut result); + result + } + } + + // Return inner fields not flattened + fn _nested_fields(dt: &DataType) -> Vec<&Field> { + match dt { + DataType::Struct(fields) => fields.iter().map(|f| f.as_ref()).collect(), + DataType::Union(fields, _) => fields.iter().map(|f| f.1.as_ref()).collect(), + DataType::List(field) + | DataType::LargeList(field) + | DataType::FixedSizeList(field, _) + | DataType::Map(field, _) => field.fields(), + DataType::Dictionary(_, value_field) => Field::_nested_fields(value_field.as_ref()), + _ => vec![], + } + } + + // Return inner fields flattened fn _fields(dt: &DataType) -> Vec<&Field> { match dt { DataType::Struct(fields) => fields.iter().flat_map(|f| f.fields()).collect(), diff --git a/arrow-schema/src/schema.rs b/arrow-schema/src/schema.rs index e547e5df3a5a..cfd9143ffb60 100644 --- a/arrow-schema/src/schema.rs +++ b/arrow-schema/src/schema.rs @@ -408,6 +408,29 @@ impl Schema { pub fn remove(&mut self, index: usize) -> FieldRef { self.fields.remove(index) } + + /// Returns a field names from schema with references to fields + /// * `nested` - include nested fields. + #[inline] + pub fn field_names(&self, nested: bool) -> Vec { + if nested { + self.fields + .iter() + .flat_map(|f| { + if f.data_type().is_nested() { + f.children_names() + } else { + vec![f.name().to_string()] + } + }) + .collect::>() + } else { + self.fields + .iter() + .map(|f| f.name().clone()) + .collect::>() + } + } } impl fmt::Display for Schema { @@ -957,4 +980,29 @@ mod tests { assert_eq!(out.metadata["k"], "v"); assert_eq!(out.metadata["key"], "value"); } + + #[test] + fn test_schema_field_names() { + use crate::Field; + let mut builder = SchemaBuilder::new(); + builder.push(Field::new("a", DataType::Int32, false)); + builder.push(Field::new("b", DataType::Utf8, false)); + builder.push(Field::new( + "nested", + DataType::Struct(Fields::from(vec![Field::new( + "inner", + DataType::Struct(Fields::from(vec![Field::new("a", DataType::Int32, true)])), + true, + )])), + true, + )); + + let schema = builder.finish(); + assert_eq!(schema.field_names(false), vec!["a", "b", "nested"]); + + assert_eq!( + schema.field_names(true), + vec!["a", "b", "nested", "nested.inner", "nested.inner.a"] + ); + } } From 87f5c017031a11b98f3bb5c6efc779c92f6d3d33 Mon Sep 17 00:00:00 2001 From: comphead Date: Tue, 12 Dec 2023 09:31:02 -0800 Subject: [PATCH 3/3] simplify --- arrow-schema/src/field.rs | 41 +++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 21 deletions(-) diff --git a/arrow-schema/src/field.rs b/arrow-schema/src/field.rs index 00cb4609d451..82b666ab6b38 100644 --- a/arrow-schema/src/field.rs +++ b/arrow-schema/src/field.rs @@ -328,7 +328,7 @@ impl Field { /// within `self` contained within this field (including `self`) pub(crate) fn fields(&self) -> Vec<&Field> { let mut collected_fields = vec![self]; - collected_fields.append(&mut Field::_fields(&self.data_type)); + collected_fields.append(&mut Field::_fields(&self.data_type, true)); collected_fields } @@ -336,7 +336,7 @@ impl Field { /// Returns a [`Vec`] direct children [`Field`]s /// within `self` pub(crate) fn nested_fields(&self) -> Vec<&Field> { - Field::_nested_fields(&self.data_type) + Field::_fields(&self.data_type, false) } /// Return self and direct children field names of the [`Field`] @@ -379,30 +379,29 @@ impl Field { } } - // Return inner fields not flattened - fn _nested_fields(dt: &DataType) -> Vec<&Field> { + // Return inner fields + // flatten - if inner fields needs to be flattened + fn _fields(dt: &DataType, flatten: bool) -> Vec<&Field> { match dt { - DataType::Struct(fields) => fields.iter().map(|f| f.as_ref()).collect(), - DataType::Union(fields, _) => fields.iter().map(|f| f.1.as_ref()).collect(), - DataType::List(field) - | DataType::LargeList(field) - | DataType::FixedSizeList(field, _) - | DataType::Map(field, _) => field.fields(), - DataType::Dictionary(_, value_field) => Field::_nested_fields(value_field.as_ref()), - _ => vec![], - } - } - - // Return inner fields flattened - fn _fields(dt: &DataType) -> Vec<&Field> { - match dt { - DataType::Struct(fields) => fields.iter().flat_map(|f| f.fields()).collect(), - DataType::Union(fields, _) => fields.iter().flat_map(|(_, f)| f.fields()).collect(), + DataType::Struct(fields) => { + if flatten { + fields.iter().flat_map(|f| f.fields()).collect() + } else { + fields.iter().map(|f| f.as_ref()).collect() + } + } + DataType::Union(fields, _) => { + if flatten { + fields.iter().flat_map(|(_, f)| f.fields()).collect() + } else { + fields.iter().map(|f| f.1.as_ref()).collect() + } + } DataType::List(field) | DataType::LargeList(field) | DataType::FixedSizeList(field, _) | DataType::Map(field, _) => field.fields(), - DataType::Dictionary(_, value_field) => Field::_fields(value_field.as_ref()), + DataType::Dictionary(_, value_field) => Field::_fields(value_field.as_ref(), flatten), _ => vec![], } }