diff --git a/arrow-array/src/record_batch.rs b/arrow-array/src/record_batch.rs index 4e859fdfe7ea..9db1fcc92e80 100644 --- a/arrow-array/src/record_batch.rs +++ b/arrow-array/src/record_batch.rs @@ -149,9 +149,10 @@ impl RecordBatch { // check that number of fields in schema match column length if schema.fields().len() != columns.len() { return Err(ArrowError::InvalidArgumentError(format!( - "number of columns({}) must match number of fields({}) in schema", + "Mismatch between columns [{}] and schema fields [{}].\nKnown schema fields[{}]", columns.len(), schema.fields().len(), + schema.field_names(false).join(","), ))); } diff --git a/arrow-flight/tests/encode_decode.rs b/arrow-flight/tests/encode_decode.rs index f4741d743e57..abed4d3dc04b 100644 --- a/arrow-flight/tests/encode_decode.rs +++ b/arrow-flight/tests/encode_decode.rs @@ -287,7 +287,7 @@ async fn test_mismatched_record_batch_schema() { let err = result.unwrap_err(); assert_eq!( err.to_string(), - "Arrow(InvalidArgumentError(\"number of columns(1) must match number of fields(2) in schema\"))" + "Arrow(InvalidArgumentError(\"Mismatch between columns [1] and schema fields [2].\\nKnown schema fields[i,f]\"))" ); } diff --git a/arrow-schema/src/field.rs b/arrow-schema/src/field.rs index 574c024bb9b9..82b666ab6b38 100644 --- a/arrow-schema/src/field.rs +++ b/arrow-schema/src/field.rs @@ -328,20 +328,80 @@ impl Field { /// within `self` contained within this field (including `self`) pub(crate) fn fields(&self) -> Vec<&Field> { let mut collected_fields = vec![self]; - collected_fields.append(&mut Field::_fields(&self.data_type)); + collected_fields.append(&mut Field::_fields(&self.data_type, true)); collected_fields } - fn _fields(dt: &DataType) -> Vec<&Field> { + /// Returns a [`Vec`] direct children [`Field`]s + /// within `self` + pub(crate) fn nested_fields(&self) -> Vec<&Field> { + Field::_fields(&self.data_type, false) + } + + /// Return self and direct children field names of the [`Field`] + /// + /// ``` + /// # use arrow_schema::*; + /// let field = Field::new("nested", + /// DataType::Struct( + /// Fields::from( + /// vec![ + /// Field::new("inner", + /// DataType::Struct( + /// Fields::from( + /// vec![ + /// Field::new("a", DataType::Int32, true) + /// ])), true)])), true + /// ); + /// + /// assert_eq!(field.children_names(), vec!["nested", "nested.inner", "nested.inner.a"]); + /// ``` + pub fn children_names(&self) -> Vec { + fn nested_field_names_inner(f: &Field, parent_name: String, buffer: &mut Vec) { + let current_name = format!("{}{}", parent_name, f.name()); + + // Push the concatenated name to the result vector + buffer.push(current_name.clone()); + + // Recursively concatenate child names + for child in f.nested_fields() { + nested_field_names_inner(child, format!("{}.", current_name), buffer); + } + } + + if !&self.data_type().is_nested() { + vec![] + } else { + let mut result: Vec = Vec::new(); + nested_field_names_inner(self, "".to_string(), &mut result); + result + } + } + + // Return inner fields + // flatten - if inner fields needs to be flattened + fn _fields(dt: &DataType, flatten: bool) -> Vec<&Field> { match dt { - DataType::Struct(fields) => fields.iter().flat_map(|f| f.fields()).collect(), - DataType::Union(fields, _) => fields.iter().flat_map(|(_, f)| f.fields()).collect(), + DataType::Struct(fields) => { + if flatten { + fields.iter().flat_map(|f| f.fields()).collect() + } else { + fields.iter().map(|f| f.as_ref()).collect() + } + } + DataType::Union(fields, _) => { + if flatten { + fields.iter().flat_map(|(_, f)| f.fields()).collect() + } else { + fields.iter().map(|f| f.1.as_ref()).collect() + } + } DataType::List(field) | DataType::LargeList(field) | DataType::FixedSizeList(field, _) | DataType::Map(field, _) => field.fields(), - DataType::Dictionary(_, value_field) => Field::_fields(value_field.as_ref()), + DataType::Dictionary(_, value_field) => Field::_fields(value_field.as_ref(), flatten), _ => vec![], } } diff --git a/arrow-schema/src/schema.rs b/arrow-schema/src/schema.rs index e547e5df3a5a..cfd9143ffb60 100644 --- a/arrow-schema/src/schema.rs +++ b/arrow-schema/src/schema.rs @@ -408,6 +408,29 @@ impl Schema { pub fn remove(&mut self, index: usize) -> FieldRef { self.fields.remove(index) } + + /// Returns a field names from schema with references to fields + /// * `nested` - include nested fields. + #[inline] + pub fn field_names(&self, nested: bool) -> Vec { + if nested { + self.fields + .iter() + .flat_map(|f| { + if f.data_type().is_nested() { + f.children_names() + } else { + vec![f.name().to_string()] + } + }) + .collect::>() + } else { + self.fields + .iter() + .map(|f| f.name().clone()) + .collect::>() + } + } } impl fmt::Display for Schema { @@ -957,4 +980,29 @@ mod tests { assert_eq!(out.metadata["k"], "v"); assert_eq!(out.metadata["key"], "value"); } + + #[test] + fn test_schema_field_names() { + use crate::Field; + let mut builder = SchemaBuilder::new(); + builder.push(Field::new("a", DataType::Int32, false)); + builder.push(Field::new("b", DataType::Utf8, false)); + builder.push(Field::new( + "nested", + DataType::Struct(Fields::from(vec![Field::new( + "inner", + DataType::Struct(Fields::from(vec![Field::new("a", DataType::Int32, true)])), + true, + )])), + true, + )); + + let schema = builder.finish(); + assert_eq!(schema.field_names(false), vec!["a", "b", "nested"]); + + assert_eq!( + schema.field_names(true), + vec!["a", "b", "nested", "nested.inner", "nested.inner.a"] + ); + } }