From 71c6acf24d8413fc0fe71312f47cda6f83824054 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Wed, 30 Oct 2024 17:58:49 +0800 Subject: [PATCH] reorganize logic and add comment --- cpp/src/parquet/arrow/schema.cc | 88 ++++++++++++++++++--------------- 1 file changed, 48 insertions(+), 40 deletions(-) diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc index e17561782fcc2..8bdfb047b66d0 100644 --- a/cpp/src/parquet/arrow/schema.cc +++ b/cpp/src/parquet/arrow/schema.cc @@ -513,11 +513,13 @@ Status PopulateLeaf(int column_index, const std::shared_ptr& field, } // Special case mentioned in the format spec: -// If the name is array or ends in _tuple, this should be a list of struct -// even for single child elements. -bool HasStructListName(const GroupNode& node) { +// If the name is array or uses the parent's name with `_tuple` appended, +// this should be: +// - a list of list or map type if the repeated group node is LIST- or MAP-annotated. +// - otherwise, a list of struct even for single child elements. +bool HasListElementName(const GroupNode& node, const GroupNode& parent) { ::std::string_view name{node.name()}; - return name == "array" || EndsWith(name, "_tuple"); + return name == "array" || name == (parent.name() + "_tuple"); } Status GroupToStruct(const GroupNode& node, LevelInfo current_levels, @@ -598,9 +600,9 @@ Status MapToSchemaField(const GroupNode& group, LevelInfo current_levels, ctx->LinkParent(value_field, key_value_field); // required/optional group name=whatever { - // repeated group name=key_values{ + // repeated group name=key_values { // required TYPE key; - // required/optional TYPE value; + // required/optional TYPE value; // } // } // @@ -651,42 +653,44 @@ Status ListToSchemaField(const GroupNode& group, LevelInfo current_levels, int16_t repeated_ancestor_def_level = current_levels.IncrementRepeated(); if (list_node.is_group()) { - // Resolve 3-level encoding - // - // required/optional group name=whatever { - // repeated group name=list { - // required/optional TYPE item; - // } - // } - // - // yields list ?nullable - // - // We distinguish the special case that we have - // - // required/optional group name=whatever { - // repeated group name=array or $SOMETHING_tuple { - // required/optional TYPE item; - // } - // } - // - // In this latter case, the inner type of the list should be a struct - // rather than a primitive value - // - // yields list not null> ?nullable const auto& list_group = static_cast(list_node); - // Special case mentioned in the format spec: - // If the name is array or ends in _tuple, this should be a list of struct - // even for single child elements. - if (list_group.field_count() == 1 && !HasStructListName(list_group)) { - // List of primitive type - RETURN_NOT_OK( - NodeToSchemaField(*list_group.field(0), current_levels, ctx, out, child_field)); - } else if (list_group.field_count() == 1 && list_group.field(0)->is_repeated()) { - // Special case for nested list in two-level list encoding - RETURN_NOT_OK( - NodeToSchemaField(*list_group.field(0), current_levels, ctx, out, child_field)); - } else { + if (list_group.field_count() > 1) { + // The inner type of the list should be a struct when there are multiple fields + // in the repeated group RETURN_NOT_OK(GroupToStruct(list_group, current_levels, ctx, out, child_field)); + } else if (list_group.field_count() == 1) { + const auto& repeated_field = list_group.field(0); + if (repeated_field->is_repeated()) { + RETURN_NOT_OK( + NodeToSchemaField(*repeated_field, current_levels, ctx, out, child_field)); + } else if (HasListElementName(list_group, group)) { + // We distinguish the special case that we have + // + // required/optional group name=SOMETHING { + // repeated group name=array or $SOMETHING_tuple { + // required/optional TYPE item; + // } + // } + // + // The inner type of the list should be a struct rather than a primitive value + // + // yields list not null> ?nullable + RETURN_NOT_OK(GroupToStruct(list_group, current_levels, ctx, out, child_field)); + } else { + // Resolve 3-level encoding + // + // required/optional group name=whatever { + // repeated group name=list { + // required/optional TYPE item; + // } + // } + // + // yields list ?nullable + RETURN_NOT_OK( + NodeToSchemaField(*repeated_field, current_levels, ctx, out, child_field)); + } + } else { + return Status::Invalid("Group must have at least one child."); } } else { // Two-level list encoding @@ -694,6 +698,10 @@ Status ListToSchemaField(const GroupNode& group, LevelInfo current_levels, // required/optional group LIST { // repeated TYPE; // } + // + // TYPE is a primitive type + // + // yields list ?nullable const auto& primitive_node = static_cast(list_node); int column_index = ctx->schema->GetColumnIndex(primitive_node); ASSIGN_OR_RAISE(std::shared_ptr type,