From b98763ad079f20e36d82268f9e7cf0db49fdd461 Mon Sep 17 00:00:00 2001 From: ZhangHuiGui <106943008+ZhangHuiGui@users.noreply.github.com> Date: Tue, 16 Apr 2024 02:00:10 +0800 Subject: [PATCH] GH-41055: [C++] Support flatten for combining nested list related types (#41092) ### Rationale for this change Support flatten for combining nested list related types. ### What changes are included in this PR? Add the recursively flatten function for auto detect and flatten the combining nested list types. ### Are these changes tested? Yes ### Are there any user-facing changes? Yes, user can flatten a combining nested-list or related array by use `Flatten` API. * GitHub Issue: #41055 Authored-by: ZhangHuiGui Signed-off-by: Felipe Oliveira Carvalho --- cpp/src/arrow/array/array_list_test.cc | 69 +++++++++++++++++++++++++- cpp/src/arrow/array/array_nested.cc | 44 ++++++++++++++++ cpp/src/arrow/array/array_nested.h | 32 ++++++++++++ 3 files changed, 144 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/array/array_list_test.cc b/cpp/src/arrow/array/array_list_test.cc index b08fa99168616..18afcc90d71f8 100644 --- a/cpp/src/arrow/array/array_list_test.cc +++ b/cpp/src/arrow/array/array_list_test.cc @@ -735,7 +735,7 @@ class TestListArray : public ::testing::Test { ArrayFromJSON(type, "[[1, 2], [3], [4], null, [5], [], [6]]")); auto sliced_list_array = std::dynamic_pointer_cast(list_array->Slice(3, 4)); - ASSERT_OK_AND_ASSIGN(auto flattened, list_array->Flatten()); + ASSERT_OK_AND_ASSIGN(auto flattened, sliced_list_array->Flatten()); ASSERT_OK(flattened->ValidateFull()); // Note the difference between values() and Flatten(). EXPECT_TRUE(flattened->Equals(ArrayFromJSON(int32(), "[5, 6]"))); @@ -763,6 +763,52 @@ class TestListArray : public ::testing::Test { << flattened->ToString(); } + void TestFlattenRecursively() { + auto inner_type = std::make_shared(int32()); + auto type = std::make_shared(inner_type); + + // List types with two nested level: list> + auto nested_list_array = std::dynamic_pointer_cast(ArrayFromJSON(type, R"([ + [[0, 1, 2], null, [3, null]], + [null], + [[2, 9], [4], [], [6, 5]] + ])")); + ASSERT_OK_AND_ASSIGN(auto flattened, nested_list_array->FlattenRecursively()); + ASSERT_OK(flattened->ValidateFull()); + ASSERT_EQ(10, flattened->length()); + ASSERT_TRUE( + flattened->Equals(ArrayFromJSON(int32(), "[0, 1, 2, 3, null, 2, 9, 4, 6, 5]"))); + + // Empty nested list should flatten until non-list type is reached + nested_list_array = + std::dynamic_pointer_cast(ArrayFromJSON(type, R"([null])")); + ASSERT_OK_AND_ASSIGN(flattened, nested_list_array->FlattenRecursively()); + ASSERT_TRUE(flattened->type()->Equals(int32())); + + // List types with three nested level: list>> + type = std::make_shared(std::make_shared(fixed_size_list(int32(), 2))); + nested_list_array = std::dynamic_pointer_cast(ArrayFromJSON(type, R"([ + [ + [[null, 0]], + [[3, 7], null] + ], + [ + [[4, null], [5, 8]], + [[8, null]], + null + ], + [ + null + ] + ])")); + ASSERT_OK_AND_ASSIGN(flattened, nested_list_array->FlattenRecursively()); + ASSERT_OK(flattened->ValidateFull()); + ASSERT_EQ(10, flattened->length()); + ASSERT_EQ(3, flattened->null_count()); + ASSERT_TRUE(flattened->Equals( + ArrayFromJSON(int32(), "[null, 0, 3, 7, 4, null, 5, 8, 8, null]"))); + } + Status ValidateOffsetsAndSizes(int64_t length, std::vector offsets, std::vector sizes, std::shared_ptr values, int64_t offset = 0) { @@ -925,10 +971,12 @@ TYPED_TEST(TestListArray, BuilderPreserveFieldName) { TYPED_TEST(TestListArray, FlattenSimple) { this->TestFlattenSimple(); } TYPED_TEST(TestListArray, FlattenNulls) { this->TestFlattenNulls(); } TYPED_TEST(TestListArray, FlattenAllEmpty) { this->TestFlattenAllEmpty(); } +TYPED_TEST(TestListArray, FlattenSliced) { this->TestFlattenSliced(); } TYPED_TEST(TestListArray, FlattenZeroLength) { this->TestFlattenZeroLength(); } TYPED_TEST(TestListArray, TestFlattenNonEmptyBackingNulls) { this->TestFlattenNonEmptyBackingNulls(); } +TYPED_TEST(TestListArray, FlattenRecursively) { this->TestFlattenRecursively(); } TYPED_TEST(TestListArray, ValidateDimensions) { this->TestValidateDimensions(); } @@ -1714,4 +1762,23 @@ TEST_F(TestFixedSizeListArray, Flatten) { } } +TEST_F(TestFixedSizeListArray, FlattenRecursively) { + // Nested fixed-size list-array: fixed_size_list(fixed_size_list(int32, 2), 2) + auto inner_type = fixed_size_list(value_type_, 2); + type_ = fixed_size_list(inner_type, 2); + + auto values = std::dynamic_pointer_cast(ArrayFromJSON(type_, R"([ + [[0, 1], [null, 3]], + [[7, null], [2, 5]], + [null, null] + ])")); + ASSERT_OK(values->ValidateFull()); + ASSERT_OK_AND_ASSIGN(auto flattened, values->FlattenRecursively()); + ASSERT_OK(flattened->ValidateFull()); + ASSERT_EQ(8, flattened->length()); + ASSERT_EQ(2, flattened->null_count()); + AssertArraysEqual(*flattened, + *ArrayFromJSON(value_type_, "[0, 1, null, 3, 7, null, 2, 5]")); +} + } // namespace arrow diff --git a/cpp/src/arrow/array/array_nested.cc b/cpp/src/arrow/array/array_nested.cc index 958c2e25380b0..24e0dfb7081ac 100644 --- a/cpp/src/arrow/array/array_nested.cc +++ b/cpp/src/arrow/array/array_nested.cc @@ -42,6 +42,7 @@ #include "arrow/util/checked_cast.h" #include "arrow/util/list_util.h" #include "arrow/util/logging.h" +#include "arrow/util/unreachable.h" namespace arrow { @@ -469,6 +470,49 @@ inline void SetListData(VarLengthListLikeArray* self, self->values_ = MakeArray(self->data_->child_data[0]); } +Result> FlattenLogicalListRecursively(const Array& in_array, + MemoryPool* memory_pool) { + std::shared_ptr array = in_array.Slice(0, in_array.length()); + for (auto kind = array->type_id(); is_list(kind) || is_list_view(kind); + kind = array->type_id()) { + switch (kind) { + case Type::LIST: { + ARROW_ASSIGN_OR_RAISE( + array, (checked_cast(array.get())->Flatten(memory_pool))); + break; + } + case Type::LARGE_LIST: { + ARROW_ASSIGN_OR_RAISE( + array, + (checked_cast(array.get())->Flatten(memory_pool))); + break; + } + case Type::LIST_VIEW: { + ARROW_ASSIGN_OR_RAISE( + array, + (checked_cast(array.get())->Flatten(memory_pool))); + break; + } + case Type::LARGE_LIST_VIEW: { + ARROW_ASSIGN_OR_RAISE( + array, + (checked_cast(array.get())->Flatten(memory_pool))); + break; + } + case Type::FIXED_SIZE_LIST: { + ARROW_ASSIGN_OR_RAISE( + array, + (checked_cast(array.get())->Flatten(memory_pool))); + break; + } + default: + Unreachable("unexpected non-list type"); + break; + } + } + return array; +} + } // namespace internal // ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/array/array_nested.h b/cpp/src/arrow/array/array_nested.h index 768a630e0af54..5744f5fcadf05 100644 --- a/cpp/src/arrow/array/array_nested.h +++ b/cpp/src/arrow/array/array_nested.h @@ -58,6 +58,20 @@ void SetListData(VarLengthListLikeArray* self, const std::shared_ptr& data, Type::type expected_type_id = TYPE::type_id); +/// \brief A version of Flatten that keeps recursively flattening until an array of +/// non-list values is reached. +/// +/// Array types considered to be lists by this function: +/// - list +/// - large_list +/// - list_view +/// - large_list_view +/// - fixed_size_list +/// +/// \see ListArray::Flatten +ARROW_EXPORT Result> FlattenLogicalListRecursively( + const Array& in_array, MemoryPool* memory_pool); + } // namespace internal /// Base class for variable-sized list and list-view arrays, regardless of offset size. @@ -103,6 +117,15 @@ class VarLengthListLikeArray : public Array { return values_->Slice(value_offset(i), value_length(i)); } + /// \brief Flatten all level recursively until reach a non-list type, and return + /// a non-list type Array. + /// + /// \see internal::FlattenLogicalListRecursively + Result> FlattenRecursively( + MemoryPool* memory_pool = default_memory_pool()) const { + return internal::FlattenLogicalListRecursively(*this, memory_pool); + } + protected: friend void internal::SetListData(VarLengthListLikeArray* self, const std::shared_ptr& data, @@ -595,6 +618,15 @@ class ARROW_EXPORT FixedSizeListArray : public Array { Result> Flatten( MemoryPool* memory_pool = default_memory_pool()) const; + /// \brief Flatten all level recursively until reach a non-list type, and return + /// a non-list type Array. + /// + /// \see internal::FlattenLogicalListRecursively + Result> FlattenRecursively( + MemoryPool* memory_pool = default_memory_pool()) const { + return internal::FlattenLogicalListRecursively(*this, memory_pool); + } + /// \brief Construct FixedSizeListArray from child value array and value_length /// /// \param[in] values Array containing list values