From fcae5c98d74bed74b3e9fda39eb92dab5dd8f340 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Thu, 20 Jun 2024 14:01:40 -0300 Subject: [PATCH 1/6] list_parent_indices: Add support for list-view types --- .../arrow/compute/kernels/vector_nested.cc | 65 ++++++++++++++- .../compute/kernels/vector_nested_test.cc | 83 ++++++++++++++++++- docs/source/cpp/compute.rst | 3 +- 3 files changed, 147 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/vector_nested.cc b/cpp/src/arrow/compute/kernels/vector_nested.cc index 8c77c261c6a98..f4baa7bfebce5 100644 --- a/cpp/src/arrow/compute/kernels/vector_nested.cc +++ b/cpp/src/arrow/compute/kernels/vector_nested.cc @@ -21,9 +21,16 @@ #include "arrow/compute/api_vector.h" #include "arrow/compute/kernels/common_internal.h" #include "arrow/result.h" +#include "arrow/util/bit_run_reader.h" +#include "arrow/util/bit_util.h" +#include "arrow/util/list_util.h" #include "arrow/visit_type_inline.h" namespace arrow { + +using internal::CountSetBits; +using list_util::internal::RangeOfValuesUsed; + namespace compute { namespace internal { namespace { @@ -76,6 +83,61 @@ struct ListParentIndicesArray { Status Visit(const LargeListType& type) { return VisitList(type); } + template + Status VisitListView(const Type&) { + ArraySpan list_view{*input}; + + const offset_type* offsets = list_view.GetValues(1); + const offset_type* sizes = list_view.GetValues(2); + offset_type values_offset; + offset_type values_length; + ARROW_ASSIGN_OR_RAISE(std::tie(values_offset, values_length), + RangeOfValuesUsed(list_view)); + + ARROW_ASSIGN_OR_RAISE(auto indices_validity, + AllocateEmptyBitmap(values_length, ctx->memory_pool())); + auto* out_indices_validity = indices_validity->mutable_data(); + + ARROW_ASSIGN_OR_RAISE(auto indices, ctx->Allocate(values_length * sizeof(int64_t))); + auto* out_indices = indices->template mutable_data_as(); + memset(out_indices, -1, values_length * sizeof(int64_t)); + + const auto* validity = list_view.GetValues(0, 0); + RETURN_NOT_OK(arrow::internal::VisitSetBitRuns( + validity, list_view.offset, list_view.length, + [this, offsets, sizes, out_indices, out_indices_validity, values_offset]( + int64_t run_start, int64_t run_length) { + for (int64_t i = run_start; i < run_start + run_length; ++i) { + const int64_t pop_count = + CountSetBits(out_indices_validity, offsets[i] - values_offset, sizes[i]); + if (ARROW_PREDICT_FALSE(pop_count > 0)) { + return Status::ExecutionError( + "Function 'list_parent_indices' cannot produce parent indices for " + "values used by more than one list-view array element."); + } + bit_util::SetBitmap(out_indices_validity, offsets[i] - values_offset, + sizes[i]); + for (auto j = static_cast(offsets[i]); + j < static_cast(offsets[i]) + sizes[i]; ++j) { + out_indices[j - values_offset] = i + base_output_offset; + } + } + return Status::OK(); + })); + + const int64_t null_count = + values_length - CountSetBits(out_indices_validity, 0, values_length); + BufferVector buffers{null_count > 0 ? std::move(indices_validity) : nullptr, + std::move(indices)}; + out = std::make_shared(int64(), values_length, std::move(buffers), + null_count); + return Status::OK(); + } + + Status Visit(const ListViewType& type) { return VisitListView(type); } + + Status Visit(const LargeListViewType& type) { return VisitListView(type); } + Status Visit(const FixedSizeListType& type) { using offset_type = typename FixedSizeListType::offset_type; const offset_type slot_length = type.list_size(); @@ -125,7 +187,7 @@ const FunctionDoc list_flatten_doc( const FunctionDoc list_parent_indices_doc( "Compute parent indices of nested list values", - ("`lists` must have a list-like type.\n" + ("`lists` must have a list-like or list-view type.\n" "For each value in each list of `lists`, the top-level list index\n" "is emitted."), {"lists"}); @@ -147,6 +209,7 @@ class ListParentIndicesFunction : public MetaFunction { int64_t base_output_offset = 0; ArrayVector out_chunks; + out_chunks.reserve(input->num_chunks()); for (const auto& chunk : input->chunks()) { ARROW_ASSIGN_OR_RAISE(auto out_chunk, ListParentIndicesArray::Exec(&kernel_ctx, chunk->data(), diff --git a/cpp/src/arrow/compute/kernels/vector_nested_test.cc b/cpp/src/arrow/compute/kernels/vector_nested_test.cc index 56604ebd16cc0..ee1ed5e5641d7 100644 --- a/cpp/src/arrow/compute/kernels/vector_nested_test.cc +++ b/cpp/src/arrow/compute/kernels/vector_nested_test.cc @@ -183,8 +183,50 @@ TEST(TestVectorNested, ListFlattenFixedSizeListRecursively) { CheckVectorUnary("list_flatten", input, expected, &opts); } +template +void SwapListView(ArrayData* array, int64_t i, int64_t j) { + ASSERT_TRUE(is_list_view(array->type->id())); + ASSERT_EQ(array->type->id(), T::type_id); + ASSERT_LT(i, array->length); + ASSERT_LT(j, array->length); + auto* validity = array->GetMutableValues(0); + if (validity) { + const bool is_valid_i = bit_util::GetBit(validity, array->offset + i); + const bool is_valid_j = bit_util::GetBit(validity, array->offset + j); + if (is_valid_i ^ is_valid_j) { + bit_util::SetBitTo(validity, array->offset + i, is_valid_j); + bit_util::SetBitTo(validity, array->offset + j, is_valid_i); + } + } + auto* offsets = array->GetMutableValues(1); + auto* sizes = array->GetMutableValues(2); + std::swap(offsets[i], offsets[j]); + std::swap(sizes[i], sizes[j]); +} + +template +void SetListView(ArrayData* array, int64_t i, offset_type offset, offset_type size) { + ASSERT_TRUE(is_list_view(array->type->id())); + ASSERT_EQ(array->type->id(), T::type_id); + ASSERT_LT(i, array->length); + auto* validity = array->GetMutableValues(0); + if (validity) { + bit_util::SetBit(validity, array->offset + i); + } + auto* offsets = array->GetMutableValues(1); + auto* sizes = array->GetMutableValues(2); + offsets[i] = offset; + sizes[i] = size; +} + TEST(TestVectorNested, ListParentIndices) { - for (auto ty : {list(int16()), large_list(int16())}) { + const auto types = { + list(int16()), + large_list(int16()), + list_view(int16()), + large_list_view(int16()), + }; + for (auto ty : types) { auto input = ArrayFromJSON(ty, "[[0, null, 1], null, [2, 3], [], [4, 5]]"); auto expected = ArrayFromJSON(int64(), "[0, 0, 0, 2, 2, 4, 4]"); @@ -196,10 +238,47 @@ TEST(TestVectorNested, ListParentIndices) { auto tweaked = TweakValidityBit(input, 1, false); auto expected = ArrayFromJSON(int64(), "[0, 0, 0, 1, 1, 2, 2, 4, 4]"); CheckVectorUnary("list_parent_indices", tweaked, expected); + + { + // Construct a list-view with a non-empty null slot + auto input = + ArrayFromJSON(list_view(int16()), "[[0, null, 1], [0, 0], [2, 3], [], [4, 5]]"); + auto tweaked = TweakValidityBit(input, 1, false); + auto expected = ArrayFromJSON(int64(), "[0, 0, 0, null, null, 2, 2, 4, 4]"); + CheckVectorUnary("list_parent_indices", tweaked, expected); + + // Swap some list-view entries + auto swapped = tweaked->data()->Copy(); + SwapListView(swapped.get(), 0, 2); + SwapListView(swapped.get(), 1, 4); + AssertDatumsEqual( + swapped, + ArrayFromJSON(list_view(int16()), "[[2, 3], [4, 5], [0, null, 1], [], null]"), + /*verbose=*/true); + expected = ArrayFromJSON(int64(), "[2, 2, 2, null, null, 0, 0, 1, 1]"); + CheckVectorUnary("list_parent_indices", swapped, expected); + + // Make one view use values that are used by other list-views + SetListView(swapped.get(), 3, 1, 4); + AssertDatumsEqual( + swapped, + ArrayFromJSON(list_view(int16()), + "[[2, 3], [4, 5], [0, null, 1], [null, 1, 0, 0], null]"), + /*verbose=*/true); + EXPECT_RAISES_WITH_MESSAGE_THAT( + ExecutionError, ::testing::HasSubstr("values used by more than one list-view"), + CallFunction("list_parent_indices", {input})); + } } TEST(TestVectorNested, ListParentIndicesChunkedArray) { - for (auto ty : {list(int16()), large_list(int16())}) { + const auto types = { + list(int16()), + large_list(int16()), + list_view(int16()), + large_list_view(int16()), + }; + for (auto ty : types) { auto input = ChunkedArrayFromJSON(ty, {"[[0, null, 1], null]", "[[2, 3], [], [4, 5]]"}); diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst index 2c403e9880f5c..ac240328dbefe 100644 --- a/docs/source/cpp/compute.rst +++ b/docs/source/cpp/compute.rst @@ -181,7 +181,8 @@ of general type categories: * "String-like": String, LargeString. -* "List-like": List, LargeList, sometimes also FixedSizeList. +* "List-like": List, LargeList, ListView, LargeListView, and sometimes also + FixedSizeList. * "Nested": List-likes (including FixedSizeList), Struct, Union, and related types like Map. From c59d6e3bbcb37760d13ad907775af09a9616fc7f Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Mon, 24 Jun 2024 16:53:27 -0300 Subject: [PATCH 2/6] docs: Update comment about nulls and list_parent_indices --- docs/source/cpp/compute.rst | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst index ac240328dbefe..4131bbdf6f912 100644 --- a/docs/source/cpp/compute.rst +++ b/docs/source/cpp/compute.rst @@ -1814,8 +1814,10 @@ Structural transforms list array are discarded. * \(3) For each value in the list child array, the index at which it is found - in the list array is appended to the output. Nulls in the parent list array - are discarded. + in the list-like array is appended to the output. Indices of null lists in the + parent array might still be present in the output if they are non-empty null + lists. If the parent is a list-view, child array values that are not used by + any non-null list-view are null in the output. * \(4) For each list element, compute the slice of that list element, then return another list-like array of those slices. Can return either a From 3f744a9ab722ba500b18414b2ec92d9177dfae1a Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Mon, 24 Jun 2024 19:06:06 -0300 Subject: [PATCH 3/6] vector_nested.cc: Use Invalid instead of ExecutionError --- cpp/src/arrow/compute/kernels/vector_nested.cc | 2 +- cpp/src/arrow/compute/kernels/vector_nested_test.cc | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/vector_nested.cc b/cpp/src/arrow/compute/kernels/vector_nested.cc index f4baa7bfebce5..c9d3b7dc8c385 100644 --- a/cpp/src/arrow/compute/kernels/vector_nested.cc +++ b/cpp/src/arrow/compute/kernels/vector_nested.cc @@ -111,7 +111,7 @@ struct ListParentIndicesArray { const int64_t pop_count = CountSetBits(out_indices_validity, offsets[i] - values_offset, sizes[i]); if (ARROW_PREDICT_FALSE(pop_count > 0)) { - return Status::ExecutionError( + return Status::Invalid( "Function 'list_parent_indices' cannot produce parent indices for " "values used by more than one list-view array element."); } diff --git a/cpp/src/arrow/compute/kernels/vector_nested_test.cc b/cpp/src/arrow/compute/kernels/vector_nested_test.cc index ee1ed5e5641d7..da751fa5de403 100644 --- a/cpp/src/arrow/compute/kernels/vector_nested_test.cc +++ b/cpp/src/arrow/compute/kernels/vector_nested_test.cc @@ -266,7 +266,7 @@ TEST(TestVectorNested, ListParentIndices) { "[[2, 3], [4, 5], [0, null, 1], [null, 1, 0, 0], null]"), /*verbose=*/true); EXPECT_RAISES_WITH_MESSAGE_THAT( - ExecutionError, ::testing::HasSubstr("values used by more than one list-view"), + Invalid, ::testing::HasSubstr("values used by more than one list-view"), CallFunction("list_parent_indices", {input})); } } From 11309c25f4a2547b3d626860257136e33c64c0c5 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Mon, 24 Jun 2024 19:14:03 -0300 Subject: [PATCH 4/6] vector_nested.cc: Avoid a final CountSetBits call --- cpp/src/arrow/compute/kernels/vector_nested.cc | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/vector_nested.cc b/cpp/src/arrow/compute/kernels/vector_nested.cc index c9d3b7dc8c385..565e95cacc3a7 100644 --- a/cpp/src/arrow/compute/kernels/vector_nested.cc +++ b/cpp/src/arrow/compute/kernels/vector_nested.cc @@ -97,6 +97,7 @@ struct ListParentIndicesArray { ARROW_ASSIGN_OR_RAISE(auto indices_validity, AllocateEmptyBitmap(values_length, ctx->memory_pool())); auto* out_indices_validity = indices_validity->mutable_data(); + int64_t total_pop_count = 0; ARROW_ASSIGN_OR_RAISE(auto indices, ctx->Allocate(values_length * sizeof(int64_t))); auto* out_indices = indices->template mutable_data_as(); @@ -105,18 +106,19 @@ struct ListParentIndicesArray { const auto* validity = list_view.GetValues(0, 0); RETURN_NOT_OK(arrow::internal::VisitSetBitRuns( validity, list_view.offset, list_view.length, - [this, offsets, sizes, out_indices, out_indices_validity, values_offset]( - int64_t run_start, int64_t run_length) { + [this, offsets, sizes, out_indices, out_indices_validity, values_offset, + &total_pop_count](int64_t run_start, int64_t run_length) { for (int64_t i = run_start; i < run_start + run_length; ++i) { + auto validity_offset = offsets[i] - values_offset; const int64_t pop_count = - CountSetBits(out_indices_validity, offsets[i] - values_offset, sizes[i]); + CountSetBits(out_indices_validity, validity_offset, sizes[i]); if (ARROW_PREDICT_FALSE(pop_count > 0)) { return Status::Invalid( "Function 'list_parent_indices' cannot produce parent indices for " "values used by more than one list-view array element."); } - bit_util::SetBitmap(out_indices_validity, offsets[i] - values_offset, - sizes[i]); + bit_util::SetBitmap(out_indices_validity, validity_offset, sizes[i]); + total_pop_count += sizes[i]; for (auto j = static_cast(offsets[i]); j < static_cast(offsets[i]) + sizes[i]; ++j) { out_indices[j - values_offset] = i + base_output_offset; @@ -125,8 +127,8 @@ struct ListParentIndicesArray { return Status::OK(); })); - const int64_t null_count = - values_length - CountSetBits(out_indices_validity, 0, values_length); + DCHECK_LE(total_pop_count, values_length); + const int64_t null_count = values_length - total_pop_count; BufferVector buffers{null_count > 0 ? std::move(indices_validity) : nullptr, std::move(indices)}; out = std::make_shared(int64(), values_length, std::move(buffers), From e6118a9b5955b7c967771301bbccbc3163ea7e4d Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Tue, 25 Jun 2024 22:16:15 -0300 Subject: [PATCH 5/6] FIX: use int64_t instead of offset_type --- cpp/src/arrow/compute/kernels/vector_nested.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/vector_nested.cc b/cpp/src/arrow/compute/kernels/vector_nested.cc index 565e95cacc3a7..955f9b8cbd14c 100644 --- a/cpp/src/arrow/compute/kernels/vector_nested.cc +++ b/cpp/src/arrow/compute/kernels/vector_nested.cc @@ -89,8 +89,8 @@ struct ListParentIndicesArray { const offset_type* offsets = list_view.GetValues(1); const offset_type* sizes = list_view.GetValues(2); - offset_type values_offset; - offset_type values_length; + int64_t values_offset; + int64_t values_length; ARROW_ASSIGN_OR_RAISE(std::tie(values_offset, values_length), RangeOfValuesUsed(list_view)); From 3ec393929390c58b37f35be83350f56fae2a8df5 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Wed, 26 Jun 2024 10:50:20 +0200 Subject: [PATCH 6/6] Attempt to fix MinGW64 build --- cpp/src/arrow/type.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index f0cd82e803121..91a0d87cb8ae7 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -57,6 +57,8 @@ using internal::checked_cast; constexpr Type::type NullType::type_id; constexpr Type::type ListType::type_id; constexpr Type::type LargeListType::type_id; +constexpr Type::type ListViewType::type_id; +constexpr Type::type LargeListViewType::type_id; constexpr Type::type MapType::type_id;