diff --git a/cpp/src/arrow/compute/kernels/vector_nested.cc b/cpp/src/arrow/compute/kernels/vector_nested.cc index 8c77c261c6a98..955f9b8cbd14c 100644 --- a/cpp/src/arrow/compute/kernels/vector_nested.cc +++ b/cpp/src/arrow/compute/kernels/vector_nested.cc @@ -21,9 +21,16 @@ #include "arrow/compute/api_vector.h" #include "arrow/compute/kernels/common_internal.h" #include "arrow/result.h" +#include "arrow/util/bit_run_reader.h" +#include "arrow/util/bit_util.h" +#include "arrow/util/list_util.h" #include "arrow/visit_type_inline.h" namespace arrow { + +using internal::CountSetBits; +using list_util::internal::RangeOfValuesUsed; + namespace compute { namespace internal { namespace { @@ -76,6 +83,63 @@ struct ListParentIndicesArray { Status Visit(const LargeListType& type) { return VisitList(type); } + template + Status VisitListView(const Type&) { + ArraySpan list_view{*input}; + + const offset_type* offsets = list_view.GetValues(1); + const offset_type* sizes = list_view.GetValues(2); + int64_t values_offset; + int64_t values_length; + ARROW_ASSIGN_OR_RAISE(std::tie(values_offset, values_length), + RangeOfValuesUsed(list_view)); + + ARROW_ASSIGN_OR_RAISE(auto indices_validity, + AllocateEmptyBitmap(values_length, ctx->memory_pool())); + auto* out_indices_validity = indices_validity->mutable_data(); + int64_t total_pop_count = 0; + + ARROW_ASSIGN_OR_RAISE(auto indices, ctx->Allocate(values_length * sizeof(int64_t))); + auto* out_indices = indices->template mutable_data_as(); + memset(out_indices, -1, values_length * sizeof(int64_t)); + + const auto* validity = list_view.GetValues(0, 0); + RETURN_NOT_OK(arrow::internal::VisitSetBitRuns( + validity, list_view.offset, list_view.length, + [this, offsets, sizes, out_indices, out_indices_validity, values_offset, + &total_pop_count](int64_t run_start, int64_t run_length) { + for (int64_t i = run_start; i < run_start + run_length; ++i) { + auto validity_offset = offsets[i] - values_offset; + const int64_t pop_count = + CountSetBits(out_indices_validity, validity_offset, sizes[i]); + if (ARROW_PREDICT_FALSE(pop_count > 0)) { + return Status::Invalid( + "Function 'list_parent_indices' cannot produce parent indices for " + "values used by more than one list-view array element."); + } + bit_util::SetBitmap(out_indices_validity, validity_offset, sizes[i]); + total_pop_count += sizes[i]; + for (auto j = static_cast(offsets[i]); + j < static_cast(offsets[i]) + sizes[i]; ++j) { + out_indices[j - values_offset] = i + base_output_offset; + } + } + return Status::OK(); + })); + + DCHECK_LE(total_pop_count, values_length); + const int64_t null_count = values_length - total_pop_count; + BufferVector buffers{null_count > 0 ? std::move(indices_validity) : nullptr, + std::move(indices)}; + out = std::make_shared(int64(), values_length, std::move(buffers), + null_count); + return Status::OK(); + } + + Status Visit(const ListViewType& type) { return VisitListView(type); } + + Status Visit(const LargeListViewType& type) { return VisitListView(type); } + Status Visit(const FixedSizeListType& type) { using offset_type = typename FixedSizeListType::offset_type; const offset_type slot_length = type.list_size(); @@ -125,7 +189,7 @@ const FunctionDoc list_flatten_doc( const FunctionDoc list_parent_indices_doc( "Compute parent indices of nested list values", - ("`lists` must have a list-like type.\n" + ("`lists` must have a list-like or list-view type.\n" "For each value in each list of `lists`, the top-level list index\n" "is emitted."), {"lists"}); @@ -147,6 +211,7 @@ class ListParentIndicesFunction : public MetaFunction { int64_t base_output_offset = 0; ArrayVector out_chunks; + out_chunks.reserve(input->num_chunks()); for (const auto& chunk : input->chunks()) { ARROW_ASSIGN_OR_RAISE(auto out_chunk, ListParentIndicesArray::Exec(&kernel_ctx, chunk->data(), diff --git a/cpp/src/arrow/compute/kernels/vector_nested_test.cc b/cpp/src/arrow/compute/kernels/vector_nested_test.cc index 56604ebd16cc0..da751fa5de403 100644 --- a/cpp/src/arrow/compute/kernels/vector_nested_test.cc +++ b/cpp/src/arrow/compute/kernels/vector_nested_test.cc @@ -183,8 +183,50 @@ TEST(TestVectorNested, ListFlattenFixedSizeListRecursively) { CheckVectorUnary("list_flatten", input, expected, &opts); } +template +void SwapListView(ArrayData* array, int64_t i, int64_t j) { + ASSERT_TRUE(is_list_view(array->type->id())); + ASSERT_EQ(array->type->id(), T::type_id); + ASSERT_LT(i, array->length); + ASSERT_LT(j, array->length); + auto* validity = array->GetMutableValues(0); + if (validity) { + const bool is_valid_i = bit_util::GetBit(validity, array->offset + i); + const bool is_valid_j = bit_util::GetBit(validity, array->offset + j); + if (is_valid_i ^ is_valid_j) { + bit_util::SetBitTo(validity, array->offset + i, is_valid_j); + bit_util::SetBitTo(validity, array->offset + j, is_valid_i); + } + } + auto* offsets = array->GetMutableValues(1); + auto* sizes = array->GetMutableValues(2); + std::swap(offsets[i], offsets[j]); + std::swap(sizes[i], sizes[j]); +} + +template +void SetListView(ArrayData* array, int64_t i, offset_type offset, offset_type size) { + ASSERT_TRUE(is_list_view(array->type->id())); + ASSERT_EQ(array->type->id(), T::type_id); + ASSERT_LT(i, array->length); + auto* validity = array->GetMutableValues(0); + if (validity) { + bit_util::SetBit(validity, array->offset + i); + } + auto* offsets = array->GetMutableValues(1); + auto* sizes = array->GetMutableValues(2); + offsets[i] = offset; + sizes[i] = size; +} + TEST(TestVectorNested, ListParentIndices) { - for (auto ty : {list(int16()), large_list(int16())}) { + const auto types = { + list(int16()), + large_list(int16()), + list_view(int16()), + large_list_view(int16()), + }; + for (auto ty : types) { auto input = ArrayFromJSON(ty, "[[0, null, 1], null, [2, 3], [], [4, 5]]"); auto expected = ArrayFromJSON(int64(), "[0, 0, 0, 2, 2, 4, 4]"); @@ -196,10 +238,47 @@ TEST(TestVectorNested, ListParentIndices) { auto tweaked = TweakValidityBit(input, 1, false); auto expected = ArrayFromJSON(int64(), "[0, 0, 0, 1, 1, 2, 2, 4, 4]"); CheckVectorUnary("list_parent_indices", tweaked, expected); + + { + // Construct a list-view with a non-empty null slot + auto input = + ArrayFromJSON(list_view(int16()), "[[0, null, 1], [0, 0], [2, 3], [], [4, 5]]"); + auto tweaked = TweakValidityBit(input, 1, false); + auto expected = ArrayFromJSON(int64(), "[0, 0, 0, null, null, 2, 2, 4, 4]"); + CheckVectorUnary("list_parent_indices", tweaked, expected); + + // Swap some list-view entries + auto swapped = tweaked->data()->Copy(); + SwapListView(swapped.get(), 0, 2); + SwapListView(swapped.get(), 1, 4); + AssertDatumsEqual( + swapped, + ArrayFromJSON(list_view(int16()), "[[2, 3], [4, 5], [0, null, 1], [], null]"), + /*verbose=*/true); + expected = ArrayFromJSON(int64(), "[2, 2, 2, null, null, 0, 0, 1, 1]"); + CheckVectorUnary("list_parent_indices", swapped, expected); + + // Make one view use values that are used by other list-views + SetListView(swapped.get(), 3, 1, 4); + AssertDatumsEqual( + swapped, + ArrayFromJSON(list_view(int16()), + "[[2, 3], [4, 5], [0, null, 1], [null, 1, 0, 0], null]"), + /*verbose=*/true); + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, ::testing::HasSubstr("values used by more than one list-view"), + CallFunction("list_parent_indices", {input})); + } } TEST(TestVectorNested, ListParentIndicesChunkedArray) { - for (auto ty : {list(int16()), large_list(int16())}) { + const auto types = { + list(int16()), + large_list(int16()), + list_view(int16()), + large_list_view(int16()), + }; + for (auto ty : types) { auto input = ChunkedArrayFromJSON(ty, {"[[0, null, 1], null]", "[[2, 3], [], [4, 5]]"}); diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index f0cd82e803121..91a0d87cb8ae7 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -57,6 +57,8 @@ using internal::checked_cast; constexpr Type::type NullType::type_id; constexpr Type::type ListType::type_id; constexpr Type::type LargeListType::type_id; +constexpr Type::type ListViewType::type_id; +constexpr Type::type LargeListViewType::type_id; constexpr Type::type MapType::type_id; diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst index 2c403e9880f5c..4131bbdf6f912 100644 --- a/docs/source/cpp/compute.rst +++ b/docs/source/cpp/compute.rst @@ -181,7 +181,8 @@ of general type categories: * "String-like": String, LargeString. -* "List-like": List, LargeList, sometimes also FixedSizeList. +* "List-like": List, LargeList, ListView, LargeListView, and sometimes also + FixedSizeList. * "Nested": List-likes (including FixedSizeList), Struct, Union, and related types like Map. @@ -1813,8 +1814,10 @@ Structural transforms list array are discarded. * \(3) For each value in the list child array, the index at which it is found - in the list array is appended to the output. Nulls in the parent list array - are discarded. + in the list-like array is appended to the output. Indices of null lists in the + parent array might still be present in the output if they are non-empty null + lists. If the parent is a list-view, child array values that are not used by + any non-null list-view are null in the output. * \(4) For each list element, compute the slice of that list element, then return another list-like array of those slices. Can return either a