Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GH-42235: [C++] list_parent_indices: Add support for list-view types #42236

Merged
merged 6 commits into from
Jun 26, 2024
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 66 additions & 1 deletion cpp/src/arrow/compute/kernels/vector_nested.cc
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,16 @@
#include "arrow/compute/api_vector.h"
#include "arrow/compute/kernels/common_internal.h"
#include "arrow/result.h"
#include "arrow/util/bit_run_reader.h"
#include "arrow/util/bit_util.h"
#include "arrow/util/list_util.h"
#include "arrow/visit_type_inline.h"

namespace arrow {

using internal::CountSetBits;
using list_util::internal::RangeOfValuesUsed;

namespace compute {
namespace internal {
namespace {
Expand Down Expand Up @@ -76,6 +83,63 @@ struct ListParentIndicesArray {

Status Visit(const LargeListType& type) { return VisitList(type); }

template <typename Type, typename offset_type = typename Type::offset_type>
Status VisitListView(const Type&) {
ArraySpan list_view{*input};

const offset_type* offsets = list_view.GetValues<offset_type>(1);
const offset_type* sizes = list_view.GetValues<offset_type>(2);
offset_type values_offset;
offset_type values_length;
ARROW_ASSIGN_OR_RAISE(std::tie(values_offset, values_length),
RangeOfValuesUsed(list_view));

ARROW_ASSIGN_OR_RAISE(auto indices_validity,
AllocateEmptyBitmap(values_length, ctx->memory_pool()));
auto* out_indices_validity = indices_validity->mutable_data();
int64_t total_pop_count = 0;

ARROW_ASSIGN_OR_RAISE(auto indices, ctx->Allocate(values_length * sizeof(int64_t)));
auto* out_indices = indices->template mutable_data_as<int64_t>();
memset(out_indices, -1, values_length * sizeof(int64_t));

const auto* validity = list_view.GetValues<uint8_t>(0, 0);
RETURN_NOT_OK(arrow::internal::VisitSetBitRuns(
validity, list_view.offset, list_view.length,
[this, offsets, sizes, out_indices, out_indices_validity, values_offset,
&total_pop_count](int64_t run_start, int64_t run_length) {
for (int64_t i = run_start; i < run_start + run_length; ++i) {
auto validity_offset = offsets[i] - values_offset;
const int64_t pop_count =
CountSetBits(out_indices_validity, validity_offset, sizes[i]);
if (ARROW_PREDICT_FALSE(pop_count > 0)) {
return Status::Invalid(
"Function 'list_parent_indices' cannot produce parent indices for "
"values used by more than one list-view array element.");
}
bit_util::SetBitmap(out_indices_validity, validity_offset, sizes[i]);
total_pop_count += sizes[i];
for (auto j = static_cast<int64_t>(offsets[i]);
j < static_cast<int64_t>(offsets[i]) + sizes[i]; ++j) {
out_indices[j - values_offset] = i + base_output_offset;
}
}
return Status::OK();
}));

DCHECK_LE(total_pop_count, values_length);
const int64_t null_count = values_length - total_pop_count;
BufferVector buffers{null_count > 0 ? std::move(indices_validity) : nullptr,
std::move(indices)};
out = std::make_shared<ArrayData>(int64(), values_length, std::move(buffers),
null_count);
return Status::OK();
}

Status Visit(const ListViewType& type) { return VisitListView(type); }

Status Visit(const LargeListViewType& type) { return VisitListView(type); }

Status Visit(const FixedSizeListType& type) {
using offset_type = typename FixedSizeListType::offset_type;
const offset_type slot_length = type.list_size();
Expand Down Expand Up @@ -125,7 +189,7 @@ const FunctionDoc list_flatten_doc(

const FunctionDoc list_parent_indices_doc(
"Compute parent indices of nested list values",
("`lists` must have a list-like type.\n"
("`lists` must have a list-like or list-view type.\n"
"For each value in each list of `lists`, the top-level list index\n"
"is emitted."),
{"lists"});
Expand All @@ -147,6 +211,7 @@ class ListParentIndicesFunction : public MetaFunction {

int64_t base_output_offset = 0;
ArrayVector out_chunks;
out_chunks.reserve(input->num_chunks());
for (const auto& chunk : input->chunks()) {
ARROW_ASSIGN_OR_RAISE(auto out_chunk,
ListParentIndicesArray::Exec(&kernel_ctx, chunk->data(),
Expand Down
83 changes: 81 additions & 2 deletions cpp/src/arrow/compute/kernels/vector_nested_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -183,8 +183,50 @@ TEST(TestVectorNested, ListFlattenFixedSizeListRecursively) {
CheckVectorUnary("list_flatten", input, expected, &opts);
}

template <typename T = ListViewType, typename offset_type = typename T::offset_type>
void SwapListView(ArrayData* array, int64_t i, int64_t j) {
ASSERT_TRUE(is_list_view(array->type->id()));
ASSERT_EQ(array->type->id(), T::type_id);
ASSERT_LT(i, array->length);
ASSERT_LT(j, array->length);
auto* validity = array->GetMutableValues<uint8_t>(0);
if (validity) {
const bool is_valid_i = bit_util::GetBit(validity, array->offset + i);
const bool is_valid_j = bit_util::GetBit(validity, array->offset + j);
if (is_valid_i ^ is_valid_j) {
bit_util::SetBitTo(validity, array->offset + i, is_valid_j);
bit_util::SetBitTo(validity, array->offset + j, is_valid_i);
}
}
auto* offsets = array->GetMutableValues<offset_type>(1);
auto* sizes = array->GetMutableValues<offset_type>(2);
std::swap(offsets[i], offsets[j]);
std::swap(sizes[i], sizes[j]);
}

template <typename T = ListViewType, typename offset_type = typename T::offset_type>
void SetListView(ArrayData* array, int64_t i, offset_type offset, offset_type size) {
ASSERT_TRUE(is_list_view(array->type->id()));
ASSERT_EQ(array->type->id(), T::type_id);
ASSERT_LT(i, array->length);
auto* validity = array->GetMutableValues<uint8_t>(0);
if (validity) {
bit_util::SetBit(validity, array->offset + i);
}
auto* offsets = array->GetMutableValues<offset_type>(1);
auto* sizes = array->GetMutableValues<offset_type>(2);
offsets[i] = offset;
sizes[i] = size;
}

TEST(TestVectorNested, ListParentIndices) {
for (auto ty : {list(int16()), large_list(int16())}) {
const auto types = {
list(int16()),
large_list(int16()),
list_view(int16()),
large_list_view(int16()),
};
for (auto ty : types) {
auto input = ArrayFromJSON(ty, "[[0, null, 1], null, [2, 3], [], [4, 5]]");

auto expected = ArrayFromJSON(int64(), "[0, 0, 0, 2, 2, 4, 4]");
Expand All @@ -196,10 +238,47 @@ TEST(TestVectorNested, ListParentIndices) {
auto tweaked = TweakValidityBit(input, 1, false);
auto expected = ArrayFromJSON(int64(), "[0, 0, 0, 1, 1, 2, 2, 4, 4]");
CheckVectorUnary("list_parent_indices", tweaked, expected);

{
// Construct a list-view with a non-empty null slot
auto input =
ArrayFromJSON(list_view(int16()), "[[0, null, 1], [0, 0], [2, 3], [], [4, 5]]");
auto tweaked = TweakValidityBit(input, 1, false);
auto expected = ArrayFromJSON(int64(), "[0, 0, 0, null, null, 2, 2, 4, 4]");
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm... the fact that this gives a different result than for regular lists should perhaps be called out in the official documentation for this function?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done. Tell me if it's clear enough.

CheckVectorUnary("list_parent_indices", tweaked, expected);

// Swap some list-view entries
felipecrv marked this conversation as resolved.
Show resolved Hide resolved
auto swapped = tweaked->data()->Copy();
SwapListView(swapped.get(), 0, 2);
SwapListView(swapped.get(), 1, 4);
AssertDatumsEqual(
swapped,
ArrayFromJSON(list_view(int16()), "[[2, 3], [4, 5], [0, null, 1], [], null]"),
/*verbose=*/true);
expected = ArrayFromJSON(int64(), "[2, 2, 2, null, null, 0, 0, 1, 1]");
CheckVectorUnary("list_parent_indices", swapped, expected);

// Make one view use values that are used by other list-views
SetListView(swapped.get(), 3, 1, 4);
AssertDatumsEqual(
swapped,
ArrayFromJSON(list_view(int16()),
"[[2, 3], [4, 5], [0, null, 1], [null, 1, 0, 0], null]"),
/*verbose=*/true);
EXPECT_RAISES_WITH_MESSAGE_THAT(
Invalid, ::testing::HasSubstr("values used by more than one list-view"),
CallFunction("list_parent_indices", {input}));
}
}

TEST(TestVectorNested, ListParentIndicesChunkedArray) {
for (auto ty : {list(int16()), large_list(int16())}) {
const auto types = {
list(int16()),
large_list(int16()),
list_view(int16()),
large_list_view(int16()),
};
for (auto ty : types) {
auto input =
ChunkedArrayFromJSON(ty, {"[[0, null, 1], null]", "[[2, 3], [], [4, 5]]"});

Expand Down
9 changes: 6 additions & 3 deletions docs/source/cpp/compute.rst
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,8 @@ of general type categories:

* "String-like": String, LargeString.

* "List-like": List, LargeList, sometimes also FixedSizeList.
* "List-like": List, LargeList, ListView, LargeListView, and sometimes also
FixedSizeList.
pitrou marked this conversation as resolved.
Show resolved Hide resolved

* "Nested": List-likes (including FixedSizeList), Struct, Union, and
related types like Map.
Expand Down Expand Up @@ -1813,8 +1814,10 @@ Structural transforms
list array are discarded.

* \(3) For each value in the list child array, the index at which it is found
in the list array is appended to the output. Nulls in the parent list array
are discarded.
in the list-like array is appended to the output. Indices of null lists in the
parent array might still be present in the output if they are non-empty null
lists. If the parent is a list-view, child array values that are not used by
any non-null list-view are null in the output.

* \(4) For each list element, compute the slice of that list element, then
return another list-like array of those slices. Can return either a
Expand Down
Loading