diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 9a6117011535e..9c6b483b6a232 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -228,6 +228,7 @@ set(ARROW_SRCS util/hashing.cc util/int_util.cc util/io_util.cc + util/list_util.cc util/logging.cc util/key_value_metadata.cc util/memory.cc @@ -789,6 +790,7 @@ add_arrow_test(array_test array/array_binary_test.cc array/array_dict_test.cc array/array_list_test.cc + array/array_list_view_test.cc array/array_run_end_test.cc array/array_struct_test.cc array/array_union_test.cc diff --git a/cpp/src/arrow/array/array_base.cc b/cpp/src/arrow/array/array_base.cc index eab71de27b11a..b483ec420cc3c 100644 --- a/cpp/src/arrow/array/array_base.cc +++ b/cpp/src/arrow/array/array_base.cc @@ -95,7 +95,7 @@ struct ScalarFromArraySlotImpl { Status Visit(const MonthDayNanoIntervalArray& a) { return Finish(a.Value(index_)); } template - Status Visit(const BaseListArray& a) { + Status Visit(const VarLengthListLikeArray& a) { return Finish(a.value_slice(index_)); } diff --git a/cpp/src/arrow/array/array_list_test.cc b/cpp/src/arrow/array/array_list_test.cc index a3a2f99851b55..0b591d401804d 100644 --- a/cpp/src/arrow/array/array_list_test.cc +++ b/cpp/src/arrow/array/array_list_test.cc @@ -41,10 +41,11 @@ namespace arrow { using internal::checked_cast; using internal::checked_pointer_cast; -using ListTypes = ::testing::Types; +using ListAndListViewTypes = + ::testing::Types; // ---------------------------------------------------------------------- -// List tests +// List and ListView tests template class TestListArray : public ::testing::Test { @@ -57,7 +58,9 @@ class TestListArray : public ::testing::Test { using OffsetArrayType = typename TypeTraits::OffsetArrayType; using OffsetBuilderType = typename TypeTraits::OffsetBuilderType; - void SetUp() { + static constexpr bool kTypeClassIsListView = is_list_view_type::value; + + void SetUp() override { value_type_ = int16(); type_ = std::make_shared(value_type_); @@ -72,8 +75,10 @@ class TestListArray : public ::testing::Test { result_ = std::dynamic_pointer_cast(out); } - void ValidateBasicListArray(const ArrayType* result, const std::vector& values, - const std::vector& is_valid) { + private: + void DoValidateBasicListArray(const ArrayType* result, + const std::vector& values, + const std::vector& is_valid) { ASSERT_OK(result->ValidateFull()); ASSERT_EQ(1, result->null_count()); ASSERT_EQ(0, result->values()->null_count()); @@ -108,6 +113,58 @@ class TestListArray : public ::testing::Test { result_->raw_value_offsets()[result->length()]); } + void DoValidateBasicListViewArray(const ArrayType* result, + const std::vector& values, + const std::vector& is_valid) { + ASSERT_OK(result->ValidateFull()); + ASSERT_EQ(1, result->null_count()); + ASSERT_EQ(0, result->values()->null_count()); + + ASSERT_EQ(3, result->length()); + std::vector ex_offsets = {0, 3, 3}; + std::vector ex_sizes = {3, 0}; + for (size_t i = 0; i < ex_sizes.size(); ++i) { + ASSERT_EQ(ex_offsets[i], result->value_offset(i)); + ASSERT_EQ(ex_sizes[i], result->value_length(i)); + } + ASSERT_EQ(ex_offsets[ex_sizes.size()], result->value_offset(ex_sizes.size())); + + for (int i = 0; i < result->length(); ++i) { + ASSERT_EQ(is_valid[i] == 0, result->IsNull(i)); + } + + ASSERT_EQ(7, result->values()->length()); + auto varr = std::dynamic_pointer_cast(result->values()); + + for (size_t i = 0; i < values.size(); ++i) { + ASSERT_EQ(values[i], varr->Value(i)); + } + + auto offsets = std::dynamic_pointer_cast(result->offsets()); + auto sizes = std::dynamic_pointer_cast(result->sizes()); + ASSERT_EQ(offsets->length(), result->length()); + ASSERT_EQ(offsets->null_count(), 0); + AssertTypeEqual(*offsets->type(), OffsetType()); + ASSERT_EQ(sizes->length(), result->length()); + ASSERT_EQ(sizes->null_count(), 0); + AssertTypeEqual(*sizes->type(), OffsetType()); + + for (int64_t i = 0; i < result->length(); ++i) { + ASSERT_EQ(offsets->Value(i), result_->raw_value_offsets()[i]); + ASSERT_EQ(sizes->Value(i), result_->raw_value_sizes()[i]); + } + } + + void ValidateBasicListArray(const ArrayType* result, const std::vector& values, + const std::vector& is_valid) { + if constexpr (kTypeClassIsListView) { + return DoValidateBasicListViewArray(result, values, is_valid); + } else { + return DoValidateBasicListArray(result, values, is_valid); + } + } + + public: void TestBasics() { std::vector values = {0, 1, 2, 3, 4, 5, 6}; std::vector lengths = {3, 0, 4}; @@ -120,7 +177,7 @@ class TestListArray : public ::testing::Test { int pos = 0; for (size_t i = 0; i < lengths.size(); ++i) { - ASSERT_OK(builder_->Append(is_valid[i] > 0)); + ASSERT_OK(builder_->Append(is_valid[i] > 0, lengths[i])); for (int j = 0; j < lengths[i]; ++j) { ASSERT_OK(vb->Append(values[pos++])); } @@ -133,25 +190,29 @@ class TestListArray : public ::testing::Test { void TestEquality() { auto vb = checked_cast(builder_->value_builder()); - std::shared_ptr array, equal_array, unequal_array; + std::shared_ptr array, equal_array; std::vector equal_offsets = {0, 1, 2, 5, 6, 7, 8, 10}; + std::vector equal_sizes = {1, 1, 3, 1, 1, 1, 2, 0}; std::vector equal_values = {1, 2, 3, 4, 5, 2, 2, 2, 5, 6}; + + std::shared_ptr unequal_array; std::vector unequal_offsets = {0, 1, 4, 7}; + std::vector unequal_sizes = {1, 3, 3, 0}; std::vector unequal_values = {1, 2, 2, 2, 3, 4, 5}; - // setup two equal arrays - ASSERT_OK(builder_->AppendValues(equal_offsets.data(), equal_offsets.size())); + ASSERT_OK(builder_->AppendValues(equal_offsets.data(), equal_sizes.data(), + equal_offsets.size())); ASSERT_OK(vb->AppendValues(equal_values.data(), equal_values.size())); - ASSERT_OK(builder_->Finish(&array)); - ASSERT_OK(builder_->AppendValues(equal_offsets.data(), equal_offsets.size())); - ASSERT_OK(vb->AppendValues(equal_values.data(), equal_values.size())); + ASSERT_OK(builder_->AppendValues(equal_offsets.data(), equal_sizes.data(), + equal_offsets.size())); + ASSERT_OK(vb->AppendValues(equal_values.data(), equal_values.size())); ASSERT_OK(builder_->Finish(&equal_array)); - // now an unequal one - ASSERT_OK(builder_->AppendValues(unequal_offsets.data(), unequal_offsets.size())); - ASSERT_OK(vb->AppendValues(unequal_values.data(), unequal_values.size())); + ASSERT_OK(builder_->AppendValues(unequal_offsets.data(), unequal_sizes.data(), + unequal_offsets.size())); + ASSERT_OK(vb->AppendValues(unequal_values.data(), unequal_values.size())); ASSERT_OK(builder_->Finish(&unequal_array)); // Test array equality @@ -197,16 +258,37 @@ class TestListArray : public ::testing::Test { EXPECT_FALSE(left->Slice(offset)->Equals(right->Slice(offset))); } - void TestFromArraysWithNullBitMap() { - std::shared_ptr offsets_w_nulls, offsets_wo_nulls, values; + private: + Result> FromArrays(const Array& offsets, const Array& sizes, + const Array& values, + std::shared_ptr null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount) { + if constexpr (kTypeClassIsListView) { + return ArrayType::FromArrays(offsets, sizes, values, pool_, null_bitmap, + null_count); + } else { + return ArrayType::FromArrays(offsets, values, pool_, null_bitmap, null_count); + } + } + + void TestFromArraysWithNullBitmap() { + std::shared_ptr offsets_w_nulls, offsets_wo_nulls; + std::shared_ptr sizes_w_nulls, sizes_wo_nulls; + std::shared_ptr values; std::vector offsets = {0, 1, 1, 3, 4}; + std::vector sizes = {1, 0, 2, 1}; std::vector offsets_w_nulls_is_valid = {true, false, true, true, true}; + std::vector sizes_w_nulls_is_valid = {true, false, true, true}; ArrayFromVector(offsets_w_nulls_is_valid, offsets, &offsets_w_nulls); ArrayFromVector(offsets, &offsets_wo_nulls); + ArrayFromVector(sizes_w_nulls_is_valid, sizes, + &sizes_w_nulls); + ArrayFromVector(sizes, &sizes_wo_nulls); + auto type = std::make_shared(int32()); auto expected = std::dynamic_pointer_cast( ArrayFromJSON(type, "[[0], null, [0, null], [0]]")); @@ -214,29 +296,41 @@ class TestListArray : public ::testing::Test { // Offsets with nulls will match. ASSERT_OK_AND_ASSIGN(auto result, - ArrayType::FromArrays(*offsets_w_nulls, *values, pool_)); + FromArrays(*offsets_w_nulls, *sizes_wo_nulls, *values)); ASSERT_OK(result->ValidateFull()); AssertArraysEqual(*result, *expected); // Offets without nulls, will replace null with empty list - ASSERT_OK_AND_ASSIGN(result, - ArrayType::FromArrays(*offsets_wo_nulls, *values, pool_)); + ASSERT_OK_AND_ASSIGN(result, FromArrays(*offsets_wo_nulls, *sizes_wo_nulls, *values)); ASSERT_OK(result->ValidateFull()); AssertArraysEqual(*result, *std::dynamic_pointer_cast( ArrayFromJSON(type, "[[0], [], [0, null], [0]]"))); // Specify non-null offsets with null_bitmap - ASSERT_OK_AND_ASSIGN(result, ArrayType::FromArrays(*offsets_wo_nulls, *values, pool_, - expected->null_bitmap())); + ASSERT_OK_AND_ASSIGN(result, FromArrays(*offsets_wo_nulls, *sizes_wo_nulls, *values, + expected->null_bitmap())); ASSERT_OK(result->ValidateFull()); AssertArraysEqual(*result, *expected); // Cannot specify both null offsets with null_bitmap - ASSERT_RAISES(Invalid, ArrayType::FromArrays(*offsets_w_nulls, *values, pool_, - expected->null_bitmap())); + ASSERT_RAISES(Invalid, FromArrays(*offsets_w_nulls, *sizes_wo_nulls, *values, + expected->null_bitmap())); + + if constexpr (kTypeClassIsListView) { + // Sizes with nulls will match. + ASSERT_OK_AND_ASSIGN(auto result, + FromArrays(*offsets_wo_nulls, *sizes_w_nulls, *values)); + ASSERT_OK(result->ValidateFull()); + AssertArraysEqual(*result, *expected); + + // Cannot specify both null sizes with null_bitmap + ASSERT_RAISES(Invalid, FromArrays(*offsets_wo_nulls, *sizes_w_nulls, *values, + expected->null_bitmap())); + } } - void TestFromArraysWithSlicedOffsets() { + template + std::enable_if_t TestFromArraysWithSlicedOffsets() { std::vector offsets = {-1, -1, 0, 1, 2, 4}; std::shared_ptr offsets_wo_nulls; @@ -261,7 +355,8 @@ class TestListArray : public ::testing::Test { AssertArraysEqual(*result, *expected->Slice(1, 2)); } - void TestFromArraysWithSlicedNullOffsets() { + template + std::enable_if_t TestFromArraysWithSlicedNullOffsets() { std::vector offsets = {-1, -1, 0, 1, 1, 3}; std::vector offsets_w_nulls_is_valid = {true, true, true, false, true, true}; @@ -288,7 +383,17 @@ class TestListArray : public ::testing::Test { AssertArraysEqual(*result, *expected->Slice(1, 2)); } - void TestFromArrays() { + public: + void TestFromArraysNullHandling() { + this->TestFromArraysWithNullBitmap(); + if constexpr (!kTypeClassIsListView) { + this->TestFromArraysWithSlicedOffsets(); + this->TestFromArraysWithSlicedNullOffsets(); + } + } + + private: + void DoTestListFromArrays() { std::shared_ptr offsets1, offsets2, offsets3, offsets4, offsets5, values; std::vector offsets_is_valid3 = {true, false, true, true}; @@ -373,6 +478,87 @@ class TestListArray : public ::testing::Test { } } + template + std::enable_if_t DoTestListViewFromArrays() { + std::shared_ptr offsets1, offsets2; + std::shared_ptr sizes1, sizes2, sizes3, sizes4, sizes5; + std::shared_ptr values; + + std::vector sizes_is_valid3 = {true, false, true, true}; + std::vector sizes_is_valid4 = {true, true, false, true}; + std::vector sizes_is_valid5 = {true, true, false, false}; + + std::vector values_is_valid = {true, false, true, true, true, true}; + + std::vector offset1_values = {2, 0, 2}; + std::vector offset2_values = {2, 0, 6}; + std::vector size1_values = {0, 2, 4}; + std::vector size2_values = {4, 2, 0}; + + std::vector values_values = {0, 1, 2, 3, 4, 5}; + const int length = 3; + + ArrayFromVector(offset1_values, &offsets1); + ArrayFromVector(offset2_values, &offsets2); + + ArrayFromVector(size1_values, &sizes1); + ArrayFromVector(size2_values, &sizes2); + ArrayFromVector(sizes_is_valid3, size1_values, &sizes3); + ArrayFromVector(sizes_is_valid4, size2_values, &sizes4); + ArrayFromVector(sizes_is_valid5, size2_values, &sizes5); + + ArrayFromVector(values_is_valid, values_values, &values); + + auto list_type = std::make_shared(int8()); + + ASSERT_OK_AND_ASSIGN(auto list_view1, + ArrayType::FromArrays(*offsets1, *sizes1, *values, pool_)); + ASSERT_OK_AND_ASSIGN(auto list_view3, + ArrayType::FromArrays(*offsets1, *sizes3, *values, pool_)); + ASSERT_OK_AND_ASSIGN(auto list_view4, + ArrayType::FromArrays(*offsets2, *sizes4, *values, pool_)); + ASSERT_OK(list_view1->ValidateFull()); + ASSERT_OK(list_view3->ValidateFull()); + ASSERT_OK(list_view4->ValidateFull()); + + ArrayType expected1(list_type, length, offsets1->data()->buffers[1], + sizes1->data()->buffers[1], values, offsets1->data()->buffers[0], + 0); + AssertArraysEqual(expected1, *list_view1); + + // Use null bitmap from sizes3, but clean sizes from non-null version + ArrayType expected3(list_type, length, offsets1->data()->buffers[1], + sizes1->data()->buffers[1], values, sizes3->data()->buffers[0], + 1); + AssertArraysEqual(expected3, *list_view3); + + ArrayType expected4(list_type, length, offsets2->data()->buffers[1], + sizes2->data()->buffers[1], values, sizes4->data()->buffers[0], + 1); + AssertArraysEqual(expected4, *list_view4); + + // Test failure modes + + std::shared_ptr tmp; + + // Zero-length offsets (not a failure mode for ListViews) + ASSERT_OK(ArrayType::FromArrays(*offsets1->Slice(0, 0), *sizes1->Slice(0, 0), *values, + pool_)); + + // Offsets not the right type + ASSERT_RAISES(TypeError, + ArrayType::FromArrays(/*offsets=*/*values, *sizes1, *values, pool_)); + } + + public: + void TestFromArrays() { + if constexpr (kTypeClassIsListView) { + DoTestListViewFromArrays(); + } else { + DoTestListFromArrays(); + } + } + void TestAppendNull() { ASSERT_OK(builder_->AppendNull()); ASSERT_OK(builder_->AppendNull()); @@ -420,11 +606,13 @@ class TestListArray : public ::testing::Test { std::vector values = {0, 1, 2, 3, 4, 5, 6}; std::vector is_valid = {1, 0, 1}; std::vector offsets = {0, 3, 3}; + std::vector sizes = {3, 0, 1}; Int16Builder* vb = checked_cast(builder_->value_builder()); ASSERT_OK(vb->Reserve(values.size())); - ASSERT_OK(builder_->AppendValues(offsets.data(), offsets.size(), is_valid.data())); + ASSERT_OK(builder_->AppendValues(offsets.data(), sizes.data(), offsets.size(), + is_valid.data())); for (int16_t value : values) { ASSERT_OK(vb->Append(value)); } @@ -434,16 +622,17 @@ class TestListArray : public ::testing::Test { void TestBulkAppendInvalid() { std::vector values = {0, 1, 2, 3, 4, 5, 6}; - std::vector lengths = {3, 0, 4}; std::vector is_valid = {1, 0, 1}; - // Should be {0, 3, 3} given the is_valid array std::vector offsets = {0, 2, 4}; + std::vector sizes = {2, 2, 4}; Int16Builder* vb = checked_cast(builder_->value_builder()); ASSERT_OK(vb->Reserve(values.size())); - ASSERT_OK(builder_->AppendValues(offsets.data(), offsets.size(), is_valid.data())); - ASSERT_OK(builder_->AppendValues(offsets.data(), offsets.size(), is_valid.data())); + ASSERT_OK(builder_->AppendValues(offsets.data(), sizes.data(), offsets.size(), + is_valid.data())); + ASSERT_OK(builder_->AppendValues(offsets.data(), sizes.data(), offsets.size(), + is_valid.data())); for (int16_t value : values) { ASSERT_OK(vb->Append(value)); } @@ -466,7 +655,12 @@ class TestListArray : public ::testing::Test { builder_.reset(checked_cast(tmp.release())); std::vector offsets = {1, 2, 4, 8}; - ASSERT_OK(builder_->AppendValues(offsets.data(), offsets.size())); + std::vector sizes = {1, 2, 4}; + if constexpr (kTypeClassIsListView) { + ASSERT_OK(builder_->AppendValues(offsets.data(), sizes.data(), sizes.size())); + } else { + ASSERT_OK(builder_->AppendValues(offsets.data(), offsets.size())); + } std::shared_ptr list_array; ASSERT_OK(builder_->Finish(&list_array)); @@ -485,10 +679,16 @@ class TestListArray : public ::testing::Test { void TestFlattenSimple() { auto type = std::make_shared(int32()); auto list_array = std::dynamic_pointer_cast( - ArrayFromJSON(type, "[[1, 2], [3], [4], null, [5], [], [6]]")); + ArrayFromJSON(type, "[[], null, [1, 2], [3], [4], null, [5], [], [6]]")); ASSERT_OK_AND_ASSIGN(auto flattened, list_array->Flatten()); ASSERT_OK(flattened->ValidateFull()); EXPECT_TRUE(flattened->Equals(ArrayFromJSON(int32(), "[1, 2, 3, 4, 5, 6]"))); + + list_array = std::dynamic_pointer_cast( + ArrayFromJSON(type, "[[], [], [1, 2], [3], [4], [], [5], [], [6]]")); + ASSERT_OK_AND_ASSIGN(flattened, list_array->Flatten()); + ASSERT_OK(flattened->ValidateFull()); + EXPECT_TRUE(flattened->Equals(ArrayFromJSON(int32(), "[1, 2, 3, 4, 5, 6]"))); } void TestFlattenNulls() { @@ -500,6 +700,35 @@ class TestListArray : public ::testing::Test { AssertTypeEqual(*flattened->type(), *value_type_); } + void TestFlattenAllEmpty() { + auto type = std::make_shared(int32()); + auto list_array = std::dynamic_pointer_cast( + ArrayFromJSON(type, "[[], [], [], [], [], [], []]")); + ASSERT_OK_AND_ASSIGN(auto flattened, list_array->Flatten()); + ASSERT_OK(flattened->ValidateFull()); + EXPECT_TRUE(flattened->Equals(ArrayFromJSON(int32(), "[]"))); + + if constexpr (kTypeClassIsListView) { + auto list_array = std::dynamic_pointer_cast( + ArrayFromJSON(type, "[[1, 2], [3], null, [5, 6], [7, 8], [], [9]]")); + auto array_data = list_array->data(); + + auto offsets = array_data->buffers[1]->template mutable_data_as(); + auto sizes = array_data->buffers[2]->template mutable_data_as(); + + // Set all sizes to 0, except the one for the null entry + memset(sizes, 0, sizeof(offset_type) * array_data->length); + sizes[2] = 4; + // Make the offset of the null entry be non-zero and out of order + offsets[2] = 1; + + ASSERT_OK(list_array->ValidateFull()); + ASSERT_OK_AND_ASSIGN(auto flattened, list_array->Flatten()); + EXPECT_TRUE(flattened->Equals(ArrayFromJSON(int32(), "[]"))) + << flattened->ToString(); + } + } + void TestFlattenSliced() { auto type = std::make_shared(int32()); auto list_array = std::dynamic_pointer_cast( @@ -520,7 +749,7 @@ class TestListArray : public ::testing::Test { std::dynamic_pointer_cast( ArrayFromJSON(type, "[[1, 2], [3], null, [5, 6], [7, 8], [], [9]]")) ->data(); - ASSERT_EQ(2, array_data->buffers.size()); + ASSERT_EQ(kTypeClassIsListView ? 3 : 2, array_data->buffers.size()); auto null_bitmap_buffer = array_data->buffers[0]; ASSERT_NE(nullptr, null_bitmap_buffer); bit_util::ClearBit(null_bitmap_buffer->mutable_data(), 1); @@ -534,20 +763,47 @@ class TestListArray : public ::testing::Test { << flattened->ToString(); } - Status ValidateOffsets(int64_t length, std::vector offsets, - const std::shared_ptr& values, int64_t offset = 0) { + Status ValidateOffsetsAndSizes(int64_t length, std::vector offsets, + std::vector sizes, + std::shared_ptr values, int64_t offset = 0) { auto type = std::make_shared(values->type()); - ArrayType arr(type, length, Buffer::Wrap(offsets), values, + auto offsets_buffer = Buffer::Wrap(offsets.data(), sizes.size()); + auto sizes_buffer = Buffer::Wrap(sizes); + ArrayType arr(type, length, std::move(offsets_buffer), std::move(sizes_buffer), + std::move(values), /*null_bitmap=*/nullptr, /*null_count=*/0, offset); return arr.ValidateFull(); } - void TestValidateOffsets() { + Status ValidateOffsets(int64_t length, std::vector offsets, + std::shared_ptr values, int64_t offset = 0) { + if constexpr (kTypeClassIsListView) { + std::vector sizes; + // Always reserve some space so Buffer::Wrap doesn't create a null buffer + // when length of the sizes buffer is 0. + sizes.reserve( + std::max(static_cast(1), offsets.empty() ? 0 : offsets.size() - 1)); + for (size_t i = 1; i < offsets.size(); ++i) { + sizes.push_back(offsets[i] - offsets[i - 1]); + } + return ValidateOffsetsAndSizes(length, std::move(offsets), std::move(sizes), + std::move(values), offset); + } else { + auto type = std::make_shared(values->type()); + ArrayType arr(type, length, Buffer::Wrap(offsets), std::move(values), + /*null_bitmap=*/nullptr, /*null_count=*/0, offset); + return arr.ValidateFull(); + } + } + + void TestValidateDimensions() { auto empty_values = ArrayFromJSON(int16(), "[]"); auto values = ArrayFromJSON(int16(), "[1, 2, 3, 4, 5, 6, 7]"); - // An empty list array can have omitted or 0-length offsets - ASSERT_OK(ValidateOffsets(0, {}, empty_values)); + if constexpr (!kTypeClassIsListView) { + // An empty list array can have omitted or 0-length offsets + ASSERT_OK(ValidateOffsets(0, {}, empty_values)); + } ASSERT_OK(ValidateOffsets(0, {0}, empty_values)); ASSERT_OK(ValidateOffsets(1, {0, 7}, values)); @@ -564,13 +820,24 @@ class TestListArray : public ::testing::Test { // Offset out of bounds ASSERT_RAISES(Invalid, ValidateOffsets(1, {0, 8}, values)); - ASSERT_RAISES(Invalid, ValidateOffsets(1, {0, 8, 8}, values, 1)); + if constexpr (kTypeClassIsListView) { + ASSERT_RAISES(Invalid, ValidateOffsets(1, {0, 8, 8}, values, 2)); + } else { + ASSERT_RAISES(Invalid, ValidateOffsets(1, {0, 8, 8}, values, 1)); + } // Negative offset ASSERT_RAISES(Invalid, ValidateOffsets(1, {-1, 0}, values)); ASSERT_RAISES(Invalid, ValidateOffsets(1, {0, -1}, values)); - ASSERT_RAISES(Invalid, ValidateOffsets(2, {0, -1, -1}, values, 1)); // Offsets non-monotonic ASSERT_RAISES(Invalid, ValidateOffsets(2, {0, 7, 4}, values)); + + if constexpr (kTypeClassIsListView) { + // Out of order offsets + ASSERT_OK(ValidateOffsetsAndSizes(2, {4, 1, 2}, {3, 6, 5}, values)); + + // Sizes out of bounds + ASSERT_RAISES(Invalid, ValidateOffsetsAndSizes(2, {4, 1, 2}, {3, 7, 5}, values)); + } } void TestCornerCases() { @@ -581,7 +848,7 @@ class TestListArray : public ::testing::Test { AssertArraysEqual(*result_, *expected); SetUp(); - ASSERT_OK(builder_->Append()); + ASSERT_OK(builder_->Append(/*is_valid=*/true, 0)); Done(); expected = ArrayFromJSON(type_, "[[]]"); AssertArraysEqual(*result_, *expected); @@ -602,7 +869,7 @@ class TestListArray : public ::testing::Test { ASSERT_OK(builder_->ValidateOverflow(max_elements)); ASSERT_RAISES(CapacityError, builder_->ValidateOverflow(max_elements + 1)); - ASSERT_OK(builder_->Append()); + ASSERT_OK(builder_->Append(/*is_valid=*/true, 2)); ASSERT_OK(vb->Append(1)); ASSERT_OK(vb->Append(2)); ASSERT_OK(builder_->ValidateOverflow(max_elements - 2)); @@ -612,7 +879,7 @@ class TestListArray : public ::testing::Test { ASSERT_OK(builder_->ValidateOverflow(max_elements - 2)); ASSERT_RAISES(CapacityError, builder_->ValidateOverflow(max_elements - 1)); - ASSERT_OK(builder_->Append()); + ASSERT_OK(builder_->Append(/*is_valid=*/true, 3)); ASSERT_OK(vb->Append(1)); ASSERT_OK(vb->Append(2)); ASSERT_OK(vb->Append(3)); @@ -629,7 +896,7 @@ class TestListArray : public ::testing::Test { std::shared_ptr result_; }; -TYPED_TEST_SUITE(TestListArray, ListTypes); +TYPED_TEST_SUITE(TestListArray, ListAndListViewTypes); TYPED_TEST(TestListArray, Basics) { this->TestBasics(); } @@ -639,11 +906,7 @@ TYPED_TEST(TestListArray, ValuesEquality) { this->TestValuesEquality(); } TYPED_TEST(TestListArray, FromArrays) { this->TestFromArrays(); } -TYPED_TEST(TestListArray, FromArraysWithNullBitMap) { - this->TestFromArraysWithNullBitMap(); - this->TestFromArraysWithSlicedOffsets(); - this->TestFromArraysWithSlicedNullOffsets(); -} +TYPED_TEST(TestListArray, FromArraysNullHandling) { this->TestFromArraysNullHandling(); } TYPED_TEST(TestListArray, AppendNull) { this->TestAppendNull(); } @@ -661,12 +924,13 @@ TYPED_TEST(TestListArray, BuilderPreserveFieldName) { TYPED_TEST(TestListArray, FlattenSimple) { this->TestFlattenSimple(); } TYPED_TEST(TestListArray, FlattenNulls) { this->TestFlattenNulls(); } +TYPED_TEST(TestListArray, FlattenAllEmpty) { this->TestFlattenAllEmpty(); } TYPED_TEST(TestListArray, FlattenZeroLength) { this->TestFlattenZeroLength(); } TYPED_TEST(TestListArray, TestFlattenNonEmptyBackingNulls) { this->TestFlattenNonEmptyBackingNulls(); } -TYPED_TEST(TestListArray, ValidateOffsets) { this->TestValidateOffsets(); } +TYPED_TEST(TestListArray, ValidateDimensions) { this->TestValidateDimensions(); } TYPED_TEST(TestListArray, CornerCases) { this->TestCornerCases(); } @@ -676,6 +940,82 @@ TYPED_TEST(TestListArray, DISABLED_TestOverflowCheck) { this->TestOverflowCheck( TYPED_TEST(TestListArray, TestOverflowCheck) { this->TestOverflowCheck(); } #endif +class TestListConversions : public ::testing::Test { + private: + MemoryPool* pool_; + + public: + TestListConversions() : pool_(default_memory_pool()) {} + + template + void DoTestListViewFromList() { + using DestListViewArrayClass = typename TypeTraits::ArrayType; + using SrcListArrayClass = typename TypeTraits::ArrayType; + auto list_type = std::make_shared(int32()); + auto list_view_type = std::make_shared(int32()); + + auto expected_list_view_w_nulls = + ArrayFromJSON(list_view_type, "[[1, 2], [3], [], [4], null]"); + auto expected_list_view_wo_nulls = + ArrayFromJSON(list_view_type, "[[1, 2], [], [100000]]"); + + std::shared_ptr list_w_nulls = + ArrayFromJSON(list_type, "[[1, 2], [3], [], [4], null]"); + auto list_wo_nulls = ArrayFromJSON(list_type, "[[1, 2], [], [100000]]"); + + ASSERT_OK_AND_ASSIGN( + auto result, DestListViewArrayClass::FromList( + *checked_pointer_cast(list_w_nulls), pool_)); + ASSERT_OK(result->ValidateFull()); + AssertArraysEqual(*expected_list_view_w_nulls, *result, /*verbose=*/true); + + ASSERT_OK_AND_ASSIGN( + result, DestListViewArrayClass::FromList( + *checked_pointer_cast(list_wo_nulls), pool_)); + ASSERT_OK(result->ValidateFull()); + AssertArraysEqual(*expected_list_view_wo_nulls, *result, /*verbose=*/true); + } + + template + void DoTestListFromListView() { + using SrcListViewArrayClass = typename TypeTraits::ArrayType; + using DestListArrayClass = typename TypeTraits::ArrayType; + auto list_view_type = std::make_shared(int32()); + auto list_type = std::make_shared(int32()); + + auto list_view_w_nulls = + ArrayFromJSON(list_view_type, "[[1, 2], [3], [], [4], null]"); + auto list_view_wo_nulls = ArrayFromJSON(list_view_type, "[[1, 2], [], [100000]]"); + + auto expected_list_w_nulls = ArrayFromJSON(list_type, "[[1, 2], [3], [], [4], null]"); + auto expected_list_wo_nulls = ArrayFromJSON(list_type, "[[1, 2], [], [100000]]"); + + ASSERT_OK_AND_ASSIGN( + auto result, + DestListArrayClass::FromListView( + *checked_pointer_cast(list_view_w_nulls), pool_)); + ASSERT_OK(result->ValidateFull()); + AssertArraysEqual(*expected_list_w_nulls, *result, /*verbose=*/true); + + ASSERT_OK_AND_ASSIGN( + result, + DestListArrayClass::FromListView( + *checked_pointer_cast(list_view_wo_nulls), pool_)); + ASSERT_OK(result->ValidateFull()); + AssertArraysEqual(*expected_list_wo_nulls, *result, /*verbose=*/true); + } +}; + +TEST_F(TestListConversions, ListViewFromList) { + this->DoTestListViewFromList(); + this->DoTestListViewFromList(); +} + +TEST_F(TestListConversions, ListFromListView) { + this->DoTestListFromListView(); + this->DoTestListFromListView(); +} + // ---------------------------------------------------------------------- // Map tests diff --git a/cpp/src/arrow/array/array_list_view_test.cc b/cpp/src/arrow/array/array_list_view_test.cc new file mode 100644 index 0000000000000..3e48191cedded --- /dev/null +++ b/cpp/src/arrow/array/array_list_view_test.cc @@ -0,0 +1,84 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "arrow/array/array_nested.h" +#include "arrow/array/util.h" +#include "arrow/pretty_print.h" +#include "arrow/testing/gtest_util.h" +#include "arrow/type_fwd.h" +#include "arrow/util/checked_cast.h" + +namespace arrow { + +using internal::checked_cast; + +// ---------------------------------------------------------------------- +// List-view array tests + +namespace { + +class TestListViewArray : public ::testing::Test { + public: + std::shared_ptr string_values; + std::shared_ptr int32_values; + std::shared_ptr int16_values; + + void SetUp() override { + string_values = ArrayFromJSON(utf8(), R"(["Hello", "World", null])"); + int32_values = ArrayFromJSON(int32(), "[1, 20, 3]"); + int16_values = ArrayFromJSON(int16(), "[10, 2, 30]"); + } + + static std::shared_ptr Offsets(std::string_view json) { + return ArrayFromJSON(int32(), json); + } + + static std::shared_ptr Sizes(std::string_view json) { + return ArrayFromJSON(int32(), json); + } +}; + +} // namespace + +TEST_F(TestListViewArray, MakeArray) { + ASSERT_OK_AND_ASSIGN(auto list_view_array, + ListViewArray::FromArrays(*Offsets("[0, 0, 1, 2]"), + *Sizes("[2, 1, 1, 1]"), *string_values)); + auto array_data = list_view_array->data(); + auto new_array = MakeArray(array_data); + ASSERT_ARRAYS_EQUAL(*new_array, *list_view_array); + // Should be the exact same ArrayData object + ASSERT_EQ(new_array->data(), array_data); + ASSERT_NE(std::dynamic_pointer_cast(new_array), NULLPTR); +} + +TEST_F(TestListViewArray, FromOffsetsAndSizes) { + std::shared_ptr list_view_array; + + ASSERT_OK_AND_ASSIGN(list_view_array, ListViewArray::FromArrays( + *Offsets("[0, 0, 1, 1000]"), + *Sizes("[2, 1, 1, null]"), *int32_values)); + ASSERT_EQ(list_view_array->length(), 4); + ASSERT_ARRAYS_EQUAL(*list_view_array->values(), *int32_values); + ASSERT_EQ(list_view_array->offset(), 0); + ASSERT_EQ(list_view_array->data()->GetNullCount(), 1); + ASSERT_EQ(list_view_array->data()->buffers.size(), 3); +} + +} // namespace arrow diff --git a/cpp/src/arrow/array/array_nested.cc b/cpp/src/arrow/array/array_nested.cc index d8308c824953a..03f3e5af29908 100644 --- a/cpp/src/arrow/array/array_nested.cc +++ b/cpp/src/arrow/array/array_nested.cc @@ -27,6 +27,8 @@ #include "arrow/array/array_base.h" #include "arrow/array/array_primitive.h" +#include "arrow/array/builder_base.h" +#include "arrow/array/builder_nested.h" #include "arrow/array/concatenate.h" #include "arrow/array/util.h" #include "arrow/buffer.h" @@ -38,6 +40,7 @@ #include "arrow/util/bitmap_generate.h" #include "arrow/util/bitmap_ops.h" #include "arrow/util/checked_cast.h" +#include "arrow/util/list_util.h" #include "arrow/util/logging.h" namespace arrow { @@ -48,7 +51,7 @@ using internal::checked_pointer_cast; using internal::CopyBitmap; // ---------------------------------------------------------------------- -// ListArray / LargeListArray (common utilities) +// ListArray / LargeListArray / ListViewArray / LargeListViewArray (common utilities) namespace { @@ -137,6 +140,77 @@ Result::ArrayType>> ListArrayFromArray return std::make_shared(std::move(data)); } +template +Result::ArrayType>> ListViewArrayFromArrays( + std::shared_ptr type, const Array& offsets, const Array& sizes, + const Array& values, MemoryPool* pool, std::shared_ptr null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount) { + using offset_type = typename TYPE::offset_type; + using ArrayType = typename TypeTraits::ArrayType; + using OffsetArrowType = typename CTypeTraits::ArrowType; + + if (offsets.type_id() != OffsetArrowType::type_id) { + return Status::TypeError("List offsets must be ", OffsetArrowType::type_name()); + } + + if (sizes.length() != offsets.length() && sizes.length() != offsets.length() - 1) { + return Status::Invalid( + "List sizes must have the same length as offsets or one less than offsets"); + } + if (sizes.type_id() != OffsetArrowType::type_id) { + return Status::TypeError("List sizes must be ", OffsetArrowType::type_name()); + } + + if (offsets.offset() != sizes.offset()) { + return Status::Invalid("List offsets and sizes must have the same offset"); + } + const int64_t array_offset = sizes.offset(); + + if (null_bitmap) { + if (offsets.null_count() > 0 || sizes.null_count() > 0) { + return Status::Invalid( + "Ambiguous to specify both validity map and offsets or sizes with nulls"); + } + if (array_offset != 0) { + return Status::Invalid( + "List offsets and sizes must not be slices if a validity map is specified"); + } + } else { + if (offsets.null_count() > 0 && sizes.null_count() > 0) { + return Status::Invalid("Ambiguous to specify both offsets and sizes with nulls"); + } + } + + DCHECK(offsets.length() == sizes.length() || offsets.length() - 1 == sizes.length()); + + using OffsetArrayType = typename TypeTraits::ArrayType; + const auto& typed_offsets = checked_cast(offsets); + const auto& typed_sizes = checked_cast(sizes); + + auto derived_validity_buffer = std::move(null_bitmap); + if (offsets.null_count() > 0) { + derived_validity_buffer = offsets.null_bitmap(); + null_count = offsets.null_count(); + // We allow construction from an offsets array containing one extra value. + // If that is the case, we might need to discount one null from out_null_count. + if (offsets.length() - 1 == sizes.length() && !offsets.IsValid(sizes.length())) { + null_count -= 1; + } + } else if (sizes.null_count() > 0) { + derived_validity_buffer = sizes.null_bitmap(); + null_count = sizes.null_count(); + } + + auto buffers = BufferVector({ + std::move(derived_validity_buffer), + typed_offsets.values(), + typed_sizes.values(), + }); + auto data = ArrayData::Make(type, sizes.length(), std::move(buffers), {values.data()}, + null_count, array_offset); + return std::make_shared(std::move(data)); +} + static std::shared_ptr SliceArrayWithOffsets(const Array& array, int64_t begin, int64_t end) { return array.Slice(begin, end - begin); @@ -189,23 +263,199 @@ Result> FlattenListArray(const ListArrayT& list_array, return Concatenate(non_null_fragments, memory_pool); } +template +Result> FlattenListViewArray(const ListViewArrayT& list_view_array, + MemoryPool* memory_pool) { + using offset_type = typename ListViewArrayT::offset_type; + const int64_t list_view_array_offset = list_view_array.offset(); + const int64_t list_view_array_length = list_view_array.length(); + std::shared_ptr value_array = list_view_array.values(); + + if (list_view_array_length == 0) { + return SliceArrayWithOffsets(*value_array, 0, 0); + } + + // If the list array is *all* nulls, then just return an empty array. + if constexpr (HasNulls) { + if (list_view_array.null_count() == list_view_array.length()) { + return MakeEmptyArray(value_array->type(), memory_pool); + } + } + + const auto* validity = list_view_array.data()->template GetValues(0, 0); + const auto* offsets = list_view_array.data()->template GetValues(1); + const auto* sizes = list_view_array.data()->template GetValues(2); + + auto is_null_or_empty = [&](int64_t i) { + if constexpr (HasNulls) { + if (!bit_util::GetBit(validity, list_view_array_offset + i)) { + return true; + } + } + return sizes[i] == 0; + }; + + // Index of the first valid, non-empty list-view. + int64_t first_i = 0; + for (; first_i < list_view_array_length; first_i++) { + if (!is_null_or_empty(first_i)) { + break; + } + } + // If all list-views are empty, return an empty array. + if (first_i == list_view_array_length) { + return MakeEmptyArray(value_array->type(), memory_pool); + } + + std::vector> slices; + { + int64_t i = first_i; + auto begin_offset = offsets[i]; + auto end_offset = offsets[i] + sizes[i]; + i += 1; + // Inductive invariant: slices and the always non-empty values slice + // [begin_offset, end_offset) contains all the maximally contiguous slices of the + // values array that are covered by all the list-views before list-view i. + for (; i < list_view_array_length; i++) { + if (is_null_or_empty(i)) { + // The invariant is preserved by simply preserving the current set of slices. + } else { + if (offsets[i] == end_offset) { + end_offset += sizes[i]; + // The invariant is preserved because since the non-empty list-view i + // starts at end_offset, the current range can be extended to end at + // offsets[i] + sizes[i] (the same as end_offset + sizes[i]). + } else { + // The current slice can't be extended because the list-view i either + // shares values with the current slice or starts after the position + // immediately after the end of the current slice. + slices.push_back(SliceArrayWithOffsets(*value_array, begin_offset, end_offset)); + begin_offset = offsets[i]; + end_offset = offsets[i] + sizes[i]; + // The invariant is preserved because a maximally contiguous slice of + // the values array (i.e. one that can't be extended) was added to slices + // and [begin_offset, end_offset) is non-empty and contains the + // current list-view i. + } + } + } + slices.push_back(SliceArrayWithOffsets(*value_array, begin_offset, end_offset)); + } + + // Final attempt to avoid invoking Concatenate(). + switch (slices.size()) { + case 0: + return MakeEmptyArray(value_array->type(), memory_pool); + case 1: + return slices[0]; + } + + return Concatenate(slices, memory_pool); +} + std::shared_ptr BoxOffsets(const std::shared_ptr& boxed_type, const ArrayData& data) { + const int64_t num_offsets = + is_list_view(data.type->id()) ? data.length : data.length + 1; std::vector> buffers = {nullptr, data.buffers[1]}; auto offsets_data = - std::make_shared(boxed_type, data.length + 1, std::move(buffers), + std::make_shared(boxed_type, /*length=*/num_offsets, std::move(buffers), /*null_count=*/0, data.offset); return MakeArray(offsets_data); } +std::shared_ptr BoxSizes(const std::shared_ptr& boxed_type, + const ArrayData& data) { + DCHECK(is_list_view(data.type->id())); + std::vector> buffers = {nullptr, data.buffers[2]}; + auto sizes_data = + std::make_shared(boxed_type, data.length, std::move(buffers), + /*null_count=*/0, data.offset); + return MakeArray(sizes_data); +} + +template +Result> ListViewFromListImpl( + const std::shared_ptr& list_data, MemoryPool* pool) { + static_assert( + std::is_same::value, + "Offset types between list type and list-view type are expected to match"); + using offset_type = typename SrcListType::offset_type; + const auto& list_type = checked_cast(*list_data->type); + + // To re-use the validity and offsets buffers, a sizes buffer with enough + // padding on the beginning is allocated and filled with the sizes after + // list_data->offset. + const int64_t buffer_length = list_data->offset + list_data->length; + ARROW_ASSIGN_OR_RAISE(auto sizes_buffer, + AllocateBuffer(buffer_length * sizeof(offset_type), pool)); + const auto* offsets = list_data->template GetValues(1, 0); + auto* sizes = sizes_buffer->mutable_data_as(); + // Zero the initial padding area to avoid leaking any data when buffers are + // sent over IPC or throught the C Data interface. + memset(sizes, 0, list_data->offset * sizeof(offset_type)); + for (int64_t i = list_data->offset; i < buffer_length; i++) { + sizes[i] = offsets[i + 1] - offsets[i]; + } + BufferVector buffers = {list_data->buffers[0], list_data->buffers[1], + std::move(sizes_buffer)}; + + return ArrayData::Make(std::make_shared(list_type.value_type()), + list_data->length, std::move(buffers), + {list_data->child_data[0]}, list_data->null_count, + list_data->offset); +} + +template +Result> ListFromListViewImpl( + const std::shared_ptr& list_view_data, MemoryPool* pool) { + static_assert( + std::is_same::value, + "Offset types between list type and list-view type are expected to match"); + using offset_type = typename DestListType::offset_type; + using ListBuilderType = typename TypeTraits::BuilderType; + + const auto& list_view_type = + checked_cast(*list_view_data->type); + const auto& value_type = list_view_type.value_type(); + const auto list_type = std::make_shared(value_type); + + ARROW_ASSIGN_OR_RAISE(auto sum_of_list_view_sizes, + list_util::internal::SumOfLogicalListSizes(*list_view_data)); + ARROW_ASSIGN_OR_RAISE(std::shared_ptr value_builder, + MakeBuilder(value_type, pool)); + RETURN_NOT_OK(value_builder->Reserve(sum_of_list_view_sizes)); + auto list_builder = std::make_shared(pool, value_builder, list_type); + RETURN_NOT_OK(list_builder->Reserve(list_view_data->length)); + + ArraySpan values{*list_view_data->child_data[0]}; + const auto* in_validity_bitmap = list_view_data->GetValues(0); + const auto* in_offsets = list_view_data->GetValues(1); + const auto* in_sizes = list_view_data->GetValues(2); + for (int64_t i = 0; i < list_view_data->length; ++i) { + const bool is_valid = + !in_validity_bitmap || + bit_util::GetBit(in_validity_bitmap, list_view_data->offset + i); + const int64_t size = is_valid ? in_sizes[i] : 0; + RETURN_NOT_OK(list_builder->Append(is_valid, size)); + RETURN_NOT_OK(value_builder->AppendArraySlice(values, in_offsets[i], size)); + } + std::shared_ptr list_array_data; + RETURN_NOT_OK(list_builder->FinishInternal(&list_array_data)); + return list_array_data; +} + } // namespace namespace internal { template -inline void SetListData(BaseListArray* self, const std::shared_ptr& data, +inline void SetListData(VarLengthListLikeArray* self, + const std::shared_ptr& data, Type::type expected_type_id) { - ARROW_CHECK_EQ(data->buffers.size(), 2); + ARROW_CHECK_EQ(data->buffers.size(), is_list_view(TYPE::type_id) ? 3 : 2); ARROW_CHECK_EQ(data->type->id(), expected_type_id); ARROW_CHECK_EQ(data->child_data.size(), 1); @@ -214,6 +464,7 @@ inline void SetListData(BaseListArray* self, const std::shared_ptrlist_type_ = checked_cast(data->type.get()); self->raw_value_offsets_ = data->GetValuesSafe(1, /*offset=*/0); + // BaseListViewArray::SetData takes care of setting raw_value_sizes_. ARROW_CHECK_EQ(self->list_type_->value_type()->id(), data->child_data[0]->type->id()); DCHECK(self->list_type_->value_type()->Equals(data->child_data[0]->type)); @@ -225,7 +476,9 @@ inline void SetListData(BaseListArray* self, const std::shared_ptr data) { SetData(std::move(data)); } +ListArray::ListArray(std::shared_ptr data) { + ListArray::SetData(std::move(data)); +} ListArray::ListArray(std::shared_ptr type, int64_t length, std::shared_ptr value_offsets, std::shared_ptr values, @@ -250,6 +503,13 @@ Result> ListArray::FromArrays( values, pool, null_bitmap, null_count); } +Result> ListArray::FromListView(const ListViewArray& source, + MemoryPool* pool) { + ARROW_ASSIGN_OR_RAISE( + auto data, (ListFromListViewImpl(source.data(), pool))); + return std::make_shared(std::move(data)); +} + Result> ListArray::FromArrays( std::shared_ptr type, const Array& offsets, const Array& values, MemoryPool* pool, std::shared_ptr null_bitmap, int64_t null_count) { @@ -273,7 +533,9 @@ std::shared_ptr ListArray::offsets() const { return BoxOffsets(int32(), * // ---------------------------------------------------------------------- // LargeListArray -LargeListArray::LargeListArray(const std::shared_ptr& data) { SetData(data); } +LargeListArray::LargeListArray(const std::shared_ptr& data) { + LargeListArray::SetData(data); +} LargeListArray::LargeListArray(const std::shared_ptr& type, int64_t length, const std::shared_ptr& value_offsets, @@ -284,7 +546,7 @@ LargeListArray::LargeListArray(const std::shared_ptr& type, int64_t le auto internal_data = ArrayData::Make(type, length, {null_bitmap, value_offsets}, null_count, offset); internal_data->child_data.emplace_back(values->data()); - SetData(internal_data); + LargeListArray::SetData(internal_data); } void LargeListArray::SetData(const std::shared_ptr& data) { @@ -299,6 +561,14 @@ Result> LargeListArray::FromArrays( null_count); } +Result> LargeListArray::FromListView( + const LargeListViewArray& source, MemoryPool* pool) { + ARROW_ASSIGN_OR_RAISE( + auto data, + (ListFromListViewImpl(source.data(), pool))); + return std::make_shared(std::move(data)); +} + Result> LargeListArray::FromArrays( std::shared_ptr type, const Array& offsets, const Array& values, MemoryPool* pool, std::shared_ptr null_bitmap, int64_t null_count) { @@ -321,6 +591,144 @@ std::shared_ptr LargeListArray::offsets() const { return BoxOffsets(int64(), *data_); } +// ---------------------------------------------------------------------- +// ListViewArray + +ListViewArray::ListViewArray(std::shared_ptr data) { + ListViewArray::SetData(std::move(data)); +} + +ListViewArray::ListViewArray(std::shared_ptr type, int64_t length, + std::shared_ptr value_offsets, + std::shared_ptr value_sizes, + std::shared_ptr values, + std::shared_ptr null_bitmap, int64_t null_count, + int64_t offset) { + ListViewArray::SetData(ArrayData::Make( + std::move(type), length, + {std::move(null_bitmap), std::move(value_offsets), std::move(value_sizes)}, + /*child_data=*/{values->data()}, null_count, offset)); +} + +void ListViewArray::SetData(const std::shared_ptr& data) { + internal::SetListData(this, data); + raw_value_sizes_ = data->GetValuesSafe(2, /*offset=*/0); +} + +Result> ListViewArray::FromArrays( + const Array& offsets, const Array& sizes, const Array& values, MemoryPool* pool, + std::shared_ptr null_bitmap, int64_t null_count) { + return ListViewArrayFromArrays( + std::make_shared(values.type()), offsets, sizes, values, pool, + null_bitmap, null_count); +} + +Result> ListViewArray::FromArrays( + std::shared_ptr type, const Array& offsets, const Array& sizes, + const Array& values, MemoryPool* pool, std::shared_ptr null_bitmap, + int64_t null_count) { + if (type->id() != Type::LIST_VIEW) { + return Status::TypeError("Expected list-view type, got ", type->ToString()); + } + const auto& list_view_type = checked_cast(*type); + if (!list_view_type.value_type()->Equals(values.type())) { + return Status::TypeError("Mismatching list-view value type"); + } + return ListViewArrayFromArrays(std::move(type), offsets, sizes, values, + pool, null_bitmap, null_count); +} + +Result> ListViewArray::FromList(const ListArray& source, + MemoryPool* pool) { + ARROW_ASSIGN_OR_RAISE( + auto data, (ListViewFromListImpl(source.data(), pool))); + return std::make_shared(std::move(data)); +} + +Result> LargeListViewArray::FromList( + const LargeListArray& source, MemoryPool* pool) { + ARROW_ASSIGN_OR_RAISE( + auto data, + (ListViewFromListImpl(source.data(), pool))); + return std::make_shared(std::move(data)); +} + +Result> ListViewArray::Flatten(MemoryPool* memory_pool) const { + if (null_count() > 0) { + return FlattenListViewArray(*this, memory_pool); + } + return FlattenListViewArray(*this, memory_pool); +} + +std::shared_ptr ListViewArray::offsets() const { + return BoxOffsets(int32(), *data_); +} + +std::shared_ptr ListViewArray::sizes() const { return BoxSizes(int32(), *data_); } + +// ---------------------------------------------------------------------- +// LargeListViewArray + +LargeListViewArray::LargeListViewArray(std::shared_ptr data) { + LargeListViewArray::SetData(std::move(data)); +} + +LargeListViewArray::LargeListViewArray(std::shared_ptr type, int64_t length, + std::shared_ptr value_offsets, + std::shared_ptr value_sizes, + std::shared_ptr values, + std::shared_ptr null_bitmap, + int64_t null_count, int64_t offset) { + LargeListViewArray::SetData(ArrayData::Make( + type, length, + {std::move(null_bitmap), std::move(value_offsets), std::move(value_sizes)}, + /*child_data=*/{values->data()}, null_count, offset)); +} + +void LargeListViewArray::SetData(const std::shared_ptr& data) { + internal::SetListData(this, data); + raw_value_sizes_ = data->GetValuesSafe(2, /*offset=*/0); +} + +Result> LargeListViewArray::FromArrays( + const Array& offsets, const Array& sizes, const Array& values, MemoryPool* pool, + std::shared_ptr null_bitmap, int64_t null_count) { + return ListViewArrayFromArrays( + std::make_shared(values.type()), offsets, sizes, values, pool, + null_bitmap, null_count); +} + +Result> LargeListViewArray::FromArrays( + std::shared_ptr type, const Array& offsets, const Array& sizes, + const Array& values, MemoryPool* pool, std::shared_ptr null_bitmap, + int64_t null_count) { + if (type->id() != Type::LARGE_LIST_VIEW) { + return Status::TypeError("Expected large list-view type, got ", type->ToString()); + } + const auto& large_list_view_type = checked_cast(*type); + if (!large_list_view_type.value_type()->Equals(values.type())) { + return Status::TypeError("Mismatching large list-view value type"); + } + return ListViewArrayFromArrays( + std::move(type), offsets, sizes, values, pool, null_bitmap, null_count); +} + +Result> LargeListViewArray::Flatten( + MemoryPool* memory_pool) const { + if (null_count() > 0) { + return FlattenListViewArray(*this, memory_pool); + } + return FlattenListViewArray(*this, memory_pool); +} + +std::shared_ptr LargeListViewArray::offsets() const { + return BoxOffsets(int64(), *data_); +} + +std::shared_ptr LargeListViewArray::sizes() const { + return BoxSizes(int64(), *data_); +} + // ---------------------------------------------------------------------- // MapArray diff --git a/cpp/src/arrow/array/array_nested.h b/cpp/src/arrow/array/array_nested.h index 8d5cc95fec00d..61606e1592d61 100644 --- a/cpp/src/arrow/array/array_nested.h +++ b/cpp/src/arrow/array/array_nested.h @@ -15,8 +15,8 @@ // specific language governing permissions and limitations // under the License. -// Array accessor classes for List, LargeList, FixedSizeList, Map, Struct, and -// Union +// Array accessor classes for List, LargeList, ListView, LargeListView, FixedSizeList, +// Map, Struct, and Union #pragma once @@ -43,30 +43,31 @@ namespace arrow { /// @{ // ---------------------------------------------------------------------- -// ListArray +// VarLengthListLikeArray template -class BaseListArray; +class VarLengthListLikeArray; namespace internal { -// Private helper for ListArray::SetData. -// Unfortunately, trying to define BaseListArray::SetData outside of this header +// Private helper for [Large]List[View]Array::SetData. +// Unfortunately, trying to define VarLengthListLikeArray::SetData outside of this header // doesn't play well with MSVC. template -void SetListData(BaseListArray* self, const std::shared_ptr& data, +void SetListData(VarLengthListLikeArray* self, + const std::shared_ptr& data, Type::type expected_type_id = TYPE::type_id); } // namespace internal -/// Base class for variable-sized list arrays, regardless of offset size. +/// Base class for variable-sized list and list-view arrays, regardless of offset size. template -class BaseListArray : public Array { +class VarLengthListLikeArray : public Array { public: using TypeClass = TYPE; using offset_type = typename TypeClass::offset_type; - const TypeClass* list_type() const { return list_type_; } + const TypeClass* var_length_list_like_type() const { return this->list_type_; } /// \brief Return array object containing the list's values /// @@ -84,19 +85,26 @@ class BaseListArray : public Array { } // The following functions will not perform boundschecking + offset_type value_offset(int64_t i) const { return raw_value_offsets_[i + data_->offset]; } - offset_type value_length(int64_t i) const { - i += data_->offset; - return raw_value_offsets_[i + 1] - raw_value_offsets_[i]; - } + + /// \brief Return the size of the value at a particular index + /// + /// Since non-empty null lists and list-views are possible, avoid calling this + /// function when the list at slot i is null. + /// + /// \pre IsValid(i) + virtual offset_type value_length(int64_t i) const = 0; + + /// \pre IsValid(i) std::shared_ptr value_slice(int64_t i) const { return values_->Slice(value_offset(i), value_length(i)); } protected: - friend void internal::SetListData(BaseListArray* self, + friend void internal::SetListData(VarLengthListLikeArray* self, const std::shared_ptr& data, Type::type expected_type_id); @@ -105,6 +113,29 @@ class BaseListArray : public Array { const offset_type* raw_value_offsets_ = NULLPTR; }; +// ---------------------------------------------------------------------- +// ListArray / LargeListArray + +template +class BaseListArray : public VarLengthListLikeArray { + public: + using TypeClass = TYPE; + using offset_type = typename TYPE::offset_type; + + const TypeClass* list_type() const { return this->var_length_list_like_type(); } + + /// \brief Return the size of the value at a particular index + /// + /// Since non-empty null lists are possible, avoid calling this + /// function when the list at slot i is null. + /// + /// \pre IsValid(i) + offset_type value_length(int64_t i) const final { + i += this->data_->offset; + return this->raw_value_offsets_[i + 1] - this->raw_value_offsets_[i]; + } +}; + /// Concrete Array class for list data class ARROW_EXPORT ListArray : public BaseListArray { public: @@ -120,10 +151,13 @@ class ARROW_EXPORT ListArray : public BaseListArray { /// This function does the bare minimum of validation of the offsets and /// input types, and will allocate a new offsets array if necessary (i.e. if /// the offsets contain any nulls). If the offsets do not have nulls, they - /// are assumed to be well-formed + /// are assumed to be well-formed. + /// + /// If a null_bitmap is not provided, the nulls will be inferred from the offsets' + /// null bitmap. But if a null_bitmap is provided, the offsets array can't have nulls. /// - /// Offsets of an Array's null bitmap can be present or an explicit - /// null_bitmap, but not both. + /// And when a null_bitmap is provided, the offsets array cannot be a slice (i.e. an + /// array with offset() > 0). /// /// \param[in] offsets Array containing n + 1 offsets encoding length and /// size. Must be of int32 type @@ -143,6 +177,10 @@ class ARROW_EXPORT ListArray : public BaseListArray { std::shared_ptr null_bitmap = NULLPTR, int64_t null_count = kUnknownNullCount); + /// \brief Build a ListArray from a ListViewArray + static Result> FromListView(const ListViewArray& source, + MemoryPool* pool); + /// \brief Return an Array that is a concatenation of the lists in this array. /// /// Note that it's different from `values()` in that it takes into @@ -181,7 +219,13 @@ class ARROW_EXPORT LargeListArray : public BaseListArray { /// This function does the bare minimum of validation of the offsets and /// input types, and will allocate a new offsets array if necessary (i.e. if /// the offsets contain any nulls). If the offsets do not have nulls, they - /// are assumed to be well-formed + /// are assumed to be well-formed. + /// + /// If a null_bitmap is not provided, the nulls will be inferred from the offsets' + /// null bitmap. But if a null_bitmap is provided, the offsets array can't have nulls. + /// + /// And when a null_bitmap is provided, the offsets array cannot be a slice (i.e. an + /// array with offset() > 0). /// /// \param[in] offsets Array containing n + 1 offsets encoding length and /// size. Must be of int64 type @@ -201,6 +245,10 @@ class ARROW_EXPORT LargeListArray : public BaseListArray { std::shared_ptr null_bitmap = NULLPTR, int64_t null_count = kUnknownNullCount); + /// \brief Build a LargeListArray from a LargeListViewArray + static Result> FromListView( + const LargeListViewArray& source, MemoryPool* pool); + /// \brief Return an Array that is a concatenation of the lists in this array. /// /// Note that it's different from `values()` in that it takes into @@ -216,6 +264,211 @@ class ARROW_EXPORT LargeListArray : public BaseListArray { void SetData(const std::shared_ptr& data); }; +// ---------------------------------------------------------------------- +// ListViewArray / LargeListViewArray + +template +class BaseListViewArray : public VarLengthListLikeArray { + public: + using TypeClass = TYPE; + using offset_type = typename TYPE::offset_type; + + const TypeClass* list_view_type() const { return this->var_length_list_like_type(); } + + /// \brief Note that this buffer does not account for any slice offset or length. + const std::shared_ptr& value_sizes() const { return this->data_->buffers[2]; } + + /// \brief Return pointer to raw value offsets accounting for any slice offset + const offset_type* raw_value_sizes() const { + return raw_value_sizes_ + this->data_->offset; + } + + /// \brief Return the size of the value at a particular index + /// + /// This should not be called if the list-view at slot i is null. + /// The returned size in those cases could be any value from 0 to the + /// length of the child values array. + /// + /// \pre IsValid(i) + offset_type value_length(int64_t i) const final { + return this->raw_value_sizes_[i + this->data_->offset]; + } + + protected: + const offset_type* raw_value_sizes_ = NULLPTR; +}; + +/// \brief Concrete Array class for list-view data +class ARROW_EXPORT ListViewArray : public BaseListViewArray { + public: + explicit ListViewArray(std::shared_ptr data); + + ListViewArray(std::shared_ptr type, int64_t length, + std::shared_ptr value_offsets, + std::shared_ptr value_sizes, std::shared_ptr values, + std::shared_ptr null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount, int64_t offset = 0); + + /// \brief Construct ListViewArray from array of offsets, sizes, and child + /// value array + /// + /// Construct a ListViewArray using buffers from offsets and sizes arrays + /// that project views into the child values array. + /// + /// This function does the bare minimum of validation of the offsets/sizes and + /// input types. The offset and length of the offsets and sizes arrays must + /// match and that will be checked, but their contents will be assumed to be + /// well-formed. + /// + /// If a null_bitmap is not provided, the nulls will be inferred from the + /// offsets's null bitmap. But if a null_bitmap is provided, the offsets array + /// can't have nulls. + /// + /// And when a null_bitmap is provided, neither the offsets or sizes array can be a + /// slice (i.e. an array with offset() > 0). + /// + /// \param[in] offsets An array of int32 offsets into the values array. NULL values are + /// supported if the corresponding values in sizes is NULL or 0. + /// \param[in] sizes An array containing the int32 sizes of every view. NULL values are + /// taken to represent a NULL list-view in the array being created. + /// \param[in] values Array containing list values + /// \param[in] pool MemoryPool + /// \param[in] null_bitmap Optional validity bitmap + /// \param[in] null_count Optional null count in null_bitmap + static Result> FromArrays( + const Array& offsets, const Array& sizes, const Array& values, + MemoryPool* pool = default_memory_pool(), + std::shared_ptr null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount); + + static Result> FromArrays( + std::shared_ptr type, const Array& offsets, const Array& sizes, + const Array& values, MemoryPool* pool = default_memory_pool(), + std::shared_ptr null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount); + + /// \brief Build a ListViewArray from a ListArray + static Result> FromList(const ListArray& list_array, + MemoryPool* pool); + + /// \brief Return an Array that is a concatenation of the list-views in this array. + /// + /// Note that it's different from `values()` in that it takes into + /// consideration this array's offsets (which can be in any order) + /// and sizes. Nulls are skipped. + /// + /// This function invokes Concatenate() if list-views are non-contiguous. It + /// will try to minimize the number of array slices passed to Concatenate() by + /// maximizing the size of each slice (containing as many contiguous + /// list-views as possible). + Result> Flatten( + MemoryPool* memory_pool = default_memory_pool()) const; + + /// \brief Return list-view offsets as an Int32Array + /// + /// The returned array will not have a validity bitmap, so you cannot expect + /// to pass it to ListArray::FromArrays() and get back the same list array + /// if the original one has nulls. + std::shared_ptr offsets() const; + + /// \brief Return list-view sizes as an Int32Array + /// + /// The returned array will not have a validity bitmap, so you cannot expect + /// to pass it to ListViewArray::FromArrays() and get back the same list + /// array if the original one has nulls. + std::shared_ptr sizes() const; + + protected: + // This constructor defers SetData to a derived array class + ListViewArray() = default; + + void SetData(const std::shared_ptr& data); +}; + +/// \brief Concrete Array class for large list-view data (with 64-bit offsets +/// and sizes) +class ARROW_EXPORT LargeListViewArray : public BaseListViewArray { + public: + explicit LargeListViewArray(std::shared_ptr data); + + LargeListViewArray(std::shared_ptr type, int64_t length, + std::shared_ptr value_offsets, + std::shared_ptr value_sizes, std::shared_ptr values, + std::shared_ptr null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount, int64_t offset = 0); + + /// \brief Construct LargeListViewArray from array of offsets, sizes, and child + /// value array + /// + /// Construct an LargeListViewArray using buffers from offsets and sizes arrays + /// that project views into the values array. + /// + /// This function does the bare minimum of validation of the offsets/sizes and + /// input types. The offset and length of the offsets and sizes arrays must + /// match and that will be checked, but their contents will be assumed to be + /// well-formed. + /// + /// If a null_bitmap is not provided, the nulls will be inferred from the offsets' or + /// sizes' null bitmap. Only one of these two is allowed to have a null bitmap. But if a + /// null_bitmap is provided, the offsets array and the sizes array can't have nulls. + /// + /// And when a null_bitmap is provided, neither the offsets or sizes array can be a + /// slice (i.e. an array with offset() > 0). + /// + /// \param[in] offsets An array of int64 offsets into the values array. NULL values are + /// supported if the corresponding values in sizes is NULL or 0. + /// \param[in] sizes An array containing the int64 sizes of every view. NULL values are + /// taken to represent a NULL list-view in the array being created. + /// \param[in] values Array containing list values + /// \param[in] pool MemoryPool + /// \param[in] null_bitmap Optional validity bitmap + /// \param[in] null_count Optional null count in null_bitmap + static Result> FromArrays( + const Array& offsets, const Array& sizes, const Array& values, + MemoryPool* pool = default_memory_pool(), + std::shared_ptr null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount); + + static Result> FromArrays( + std::shared_ptr type, const Array& offsets, const Array& sizes, + const Array& values, MemoryPool* pool = default_memory_pool(), + std::shared_ptr null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount); + + /// \brief Build a LargeListViewArray from a LargeListArray + static Result> FromList( + const LargeListArray& list_array, MemoryPool* pool); + + /// \brief Return an Array that is a concatenation of the large list-views in this + /// array. + /// + /// Note that it's different from `values()` in that it takes into + /// consideration this array's offsets (which can be in any order) + /// and sizes. Nulls are skipped. + Result> Flatten( + MemoryPool* memory_pool = default_memory_pool()) const; + + /// \brief Return list-view offsets as an Int64Array + /// + /// The returned array will not have a validity bitmap, so you cannot expect + /// to pass it to LargeListArray::FromArrays() and get back the same list array + /// if the original one has nulls. + std::shared_ptr offsets() const; + + /// \brief Return list-view sizes as an Int64Array + /// + /// The returned array will not have a validity bitmap, so you cannot expect + /// to pass it to LargeListViewArray::FromArrays() and get back the same list + /// array if the original one has nulls. + std::shared_ptr sizes() const; + + protected: + // This constructor defers SetData to a derived array class + LargeListViewArray() = default; + + void SetData(const std::shared_ptr& data); +}; + // ---------------------------------------------------------------------- // MapArray @@ -319,10 +572,18 @@ class ARROW_EXPORT FixedSizeListArray : public Array { i += data_->offset; return list_size_ * i; } + /// \brief Return the fixed-size of the values + /// + /// No matter the value of the index parameter, the result is the same. + /// So even when the value at slot i is null, this function will return a + /// non-zero size. + /// + /// \pre IsValid(i) int32_t value_length(int64_t i = 0) const { ARROW_UNUSED(i); return list_size_; } + /// \pre IsValid(i) std::shared_ptr value_slice(int64_t i) const { return values_->Slice(value_offset(i), value_length(i)); } diff --git a/cpp/src/arrow/array/array_test.cc b/cpp/src/arrow/array/array_test.cc index 46908439ef5f0..be54d62fd77a7 100644 --- a/cpp/src/arrow/array/array_test.cc +++ b/cpp/src/arrow/array/array_test.cc @@ -398,6 +398,8 @@ static std::vector> TestArrayUtilitiesAgainstTheseType large_list(list(large_utf8())), fixed_size_list(utf8(), 3), fixed_size_list(int64(), 4), + list_view(utf8()), + large_list_view(utf8()), dictionary(int32(), utf8()), struct_({field("a", utf8()), field("b", int32())}), sparse_union(union_fields1, union_type_codes), @@ -616,6 +618,8 @@ static ScalarVector GetScalars() { ScalarFromJSON(map(int8(), utf8()), R"([[1, "foo"], [2, "bar"]])"), std::make_shared(ArrayFromJSON(int8(), "[1, 1, 2, 2, 3, 3]")), std::make_shared(ArrayFromJSON(int8(), "[1, 2, 3, 4]")), + std::make_shared(ArrayFromJSON(int8(), "[1, 2, 3]")), + std::make_shared(ArrayFromJSON(int8(), "[1, 1, 2, 2, 3, 3]")), std::make_shared( ScalarVector{ std::make_shared(2), @@ -752,9 +756,9 @@ TEST_F(TestArray, TestFillFromScalar) { ArraySpan span(*scalar); auto roundtripped_array = span.ToArray(); - AssertArraysEqual(*array, *roundtripped_array); - ASSERT_OK(roundtripped_array->ValidateFull()); + + AssertArraysEqual(*array, *roundtripped_array); ASSERT_OK_AND_ASSIGN(auto roundtripped_scalar, roundtripped_array->GetScalar(0)); AssertScalarsEqual(*scalar, *roundtripped_scalar); } @@ -3526,6 +3530,8 @@ DataTypeVector SwappableTypes() { large_utf8(), list(int16()), large_list(int16()), + list_view(int16()), + large_list_view(int16()), dictionary(int16(), utf8())}; } diff --git a/cpp/src/arrow/array/builder_base.cc b/cpp/src/arrow/array/builder_base.cc index d3502a0ab645a..40e705aa3e440 100644 --- a/cpp/src/arrow/array/builder_base.cc +++ b/cpp/src/arrow/array/builder_base.cc @@ -150,7 +150,8 @@ struct AppendScalarImpl { } template - enable_if_list_like Visit(const T&) { + enable_if_t::value || is_list_like_type::value, Status> Visit( + const T&) { auto builder = checked_cast::BuilderType*>(builder_); int64_t num_children = 0; for (auto it = scalars_begin_; it != scalars_end_; ++it) { @@ -162,8 +163,12 @@ struct AppendScalarImpl { for (int64_t i = 0; i < n_repeats_; i++) { for (auto it = scalars_begin_; it != scalars_end_; ++it) { if (it->is_valid) { - RETURN_NOT_OK(builder->Append()); const Array& list = *checked_cast(*it).value; + if constexpr (T::type_id == Type::MAP || T::type_id == Type::FIXED_SIZE_LIST) { + RETURN_NOT_OK(builder->Append()); + } else { + RETURN_NOT_OK(builder->Append(/*is_valid=*/true, list.length())); + } for (int64_t i = 0; i < list.length(); i++) { ARROW_ASSIGN_OR_RAISE(auto scalar, list.GetScalar(i)); RETURN_NOT_OK(builder->value_builder()->AppendScalar(*scalar)); diff --git a/cpp/src/arrow/array/builder_nested.cc b/cpp/src/arrow/array/builder_nested.cc index fbba1fd056430..5bdc76d96c8f0 100644 --- a/cpp/src/arrow/array/builder_nested.cc +++ b/cpp/src/arrow/array/builder_nested.cc @@ -30,6 +30,20 @@ namespace arrow { +// ---------------------------------------------------------------------- +// VarLengthListLikeBuilder / BaseListBuilder / BaseListViewBuilder + +template class VarLengthListLikeBuilder; +template class VarLengthListLikeBuilder; +template class VarLengthListLikeBuilder; +template class VarLengthListLikeBuilder; + +template class BaseListBuilder; +template class BaseListBuilder; + +template class BaseListViewBuilder; +template class BaseListViewBuilder; + // ---------------------------------------------------------------------- // MapBuilder diff --git a/cpp/src/arrow/array/builder_nested.h b/cpp/src/arrow/array/builder_nested.h index d0b17c230489b..21c2d4b270eb1 100644 --- a/cpp/src/arrow/array/builder_nested.h +++ b/cpp/src/arrow/array/builder_nested.h @@ -40,37 +40,46 @@ namespace arrow { /// @{ // ---------------------------------------------------------------------- -// List builder +// VarLengthListLikeBuilder template -class BaseListBuilder : public ArrayBuilder { +class ARROW_EXPORT VarLengthListLikeBuilder : public ArrayBuilder { public: using TypeClass = TYPE; using offset_type = typename TypeClass::offset_type; /// Use this constructor to incrementally build the value array along with offsets and /// null bitmap. - BaseListBuilder(MemoryPool* pool, std::shared_ptr const& value_builder, - const std::shared_ptr& type, - int64_t alignment = kDefaultBufferAlignment) + VarLengthListLikeBuilder(MemoryPool* pool, + std::shared_ptr const& value_builder, + const std::shared_ptr& type, + int64_t alignment = kDefaultBufferAlignment) : ArrayBuilder(pool, alignment), offsets_builder_(pool, alignment), value_builder_(value_builder), value_field_(type->field(0)->WithType(NULLPTR)) {} - BaseListBuilder(MemoryPool* pool, std::shared_ptr const& value_builder, - int64_t alignment = kDefaultBufferAlignment) - : BaseListBuilder(pool, value_builder, list(value_builder->type()), alignment) {} + VarLengthListLikeBuilder(MemoryPool* pool, + std::shared_ptr const& value_builder, + int64_t alignment = kDefaultBufferAlignment) + : VarLengthListLikeBuilder(pool, value_builder, + std::make_shared(value_builder->type()), + alignment) {} + + ~VarLengthListLikeBuilder() override = default; Status Resize(int64_t capacity) override { if (ARROW_PREDICT_FALSE(capacity > maximum_elements())) { - return Status::CapacityError("List array cannot reserve space for more than ", + return Status::CapacityError(type_name(), + " array cannot reserve space for more than ", maximum_elements(), " got ", capacity); } ARROW_RETURN_NOT_OK(CheckCapacity(capacity)); - // One more than requested for offsets - ARROW_RETURN_NOT_OK(offsets_builder_.Resize(capacity + 1)); + // One more than requested for list offsets + const int64_t offsets_capacity = + is_list_view(TYPE::type_id) ? capacity : capacity + 1; + ARROW_RETURN_NOT_OK(offsets_builder_.Resize(offsets_capacity)); return ArrayBuilder::Resize(capacity); } @@ -80,56 +89,98 @@ class BaseListBuilder : public ArrayBuilder { value_builder_->Reset(); } - /// \brief Vector append - /// - /// If passed, valid_bytes is of equal length to values, and any zero byte - /// will be considered as a null for that slot - Status AppendValues(const offset_type* offsets, int64_t length, - const uint8_t* valid_bytes = NULLPTR) { - ARROW_RETURN_NOT_OK(Reserve(length)); - UnsafeAppendToBitmap(valid_bytes, length); - offsets_builder_.UnsafeAppend(offsets, length); - return Status::OK(); - } - /// \brief Start a new variable-length list slot /// - /// This function should be called before beginning to append elements to the - /// value builder - Status Append(bool is_valid = true) { + /// This function should be called before appending elements to the + /// value builder. Elements appended to the value builder before this function + /// is called for the first time, will not be members of any list value. + /// + /// After this function is called, list_length elements SHOULD be appended to + /// the values builder. If this contract is violated, the behavior is defined by + /// the concrete builder implementation and SHOULD NOT be relied upon unless + /// the caller is specifically building a [Large]List or [Large]ListView array. + /// + /// For [Large]List arrays, the list slot length will be the number of elements + /// appended to the values builder before the next call to Append* or Finish. For + /// [Large]ListView arrays, the list slot length will be exactly list_length, but if + /// Append* is called before at least list_length elements are appended to the values + /// builder, the current list slot will share elements with the next list + /// slots or an invalid [Large]ListView array will be generated because there + /// aren't enough elements in the values builder to fill the list slots. + /// + /// If you're building a [Large]List and don't need to be compatible + /// with [Large]ListView, then `BaseListBuilder::Append(bool is_valid)` + /// is a simpler API. + /// + /// \pre if is_valid is false, list_length MUST be 0 + /// \param is_valid Whether the new list slot is valid + /// \param list_length The number of elements in the list + Status Append(bool is_valid, int64_t list_length) { ARROW_RETURN_NOT_OK(Reserve(1)); + assert(is_valid || list_length == 0); UnsafeAppendToBitmap(is_valid); - UnsafeAppendNextOffset(); + UnsafeAppendDimensions(/*offset=*/value_builder_->length(), /*size=*/list_length); return Status::OK(); } - Status AppendNull() final { return Append(false); } + Status AppendNull() final { + // Append() a null list slot with list_length=0. + // + // When building [Large]List arrays, elements being appended to the values builder + // before the next call to Append* or Finish will extend the list slot length, but + // that is totally fine because list arrays admit non-empty null list slots. + // + // In the case of [Large]ListViews that's not a problem either because the + // list slot length remains zero. + return Append(false, 0); + } Status AppendNulls(int64_t length) final { ARROW_RETURN_NOT_OK(Reserve(length)); UnsafeAppendToBitmap(length, false); - const int64_t num_values = value_builder_->length(); - for (int64_t i = 0; i < length; ++i) { - offsets_builder_.UnsafeAppend(static_cast(num_values)); - } + UnsafeAppendEmptyDimensions(/*num_values=*/length); return Status::OK(); } - Status AppendEmptyValue() final { return Append(true); } + /// \brief Append an empty list slot + /// + /// \post Another call to Append* or Finish should be made before appending to + /// the values builder to ensure list slot remains empty + Status AppendEmptyValue() final { return Append(true, 0); } + /// \brief Append an empty list slot + /// + /// \post Another call to Append* or Finish should be made before appending to + /// the values builder to ensure the last list slot remains empty Status AppendEmptyValues(int64_t length) final { ARROW_RETURN_NOT_OK(Reserve(length)); UnsafeAppendToBitmap(length, true); - const int64_t num_values = value_builder_->length(); - for (int64_t i = 0; i < length; ++i) { - offsets_builder_.UnsafeAppend(static_cast(num_values)); - } + UnsafeAppendEmptyDimensions(/*num_values=*/length); return Status::OK(); } + /// \brief Vector append + /// + /// For list-array builders, the sizes are inferred from the offsets. + /// BaseListBuilder provides an implementation that doesn't take sizes, but + /// this virtual function allows dispatching calls to both list-array and + /// list-view-array builders (which need the sizes) + /// + /// \param offsets The offsets of the variable-length lists + /// \param sizes The sizes of the variable-length lists + /// \param length The number of offsets, sizes, and validity bits to append + /// \param valid_bytes If passed, valid_bytes is of equal length to values, + /// and any zero byte will be considered as a null for that slot + virtual Status AppendValues(const offset_type* offsets, const offset_type* sizes, + int64_t length, const uint8_t* valid_bytes) = 0; + Status AppendArraySlice(const ArraySpan& array, int64_t offset, int64_t length) override { const offset_type* offsets = array.GetValues(1); + [[maybe_unused]] const offset_type* sizes = NULLPTR; + if constexpr (is_list_view(TYPE::type_id)) { + sizes = array.GetValues(2); + } const bool all_valid = !array.MayHaveLogicalNulls(); const uint8_t* validity = array.HasValidityBitmap() ? array.buffers[0].data : NULLPTR; ARROW_RETURN_NOT_OK(Reserve(length)); @@ -137,43 +188,28 @@ class BaseListBuilder : public ArrayBuilder { const bool is_valid = all_valid || (validity && bit_util::GetBit(validity, array.offset + row)) || array.IsValid(row); + int64_t size = 0; + if (is_valid) { + if constexpr (is_list_view(TYPE::type_id)) { + size = sizes[row]; + } else { + size = offsets[row + 1] - offsets[row]; + } + } UnsafeAppendToBitmap(is_valid); - UnsafeAppendNextOffset(); + UnsafeAppendDimensions(/*offset=*/value_builder_->length(), size); if (is_valid) { - int64_t slot_length = offsets[row + 1] - offsets[row]; - ARROW_RETURN_NOT_OK(value_builder_->AppendArraySlice(array.child_data[0], - offsets[row], slot_length)); + ARROW_RETURN_NOT_OK( + value_builder_->AppendArraySlice(array.child_data[0], offsets[row], size)); } } return Status::OK(); } - Status FinishInternal(std::shared_ptr* out) override { - ARROW_RETURN_NOT_OK(AppendNextOffset()); - - // Offset padding zeroed by BufferBuilder - std::shared_ptr offsets, null_bitmap; - ARROW_RETURN_NOT_OK(offsets_builder_.Finish(&offsets)); - ARROW_RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap)); - - if (value_builder_->length() == 0) { - // Try to make sure we get a non-null values buffer (ARROW-2744) - ARROW_RETURN_NOT_OK(value_builder_->Resize(0)); - } - - std::shared_ptr items; - ARROW_RETURN_NOT_OK(value_builder_->FinishInternal(&items)); - - *out = ArrayData::Make(type(), length_, {null_bitmap, offsets}, {std::move(items)}, - null_count_); - Reset(); - return Status::OK(); - } - Status ValidateOverflow(int64_t new_elements) const { auto new_length = value_builder_->length() + new_elements; if (ARROW_PREDICT_FALSE(new_length > maximum_elements())) { - return Status::CapacityError("List array cannot contain more than ", + return Status::CapacityError(type_name(), " array cannot contain more than ", maximum_elements(), " elements, have ", new_elements); } else { return Status::OK(); @@ -191,20 +227,136 @@ class BaseListBuilder : public ArrayBuilder { return std::make_shared(value_field_->WithType(value_builder_->type())); } + private: + static constexpr const char* type_name() { + if constexpr (is_list_view(TYPE::type_id)) { + return "ListView"; + } else { + return "List"; + } + } + protected: + /// \brief Append dimensions for num_values empty list slots. + /// + /// ListViewBuilder overrides this to also append the sizes. + virtual void UnsafeAppendEmptyDimensions(int64_t num_values) { + const int64_t offset = value_builder_->length(); + for (int64_t i = 0; i < num_values; ++i) { + offsets_builder_.UnsafeAppend(static_cast(offset)); + } + } + + /// \brief Append dimensions for a single list slot. + /// + /// ListViewBuilder overrides this to also append the size. + virtual void UnsafeAppendDimensions(int64_t offset, int64_t size) { + offsets_builder_.UnsafeAppend(static_cast(offset)); + } + TypedBufferBuilder offsets_builder_; std::shared_ptr value_builder_; std::shared_ptr value_field_; +}; + +// ---------------------------------------------------------------------- +// ListBuilder / LargeListBuilder + +template +class ARROW_EXPORT BaseListBuilder : public VarLengthListLikeBuilder { + private: + using BASE = VarLengthListLikeBuilder; + + public: + using TypeClass = TYPE; + using offset_type = typename BASE::offset_type; + + using BASE::BASE; + + using BASE::Append; + + ~BaseListBuilder() override = default; + + /// \brief Start a new variable-length list slot + /// + /// This function should be called before beginning to append elements to the + /// value builder + Status Append(bool is_valid = true) { + // The value_length parameter to BASE::Append(bool, int64_t) is ignored when + // building a list array, so we can pass 0 here. + return BASE::Append(is_valid, 0); + } + + /// \brief Vector append + /// + /// If passed, valid_bytes is of equal length to values, and any zero byte + /// will be considered as a null for that slot + Status AppendValues(const offset_type* offsets, int64_t length, + const uint8_t* valid_bytes = NULLPTR) { + ARROW_RETURN_NOT_OK(this->Reserve(length)); + this->UnsafeAppendToBitmap(valid_bytes, length); + this->offsets_builder_.UnsafeAppend(offsets, length); + return Status::OK(); + } + + Status AppendValues(const offset_type* offsets, const offset_type* sizes, + int64_t length, const uint8_t* valid_bytes) final { + // Offsets are assumed to be valid, but the first length-1 sizes have to be + // consistent with the offsets to partially rule out the possibility that the + // caller is passing sizes that could work if building a list-view, but don't + // work on building a list that requires offsets to be non-decreasing. + // + // CAUTION: the last size element (`sizes[length - 1]`) is not + // validated and could be inconsistent with the offsets given in a + // subsequent call to AppendValues. +#ifndef NDEBUG + if (sizes) { + for (int64_t i = 0; i < length - 1; ++i) { + if (ARROW_PREDICT_FALSE(offsets[i] != offsets[i + 1] - sizes[i])) { + if (!valid_bytes || valid_bytes[i]) { + return Status::Invalid( + "BaseListBuilder: sizes are inconsistent with offsets provided"); + } + } + } + } +#endif + return AppendValues(offsets, length, valid_bytes); + } + + Status AppendValues(const offset_type* offsets, const offset_type* sizes, + int64_t length) { + return AppendValues(offsets, sizes, length, /*valid_bytes=*/NULLPTR); + } Status AppendNextOffset() { - ARROW_RETURN_NOT_OK(ValidateOverflow(0)); - const int64_t num_values = value_builder_->length(); - return offsets_builder_.Append(static_cast(num_values)); + ARROW_RETURN_NOT_OK(this->ValidateOverflow(0)); + const int64_t num_values = this->value_builder_->length(); + return this->offsets_builder_.Append(static_cast(num_values)); } - void UnsafeAppendNextOffset() { - const int64_t num_values = value_builder_->length(); - offsets_builder_.UnsafeAppend(static_cast(num_values)); + Status FinishInternal(std::shared_ptr* out) override { + ARROW_RETURN_NOT_OK(AppendNextOffset()); + + // Offset padding zeroed by BufferBuilder + std::shared_ptr offsets; + std::shared_ptr null_bitmap; + ARROW_RETURN_NOT_OK(this->offsets_builder_.Finish(&offsets)); + ARROW_RETURN_NOT_OK(this->null_bitmap_builder_.Finish(&null_bitmap)); + + if (this->value_builder_->length() == 0) { + // Try to make sure we get a non-null values buffer (ARROW-2744) + ARROW_RETURN_NOT_OK(this->value_builder_->Resize(0)); + } + + std::shared_ptr items; + ARROW_RETURN_NOT_OK(this->value_builder_->FinishInternal(&items)); + + *out = ArrayData::Make(this->type(), this->length_, + {std::move(null_bitmap), std::move(offsets)}, + {std::move(items)}, this->null_count_); + this->Reset(); + return Status::OK(); } }; @@ -247,6 +399,116 @@ class ARROW_EXPORT LargeListBuilder : public BaseListBuilder { Status Finish(std::shared_ptr* out) { return FinishTyped(out); } }; +// ---------------------------------------------------------------------- +// ListViewBuilder / LargeListViewBuilder + +template +class ARROW_EXPORT BaseListViewBuilder : public VarLengthListLikeBuilder { + private: + using BASE = VarLengthListLikeBuilder; + + public: + using TypeClass = TYPE; + using offset_type = typename BASE::offset_type; + + using BASE::BASE; + + ~BaseListViewBuilder() override = default; + + Status Resize(int64_t capacity) override { + ARROW_RETURN_NOT_OK(BASE::Resize(capacity)); + return sizes_builder_.Resize(capacity); + } + + void Reset() override { + BASE::Reset(); + sizes_builder_.Reset(); + } + + /// \brief Vector append + /// + /// If passed, valid_bytes is of equal length to values, and any zero byte + /// will be considered as a null for that slot + Status AppendValues(const offset_type* offsets, const offset_type* sizes, + int64_t length, const uint8_t* valid_bytes) final { + ARROW_RETURN_NOT_OK(this->Reserve(length)); + this->UnsafeAppendToBitmap(valid_bytes, length); + this->offsets_builder_.UnsafeAppend(offsets, length); + this->sizes_builder_.UnsafeAppend(sizes, length); + return Status::OK(); + } + + Status AppendValues(const offset_type* offsets, const offset_type* sizes, + int64_t length) { + return AppendValues(offsets, sizes, length, /*valid_bytes=*/NULLPTR); + } + + Status FinishInternal(std::shared_ptr* out) override { + // Offset and sizes padding zeroed by BufferBuilder + std::shared_ptr null_bitmap; + std::shared_ptr offsets; + std::shared_ptr sizes; + ARROW_RETURN_NOT_OK(this->null_bitmap_builder_.Finish(&null_bitmap)); + ARROW_RETURN_NOT_OK(this->offsets_builder_.Finish(&offsets)); + ARROW_RETURN_NOT_OK(this->sizes_builder_.Finish(&sizes)); + + if (this->value_builder_->length() == 0) { + // Try to make sure we get a non-null values buffer (ARROW-2744) + ARROW_RETURN_NOT_OK(this->value_builder_->Resize(0)); + } + + std::shared_ptr items; + ARROW_RETURN_NOT_OK(this->value_builder_->FinishInternal(&items)); + + *out = ArrayData::Make(this->type(), this->length_, + {std::move(null_bitmap), std::move(offsets), std::move(sizes)}, + {std::move(items)}, this->null_count_); + this->Reset(); + return Status::OK(); + } + + protected: + void UnsafeAppendEmptyDimensions(int64_t num_values) override { + for (int64_t i = 0; i < num_values; ++i) { + this->offsets_builder_.UnsafeAppend(0); + } + for (int64_t i = 0; i < num_values; ++i) { + this->sizes_builder_.UnsafeAppend(0); + } + } + + void UnsafeAppendDimensions(int64_t offset, int64_t size) override { + this->offsets_builder_.UnsafeAppend(static_cast(offset)); + this->sizes_builder_.UnsafeAppend(static_cast(size)); + } + + private: + TypedBufferBuilder sizes_builder_; +}; + +class ARROW_EXPORT ListViewBuilder final : public BaseListViewBuilder { + public: + using BaseListViewBuilder::BaseListViewBuilder; + + /// \cond FALSE + using ArrayBuilder::Finish; + /// \endcond + + Status Finish(std::shared_ptr* out) { return FinishTyped(out); } +}; + +class ARROW_EXPORT LargeListViewBuilder final + : public BaseListViewBuilder { + public: + using BaseListViewBuilder::BaseListViewBuilder; + + /// \cond FALSE + using ArrayBuilder::Finish; + /// \endcond + + Status Finish(std::shared_ptr* out) { return FinishTyped(out); } +}; + // ---------------------------------------------------------------------- // Map builder diff --git a/cpp/src/arrow/array/concatenate.cc b/cpp/src/arrow/array/concatenate.cc index 37c7271b5b95c..ff9ed66d1149f 100644 --- a/cpp/src/arrow/array/concatenate.cc +++ b/cpp/src/arrow/array/concatenate.cc @@ -35,14 +35,17 @@ #include "arrow/status.h" #include "arrow/type.h" #include "arrow/type_fwd.h" +#include "arrow/util/bit_block_counter.h" #include "arrow/util/bit_run_reader.h" #include "arrow/util/bit_util.h" #include "arrow/util/bitmap_ops.h" #include "arrow/util/checked_cast.h" #include "arrow/util/int_util.h" #include "arrow/util/int_util_overflow.h" +#include "arrow/util/list_util.h" #include "arrow/util/logging.h" #include "arrow/util/ree_util.h" +#include "arrow/util/slice_util_internal.h" #include "arrow/visit_data_inline.h" #include "arrow/visit_type_inline.h" @@ -98,10 +101,18 @@ Status ConcatenateBitmaps(const std::vector& bitmaps, MemoryPool* pool, return Status::OK(); } +int64_t SumBufferSizesInBytes(const BufferVector& buffers) { + int64_t size = 0; + for (const auto& buffer : buffers) { + size += buffer->size(); + } + return size; +} + // Write offsets in src into dst, adjusting them such that first_offset // will be the first offset written. template -Status PutOffsets(const std::shared_ptr& src, Offset first_offset, Offset* dst, +Status PutOffsets(const Buffer& src, Offset first_offset, Offset* dst, Range* values_range); // Concatenate buffers holding offsets into a single buffer of offsets, @@ -113,33 +124,30 @@ Status ConcatenateOffsets(const BufferVector& buffers, MemoryPool* pool, values_ranges->resize(buffers.size()); // allocate output buffer - int64_t out_length = 0; - for (const auto& buffer : buffers) { - out_length += buffer->size() / sizeof(Offset); - } - ARROW_ASSIGN_OR_RAISE(*out, AllocateBuffer((out_length + 1) * sizeof(Offset), pool)); - auto dst = reinterpret_cast((*out)->mutable_data()); + const int64_t out_size_in_bytes = SumBufferSizesInBytes(buffers); + ARROW_ASSIGN_OR_RAISE(*out, AllocateBuffer(sizeof(Offset) + out_size_in_bytes, pool)); + auto* out_data = (*out)->mutable_data_as(); int64_t elements_length = 0; Offset values_length = 0; for (size_t i = 0; i < buffers.size(); ++i) { // the first offset from buffers[i] will be adjusted to values_length // (the cumulative length of values spanned by offsets in previous buffers) - RETURN_NOT_OK(PutOffsets(buffers[i], values_length, &dst[elements_length], - &(*values_ranges)[i])); + RETURN_NOT_OK(PutOffsets(*buffers[i], values_length, + out_data + elements_length, &(*values_ranges)[i])); elements_length += buffers[i]->size() / sizeof(Offset); values_length += static_cast((*values_ranges)[i].length); } - // the final element in dst is the length of all values spanned by the offsets - dst[out_length] = values_length; + // the final element in out_data is the length of all values spanned by the offsets + out_data[out_size_in_bytes / sizeof(Offset)] = values_length; return Status::OK(); } template -Status PutOffsets(const std::shared_ptr& src, Offset first_offset, Offset* dst, +Status PutOffsets(const Buffer& src, Offset first_offset, Offset* dst, Range* values_range) { - if (src->size() == 0) { + if (src.size() == 0) { // It's allowed to have an empty offsets buffer for a 0-length array // (see Array::Validate) values_range->offset = 0; @@ -148,8 +156,8 @@ Status PutOffsets(const std::shared_ptr& src, Offset first_offset, Offse } // Get the range of offsets to transfer from src - auto src_begin = reinterpret_cast(src->data()); - auto src_end = reinterpret_cast(src->data() + src->size()); + auto src_begin = src.data_as(); + auto src_end = reinterpret_cast(src.data() + src.size()); // Compute the range of values which is spanned by this range of offsets values_range->offset = src_begin[0]; @@ -160,16 +168,132 @@ Status PutOffsets(const std::shared_ptr& src, Offset first_offset, Offse // Write offsets into dst, ensuring that the first offset written is // first_offset - auto adjustment = first_offset - src_begin[0]; + auto displacement = first_offset - src_begin[0]; // NOTE: Concatenate can be called during IPC reads to append delta dictionaries. // Avoid UB on non-validated input by doing the addition in the unsigned domain. // (the result can later be validated using Array::ValidateFull) - std::transform(src_begin, src_end, dst, [adjustment](Offset offset) { - return SafeSignedAdd(offset, adjustment); + std::transform(src_begin, src_end, dst, [displacement](Offset offset) { + return SafeSignedAdd(offset, displacement); }); return Status::OK(); } +template +Status PutListViewOffsets(const ArrayData& input, offset_type* sizes, const Buffer& src, + offset_type displacement, offset_type* dst); + +// Concatenate buffers holding list-view offsets into a single buffer of offsets +// +// value_ranges contains the relevant ranges of values in the child array actually +// referenced to by the views. Most commonly, these ranges will start from 0, +// but when that is not the case, we need to adjust the displacement of offsets. +// The concatenated child array does not contain values from the beginning +// if they are not referenced to by any view. +// +// The child arrays and the sizes buffer are used to ensure we can trust the offsets in +// offset_buffers to be within the valid range. +// +// This function also mutates sizes so that null list-view entries have size 0. +// +// \param[in] in The child arrays +// \param[in,out] sizes The concatenated sizes buffer +template +Status ConcatenateListViewOffsets(const ArrayDataVector& in, offset_type* sizes, + const BufferVector& offset_buffers, + const std::vector& value_ranges, + MemoryPool* pool, std::shared_ptr* out) { + DCHECK_EQ(offset_buffers.size(), value_ranges.size()); + + // Allocate resulting offsets buffer and initialize it with zeros + const int64_t out_size_in_bytes = SumBufferSizesInBytes(offset_buffers); + ARROW_ASSIGN_OR_RAISE(*out, AllocateBuffer(out_size_in_bytes, pool)); + memset((*out)->mutable_data(), 0, static_cast((*out)->size())); + + auto* out_offsets = (*out)->mutable_data_as(); + + int64_t num_child_values = 0; + int64_t elements_length = 0; + for (size_t i = 0; i < offset_buffers.size(); ++i) { + const auto displacement = + static_cast(num_child_values - value_ranges[i].offset); + RETURN_NOT_OK(PutListViewOffsets(*in[i], /*sizes=*/sizes + elements_length, + /*src=*/*offset_buffers[i], displacement, + /*dst=*/out_offsets + elements_length)); + elements_length += offset_buffers[i]->size() / sizeof(offset_type); + num_child_values += value_ranges[i].length; + if (num_child_values > std::numeric_limits::max()) { + return Status::Invalid("offset overflow while concatenating arrays"); + } + } + DCHECK_EQ(elements_length, + static_cast(out_size_in_bytes / sizeof(offset_type))); + + return Status::OK(); +} + +template +Status PutListViewOffsets(const ArrayData& input, offset_type* sizes, const Buffer& src, + offset_type displacement, offset_type* dst) { + if (src.size() == 0) { + return Status::OK(); + } + const auto& validity_buffer = input.buffers[0]; + if (validity_buffer) { + // Ensure that it is safe to access all the bits in the validity bitmap of input. + RETURN_NOT_OK(internal::CheckSliceParams(/*size=*/8 * validity_buffer->size(), + input.offset, input.length, "buffer")); + } + + const auto offsets = src.data_as(); + DCHECK_EQ(static_cast(src.size() / sizeof(offset_type)), input.length); + + auto visit_not_null = [&](int64_t position) { + if (sizes[position] > 0) { + // NOTE: Concatenate can be called during IPC reads to append delta + // dictionaries. Avoid UB on non-validated input by doing the addition in the + // unsigned domain. (the result can later be validated using + // Array::ValidateFull) + const auto displaced_offset = SafeSignedAdd(offsets[position], displacement); + // displaced_offset>=0 is guaranteed by RangeOfValuesUsed returning the + // smallest offset of valid and non-empty list-views. + DCHECK_GE(displaced_offset, 0); + dst[position] = displaced_offset; + } else { + // Do nothing to leave the dst[position] as 0. + } + }; + + const auto* validity = validity_buffer ? validity_buffer->data_as() : nullptr; + internal::OptionalBitBlockCounter bit_counter(validity, input.offset, input.length); + int64_t position = 0; + while (position < input.length) { + internal::BitBlockCount block = bit_counter.NextBlock(); + if (block.AllSet()) { + for (int64_t i = 0; i < block.length; ++i, ++position) { + visit_not_null(position); + } + } else if (block.NoneSet()) { + // NOTE: we don't have to do anything for the null entries regarding the + // offsets as the buffer is initialized to 0 when it is allocated. + + // Zero-out the sizes of the null entries to ensure these sizes are not + // greater than the new values length of the concatenated array. + memset(sizes + position, 0, block.length * sizeof(offset_type)); + position += block.length; + } else { + for (int64_t i = 0; i < block.length; ++i, ++position) { + if (bit_util::GetBit(validity, input.offset + position)) { + visit_not_null(position); + } else { + // Zero-out the size at position. + sizes[position] = 0; + } + } + } + } + return Status::OK(); +} + class ConcatenateImpl { public: ConcatenateImpl(const ArrayDataVector& in, MemoryPool* pool) @@ -288,6 +412,41 @@ class ConcatenateImpl { return ConcatenateImpl(child_data, pool_).Concatenate(&out_->child_data[0]); } + template + enable_if_list_view Visit(const T& type) { + using offset_type = typename T::offset_type; + out_->buffers.resize(3); + out_->child_data.resize(1); + + // Calculate the ranges of values that each list-view array uses + std::vector value_ranges; + value_ranges.reserve(in_.size()); + for (const auto& input : in_) { + ArraySpan input_span(*input); + Range range; + ARROW_ASSIGN_OR_RAISE(std::tie(range.offset, range.length), + list_util::internal::RangeOfValuesUsed(input_span)); + value_ranges.push_back(range); + } + + // Concatenate the values + ARROW_ASSIGN_OR_RAISE(ArrayDataVector value_data, ChildData(0, value_ranges)); + RETURN_NOT_OK(ConcatenateImpl(value_data, pool_).Concatenate(&out_->child_data[0])); + out_->child_data[0]->type = type.value_type(); + + // Concatenate the sizes first + ARROW_ASSIGN_OR_RAISE(auto size_buffers, Buffers(2, sizeof(offset_type))); + RETURN_NOT_OK(ConcatenateBuffers(size_buffers, pool_).Value(&out_->buffers[2])); + + // Concatenate the offsets + ARROW_ASSIGN_OR_RAISE(auto offset_buffers, Buffers(1, sizeof(offset_type))); + RETURN_NOT_OK(ConcatenateListViewOffsets( + in_, /*sizes=*/out_->buffers[2]->mutable_data_as(), offset_buffers, + value_ranges, pool_, &out_->buffers[1])); + + return Status::OK(); + } + Status Visit(const FixedSizeListType& fixed_size_list) { ARROW_ASSIGN_OR_RAISE(auto child_data, ChildData(0, fixed_size_list.list_size())); return ConcatenateImpl(child_data, pool_).Concatenate(&out_->child_data[0]); diff --git a/cpp/src/arrow/array/concatenate_test.cc b/cpp/src/arrow/array/concatenate_test.cc index 0ef1136ea78f8..af595e897f9ee 100644 --- a/cpp/src/arrow/array/concatenate_test.cc +++ b/cpp/src/arrow/array/concatenate_test.cc @@ -17,6 +17,7 @@ #include #include +#include #include #include #include @@ -40,26 +41,55 @@ #include "arrow/testing/random.h" #include "arrow/testing/util.h" #include "arrow/type.h" +#include "arrow/util/list_util.h" namespace arrow { -class ConcatenateTest : public ::testing::Test { - protected: - ConcatenateTest() - : rng_(seed_), - sizes_({0, 1, 2, 4, 16, 31, 1234}), - null_probabilities_({0.0, 0.1, 0.5, 0.9, 1.0}) {} +class SimpleRandomArrayGenerator { + private: + random::SeedType seed_ = 0xdeadbeef; + std::default_random_engine random_engine_; + random::RandomArrayGenerator rag_; + + public: + SimpleRandomArrayGenerator() : random_engine_(seed_), rag_(seed_) {} + + template + std::vector RandomOffsetsInRange(offset_type min_offset, + offset_type max_offset, + int64_t num_offsets) { + std::vector offsets(static_cast(num_offsets)); + std::uniform_int_distribution dist(min_offset, max_offset); + std::generate(offsets.begin(), offsets.end(), [&] { return dist(random_engine_); }); + return offsets; + } - template - std::vector Offsets(int32_t length, int32_t slice_count) { - std::vector offsets(static_cast(slice_count + 1)); - std::default_random_engine gen(seed_); - std::uniform_int_distribution dist(0, length); - std::generate(offsets.begin(), offsets.end(), [&] { return dist(gen); }); + template + std::vector Offsets(int32_t values_length, int32_t slice_count) { + auto offsets = RandomOffsetsInRange(0, values_length, slice_count + 1); std::sort(offsets.begin(), offsets.end()); return offsets; } + /// \param[in] random_offsets Random offsets in [0, values_size] and no particular order + template + std::vector ListViewSizes(const std::vector& random_offsets, + int64_t values_size, double avg_size, + int64_t num_sizes) { + std::normal_distribution normal(/*mean=*/avg_size, /*stddev=*/3.0); + std::vector sizes; + sizes.reserve(num_sizes); + for (int64_t i = 0; i < num_sizes; ++i) { + const auto sampled_size = std::llround(normal(random_engine_)); + auto size = std::max(0, static_cast(sampled_size)); + if (random_offsets[i] > values_size - size) { + size = static_cast(values_size - random_offsets[i]); + } + sizes.push_back(size); + } + return sizes; + } + ArrayVector Slices(const std::shared_ptr& array, const std::vector& offsets) { ArrayVector slices(offsets.size() - 1); @@ -69,33 +99,119 @@ class ConcatenateTest : public ::testing::Test { return slices; } + std::shared_ptr ValidityBitmap(int64_t size, double null_probability) { + return rag_.NullBitmap(size, null_probability, kDefaultBufferAlignment, + default_memory_pool()); + } + template - std::shared_ptr GeneratePrimitive(int64_t size, double null_probability) { + std::shared_ptr PrimitiveArray(int64_t size, double null_probability) { if (std::is_same::value) { - return rng_.Boolean(size, 0.5, null_probability); + return rag_.Boolean(size, 0.5, null_probability); } - return rng_.Numeric(size, 0, 127, null_probability); + return rag_.Numeric(size, 0, 127, null_probability); + } + + std::shared_ptr StringArray(int64_t size, double null_probability) { + return rag_.String(size, /*min_length =*/0, /*max_length =*/15, null_probability); + } + + std::shared_ptr LargeStringArray(int64_t size, double null_probability) { + return rag_.LargeString(size, /*min_length =*/0, /*max_length =*/15, + null_probability); + } + + std::shared_ptr StringViewArray(int64_t size, double null_probability) { + return rag_.StringView(size, /*min_length =*/0, /*max_length =*/40, null_probability, + /*max_buffer_length=*/200); + } + + std::shared_ptr ArrayOf(std::shared_ptr type, int64_t size, + double null_probability) { + return rag_.ArrayOf(std::move(type), size, null_probability); + } + + // TODO(GH-38656): Use the random array generators from testing/random.h here + + template ::ArrayType> + Result> ListArray(int32_t length, + double null_probability) { + using offset_type = typename ListType::offset_type; + using OffsetArrowType = typename CTypeTraits::ArrowType; + + auto values_size = length * 4; + auto values = PrimitiveArray(values_size, null_probability); + auto offsets_vector = Offsets(values_size, length); + // Ensure first and last offsets encompass the whole values array + offsets_vector.front() = 0; + offsets_vector.back() = static_cast(values_size); + std::shared_ptr offsets; + ArrayFromVector(offsets_vector, &offsets); + return ListArrayType::FromArrays(*offsets, *values); } + template ::ArrayType> + Result> ListViewArray(int32_t length, + double null_probability) { + using offset_type = typename ListViewType::offset_type; + using OffsetArrowType = typename CTypeTraits::ArrowType; + + constexpr int kAvgListViewSize = 4; + auto values_size = kAvgListViewSize * length; + + auto values = PrimitiveArray(values_size, null_probability); + + std::shared_ptr offsets; + auto offsets_vector = RandomOffsetsInRange(0, values_size, length); + ArrayFromVector(offsets_vector, &offsets); + + std::shared_ptr sizes; + auto sizes_vector = + ListViewSizes(offsets_vector, values_size, kAvgListViewSize, length); + ArrayFromVector(sizes_vector, &sizes); + + auto validity_bitmap = ValidityBitmap(length, null_probability); + auto valid_count = internal::CountSetBits(validity_bitmap->data(), 0, length); + + return ListViewArrayType::FromArrays( + *offsets, *sizes, *values, default_memory_pool(), + valid_count == length ? nullptr : std::move(validity_bitmap)); + } +}; + +class ConcatenateTest : public ::testing::Test { + private: + std::vector sizes_; + std::vector null_probabilities_; + + protected: + SimpleRandomArrayGenerator rag; + + ConcatenateTest() + : sizes_({0, 1, 2, 4, 16, 31, 1234}), + null_probabilities_({0.0, 0.1, 0.5, 0.9, 1.0}) {} + void CheckTrailingBitsAreZeroed(const std::shared_ptr& bitmap, int64_t length) { if (auto preceding_bits = bit_util::kPrecedingBitmask[length % 8]) { auto last_byte = bitmap->data()[length / 8]; ASSERT_EQ(static_cast(last_byte & preceding_bits), last_byte) - << length << " " << int(preceding_bits); + << length << " " << static_cast(preceding_bits); } } template void Check(ArrayFactory&& factory) { for (auto size : this->sizes_) { - auto offsets = this->Offsets(size, 3); + auto offsets = rag.Offsets(size, 3); for (auto null_probability : this->null_probabilities_) { std::shared_ptr array; factory(size, null_probability, &array); ASSERT_OK(array->ValidateFull()); auto expected = array->Slice(offsets.front(), offsets.back() - offsets.front()); ASSERT_OK(expected->ValidateFull()); - auto slices = this->Slices(array, offsets); + auto slices = rag.Slices(array, offsets); for (auto slice : slices) { ASSERT_OK(slice->ValidateFull()); } @@ -111,11 +227,6 @@ class ConcatenateTest : public ::testing::Test { } } } - - random::SeedType seed_ = 0xdeadbeef; - random::RandomArrayGenerator rng_; - std::vector sizes_; - std::vector null_probabilities_; }; TEST(ConcatenateEmptyArraysTest, TestValueBuffersNullPtr) { @@ -144,7 +255,7 @@ TYPED_TEST_SUITE(PrimitiveConcatenateTest, PrimitiveArrowTypes); TYPED_TEST(PrimitiveConcatenateTest, Primitives) { this->Check([this](int64_t size, double null_probability, std::shared_ptr* out) { - *out = this->template GeneratePrimitive(size, null_probability); + *out = this->rag.template PrimitiveArray(size, null_probability); }); } @@ -156,23 +267,21 @@ TEST_F(ConcatenateTest, NullType) { TEST_F(ConcatenateTest, StringType) { Check([this](int32_t size, double null_probability, std::shared_ptr* out) { - *out = rng_.String(size, /*min_length =*/0, /*max_length =*/15, null_probability); + *out = rag.StringArray(size, null_probability); ASSERT_OK((**out).ValidateFull()); }); } TEST_F(ConcatenateTest, StringViewType) { Check([this](int32_t size, double null_probability, std::shared_ptr* out) { - *out = rng_.StringView(size, /*min_length =*/0, /*max_length =*/40, null_probability, - /*max_buffer_length=*/200); + *out = rag.StringViewArray(size, null_probability); ASSERT_OK((**out).ValidateFull()); }); } TEST_F(ConcatenateTest, LargeStringType) { Check([this](int32_t size, double null_probability, std::shared_ptr* out) { - *out = - rng_.LargeString(size, /*min_length =*/0, /*max_length =*/15, null_probability); + *out = rag.LargeStringArray(size, null_probability); ASSERT_OK((**out).ValidateFull()); }); } @@ -181,7 +290,7 @@ TEST_F(ConcatenateTest, FixedSizeListType) { Check([this](int32_t size, double null_probability, std::shared_ptr* out) { auto list_size = 3; auto values_size = size * list_size; - auto values = this->GeneratePrimitive(values_size, null_probability); + auto values = this->rag.PrimitiveArray(values_size, null_probability); ASSERT_OK_AND_ASSIGN(*out, FixedSizeListArray::FromArrays(values, list_size)); ASSERT_OK((**out).ValidateFull()); }); @@ -189,39 +298,40 @@ TEST_F(ConcatenateTest, FixedSizeListType) { TEST_F(ConcatenateTest, ListType) { Check([this](int32_t size, double null_probability, std::shared_ptr* out) { - auto values_size = size * 4; - auto values = this->GeneratePrimitive(values_size, null_probability); - auto offsets_vector = this->Offsets(values_size, size); - // Ensure first and last offsets encompass the whole values array - offsets_vector.front() = 0; - offsets_vector.back() = static_cast(values_size); - std::shared_ptr offsets; - ArrayFromVector(offsets_vector, &offsets); - ASSERT_OK_AND_ASSIGN(*out, ListArray::FromArrays(*offsets, *values)); + ASSERT_OK_AND_ASSIGN(*out, this->rag.ListArray(size, null_probability)); ASSERT_OK((**out).ValidateFull()); }); } TEST_F(ConcatenateTest, LargeListType) { Check([this](int32_t size, double null_probability, std::shared_ptr* out) { - auto values_size = size * 4; - auto values = this->GeneratePrimitive(values_size, null_probability); - auto offsets_vector = this->Offsets(values_size, size); - // Ensure first and last offsets encompass the whole values array - offsets_vector.front() = 0; - offsets_vector.back() = static_cast(values_size); - std::shared_ptr offsets; - ArrayFromVector(offsets_vector, &offsets); - ASSERT_OK_AND_ASSIGN(*out, LargeListArray::FromArrays(*offsets, *values)); + ASSERT_OK_AND_ASSIGN(*out, + this->rag.ListArray(size, null_probability)); + ASSERT_OK((**out).ValidateFull()); + }); +} + +TEST_F(ConcatenateTest, ListViewType) { + Check([this](int32_t size, double null_probability, std::shared_ptr* out) { + ASSERT_OK_AND_ASSIGN(*out, + this->rag.ListViewArray(size, null_probability)); + ASSERT_OK((**out).ValidateFull()); + }); +} + +TEST_F(ConcatenateTest, LargeListViewType) { + Check([this](int32_t size, double null_probability, std::shared_ptr* out) { + ASSERT_OK_AND_ASSIGN( + *out, this->rag.ListViewArray(size, null_probability)); ASSERT_OK((**out).ValidateFull()); }); } TEST_F(ConcatenateTest, StructType) { Check([this](int32_t size, double null_probability, std::shared_ptr* out) { - auto foo = this->GeneratePrimitive(size, null_probability); - auto bar = this->GeneratePrimitive(size, null_probability); - auto baz = this->GeneratePrimitive(size, null_probability); + auto foo = this->rag.PrimitiveArray(size, null_probability); + auto bar = this->rag.PrimitiveArray(size, null_probability); + auto baz = this->rag.PrimitiveArray(size, null_probability); *out = std::make_shared( struct_({field("foo", int8()), field("bar", float64()), field("baz", boolean())}), size, ArrayVector{foo, bar, baz}); @@ -230,8 +340,8 @@ TEST_F(ConcatenateTest, StructType) { TEST_F(ConcatenateTest, DictionaryType) { Check([this](int32_t size, double null_probability, std::shared_ptr* out) { - auto indices = this->GeneratePrimitive(size, null_probability); - auto dict = this->GeneratePrimitive(128, 0); + auto indices = rag.PrimitiveArray(size, null_probability); + auto dict = rag.PrimitiveArray(128, 0); auto type = dictionary(int32(), dict->type()); *out = std::make_shared(type, indices, dict); }); @@ -382,20 +492,20 @@ TEST_F(ConcatenateTest, DictionaryTypeNullSlots) { TEST_F(ConcatenateTest, UnionType) { // sparse mode Check([this](int32_t size, double null_probability, std::shared_ptr* out) { - *out = rng_.ArrayOf(sparse_union({ - field("a", float64()), - field("b", boolean()), - }), - size, null_probability); + *out = rag.ArrayOf(sparse_union({ + field("a", float64()), + field("b", boolean()), + }), + size, null_probability); }); // dense mode Check([this](int32_t size, double null_probability, std::shared_ptr* out) { - *out = rng_.ArrayOf(dense_union({ - field("a", uint32()), - field("b", boolean()), - field("c", int8()), - }), - size, null_probability); + *out = rag.ArrayOf(dense_union({ + field("a", uint32()), + field("b", boolean()), + field("c", int8()), + }), + size, null_probability); }); } @@ -413,7 +523,7 @@ TEST_F(ConcatenateTest, DenseUnionTypeOverflow) { auto type_ids_ok = ArrayFromJSON(int8(), "[0]"); auto offsets_ok = ArrayFromJSON(int32(), "[0]"); auto child_array_overflow = - this->rng_.ArrayOf(null(), std::numeric_limits::max() - 1, 0.0); + rag.ArrayOf(null(), std::numeric_limits::max() - 1, 0.0); ASSERT_OK_AND_ASSIGN( auto array_overflow, DenseUnionArray::Make(*type_ids_ok, *offsets_ok, {child_array_overflow})); @@ -546,7 +656,7 @@ TEST_F(ConcatenateTest, DenseUnionType) { TEST_F(ConcatenateTest, ExtensionType) { Check([this](int32_t size, double null_probability, std::shared_ptr* out) { - auto storage = this->GeneratePrimitive(size, null_probability); + auto storage = this->rag.PrimitiveArray(size, null_probability); *out = ExtensionType::WrapArray(smallint(), storage); }); } diff --git a/cpp/src/arrow/array/data.cc b/cpp/src/arrow/array/data.cc index 678513fd4d151..3ea5ca88523c3 100644 --- a/cpp/src/arrow/array/data.cc +++ b/cpp/src/arrow/array/data.cc @@ -244,9 +244,22 @@ BufferSpan OffsetsForScalar(uint8_t* scratch_space, offset_type value_size) { auto* offsets = reinterpret_cast(scratch_space); offsets[0] = 0; offsets[1] = static_cast(value_size); + static_assert(2 * sizeof(offset_type) <= 16); return {scratch_space, sizeof(offset_type) * 2}; } +template +std::pair OffsetsAndSizesForScalar(uint8_t* scratch_space, + offset_type value_size) { + auto* offsets = scratch_space; + auto* sizes = scratch_space + sizeof(offset_type); + reinterpret_cast(offsets)[0] = 0; + reinterpret_cast(sizes)[0] = value_size; + static_assert(2 * sizeof(offset_type) <= 16); + return {BufferSpan{offsets, sizeof(offset_type)}, + BufferSpan{sizes, sizeof(offset_type)}}; +} + int GetNumBuffers(const DataType& type) { switch (type.id()) { case Type::NA: @@ -261,6 +274,8 @@ int GetNumBuffers(const DataType& type) { case Type::STRING_VIEW: case Type::BINARY_VIEW: case Type::DENSE_UNION: + case Type::LIST_VIEW: + case Type::LARGE_LIST_VIEW: return 3; case Type::EXTENSION: // The number of buffers depends on the storage type @@ -381,7 +396,7 @@ void ArraySpan::FillFromScalar(const Scalar& value) { const auto& scalar = checked_cast(value); this->buffers[1].data = const_cast(scalar.value->data()); this->buffers[1].size = scalar.value->size(); - } else if (is_list_like(type_id)) { + } else if (is_var_length_list_like(type_id) || type_id == Type::FIXED_SIZE_LIST) { const auto& scalar = checked_cast(value); int64_t value_length = 0; @@ -402,7 +417,14 @@ void ArraySpan::FillFromScalar(const Scalar& value) { OffsetsForScalar(scalar.scratch_space_, static_cast(value_length)); } else if (type_id == Type::LARGE_LIST) { this->buffers[1] = OffsetsForScalar(scalar.scratch_space_, value_length); + } else if (type_id == Type::LIST_VIEW) { + std::tie(this->buffers[1], this->buffers[2]) = OffsetsAndSizesForScalar( + scalar.scratch_space_, static_cast(value_length)); + } else if (type_id == Type::LARGE_LIST_VIEW) { + std::tie(this->buffers[1], this->buffers[2]) = + OffsetsAndSizesForScalar(scalar.scratch_space_, value_length); } else { + DCHECK_EQ(type_id, Type::FIXED_SIZE_LIST); // FIXED_SIZE_LIST: does not have a second buffer this->buffers[1] = {}; } diff --git a/cpp/src/arrow/array/diff.cc b/cpp/src/arrow/array/diff.cc index be9597e59b378..f9714eda34c61 100644 --- a/cpp/src/arrow/array/diff.cc +++ b/cpp/src/arrow/array/diff.cc @@ -289,6 +289,13 @@ class ValueComparatorFactory { Status Visit(const NullType&, const Array&, const Array&) { return Status::NotImplemented("null type"); } + Status Visit(const ListViewType&, const Array&, const Array&) { + return Status::NotImplemented("list-view type"); + } + + Status Visit(const LargeListViewType&, const Array&, const Array&) { + return Status::NotImplemented("list-view type"); + } Status Visit(const ExtensionType&, const Array&, const Array&) { return Status::NotImplemented("extension type"); @@ -589,6 +596,9 @@ Result> Diff(const Array& base, const Array& target return Diff(*base_storage, *target_storage, pool); } else if (base.type()->id() == Type::DICTIONARY) { return Status::NotImplemented("diffing arrays of type ", *base.type()); + } else if (base.type()->id() == Type::LIST_VIEW || + base.type()->id() == Type::LARGE_LIST_VIEW) { + return Status::NotImplemented("diffing arrays of type ", *base.type()); } else { return QuadraticSpaceMyersDiff(base, target, pool).Diff(); } @@ -732,6 +742,14 @@ class MakeFormatterImpl { return Status::OK(); } + Status Visit(const ListViewType& t) { + return Status::NotImplemented("formatting diffs between arrays of type ", t); + } + + Status Visit(const LargeListViewType& t) { + return Status::NotImplemented("formatting diffs between arrays of type ", t); + } + // TODO(bkietz) format maps better Status Visit(const StructType& t) { diff --git a/cpp/src/arrow/array/util.cc b/cpp/src/arrow/array/util.cc index 9ea2fc2b6f0a1..86e2ffcae4de7 100644 --- a/cpp/src/arrow/array/util.cc +++ b/cpp/src/arrow/array/util.cc @@ -134,7 +134,6 @@ class ArrayDataEndianSwapper { out_->buffers[index] = data_->buffers[index]; return Status::OK(); } - // Except union, offset has one more element rather than data->length ARROW_ASSIGN_OR_RAISE(out_->buffers[index], ByteSwapBuffer(data_->buffers[index])); return Status::OK(); @@ -290,6 +289,17 @@ class ArrayDataEndianSwapper { return Status::OK(); } + Status Visit(const ListViewType& type) { + RETURN_NOT_OK(SwapOffsets(1)); + RETURN_NOT_OK(SwapOffsets(2)); + return Status::OK(); + } + Status Visit(const LargeListViewType& type) { + RETURN_NOT_OK(SwapOffsets(1)); + RETURN_NOT_OK(SwapOffsets(2)); + return Status::OK(); + } + Status Visit(const DictionaryType& type) { // dictionary was already swapped in ReadDictionary() in ipc/reader.cc RETURN_NOT_OK(SwapType(*type.index_type())); @@ -379,7 +389,14 @@ class NullArrayFactory { enable_if_var_size_list Visit(const T& type) { // values array may be empty, but there must be at least one offset of 0 RETURN_NOT_OK(MaxOf(sizeof(typename T::offset_type) * (length_ + 1))); - RETURN_NOT_OK(MaxOf(GetBufferLength(type.value_type(), length_))); + RETURN_NOT_OK(MaxOf(GetBufferLength(type.value_type(), /*length=*/0))); + return Status::OK(); + } + + template + enable_if_list_view Visit(const T& type) { + RETURN_NOT_OK(MaxOf(sizeof(typename T::offset_type) * length_)); + RETURN_NOT_OK(MaxOf(GetBufferLength(type.value_type(), /*length=*/0))); return Status::OK(); } @@ -518,8 +535,8 @@ class NullArrayFactory { } template - enable_if_var_size_list Visit(const T& type) { - out_->buffers.resize(2, buffer_); + enable_if_var_length_list_like Visit(const T& type) { + out_->buffers.resize(is_list_view(T::type_id) ? 3 : 2, buffer_); ARROW_ASSIGN_OR_RAISE(out_->child_data[0], CreateChild(type, 0, /*length=*/0)); return Status::OK(); } @@ -698,12 +715,28 @@ class RepeatedArrayFactory { std::shared_ptr offsets_buffer; auto size = static_cast(scalar().value->length()); RETURN_NOT_OK(CreateOffsetsBuffer(size, &offsets_buffer)); - out_ = std::make_shared(scalar_.type, length_, offsets_buffer, value_array); return Status::OK(); } + template + enable_if_list_view Visit(const T& type) { + using ScalarType = typename TypeTraits::ScalarType; + using ArrayType = typename TypeTraits::ArrayType; + + auto value = checked_cast(scalar_).value; + + auto size = static_cast(value->length()); + ARROW_ASSIGN_OR_RAISE(auto offsets_buffer, + CreateIntBuffer(0)); + ARROW_ASSIGN_OR_RAISE(auto sizes_buffer, + CreateIntBuffer(size)); + out_ = std::make_shared(scalar_.type, length_, std::move(offsets_buffer), + std::move(sizes_buffer), value); + return Status::OK(); + } + Status Visit(const FixedSizeListType& type) { auto value = checked_cast(scalar_).value; @@ -853,6 +886,15 @@ class RepeatedArrayFactory { return builder.Finish(out); } + template + Result> CreateIntBuffer(IntType value) { + std::shared_ptr buffer; + TypedBufferBuilder builder(pool_); + RETURN_NOT_OK(builder.Append(/*num_copies=*/length_, value)); + RETURN_NOT_OK(builder.Finish(&buffer)); + return buffer; + } + Status CreateBufferOf(const void* data, size_t data_length, std::shared_ptr* out) { BufferBuilder builder(pool_); diff --git a/cpp/src/arrow/array/validate.cc b/cpp/src/arrow/array/validate.cc index 3dde41b1450e8..8dd3eb3f90c15 100644 --- a/cpp/src/arrow/array/validate.cc +++ b/cpp/src/arrow/array/validate.cc @@ -23,7 +23,6 @@ #include "arrow/extension_type.h" #include "arrow/type.h" #include "arrow/type_traits.h" -#include "arrow/util/bit_block_counter.h" #include "arrow/util/bit_util.h" #include "arrow/util/bitmap_ops.h" #include "arrow/util/checked_cast.h" @@ -269,6 +268,9 @@ struct ValidateArrayImpl { return MapArray::ValidateChildData(data.child_data); } + Status Visit(const ListViewType& type) { return ValidateListView(type); } + Status Visit(const LargeListViewType& type) { return ValidateListView(type); } + Status Visit(const FixedSizeListType& type) { const ArrayData& values = *data.child_data[0]; const int64_t list_size = type.list_size(); @@ -582,7 +584,7 @@ struct ValidateArrayImpl { const Buffer& values = *data.buffers[2]; // First validate offsets, to make sure the accesses below are valid - RETURN_NOT_OK(ValidateOffsets(type, values.size())); + RETURN_NOT_OK(ValidateOffsetsAndSizes(type, values.size())); if (data.length > 0 && data.buffers[1]->is_cpu()) { using offset_type = typename BinaryType::offset_type; @@ -702,7 +704,7 @@ struct ValidateArrayImpl { } // First validate offsets, to make sure the accesses below are valid - RETURN_NOT_OK(ValidateOffsets(type, values.offset + values.length)); + RETURN_NOT_OK(ValidateOffsetsAndSizes(type, values.offset + values.length)); // An empty list array can have 0 offsets if (data.length > 0 && data.buffers[1]->is_cpu()) { @@ -735,6 +737,18 @@ struct ValidateArrayImpl { return Status::OK(); } + template + Status ValidateListView(const ListViewType& type) { + const ArrayData& values = *data.child_data[0]; + const Status child_valid = RecurseInto(values); + if (!child_valid.ok()) { + return Status::Invalid("List-view child array is invalid: ", + child_valid.ToString()); + } + // For list-views, sizes are validated together with offsets. + return ValidateOffsetsAndSizes(type, /*offset_limit=*/values.length); + } + template Status ValidateRunEndEncoded(const RunEndEncodedType& type) { if (data.child_data.size() != 2) { @@ -797,23 +811,105 @@ struct ValidateArrayImpl { return Status::OK(); } + private: + /// \pre basic validation has already been performed + template + Status FullyValidateOffsets(int64_t offset_limit) { + const auto* offsets = data.GetValues(1); + auto prev_offset = offsets[0]; + if (prev_offset < 0) { + return Status::Invalid("Offset invariant failure: array starts at negative offset ", + prev_offset); + } + for (int64_t i = 1; i <= data.length; ++i) { + const auto current_offset = offsets[i]; + if (current_offset < prev_offset) { + return Status::Invalid("Offset invariant failure: non-monotonic offset at slot ", + i, ": ", current_offset, " < ", prev_offset); + } + if (current_offset > offset_limit) { + return Status::Invalid("Offset invariant failure: offset for slot ", i, + " out of bounds: ", current_offset, " > ", offset_limit); + } + prev_offset = current_offset; + } + return Status::OK(); + } + + template + Status OutOfBoundsListViewOffset(int64_t slot, int64_t offset_limit) { + const auto* offsets = data.GetValues(1); + const auto offset = offsets[slot]; + return Status::Invalid("Offset invariant failure: offset for slot ", slot, + " out of bounds. Expected ", offset, + " to be at least 0 and less than ", offset_limit); + } + + template + Status OutOfBoundsListViewSize(int64_t slot, int64_t offset_limit) { + const auto* offsets = data.GetValues(1); + const auto* sizes = data.GetValues(2); + const auto size = sizes[slot]; + if (size < 0) { + return Status::Invalid("Offset invariant failure: size for slot ", slot, + " out of bounds: ", size, " < 0"); + } else { + const auto offset = offsets[slot]; + return Status::Invalid("Offset invariant failure: size for slot ", slot, + " out of bounds: ", offset, " + ", size, " > ", + offset_limit); + } + } + + /// \pre basic validation has already been performed + template + Status FullyValidateOffsetsAndSizes(int64_t offset_limit) { + const auto* offsets = data.GetValues(1); + const auto* sizes = data.GetValues(2); + + for (int64_t i = 0; i < data.length; ++i) { + const auto size = sizes[i]; + if (size >= 0) { + const auto offset = offsets[i]; + if (offset < 0 || offset > offset_limit) { + return OutOfBoundsListViewOffset(i, offset_limit); + } + if (size > offset_limit - offset) { + return OutOfBoundsListViewSize(i, offset_limit); + } + } else { + return OutOfBoundsListViewSize(i, offset_limit); + } + } + + return Status::OK(); + } + + public: template - Status ValidateOffsets(const TypeClass& type, int64_t offset_limit) { + Status ValidateOffsetsAndSizes(const TypeClass&, int64_t offset_limit) { using offset_type = typename TypeClass::offset_type; + constexpr bool is_list_view = is_list_view_type::value; - if (!IsBufferValid(1)) { - // For length 0, an empty offsets buffer seems accepted as a special case - // (ARROW-544) - if (data.length > 0) { - return Status::Invalid("Non-empty array but offsets are null"); + const bool non_empty = data.length > 0; + if constexpr (is_list_view) { + if (!IsBufferValid(1)) { + return Status::Invalid("offsets buffer is null"); + } + if (!IsBufferValid(2)) { + return Status::Invalid("sizes buffer is null"); + } + } else { + if (!IsBufferValid(1)) { + // For length 0, an empty offsets buffer is accepted (ARROW-544). + return non_empty ? Status::Invalid("Non-empty array but offsets are null") + : Status::OK(); } - return Status::OK(); } - // An empty list array can have 0 offsets const auto offsets_byte_size = data.buffers[1]->size(); const auto required_offsets = ((data.length > 0) || (offsets_byte_size > 0)) - ? data.length + data.offset + 1 + ? data.length + data.offset + (is_list_view ? 0 : 1) : 0; if (offsets_byte_size / static_cast(sizeof(offset_type)) < required_offsets) { @@ -821,28 +917,21 @@ struct ValidateArrayImpl { " isn't large enough for length: ", data.length, " and offset: ", data.offset); } + if constexpr (is_list_view) { + const auto required_sizes = data.length + data.offset; + const auto sizes_bytes_size = data.buffers[2]->size(); + if (sizes_bytes_size / static_cast(sizeof(offset_type)) < required_sizes) { + return Status::Invalid("Sizes buffer size (bytes): ", sizes_bytes_size, + " isn't large enough for length: ", data.length, + " and offset: ", data.offset); + } + } if (full_validation && required_offsets > 0) { - // Validate all offset values - const offset_type* offsets = data.GetValues(1); - - auto prev_offset = offsets[0]; - if (prev_offset < 0) { - return Status::Invalid( - "Offset invariant failure: array starts at negative offset ", prev_offset); - } - for (int64_t i = 1; i <= data.length; ++i) { - const auto current_offset = offsets[i]; - if (current_offset < prev_offset) { - return Status::Invalid( - "Offset invariant failure: non-monotonic offset at slot ", i, ": ", - current_offset, " < ", prev_offset); - } - if (current_offset > offset_limit) { - return Status::Invalid("Offset invariant failure: offset for slot ", i, - " out of bounds: ", current_offset, " > ", offset_limit); - } - prev_offset = current_offset; + if constexpr (is_list_view) { + return FullyValidateOffsetsAndSizes(offset_limit); + } else { + return FullyValidateOffsets(offset_limit); } } return Status::OK(); diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc index c7e6207bfefa4..7042d9818c691 100644 --- a/cpp/src/arrow/builder.cc +++ b/cpp/src/arrow/builder.cc @@ -221,6 +221,20 @@ struct MakeBuilderImpl { return Status::OK(); } + Status Visit(const ListViewType& list_view_type) { + std::shared_ptr value_type = list_view_type.value_type(); + ARROW_ASSIGN_OR_RAISE(auto value_builder, ChildBuilder(value_type)); + out.reset(new ListViewBuilder(pool, std::move(value_builder), std::move(type))); + return Status::OK(); + } + + Status Visit(const LargeListViewType& large_list_view_type) { + std::shared_ptr value_type = large_list_view_type.value_type(); + ARROW_ASSIGN_OR_RAISE(auto value_builder, ChildBuilder(value_type)); + out.reset(new LargeListViewBuilder(pool, std::move(value_builder), std::move(type))); + return Status::OK(); + } + Status Visit(const MapType& map_type) { ARROW_ASSIGN_OR_RAISE(auto key_builder, ChildBuilder(map_type.key_type())); ARROW_ASSIGN_OR_RAISE(auto item_builder, ChildBuilder(map_type.item_type())); diff --git a/cpp/src/arrow/c/bridge.cc b/cpp/src/arrow/c/bridge.cc index 033371d3d6719..eeec75f2f473d 100644 --- a/cpp/src/arrow/c/bridge.cc +++ b/cpp/src/arrow/c/bridge.cc @@ -444,6 +444,10 @@ struct SchemaExporter { Status Visit(const LargeListType& type) { return SetFormat("+L"); } + Status Visit(const ListViewType& type) { return SetFormat("+vl"); } + + Status Visit(const LargeListViewType& type) { return SetFormat("+vL"); } + Status Visit(const FixedSizeListType& type) { return SetFormat("+w:" + ToChars(type.list_size())); } @@ -1100,6 +1104,16 @@ struct SchemaImporter { return ProcessListLike(); case 'L': return ProcessListLike(); + case 'v': { + RETURN_NOT_OK(f_parser_.CheckHasNext()); + switch (f_parser_.Next()) { + case 'l': + return ProcessListView(); + case 'L': + return ProcessListView(); + } + break; + } case 'w': return ProcessFixedSizeList(); case 's': @@ -1204,6 +1218,15 @@ struct SchemaImporter { return Status::OK(); } + template + Status ProcessListView() { + RETURN_NOT_OK(f_parser_.CheckAtEnd()); + RETURN_NOT_OK(CheckNumChildren(1)); + ARROW_ASSIGN_OR_RAISE(auto field, MakeChildField(0)); + type_ = std::make_shared(std::move(field)); + return Status::OK(); + } + Status ProcessMap() { RETURN_NOT_OK(f_parser_.CheckAtEnd()); RETURN_NOT_OK(CheckNumChildren(1)); @@ -1572,6 +1595,10 @@ struct ArrayImporter { Status Visit(const LargeListType& type) { return ImportListLike(type); } + Status Visit(const ListViewType& type) { return ImportListView(type); } + + Status Visit(const LargeListViewType& type) { return ImportListView(type); } + Status Visit(const FixedSizeListType& type) { RETURN_NOT_OK(CheckNumChildren(1)); RETURN_NOT_OK(CheckNumBuffers(1)); @@ -1667,6 +1694,18 @@ struct ArrayImporter { return Status::OK(); } + template + Status ImportListView(const ListViewType& type) { + using offset_type = typename ListViewType::offset_type; + RETURN_NOT_OK(CheckNumChildren(1)); + RETURN_NOT_OK(CheckNumBuffers(3)); + RETURN_NOT_OK(AllocateArrayData()); + RETURN_NOT_OK(ImportNullBitmap()); + RETURN_NOT_OK((ImportOffsetsBuffer(1))); + RETURN_NOT_OK(ImportSizesBuffer(2)); + return Status::OK(); + } + Status CheckNoChildren() { return CheckNumChildren(0); } Status CheckNumChildren(int64_t n_children) { @@ -1735,11 +1774,18 @@ struct ArrayImporter { return ImportBuffer(buffer_id, buffer_size); } - template + template Status ImportOffsetsBuffer(int32_t buffer_id) { // Compute visible size of buffer - int64_t buffer_size = - sizeof(OffsetType) * (c_struct_->length + c_struct_->offset + 1); + int64_t buffer_size = sizeof(OffsetType) * (c_struct_->length + c_struct_->offset + + (with_extra_offset ? 1 : 0)); + return ImportBuffer(buffer_id, buffer_size); + } + + template + Status ImportSizesBuffer(int32_t buffer_id) { + // Compute visible size of buffer + int64_t buffer_size = sizeof(OffsetType) * (c_struct_->length + c_struct_->offset); return ImportBuffer(buffer_id, buffer_size); } diff --git a/cpp/src/arrow/c/bridge_test.cc b/cpp/src/arrow/c/bridge_test.cc index bd0e498a9f332..362df833781a1 100644 --- a/cpp/src/arrow/c/bridge_test.cc +++ b/cpp/src/arrow/c/bridge_test.cc @@ -33,6 +33,7 @@ #include "arrow/c/util_internal.h" #include "arrow/ipc/json_simple.h" #include "arrow/memory_pool.h" +#include "arrow/testing/builder.h" #include "arrow/testing/extension_type.h" #include "arrow/testing/gtest_util.h" #include "arrow/testing/matchers.h" @@ -397,6 +398,14 @@ TEST_F(TestSchemaExport, List) { TestNested(list(large_list(int32())), {"+l", "+L", "i"}, {"", "item", "item"}); } +TEST_F(TestSchemaExport, ListView) { + TestNested(list_view(int8()), {"+vl", "c"}, {"", "item"}); + TestNested(large_list_view(uint16()), {"+vL", "S"}, {"", "item"}); + + TestNested(list_view(large_list_view(int32())), {"+vl", "+vL", "i"}, + {"", "item", "item"}); +} + TEST_F(TestSchemaExport, Struct) { auto type = struct_({field("a", int8()), field("b", utf8())}); TestNested(type, {"+s", "c", "u"}, {"", "a", "b"}, @@ -945,6 +954,33 @@ TEST_F(TestArrayExport, ListSliced) { } } +TEST_F(TestArrayExport, ListView) { + TestNested(list_view(int8()), "[[1, 2], [3, null], null]"); + TestNested(large_list_view(uint16()), "[[1, 2], [3, null], null]"); + TestNested(fixed_size_list(int64(), 2), "[[1, 2], [3, null], null]"); + + TestNested(list_view(large_list_view(int32())), "[[[1, 2], [3], null], null]"); +} + +TEST_F(TestArrayExport, ListViewSliced) { + { + auto factory = []() { + return ArrayFromJSON(list_view(int8()), "[[1, 2], [3, null], [4, 5, 6], null]") + ->Slice(1, 2); + }; + TestNested(factory); + } + { + auto factory = []() { + auto values = ArrayFromJSON(int16(), "[1, 2, 3, 4, null, 5, 6, 7, 8]")->Slice(1, 6); + auto offsets = ArrayFromJSON(int32(), "[5, 2, 0, 3]")->Slice(1, 2); + auto sizes = ArrayFromJSON(int32(), "[2, 3, 6, 1]")->Slice(1, 2); + return ListViewArray::FromArrays(*offsets, *sizes, *values); + }; + TestNested(factory); + } +} + TEST_F(TestArrayExport, Struct) { const char* data = R"([[1, "foo"], [2, null]])"; auto type = struct_({field("a", int8()), field("b", utf8())}); @@ -1490,6 +1526,45 @@ TEST_F(TestDeviceArrayExport, ListSliced) { } } +TEST_F(TestDeviceArrayExport, ListView) { + std::shared_ptr device = std::make_shared(1); + auto mm = device->default_memory_manager(); + + TestNested(mm, list_view(int8()), "[[1, 2], [3, null], null]"); + TestNested(mm, large_list_view(uint16()), "[[1, 2], [3, null], null]"); + + TestNested(mm, list_view(large_list_view(int32())), "[[[1, 2], [3], null], null]"); +} + +TEST_F(TestDeviceArrayExport, ListViewSliced) { + std::shared_ptr device = std::make_shared(1); + auto mm = device->default_memory_manager(); + + { + auto factory = [=]() { + return (*ToDevice(mm, *ArrayFromJSON(list_view(int8()), + "[[1, 2], [3, null], [4, 5, 6], null]") + ->data())) + ->Slice(1, 2); + }; + TestNested(factory); + } + { + auto factory = [=]() { + auto values = + (*ToDevice(mm, + *ArrayFromJSON(int16(), "[1, 2, 3, 4, null, 5, 6, 7, 8]")->data())) + ->Slice(1, 6); + auto offsets = + (*ToDevice(mm, *ArrayFromJSON(int32(), "[5, 2, 0, 3]")->data()))->Slice(1, 2); + auto sizes = + (*ToDevice(mm, *ArrayFromJSON(int32(), "[2, 3, 6, 1]")->data()))->Slice(1, 2); + return ListViewArray::FromArrays(*offsets, *sizes, *values); + }; + TestNested(factory); + } +} + TEST_F(TestDeviceArrayExport, Struct) { std::shared_ptr device = std::make_shared(1); auto mm = device->default_memory_manager(); @@ -1930,6 +2005,33 @@ TEST_F(TestSchemaImport, NestedList) { CheckImport(list(fixed_size_list(int8(), 3))); } +TEST_F(TestSchemaImport, ListView) { + FillPrimitive(AddChild(), "c"); + FillListLike("+vl"); + CheckImport(list_view(int8())); + + FillPrimitive(AddChild(), "s", "item", 0); + FillListLike("+vl"); + CheckImport(list_view(field("item", int16(), /*nullable=*/false))); + + // Large list-view + FillPrimitive(AddChild(), "s"); + FillListLike("+vL"); + CheckImport(large_list_view(int16())); +} + +TEST_F(TestSchemaImport, NestedListView) { + FillPrimitive(AddChild(), "c"); + FillListLike(AddChild(), "+vl"); + FillListLike("+vL"); + CheckImport(large_list_view(list_view(int8()))); + + FillPrimitive(AddChild(), "c"); + FillListLike(AddChild(), "+w:3"); + FillListLike("+vl"); + CheckImport(list_view(fixed_size_list(int8(), 3))); +} + TEST_F(TestSchemaImport, Struct) { FillPrimitive(AddChild(), "u", "strs"); FillPrimitive(AddChild(), "S", "ints"); @@ -2325,6 +2427,18 @@ static const int64_t large_list_offsets_buffer1[] = {0, 2, 2, 5, 6, 8}; static const void* large_list_buffers_no_nulls1[2] = {nullptr, large_list_offsets_buffer1}; +static const int32_t list_view_offsets_buffer1[] = {0, 2, 2, 5, 6}; +static const int32_t list_view_sizes_buffer1[] = {2, 0, 3, 1, 2}; +static const void* list_view_buffers_no_nulls1[3] = {nullptr, list_view_offsets_buffer1, + list_view_sizes_buffer1}; +static const void* list_view_buffers_nulls1[3] = {bits_buffer1, list_view_offsets_buffer1, + list_view_sizes_buffer1}; + +static const int64_t large_list_view_offsets_buffer1[] = {0, 2, 2, 5, 6}; +static const int64_t large_list_view_sizes_buffer1[] = {2, 0, 3, 1, 2}; +static const void* large_list_view_buffers_no_nulls1[3] = { + nullptr, large_list_view_offsets_buffer1, large_list_view_sizes_buffer1}; + static const int8_t type_codes_buffer1[] = {42, 42, 43, 43, 42}; static const int32_t union_offsets_buffer1[] = {0, 1, 0, 1, 2}; static const void* sparse_union_buffers1_legacy[2] = {nullptr, type_codes_buffer1}; @@ -2407,6 +2521,17 @@ class TestArrayImport : public ::testing::Test { c->children = NLastChildren(1, c); } + void FillListView(struct ArrowArray* c, int64_t length, int64_t null_count, + int64_t offset, const void** buffers) { + c->length = length; + c->null_count = null_count; + c->offset = offset; + c->n_buffers = 3; + c->buffers = buffers; + c->n_children = 1; + c->children = NLastChildren(1, c); + } + void FillFixedSizeListLike(struct ArrowArray* c, int64_t length, int64_t null_count, int64_t offset, const void** buffers) { c->length = length; @@ -2463,6 +2588,11 @@ class TestArrayImport : public ::testing::Test { FillListLike(&c_struct_, length, null_count, offset, buffers); } + void FillListView(int64_t length, int64_t null_count, int64_t offset, + const void** buffers) { + FillListView(&c_struct_, length, null_count, offset, buffers); + } + void FillFixedSizeListLike(int64_t length, int64_t null_count, int64_t offset, const void** buffers) { FillFixedSizeListLike(&c_struct_, length, null_count, offset, buffers); @@ -2820,6 +2950,53 @@ TEST_F(TestArrayImport, ListWithOffset) { "[[6, 7, 8], [9, 10, 11], [12, 13, 14]]")); } +TEST_F(TestArrayImport, ListView) { + FillPrimitive(AddChild(), 8, 0, 0, primitive_buffers_no_nulls1_8); + FillListView(5, 0, 0, list_view_buffers_no_nulls1); + CheckImport(ArrayFromJSON(list_view(int8()), "[[1, 2], [], [3, 4, 5], [6], [7, 8]]")); + FillPrimitive(AddChild(), 5, 0, 0, primitive_buffers_no_nulls1_16); + FillListView(3, 1, 0, list_view_buffers_nulls1); + CheckImport( + ArrayFromJSON(list_view(int16()), "[[513, 1027], null, [1541, 2055, 2569]]")); + + // Large list-view + FillPrimitive(AddChild(), 5, 0, 0, primitive_buffers_no_nulls1_16); + FillListView(3, 0, 0, large_list_view_buffers_no_nulls1); + CheckImport( + ArrayFromJSON(large_list_view(int16()), "[[513, 1027], [], [1541, 2055, 2569]]")); +} + +TEST_F(TestArrayImport, NestedListView) { + FillPrimitive(AddChild(), 8, 0, 0, primitive_buffers_no_nulls1_8); + FillListView(AddChild(), 5, 0, 0, list_view_buffers_no_nulls1); + FillListView(3, 0, 0, large_list_view_buffers_no_nulls1); + CheckImport(ArrayFromJSON(large_list_view(list_view(int8())), + "[[[1, 2], []], [], [[3, 4, 5], [6], [7, 8]]]")); + + FillPrimitive(AddChild(), 6, 0, 0, primitive_buffers_no_nulls1_8); + FillFixedSizeListLike(AddChild(), 2, 0, 0, buffers_no_nulls_no_data); + FillListView(2, 0, 0, list_view_buffers_no_nulls1); + CheckImport(ArrayFromJSON(list_view(fixed_size_list(int8(), 3)), + "[[[1, 2, 3], [4, 5, 6]], []]")); +} + +TEST_F(TestArrayImport, ListViewWithOffset) { + // Offset in child + FillPrimitive(AddChild(), 8, 0, 1, primitive_buffers_no_nulls1_8); + FillListView(5, 0, 0, list_view_buffers_no_nulls1); + CheckImport(ArrayFromJSON(list_view(int8()), "[[2, 3], [], [4, 5, 6], [7], [8, 9]]")); + + // Offset in parent + FillPrimitive(AddChild(), 8, 0, 0, primitive_buffers_no_nulls1_8); + FillListView(4, 0, 1, list_view_buffers_no_nulls1); + CheckImport(ArrayFromJSON(list_view(int8()), "[[], [3, 4, 5], [6], [7, 8]]")); + + // Both + FillPrimitive(AddChild(), 8, 0, 2, primitive_buffers_no_nulls1_8); + FillListView(4, 0, 1, list_view_buffers_no_nulls1); + CheckImport(ArrayFromJSON(list_view(int8()), "[[], [5, 6, 7], [8], [9, 10]]")); +} + TEST_F(TestArrayImport, Struct) { FillStringLike(AddChild(), 3, 0, 0, string_buffers_no_nulls1); FillPrimitive(AddChild(), 3, -1, 0, primitive_buffers_nulls1_16); @@ -3117,6 +3294,17 @@ TEST_F(TestArrayImport, ListError) { CheckImportError(list(int8())); } +TEST_F(TestArrayImport, ListViewNoError) { + // Unlike with lists, importing a length-0 list-view with all buffers ommitted is + // not an error. List-views don't need an extra offset value, so an empty offsets + // buffer is valid in this case. + + // Null offsets pointer + FillPrimitive(AddChild(), 0, 0, 0, primitive_buffers_no_nulls1_8); + FillListView(0, 0, 0, all_buffers_omitted); + CheckImport(ArrayFromJSON(list_view(int8()), "[]")); +} + TEST_F(TestArrayImport, MapError) { // Bad number of (struct) children in map child FillStringLike(AddChild(), 5, 0, 0, string_buffers_no_nulls1); @@ -3370,6 +3558,12 @@ TEST_F(TestSchemaRoundtrip, List) { TestWithTypeFactory([]() { return list(fixed_size_list(utf8(), 5)); }); } +TEST_F(TestSchemaRoundtrip, ListView) { + TestWithTypeFactory([]() { return list_view(utf8()); }); + TestWithTypeFactory([]() { return large_list_view(list_view(utf8())); }); + TestWithTypeFactory([]() { return list_view(fixed_size_list(utf8(), 5)); }); +} + TEST_F(TestSchemaRoundtrip, Struct) { auto f1 = field("f1", utf8(), /*nullable=*/false); auto f2 = field("f2", list(decimal(19, 4))); @@ -3631,6 +3825,31 @@ TEST_F(TestArrayRoundtrip, List) { TestWithJSONSliced(fixed_size_list(int32(), 3), "[[4, 5, 6], null, [7, 8, null]]"); } +TEST_F(TestArrayRoundtrip, ListView) { + TestWithJSON(list_view(int32()), "[]"); + TestWithJSON(list_view(int32()), "[[4, 5], [6, null], null]"); + + TestWithJSONSliced(list_view(int32()), "[[4, 5], [6, null], null]"); + + // Out-of-order offsets + TestWithArrayFactory([this]() -> Result> { + std::shared_ptr offsets; + ArrayFromVector(int32(), + std::vector{false, true, true, true, false, true}, + std::vector{4, 2, 1, 3, 3, 2}, &offsets); + + std::shared_ptr sizes; + ArrayFromVector(std::vector{2, 2, 3, 1, 2, 0}, &sizes); + + auto values = ArrayFromJSON(int8(), "[4, 5, 6, null, 8, null]"); + auto result = ListViewArray::FromArrays(*offsets, *sizes, *values, pool_); + if (result.ok()) { + RETURN_NOT_OK((*result)->ValidateFull()); + } + return result; + }); +} + TEST_F(TestArrayRoundtrip, Struct) { auto type = struct_({field("ints", int16()), field("bools", boolean())}); TestWithJSON(type, "[]"); diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc index 50cfdd05a14bb..bb632e2eb912d 100644 --- a/cpp/src/arrow/compare.cc +++ b/cpp/src/arrow/compare.cc @@ -308,6 +308,10 @@ class RangeDataEqualsImpl { Status Visit(const LargeListType& type) { return CompareList(type); } + Status Visit(const ListViewType& type) { return CompareListView(type); } + + Status Visit(const LargeListViewType& type) { return CompareListView(type); } + Status Visit(const FixedSizeListType& type) { const auto list_size = type.list_size(); const ArrayData& left_data = *left_.child_data[0]; @@ -493,6 +497,38 @@ class RangeDataEqualsImpl { return Status::OK(); } + template + Status CompareListView(const TypeClass& type) { + const ArrayData& left_values = *left_.child_data[0]; + const ArrayData& right_values = *right_.child_data[0]; + + using offset_type = typename TypeClass::offset_type; + const auto* left_offsets = left_.GetValues(1) + left_start_idx_; + const auto* right_offsets = right_.GetValues(1) + right_start_idx_; + const auto* left_sizes = left_.GetValues(2) + left_start_idx_; + const auto* right_sizes = right_.GetValues(2) + right_start_idx_; + + auto compare_view = [&](int64_t i, int64_t length) -> bool { + for (int64_t j = i; j < i + length; ++j) { + if (left_sizes[j] != right_sizes[j]) { + return false; + } + const offset_type size = left_sizes[j]; + if (size == 0) { + continue; + } + RangeDataEqualsImpl impl(options_, floating_approximate_, left_values, + right_values, left_offsets[j], right_offsets[j], size); + if (!impl.Compare()) { + return false; + } + } + return true; + }; + VisitValidRuns(std::move(compare_view)); + return Status::OK(); + } + template Status CompareRunEndEncoded() { auto left_span = ArraySpan(left_); @@ -699,7 +735,8 @@ class TypeEqualsVisitor { } template - enable_if_t::value, Status> Visit(const T& left) { + enable_if_t::value || is_list_view_type::value, Status> Visit( + const T& left) { std::shared_ptr left_field = left.field(0); std::shared_ptr right_field = checked_cast(right_).field(0); bool equal_names = !check_metadata_ || (left_field->name() == right_field->name()); @@ -857,6 +894,18 @@ class ScalarEqualsVisitor { return Status::OK(); } + Status Visit(const ListViewScalar& left) { + const auto& right = checked_cast(right_); + result_ = ArrayEquals(*left.value, *right.value, options_, floating_approximate_); + return Status::OK(); + } + + Status Visit(const LargeListViewScalar& left) { + const auto& right = checked_cast(right_); + result_ = ArrayEquals(*left.value, *right.value, options_, floating_approximate_); + return Status::OK(); + } + Status Visit(const MapScalar& left) { const auto& right = checked_cast(right_); result_ = ArrayEquals(*left.value, *right.value, options_, floating_approximate_); diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else.cc b/cpp/src/arrow/compute/kernels/scalar_if_else.cc index 6b4b2339e4afe..ee181c053c053 100644 --- a/cpp/src/arrow/compute/kernels/scalar_if_else.cc +++ b/cpp/src/arrow/compute/kernels/scalar_if_else.cc @@ -82,9 +82,9 @@ std::optional GetConstantValidityWord(const ExecValue& data) { return {}; } -// if the condition is null then output is null otherwise we take validity from the -// selected argument -// ie. cond.valid & (cond.data & left.valid | ~cond.data & right.valid) +/// If the condition is null then output is null otherwise we take validity from the +/// selected argument +/// (i.e. cond.valid & (cond.data & left.valid | ~cond.data & right.valid)). struct IfElseNullPromoter { KernelContext* ctx; const ArraySpan& cond; @@ -368,7 +368,7 @@ void RunIfElseLoopInverted(const ArraySpan& cond, const HandleBlock& handle_bloc } /// Runs if-else when cond is a scalar. Two special functions are required, -/// 1.CopyArrayData, 2. BroadcastScalar +/// 1. CopyArrayData, 2. BroadcastScalar template Status RunIfElseScalar(const BooleanScalar& cond, const ExecValue& left, const ExecValue& right, ExecResult* out, @@ -1028,7 +1028,7 @@ struct NestedIfElseExec { // AAA static Status Call(KernelContext* ctx, const ArraySpan& cond, const ArraySpan& left, const ArraySpan& right, ExecResult* out) { - return RunLoop( + return RunLoopOfNestedIfElseExec( ctx, cond, out, [&](ArrayBuilder* builder, int64_t i, int64_t length) { return builder->AppendArraySlice(left, i, length); @@ -1041,7 +1041,7 @@ struct NestedIfElseExec { // ASA static Status Call(KernelContext* ctx, const ArraySpan& cond, const Scalar& left, const ArraySpan& right, ExecResult* out) { - return RunLoop( + return RunLoopOfNestedIfElseExec( ctx, cond, out, [&](ArrayBuilder* builder, int64_t i, int64_t length) { return builder->AppendScalar(left, length); @@ -1054,7 +1054,7 @@ struct NestedIfElseExec { // AAS static Status Call(KernelContext* ctx, const ArraySpan& cond, const ArraySpan& left, const Scalar& right, ExecResult* out) { - return RunLoop( + return RunLoopOfNestedIfElseExec( ctx, cond, out, [&](ArrayBuilder* builder, int64_t i, int64_t length) { return builder->AppendArraySlice(left, i, length); @@ -1067,7 +1067,7 @@ struct NestedIfElseExec { // ASS static Status Call(KernelContext* ctx, const ArraySpan& cond, const Scalar& left, const Scalar& right, ExecResult* out) { - return RunLoop( + return RunLoopOfNestedIfElseExec( ctx, cond, out, [&](ArrayBuilder* builder, int64_t i, int64_t length) { return builder->AppendScalar(left, length); @@ -1078,8 +1078,9 @@ struct NestedIfElseExec { } template - static Status RunLoop(KernelContext* ctx, const ArraySpan& cond, ExecResult* out, - HandleLeft&& handle_left, HandleRight&& handle_right) { + static Status RunLoopOfNestedIfElseExec(KernelContext* ctx, const ArraySpan& cond, + ExecResult* out, HandleLeft&& handle_left, + HandleRight&& handle_right) { std::unique_ptr raw_builder; RETURN_NOT_OK(MakeBuilderExactIndex(ctx->memory_pool(), out->type()->GetSharedPtr(), &raw_builder)); @@ -1308,9 +1309,9 @@ void AddFixedWidthIfElseKernel(const std::shared_ptr& scalar_fun } void AddNestedIfElseKernels(const std::shared_ptr& scalar_function) { - for (const auto type_id : - {Type::LIST, Type::LARGE_LIST, Type::FIXED_SIZE_LIST, Type::STRUCT, - Type::DENSE_UNION, Type::SPARSE_UNION, Type::DICTIONARY}) { + for (const auto type_id : {Type::LIST, Type::LARGE_LIST, Type::LIST_VIEW, + Type::LARGE_LIST_VIEW, Type::FIXED_SIZE_LIST, Type::STRUCT, + Type::DENSE_UNION, Type::SPARSE_UNION, Type::DICTIONARY}) { ScalarKernel kernel({boolean(), InputType(type_id), InputType(type_id)}, LastType, NestedIfElseExec::Exec); kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE; diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else_benchmark.cc b/cpp/src/arrow/compute/kernels/scalar_if_else_benchmark.cc index 34225ce9fe084..b72402bbccd4e 100644 --- a/cpp/src/arrow/compute/kernels/scalar_if_else_benchmark.cc +++ b/cpp/src/arrow/compute/kernels/scalar_if_else_benchmark.cc @@ -67,11 +67,15 @@ struct GetBytesProcessedVisitor { } template - enable_if_var_size_list Visit(const ArrowType& type) { + enable_if_var_length_list_like Visit(const ArrowType& type) { using ArrayType = typename TypeTraits::ArrayType; using OffsetType = typename TypeTraits::OffsetType::c_type; - total_bytes += (arr->length() + 1) * sizeof(OffsetType); + const auto num_offsets = is_list_view(type) ? arr->length() : arr->length() + 1; + total_bytes += num_offsets * sizeof(OffsetType); + // NOTE: the sizes buffer is not counted when type is a list-view as that + // can make the throughput numbers look better just because the sizes + // increase the number of bytes in the input. auto child_array = internal::checked_cast(arr)->values(); return RecurseInto(child_array.get()); } @@ -126,7 +130,7 @@ static void IfElseBench(benchmark::State& state) { } template -static void IfElseBenchList(benchmark::State& state) { +static void IfElseBenchVarLengthListLike(benchmark::State& state) { auto value_type = TypeTraits::type_singleton(); auto list_type = std::make_shared(value_type); return IfElseBench(state, list_type); @@ -172,7 +176,7 @@ static void IfElseBenchContiguous(benchmark::State& state) { } template -static void IfElseBenchListContiguous(benchmark::State& state) { +static void IfElseBenchVarLengthListLikeContiguous(benchmark::State& state) { auto value_type = TypeTraits::type_singleton(); auto list_type = std::make_shared(value_type); return IfElseBenchContiguous(state, list_type); @@ -187,11 +191,11 @@ static void IfElseBench32(benchmark::State& state) { } static void IfElseBenchListUInt32(benchmark::State& state) { - return IfElseBenchList(state); + return IfElseBenchVarLengthListLike(state); } static void IfElseBenchListString32(benchmark::State& state) { - return IfElseBenchList(state); + return IfElseBenchVarLengthListLike(state); } static void IfElseBenchString32(benchmark::State& state) { @@ -211,11 +215,27 @@ static void IfElseBench32Contiguous(benchmark::State& state) { } static void IfElseBenchListUInt32Contiguous(benchmark::State& state) { - return IfElseBenchListContiguous(state); + return IfElseBenchVarLengthListLikeContiguous(state); } static void IfElseBenchListString32Contiguous(benchmark::State& state) { - return IfElseBenchListContiguous(state); + return IfElseBenchVarLengthListLikeContiguous(state); +} + +static void IfElseBenchListViewUInt32(benchmark::State& state) { + return IfElseBenchVarLengthListLike(state); +} + +static void IfElseBenchListViewString32(benchmark::State& state) { + return IfElseBenchVarLengthListLike(state); +} + +static void IfElseBenchListViewUInt32Contiguous(benchmark::State& state) { + return IfElseBenchVarLengthListLikeContiguous(state); +} + +static void IfElseBenchListViewString32Contiguous(benchmark::State& state) { + return IfElseBenchVarLengthListLikeContiguous(state); } static void IfElseBenchString64Contiguous(benchmark::State& state) { @@ -494,6 +514,12 @@ BENCHMARK(IfElseBenchListString32)->Args({kNumItems, 0}); BENCHMARK(IfElseBenchListUInt32Contiguous)->Args({kNumItems, 0}); BENCHMARK(IfElseBenchListString32Contiguous)->Args({kNumItems, 0}); +// IfElse: ListViews +BENCHMARK(IfElseBenchListViewUInt32)->Args({kNumItems, 0}); +BENCHMARK(IfElseBenchListViewString32)->Args({kNumItems, 0}); +BENCHMARK(IfElseBenchListViewUInt32Contiguous)->Args({kNumItems, 0}); +BENCHMARK(IfElseBenchListViewString32Contiguous)->Args({kNumItems, 0}); + // IfElse: Strings BENCHMARK(IfElseBenchString32)->Args({kNumItems, 0}); BENCHMARK(IfElseBenchString64)->Args({kNumItems, 0}); diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc b/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc index a9c5a1fc3c96f..a11aab81742ed 100644 --- a/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc @@ -737,12 +737,15 @@ TEST_F(TestIfElseKernel, Decimal) { } } +using ListAndListViewArrowTypes = + ::testing::Types; + template -class TestIfElseList : public ::testing::Test {}; +class TestIfElseVarLengthListLike : public ::testing::Test {}; -TYPED_TEST_SUITE(TestIfElseList, ListArrowTypes); +TYPED_TEST_SUITE(TestIfElseVarLengthListLike, ListAndListViewArrowTypes); -TYPED_TEST(TestIfElseList, ListOfInt) { +TYPED_TEST(TestIfElseVarLengthListLike, ListOfInt) { auto type = std::make_shared(int32()); CheckWithDifferentShapes(ArrayFromJSON(boolean(), "[true, true, false, false]"), ArrayFromJSON(type, "[[], null, [1, null], [2, 3]]"), @@ -755,7 +758,7 @@ TYPED_TEST(TestIfElseList, ListOfInt) { ArrayFromJSON(type, "[null, null, null, null]")); } -TYPED_TEST(TestIfElseList, ListOfString) { +TYPED_TEST(TestIfElseVarLengthListLike, ListOfString) { auto type = std::make_shared(utf8()); CheckWithDifferentShapes( ArrayFromJSON(boolean(), "[true, true, false, false]"), diff --git a/cpp/src/arrow/engine/substrait/expression_internal.cc b/cpp/src/arrow/engine/substrait/expression_internal.cc index d395261597696..5d892af9a394e 100644 --- a/cpp/src/arrow/engine/substrait/expression_internal.cc +++ b/cpp/src/arrow/engine/substrait/expression_internal.cc @@ -807,6 +807,14 @@ struct ScalarToProtoImpl { return Status::OK(); } + Status Visit(const ListViewScalar& s) { + return Status::NotImplemented("list-view to proto"); + } + + Status Visit(const LargeListViewScalar& s) { + return Status::NotImplemented("list-view to proto"); + } + Status Visit(const StructScalar& s) { lit_->set_allocated_struct_(new Lit::Struct()); diff --git a/cpp/src/arrow/engine/substrait/type_internal.cc b/cpp/src/arrow/engine/substrait/type_internal.cc index d3fb058137e6a..f4a2e6800eb49 100644 --- a/cpp/src/arrow/engine/substrait/type_internal.cc +++ b/cpp/src/arrow/engine/substrait/type_internal.cc @@ -313,6 +313,10 @@ struct DataTypeToProtoImpl { return Status::OK(); } + Status Visit(const ListViewType& t) { return NotImplemented(t); } + + Status Visit(const LargeListViewType& t) { return NotImplemented(t); } + Status Visit(const StructType& t) { auto types = SetWithThen(&substrait::Type::set_allocated_struct_)->mutable_types(); diff --git a/cpp/src/arrow/integration/json_integration_test.cc b/cpp/src/arrow/integration/json_integration_test.cc index e023e6a3a44d3..9b56928c68843 100644 --- a/cpp/src/arrow/integration/json_integration_test.cc +++ b/cpp/src/arrow/integration/json_integration_test.cc @@ -793,8 +793,6 @@ void CheckPrimitive(const std::shared_ptr& type, } TEST(TestJsonSchemaWriter, FlatTypes) { - // TODO - // field("f14", date32()) std::vector> fields = { field("f0", int8()), field("f1", int16(), false), @@ -822,6 +820,8 @@ TEST(TestJsonSchemaWriter, FlatTypes) { field("f21", run_end_encoded(int16(), utf8())), field("f22", run_end_encoded(int32(), utf8())), field("f23", run_end_encoded(int64(), utf8())), + field("f24", list_view(int32())), + field("f25", large_list_view(uint8())), }; auto schema = ::arrow::schema(fields); @@ -1147,10 +1147,12 @@ TEST_P(TestJsonRoundTrip, RoundTrip) { const std::vector kBatchCases = { &MakeIntRecordBatch, &MakeListRecordBatch, + &MakeListViewRecordBatch, &MakeFixedSizeListRecordBatch, &MakeNonNullRecordBatch, &MakeZeroLengthRecordBatch, &MakeDeeplyNestedList, + &MakeDeeplyNestedListView, &MakeStringTypesRecordBatchWithNulls, &MakeStruct, &MakeUnion, diff --git a/cpp/src/arrow/integration/json_internal.cc b/cpp/src/arrow/integration/json_internal.cc index 59749c36a958e..64eb342d5bd47 100644 --- a/cpp/src/arrow/integration/json_internal.cc +++ b/cpp/src/arrow/integration/json_internal.cc @@ -236,7 +236,7 @@ class SchemaWriter { enable_if_t::value || is_primitive_ctype::value || is_base_binary_type::value || is_binary_view_like_type::value || is_var_length_list_type::value || is_struct_type::value || - is_run_end_encoded_type::value> + is_run_end_encoded_type::value || is_list_view_type::value> WriteTypeMetadata(const T& type) {} void WriteTypeMetadata(const MapType& type) { @@ -422,6 +422,16 @@ class SchemaWriter { return Status::OK(); } + Status Visit(const ListViewType& type) { + WriteName("listview", type); + return Status::OK(); + } + + Status Visit(const LargeListViewType& type) { + WriteName("largelistview", type); + return Status::OK(); + } + Status Visit(const MapType& type) { WriteName("map", type); return Status::OK(); @@ -777,6 +787,15 @@ class ArrayWriter { return WriteChildren(array.type()->fields(), {array.values()}); } + template + enable_if_list_view Visit( + const ArrayType& array) { + WriteValidityField(array); + WriteIntegerField("OFFSET", array.raw_value_offsets(), array.length()); + WriteIntegerField("SIZE", array.raw_value_sizes(), array.length()); + return WriteChildren(array.type()->fields(), {array.values()}); + } + Status Visit(const FixedSizeListArray& array) { WriteValidityField(array); const auto& type = checked_cast(*array.type()); @@ -1132,6 +1151,16 @@ Result> GetType(const RjObject& json_type, return Status::Invalid("Large list must have exactly one child"); } return large_list(children[0]); + } else if (type_name == "listview") { + if (children.size() != 1) { + return Status::Invalid("List-view must have exactly one child"); + } + return list_view(children[0]); + } else if (type_name == "largelistview") { + if (children.size() != 1) { + return Status::Invalid("Large list-view must have exactly one child"); + } + return large_list_view(children[0]); } else if (type_name == "map") { return GetMap(json_type, children); } else if (type_name == "fixedsizelist") { @@ -1651,6 +1680,26 @@ class ArrayReader { return CreateList(type_); } + template + Status CreateListView(const std::shared_ptr& type) { + using offset_type = typename T::offset_type; + + RETURN_NOT_OK(InitializeData(3)); + + RETURN_NOT_OK(GetNullBitmap()); + ARROW_ASSIGN_OR_RAISE(const auto json_offsets, GetMemberArray(obj_, "OFFSET")); + RETURN_NOT_OK(GetIntArray(json_offsets, length_, &data_->buffers[1])); + ARROW_ASSIGN_OR_RAISE(const auto json_sizes, GetMemberArray(obj_, "SIZE")); + RETURN_NOT_OK(GetIntArray(json_sizes, length_, &data_->buffers[2])); + RETURN_NOT_OK(GetChildren(obj_, *type)); + return Status::OK(); + } + + template + enable_if_list_view Visit(const T& type) { + return CreateListView(type_); + } + Status Visit(const MapType& type) { auto list_type = std::make_shared(type.value_field()); RETURN_NOT_OK(CreateList(list_type)); diff --git a/cpp/src/arrow/ipc/feather_test.cc b/cpp/src/arrow/ipc/feather_test.cc index 0b6ae4f620647..80e441fe2b670 100644 --- a/cpp/src/arrow/ipc/feather_test.cc +++ b/cpp/src/arrow/ipc/feather_test.cc @@ -329,9 +329,11 @@ namespace { const std::vector kBatchCases = { &ipc::test::MakeIntRecordBatch, &ipc::test::MakeListRecordBatch, + &ipc::test::MakeListViewRecordBatch, &ipc::test::MakeFixedSizeListRecordBatch, &ipc::test::MakeNonNullRecordBatch, &ipc::test::MakeDeeplyNestedList, + &ipc::test::MakeDeeplyNestedListView, &ipc::test::MakeStringTypesRecordBatchWithNulls, &ipc::test::MakeStruct, &ipc::test::MakeUnion, diff --git a/cpp/src/arrow/ipc/generate_fuzz_corpus.cc b/cpp/src/arrow/ipc/generate_fuzz_corpus.cc index 50be10991ff9f..682c352132a11 100644 --- a/cpp/src/arrow/ipc/generate_fuzz_corpus.cc +++ b/cpp/src/arrow/ipc/generate_fuzz_corpus.cc @@ -74,6 +74,8 @@ Result>> Batches() { batches.push_back(batch); RETURN_NOT_OK(test::MakeListRecordBatch(&batch)); batches.push_back(batch); + RETURN_NOT_OK(test::MakeListViewRecordBatch(&batch)); + batches.push_back(batch); RETURN_NOT_OK(test::MakeDictionary(&batch)); batches.push_back(batch); RETURN_NOT_OK(test::MakeTimestamps(&batch)); diff --git a/cpp/src/arrow/ipc/json_simple.cc b/cpp/src/arrow/ipc/json_simple.cc index 4d2d803f3f65e..ceeabe01677ed 100644 --- a/cpp/src/arrow/ipc/json_simple.cc +++ b/cpp/src/arrow/ipc/json_simple.cc @@ -123,12 +123,16 @@ Status GetConverter(const std::shared_ptr&, std::shared_ptr template class ConcreteConverter : public Converter { public: - Status AppendValues(const rj::Value& json_array) override { - auto self = static_cast(this); - if (!json_array.IsArray()) { - return JSONTypeError("array", json_array.GetType()); + Result SizeOfJSONArray(const rj::Value& json_obj) { + if (!json_obj.IsArray()) { + return JSONTypeError("array", json_obj.GetType()); } - auto size = json_array.Size(); + return json_obj.Size(); + } + + Status AppendValues(const rj::Value& json_array) final { + auto self = static_cast(this); + ARROW_ASSIGN_OR_RAISE(auto size, SizeOfJSONArray(json_array)); for (uint32_t i = 0; i < size; ++i) { RETURN_NOT_OK(self->AppendValue(json_array[i])); } @@ -536,15 +540,19 @@ class FixedSizeBinaryConverter final // Converter for list arrays template -class ListConverter final : public ConcreteConverter> { +class VarLengthListLikeConverter final + : public ConcreteConverter> { public: using BuilderType = typename TypeTraits::BuilderType; - explicit ListConverter(const std::shared_ptr& type) { this->type_ = type; } + explicit VarLengthListLikeConverter(const std::shared_ptr& type) { + this->type_ = type; + } Status Init() override { - const auto& list_type = checked_cast(*this->type_); - RETURN_NOT_OK(GetConverter(list_type.value_type(), &child_converter_)); + const auto& var_length_list_like_type = checked_cast(*this->type_); + RETURN_NOT_OK( + GetConverter(var_length_list_like_type.value_type(), &child_converter_)); auto child_builder = child_converter_->builder(); builder_ = std::make_shared(default_memory_pool(), child_builder, this->type_); @@ -555,8 +563,9 @@ class ListConverter final : public ConcreteConverter> { if (json_obj.IsNull()) { return this->AppendNull(); } - RETURN_NOT_OK(builder_->Append()); // Extend the child converter with this JSON array + ARROW_ASSIGN_OR_RAISE(auto size, this->SizeOfJSONArray(json_obj)); + RETURN_NOT_OK(builder_->Append(true, size)); return child_converter_->AppendValues(json_obj); } @@ -898,8 +907,11 @@ Status GetConverter(const std::shared_ptr& type, SIMPLE_CONVERTER_CASE(Type::HALF_FLOAT, IntegerConverter) SIMPLE_CONVERTER_CASE(Type::FLOAT, FloatConverter) SIMPLE_CONVERTER_CASE(Type::DOUBLE, FloatConverter) - SIMPLE_CONVERTER_CASE(Type::LIST, ListConverter) - SIMPLE_CONVERTER_CASE(Type::LARGE_LIST, ListConverter) + SIMPLE_CONVERTER_CASE(Type::LIST, VarLengthListLikeConverter) + SIMPLE_CONVERTER_CASE(Type::LARGE_LIST, VarLengthListLikeConverter) + SIMPLE_CONVERTER_CASE(Type::LIST_VIEW, VarLengthListLikeConverter) + SIMPLE_CONVERTER_CASE(Type::LARGE_LIST_VIEW, + VarLengthListLikeConverter) SIMPLE_CONVERTER_CASE(Type::MAP, MapConverter) SIMPLE_CONVERTER_CASE(Type::FIXED_SIZE_LIST, FixedSizeListConverter) SIMPLE_CONVERTER_CASE(Type::STRUCT, StructConverter) diff --git a/cpp/src/arrow/ipc/json_simple_test.cc b/cpp/src/arrow/ipc/json_simple_test.cc index b67c26999945b..ea3a9ae1a14a9 100644 --- a/cpp/src/arrow/ipc/json_simple_test.cc +++ b/cpp/src/arrow/ipc/json_simple_test.cc @@ -59,6 +59,9 @@ using ::arrow::internal::BytesToBits; using ::arrow::internal::checked_cast; using ::arrow::internal::checked_pointer_cast; +using ListAndListViewTypes = + ::testing::Types; + // Avoid undefined behaviour on signed overflow template Signed SafeSignedAdd(Signed u, Signed v) { @@ -591,145 +594,207 @@ TEST(TestDecimal, Dictionary) { } } -TEST(TestList, IntegerList) { - auto pool = default_memory_pool(); - std::shared_ptr type = list(int64()); - std::shared_ptr offsets, values, expected, actual; - - ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[]")); - ASSERT_OK(actual->ValidateFull()); - ArrayFromVector({0}, &offsets); - ArrayFromVector({}, &values); - ASSERT_OK_AND_ASSIGN(expected, ListArray::FromArrays(*offsets, *values, pool)); - AssertArraysEqual(*expected, *actual); - - ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[[4, 5], [], [6]]")); - ASSERT_OK(actual->ValidateFull()); - ArrayFromVector({0, 2, 2, 3}, &offsets); - ArrayFromVector({4, 5, 6}, &values); - ASSERT_OK_AND_ASSIGN(expected, ListArray::FromArrays(*offsets, *values, pool)); - AssertArraysEqual(*expected, *actual); - - ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[[], [null], [6, null]]")); - ASSERT_OK(actual->ValidateFull()); - ArrayFromVector({0, 0, 1, 3}, &offsets); - auto is_valid = std::vector{false, true, false}; - ArrayFromVector(is_valid, {0, 6, 0}, &values); - ASSERT_OK_AND_ASSIGN(expected, ListArray::FromArrays(*offsets, *values, pool)); - AssertArraysEqual(*expected, *actual); - - ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[null, [], null]")); - ASSERT_OK(actual->ValidateFull()); - { - std::unique_ptr builder; - ASSERT_OK(MakeBuilder(pool, type, &builder)); - auto& list_builder = checked_cast(*builder); - ASSERT_OK(list_builder.AppendNull()); - ASSERT_OK(list_builder.Append()); - ASSERT_OK(list_builder.AppendNull()); - ASSERT_OK(list_builder.Finish(&expected)); +template +class TestVarLengthListArray : public ::testing::Test { + public: + using TypeClass = T; + using offset_type = typename TypeClass::offset_type; + using ArrayType = typename TypeTraits::ArrayType; + using BuilderType = typename TypeTraits::BuilderType; + using OffsetType = typename TypeTraits::OffsetType; + + static constexpr bool is_list_view_type = is_list_view(TypeClass::type_id); + + void TestIntegerList() { + auto pool = default_memory_pool(); + std::shared_ptr type = std::make_shared(int64()); + std::shared_ptr offsets, sizes, values, expected, actual; + + ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[]")); + ASSERT_OK(actual->ValidateFull()); + ArrayFromVector({0}, &offsets); + ArrayFromVector({}, &values); + if constexpr (is_list_view_type) { + ArrayFromVector({}, &sizes); + ASSERT_OK_AND_ASSIGN(expected, + ArrayType::FromArrays(*offsets, *sizes, *values, pool)); + } else { + ASSERT_OK_AND_ASSIGN(expected, ArrayType::FromArrays(*offsets, *values, pool)); + } + AssertArraysEqual(*expected, *actual); + + ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[[4, 5], [], [6]]")); + ASSERT_OK(actual->ValidateFull()); + ArrayFromVector({0, 2, 2, 3}, &offsets); + ArrayFromVector({4, 5, 6}, &values); + if constexpr (is_list_view_type) { + ArrayFromVector({2, 0, 1}, &sizes); + ASSERT_OK_AND_ASSIGN(expected, + ArrayType::FromArrays(*offsets, *sizes, *values, pool)); + } else { + ASSERT_OK_AND_ASSIGN(expected, ArrayType::FromArrays(*offsets, *values, pool)); + } + AssertArraysEqual(*expected, *actual); + + ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[[], [null], [6, null]]")); + ASSERT_OK(actual->ValidateFull()); + ArrayFromVector({0, 0, 1, 3}, &offsets); + auto is_valid = std::vector{false, true, false}; + ArrayFromVector(is_valid, {0, 6, 0}, &values); + if constexpr (is_list_view_type) { + ArrayFromVector({0, 1, 2}, &sizes); + ASSERT_OK_AND_ASSIGN(expected, + ArrayType::FromArrays(*offsets, *sizes, *values, pool)); + } else { + ASSERT_OK_AND_ASSIGN(expected, ArrayType::FromArrays(*offsets, *values, pool)); + } + AssertArraysEqual(*expected, *actual); + + ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[null, [], null]")); + ASSERT_OK(actual->ValidateFull()); + { + std::unique_ptr builder; + ASSERT_OK(MakeBuilder(pool, type, &builder)); + auto& list_builder = checked_cast(*builder); + ASSERT_OK(list_builder.AppendNull()); + ASSERT_OK(list_builder.Append(true, 0)); + ASSERT_OK(list_builder.AppendNull()); + ASSERT_OK(list_builder.Finish(&expected)); + } + AssertArraysEqual(*expected, *actual); } - AssertArraysEqual(*expected, *actual); -} - -TEST(TestList, IntegerListErrors) { - std::shared_ptr type = list(int64()); - std::shared_ptr array; - - ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[0]")); - ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[0.0]]")); - ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[9223372036854775808]]")); -} - -TEST(TestList, NullList) { - auto pool = default_memory_pool(); - std::shared_ptr type = list(null()); - std::shared_ptr offsets, values, expected, actual; - ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[]")); - ASSERT_OK(actual->ValidateFull()); - ArrayFromVector({0}, &offsets); - values = std::make_shared(0); - ASSERT_OK_AND_ASSIGN(expected, ListArray::FromArrays(*offsets, *values, pool)); - AssertArraysEqual(*expected, *actual); + void TestIntegerListErrors() { + std::shared_ptr type = std::make_shared(int64()); + std::shared_ptr array; - ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[[], [null], [null, null]]")); - ASSERT_OK(actual->ValidateFull()); - ArrayFromVector({0, 0, 1, 3}, &offsets); - values = std::make_shared(3); - ASSERT_OK_AND_ASSIGN(expected, ListArray::FromArrays(*offsets, *values, pool)); - AssertArraysEqual(*expected, *actual); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[0]")); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[0.0]]")); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[9223372036854775808]]")); + } - ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[null, [], null]")); - ASSERT_OK(actual->ValidateFull()); - { - std::unique_ptr builder; - ASSERT_OK(MakeBuilder(pool, type, &builder)); - auto& list_builder = checked_cast(*builder); - ASSERT_OK(list_builder.AppendNull()); - ASSERT_OK(list_builder.Append()); - ASSERT_OK(list_builder.AppendNull()); - ASSERT_OK(list_builder.Finish(&expected)); + void TestNullList() { + auto pool = default_memory_pool(); + std::shared_ptr type = std::make_shared(null()); + std::shared_ptr offsets, sizes, values, expected, actual; + + ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[]")); + ASSERT_OK(actual->ValidateFull()); + ArrayFromVector({0}, &offsets); + values = std::make_shared(0); + if constexpr (is_list_view_type) { + ArrayFromVector({}, &sizes); + ASSERT_OK_AND_ASSIGN(expected, + ArrayType::FromArrays(*offsets, *sizes, *values, pool)); + } else { + ASSERT_OK_AND_ASSIGN(expected, ArrayType::FromArrays(*offsets, *values, pool)); + } + AssertArraysEqual(*expected, *actual); + + ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[[], [null], [null, null]]")); + ASSERT_OK(actual->ValidateFull()); + ArrayFromVector({0, 0, 1, 3}, &offsets); + values = std::make_shared(3); + if constexpr (is_list_view_type) { + ArrayFromVector({0, 1, 2}, &sizes); + ASSERT_OK_AND_ASSIGN(expected, + ArrayType::FromArrays(*offsets, *sizes, *values, pool)); + } else { + ASSERT_OK_AND_ASSIGN(expected, ArrayType::FromArrays(*offsets, *values, pool)); + } + AssertArraysEqual(*expected, *actual); + + ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[null, [], null]")); + ASSERT_OK(actual->ValidateFull()); + { + std::unique_ptr builder; + ASSERT_OK(MakeBuilder(pool, type, &builder)); + auto& list_builder = checked_cast(*builder); + ASSERT_OK(list_builder.AppendNull()); + ASSERT_OK(list_builder.Append(true, 0)); + ASSERT_OK(list_builder.AppendNull()); + ASSERT_OK(list_builder.Finish(&expected)); + } + AssertArraysEqual(*expected, *actual); } - AssertArraysEqual(*expected, *actual); -} -TEST(TestList, IntegerListList) { - auto pool = default_memory_pool(); - std::shared_ptr type = list(list(uint8())); - std::shared_ptr offsets, values, nested, expected, actual; + void TestIntegerListList() { + auto pool = default_memory_pool(); + std::shared_ptr type = + std::make_shared(std::make_shared(uint8())); + std::shared_ptr offsets, sizes, values, nested, expected, actual; + + ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[[[4], [5, 6]], [[7, 8, 9]]]")); + ASSERT_OK(actual->ValidateFull()); + ArrayFromVector({0, 1, 3, 6}, &offsets); + ArrayFromVector({4, 5, 6, 7, 8, 9}, &values); + if constexpr (is_list_view_type) { + ArrayFromVector({1, 2, 3}, &sizes); + ASSERT_OK_AND_ASSIGN(nested, + ArrayType::FromArrays(*offsets, *sizes, *values, pool)); + } else { + ASSERT_OK_AND_ASSIGN(nested, ArrayType::FromArrays(*offsets, *values, pool)); + } + ArrayFromVector({0, 2, 3}, &offsets); + if constexpr (is_list_view_type) { + ArrayFromVector({2, 1}, &sizes); + ASSERT_OK_AND_ASSIGN(expected, + ArrayType::FromArrays(*offsets, *sizes, *nested, pool)); + } else { + ASSERT_OK_AND_ASSIGN(expected, ArrayType::FromArrays(*offsets, *nested, pool)); + } + ASSERT_EQ(actual->length(), 2); + AssertArraysEqual(*expected, *actual); + + ASSERT_OK_AND_ASSIGN( + actual, ArrayFromJSON(type, "[[], [[]], [[4], [], [5, 6]], [[7, 8, 9]]]")); + ASSERT_OK(actual->ValidateFull()); + ArrayFromVector({0, 0, 1, 1, 3, 6}, &offsets); + ArrayFromVector({4, 5, 6, 7, 8, 9}, &values); + if constexpr (is_list_view_type) { + ArrayFromVector({0, 1, 0, 2, 3}, &sizes); + ASSERT_OK_AND_ASSIGN(nested, + ArrayType::FromArrays(*offsets, *sizes, *values, pool)); + } else { + ASSERT_OK_AND_ASSIGN(nested, ArrayType::FromArrays(*offsets, *values, pool)); + } + ArrayFromVector({0, 0, 1, 4, 5}, &offsets); + if constexpr (is_list_view_type) { + ArrayFromVector({0, 1, 3, 1}, &sizes); + ASSERT_OK_AND_ASSIGN(expected, + ArrayType::FromArrays(*offsets, *sizes, *nested, pool)); + } else { + ASSERT_OK_AND_ASSIGN(expected, ArrayType::FromArrays(*offsets, *nested, pool)); + } + ASSERT_EQ(actual->length(), 4); + AssertArraysEqual(*expected, *actual); + + ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[null, [null], [[null]]]")); + ASSERT_OK(actual->ValidateFull()); + { + std::unique_ptr builder; + ASSERT_OK(MakeBuilder(pool, type, &builder)); + auto& list_builder = checked_cast(*builder); + auto& child_builder = checked_cast(*list_builder.value_builder()); + ASSERT_OK(list_builder.AppendNull()); + ASSERT_OK(list_builder.Append(true, 0)); + ASSERT_OK(child_builder.AppendNull()); + ASSERT_OK(list_builder.Append(true, 0)); + ASSERT_OK(child_builder.Append(true, 0)); + ASSERT_OK(list_builder.Finish(&expected)); + } + } +}; - ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[[[4], [5, 6]], [[7, 8, 9]]]")); - ASSERT_OK(actual->ValidateFull()); - ArrayFromVector({0, 1, 3, 6}, &offsets); - ArrayFromVector({4, 5, 6, 7, 8, 9}, &values); - ASSERT_OK_AND_ASSIGN(nested, ListArray::FromArrays(*offsets, *values, pool)); - ArrayFromVector({0, 2, 3}, &offsets); - ASSERT_OK_AND_ASSIGN(expected, ListArray::FromArrays(*offsets, *nested, pool)); - ASSERT_EQ(actual->length(), 2); - AssertArraysEqual(*expected, *actual); +TYPED_TEST_SUITE(TestVarLengthListArray, ListAndListViewTypes); - ASSERT_OK_AND_ASSIGN(actual, - ArrayFromJSON(type, "[[], [[]], [[4], [], [5, 6]], [[7, 8, 9]]]")); - ASSERT_OK(actual->ValidateFull()); - ArrayFromVector({0, 0, 1, 1, 3, 6}, &offsets); - ArrayFromVector({4, 5, 6, 7, 8, 9}, &values); - ASSERT_OK_AND_ASSIGN(nested, ListArray::FromArrays(*offsets, *values, pool)); - ArrayFromVector({0, 0, 1, 4, 5}, &offsets); - ASSERT_OK_AND_ASSIGN(expected, ListArray::FromArrays(*offsets, *nested, pool)); - ASSERT_EQ(actual->length(), 4); - AssertArraysEqual(*expected, *actual); +TYPED_TEST(TestVarLengthListArray, IntegerList) { this->TestIntegerList(); } - ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[null, [null], [[null]]]")); - ASSERT_OK(actual->ValidateFull()); - { - std::unique_ptr builder; - ASSERT_OK(MakeBuilder(pool, type, &builder)); - auto& list_builder = checked_cast(*builder); - auto& child_builder = checked_cast(*list_builder.value_builder()); - ASSERT_OK(list_builder.AppendNull()); - ASSERT_OK(list_builder.Append()); - ASSERT_OK(child_builder.AppendNull()); - ASSERT_OK(list_builder.Append()); - ASSERT_OK(child_builder.Append()); - ASSERT_OK(list_builder.Finish(&expected)); - } -} +TYPED_TEST(TestVarLengthListArray, IntegerListErrors) { this->TestIntegerListErrors(); } -TEST(TestLargeList, Basics) { - // Similar as TestList above, only testing the basics - auto pool = default_memory_pool(); - std::shared_ptr type = large_list(int16()); - std::shared_ptr offsets, values, expected, actual; +TYPED_TEST(TestVarLengthListArray, NullList) { this->TestNullList(); } - ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[[], [null], [6, null]]")); - ASSERT_OK(actual->ValidateFull()); - ArrayFromVector({0, 0, 1, 3}, &offsets); - auto is_valid = std::vector{false, true, false}; - ArrayFromVector(is_valid, {0, 6, 0}, &values); - ASSERT_OK_AND_ASSIGN(expected, LargeListArray::FromArrays(*offsets, *values, pool)); - AssertArraysEqual(*expected, *actual); -} +TYPED_TEST(TestVarLengthListArray, IntegerListList) { this->TestIntegerListList(); } TEST(TestMap, IntegerToInteger) { auto type = map(int16(), int16()); diff --git a/cpp/src/arrow/ipc/metadata_internal.cc b/cpp/src/arrow/ipc/metadata_internal.cc index ab1a58dd1df8b..4f41edf8e15db 100644 --- a/cpp/src/arrow/ipc/metadata_internal.cc +++ b/cpp/src/arrow/ipc/metadata_internal.cc @@ -361,6 +361,18 @@ Status ConcreteTypeFromFlatbuffer(flatbuf::Type type, const void* type_data, } *out = std::make_shared(children[0]); return Status::OK(); + case flatbuf::Type::ListView: + if (children.size() != 1) { + return Status::Invalid("ListView must have exactly 1 child field"); + } + *out = std::make_shared(children[0]); + return Status::OK(); + case flatbuf::Type::LargeListView: + if (children.size() != 1) { + return Status::Invalid("LargeListView must have exactly 1 child field"); + } + *out = std::make_shared(children[0]); + return Status::OK(); case flatbuf::Type::Map: if (children.size() != 1) { return Status::Invalid("Map must have exactly 1 child field"); @@ -669,6 +681,20 @@ class FieldToFlatbufferVisitor { return Status::OK(); } + Status Visit(const ListViewType& type) { + fb_type_ = flatbuf::Type::ListView; + RETURN_NOT_OK(VisitChildFields(type)); + type_offset_ = flatbuf::CreateListView(fbb_).Union(); + return Status::OK(); + } + + Status Visit(const LargeListViewType& type) { + fb_type_ = flatbuf::Type::LargeListView; + RETURN_NOT_OK(VisitChildFields(type)); + type_offset_ = flatbuf::CreateListView(fbb_).Union(); + return Status::OK(); + } + Status Visit(const MapType& type) { fb_type_ = flatbuf::Type::Map; RETURN_NOT_OK(VisitChildFields(type)); diff --git a/cpp/src/arrow/ipc/read_write_test.cc b/cpp/src/arrow/ipc/read_write_test.cc index 05a48aec2c7f3..98127c78388ca 100644 --- a/cpp/src/arrow/ipc/read_write_test.cc +++ b/cpp/src/arrow/ipc/read_write_test.cc @@ -376,10 +376,12 @@ TEST_F(TestSchemaMetadata, MetadataVersionForwardCompatibility) { const std::vector kBatchCases = { &MakeIntRecordBatch, &MakeListRecordBatch, + &MakeListViewRecordBatch, &MakeFixedSizeListRecordBatch, &MakeNonNullRecordBatch, &MakeZeroLengthRecordBatch, &MakeDeeplyNestedList, + &MakeDeeplyNestedListView, &MakeStringTypesRecordBatchWithNulls, &MakeStruct, &MakeUnion, @@ -974,6 +976,9 @@ TEST_F(TestWriteRecordBatch, IntegerGetRecordBatchSize) { ASSERT_OK(MakeListRecordBatch(&batch)); TestGetRecordBatchSize(options_, batch); + ASSERT_OK(MakeListViewRecordBatch(&batch)); + TestGetRecordBatchSize(options_, batch); + ASSERT_OK(MakeZeroLengthRecordBatch(&batch)); TestGetRecordBatchSize(options_, batch); @@ -982,6 +987,9 @@ TEST_F(TestWriteRecordBatch, IntegerGetRecordBatchSize) { ASSERT_OK(MakeDeeplyNestedList(&batch)); TestGetRecordBatchSize(options_, batch); + + ASSERT_OK(MakeDeeplyNestedListView(&batch)); + TestGetRecordBatchSize(options_, batch); } class RecursionLimits : public ::testing::Test, public io::MemoryMapFixture { diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc index d603062d81d4a..d8d2d4b41a25a 100644 --- a/cpp/src/arrow/ipc/reader.cc +++ b/cpp/src/arrow/ipc/reader.cc @@ -330,6 +330,22 @@ class ArrayLoader { return LoadChildren(type.fields()); } + template + Status LoadListView(const TYPE& type) { + out_->buffers.resize(3); + + RETURN_NOT_OK(LoadCommon(type.id())); + RETURN_NOT_OK(GetBuffer(buffer_index_++, &out_->buffers[1])); + RETURN_NOT_OK(GetBuffer(buffer_index_++, &out_->buffers[2])); + + const int num_children = type.num_fields(); + if (num_children != 1) { + return Status::Invalid("Wrong number of children: ", num_children); + } + + return LoadChildren(type.fields()); + } + Status LoadChildren(const std::vector>& child_fields) { DCHECK_NE(out_, nullptr); ArrayData* parent = out_; @@ -392,6 +408,11 @@ class ArrayLoader { return LoadList(type); } + template + enable_if_list_view Visit(const T& type) { + return LoadListView(type); + } + Status Visit(const MapType& type) { RETURN_NOT_OK(LoadList(type)); return MapArray::ValidateChildData(out_->child_data); diff --git a/cpp/src/arrow/ipc/test_common.cc b/cpp/src/arrow/ipc/test_common.cc index 6faaf96b332d4..87c02e2d87a1e 100644 --- a/cpp/src/arrow/ipc/test_common.cc +++ b/cpp/src/arrow/ipc/test_common.cc @@ -189,6 +189,32 @@ Status MakeRandomListArray(const std::shared_ptr& child_array, int num_li return MakeListArray(child_array, num_lists, include_nulls, pool, out); } +Status MakeRandomListViewArray(const std::shared_ptr& child_array, int num_lists, + bool include_nulls, MemoryPool* pool, + std::shared_ptr* out) { + const auto seed = static_cast(child_array->length()); + random::RandomArrayGenerator rand(seed); + + const double null_probability = include_nulls ? 0.5 : 0.0; + *out = rand.ListView(*child_array, /*length=*/num_lists, null_probability, + /*force_empty_nulls=*/false, /*coverage=*/0.9, + kDefaultBufferAlignment, pool); + return Status::OK(); +} + +Status MakeRandomLargeListViewArray(const std::shared_ptr& child_array, + int num_lists, bool include_nulls, MemoryPool* pool, + std::shared_ptr* out) { + const auto seed = static_cast(child_array->length()); + random::RandomArrayGenerator rand(seed); + + const double null_probability = include_nulls ? 0.5 : 0.0; + *out = rand.LargeListView(*child_array, /*length=*/num_lists, null_probability, + /*force_empty_nulls=*/false, + /*force_empty_nulls=*/0.9, kDefaultBufferAlignment, pool); + return Status::OK(); +} + Status MakeRandomLargeListArray(const std::shared_ptr& child_array, int num_lists, bool include_nulls, MemoryPool* pool, std::shared_ptr* out) { @@ -418,6 +444,31 @@ Status MakeListRecordBatch(std::shared_ptr* out) { return Status::OK(); } +Status MakeListViewRecordBatch(std::shared_ptr* out) { + // Make the schema + auto f0 = field("f0", list_view(int32())); + auto f1 = field("f1", list_view(list_view(int32()))); + auto f2 = field("f2", large_list_view(int32())); + auto schema = ::arrow::schema({f0, f1, f2}); + + // Example data + + MemoryPool* pool = default_memory_pool(); + const int length = 200; + std::shared_ptr leaf_values, list_array, list_list_array, large_list_array; + const bool include_nulls = true; + RETURN_NOT_OK(MakeRandomInt32Array(1000, include_nulls, pool, &leaf_values)); + RETURN_NOT_OK( + MakeRandomListViewArray(leaf_values, length, include_nulls, pool, &list_array)); + RETURN_NOT_OK( + MakeRandomListViewArray(list_array, length, include_nulls, pool, &list_list_array)); + RETURN_NOT_OK(MakeRandomLargeListViewArray(leaf_values, length, include_nulls, pool, + &large_list_array)); + *out = + RecordBatch::Make(schema, length, {list_array, list_list_array, large_list_array}); + return Status::OK(); +} + Status MakeFixedSizeListRecordBatch(std::shared_ptr* out) { // Make the schema auto f0 = field("f0", fixed_size_list(int32(), 1)); @@ -505,6 +556,27 @@ Status MakeDeeplyNestedList(std::shared_ptr* out) { return Status::OK(); } +Status MakeDeeplyNestedListView(std::shared_ptr* out) { + const int batch_length = 5; + auto type = int32(); + + MemoryPool* pool = default_memory_pool(); + std::shared_ptr array; + const bool include_nulls = true; + RETURN_NOT_OK(MakeRandomInt32Array(1000, include_nulls, pool, &array)); + for (int i = 0; i < 63; ++i) { + type = std::static_pointer_cast(list_view(type)); + RETURN_NOT_OK( + MakeRandomListViewArray(array, batch_length, include_nulls, pool, &array)); + } + + auto f0 = field("f0", type); + auto schema = ::arrow::schema({f0}); + std::vector> arrays = {array}; + *out = RecordBatch::Make(schema, batch_length, arrays); + return Status::OK(); +} + Status MakeStruct(std::shared_ptr* out) { // reuse constructed list columns std::shared_ptr list_batch; diff --git a/cpp/src/arrow/ipc/test_common.h b/cpp/src/arrow/ipc/test_common.h index fc0c8ddbea319..db8613cbb1e6a 100644 --- a/cpp/src/arrow/ipc/test_common.h +++ b/cpp/src/arrow/ipc/test_common.h @@ -107,6 +107,9 @@ Status MakeNullRecordBatch(std::shared_ptr* out); ARROW_TESTING_EXPORT Status MakeListRecordBatch(std::shared_ptr* out); +ARROW_TESTING_EXPORT +Status MakeListViewRecordBatch(std::shared_ptr* out); + ARROW_TESTING_EXPORT Status MakeFixedSizeListRecordBatch(std::shared_ptr* out); @@ -119,6 +122,9 @@ Status MakeNonNullRecordBatch(std::shared_ptr* out); ARROW_TESTING_EXPORT Status MakeDeeplyNestedList(std::shared_ptr* out); +ARROW_TESTING_EXPORT +Status MakeDeeplyNestedListView(std::shared_ptr* out); + ARROW_TESTING_EXPORT Status MakeStruct(std::shared_ptr* out); diff --git a/cpp/src/arrow/ipc/writer.cc b/cpp/src/arrow/ipc/writer.cc index 9668f459d0d31..93256440f4a7a 100644 --- a/cpp/src/arrow/ipc/writer.cc +++ b/cpp/src/arrow/ipc/writer.cc @@ -350,6 +350,67 @@ class RecordBatchSerializer { return Status::OK(); } + template + Status GetZeroBasedListViewOffsets(const ArrayType& array, + std::shared_ptr* out_value_offsets, + offset_type* out_min_offset, + offset_type* out_max_end) { + auto offsets = array.value_offsets(); + auto sizes = array.value_sizes(); + + const int64_t required_bytes = sizeof(offset_type) * array.length(); + if (array.offset() != 0) { + // If we have a non-zero offset, it's likely that the smallest offset is + // not zero. We must a) create a new offsets array with shifted offsets and + // b) slice the values array accordingly. + + ARROW_ASSIGN_OR_RAISE(auto shifted_offsets, + AllocateBuffer(required_bytes, options_.memory_pool)); + offset_type min_offset = 0; + offset_type max_end = 0; + if (array.length() > 0) { + min_offset = std::numeric_limits::max(); + for (int i = 0; i < array.length(); ++i) { + min_offset = std::min(min_offset, array.value_offset(i)); + max_end = std::max(max_end, array.value_offset(i) + array.value_length(i)); + } + } + + auto* dest_offsets = shifted_offsets->mutable_data_as(); + + for (int i = 0; i < array.length(); ++i) { + dest_offsets[i] = array.value_offset(i) - min_offset; + } + *out_min_offset = min_offset; + *out_max_end = max_end; + offsets = std::move(shifted_offsets); + } else { + // ARROW-6046: Slice offsets to used extent, in case we have a truncated + // slice + if (offsets != nullptr && offsets->size() > required_bytes) { + offsets = SliceBuffer(offsets, 0, required_bytes); + } + *out_min_offset = 0; + *out_max_end = static_cast(array.values()->length()); + } + *out_value_offsets = std::move(offsets); + return Status::OK(); + } + + template + Status GetListViewSizes(const ArrayType& array, + std::shared_ptr* out_value_sizes) { + const int64_t required_bytes = sizeof(offset_type) * array.length(); + auto sizes = array.value_sizes(); + if (sizes != nullptr && (array.offset() != 0 || sizes->size() > required_bytes)) { + // ARROW-6046: Slice offsets to used extent, in case we have a truncated slice + auto offset_bytes = array.offset() * sizeof(offset_type); + sizes = SliceBuffer(sizes, offset_bytes, required_bytes); + } + *out_value_sizes = std::move(sizes); + return Status::OK(); + } + Status Visit(const BooleanArray& array) { std::shared_ptr data; RETURN_NOT_OK(GetTruncatedBitmap(array.offset(), array.length(), array.values(), @@ -428,7 +489,6 @@ class RecordBatchSerializer { RETURN_NOT_OK(GetZeroBasedValueOffsets(array, &value_offsets)); out_->body_buffers.emplace_back(value_offsets); - --max_recursion_depth_; std::shared_ptr values = array.values(); offset_type values_offset = 0; @@ -442,6 +502,37 @@ class RecordBatchSerializer { // Must also slice the values values = values->Slice(values_offset, values_length); } + --max_recursion_depth_; + RETURN_NOT_OK(VisitArray(*values)); + ++max_recursion_depth_; + return Status::OK(); + } + + template + enable_if_list_view Visit(const T& array) { + using offset_type = typename T::offset_type; + + offset_type min_offset = 0; + offset_type max_end = 0; + { + std::shared_ptr value_offsets; + RETURN_NOT_OK( + GetZeroBasedListViewOffsets(array, &value_offsets, &min_offset, &max_end)); + out_->body_buffers.push_back(std::move(value_offsets)); + } + { + std::shared_ptr value_sizes; + RETURN_NOT_OK(GetListViewSizes(array, &value_sizes)); + out_->body_buffers.push_back(std::move(value_sizes)); + } + + std::shared_ptr values = array.values(); + + if (min_offset != 0 || max_end < values->length()) { + // Must also slice the values + values = values->Slice(min_offset, max_end); + } + --max_recursion_depth_; RETURN_NOT_OK(VisitArray(*values)); ++max_recursion_depth_; return Status::OK(); diff --git a/cpp/src/arrow/json/test_common.h b/cpp/src/arrow/json/test_common.h index f7ab6fd10275f..2f819779bdb59 100644 --- a/cpp/src/arrow/json/test_common.h +++ b/cpp/src/arrow/json/test_common.h @@ -135,6 +135,10 @@ struct GenerateImpl { return OK(writer.EndArray(size)); } + Status Visit(const ListViewType& t) { return NotImplemented(t); } + + Status Visit(const LargeListViewType& t) { return NotImplemented(t); } + Status Visit(const StructType& t) { return Generate(t.fields(), e, &writer, options); } Status Visit(const DayTimeIntervalType& t) { return NotImplemented(t); } diff --git a/cpp/src/arrow/pretty_print.cc b/cpp/src/arrow/pretty_print.cc index b392e027a6b89..e666ec70f9489 100644 --- a/cpp/src/arrow/pretty_print.cc +++ b/cpp/src/arrow/pretty_print.cc @@ -249,7 +249,8 @@ class ArrayPrinter : public PrettyPrinter { } template - enable_if_list_like WriteDataValues(const ArrayType& array) { + enable_if_t::value || is_list_view_type::value, Status> + WriteDataValues(const ArrayType& array) { const auto values = array.values(); const auto child_options = ChildOptions(); ArrayPrinter values_printer(child_options, sink_); @@ -300,6 +301,8 @@ class ArrayPrinter : public PrettyPrinter { std::is_base_of::value || std::is_base_of::value || std::is_base_of::value || + std::is_base_of::value || + std::is_base_of::value || std::is_base_of::value || std::is_base_of::value, Status> diff --git a/cpp/src/arrow/pretty_print_test.cc b/cpp/src/arrow/pretty_print_test.cc index 9217e190d5b62..0db6ae4867299 100644 --- a/cpp/src/arrow/pretty_print_test.cc +++ b/cpp/src/arrow/pretty_print_test.cc @@ -774,8 +774,11 @@ TEST_F(TestPrettyPrint, BinaryNoNewlines) { CheckPrimitive(options, is_valid, values, expected, false); } -TEST_F(TestPrettyPrint, ListType) { - auto list_type = list(int64()); +template +void TestPrettyPrintVarLengthListLike() { + using LargeTypeClass = typename TypeTraits::LargeType; + auto var_list_type = std::make_shared(int64()); + auto var_large_list_type = std::make_shared(int64()); static const char* ex = R"expected([ [ @@ -836,7 +839,7 @@ TEST_F(TestPrettyPrint, ListType) { ] ])expected"; - auto array = ArrayFromJSON(list_type, "[[null], [], null, [4, 6, 7], [2, 3]]"); + auto array = ArrayFromJSON(var_list_type, "[[null], [], null, [4, 6, 7], [2, 3]]"); auto make_options = [](int indent, int window, int container_window) { auto options = PrettyPrintOptions(indent, window); options.container_window = container_window; @@ -850,8 +853,7 @@ TEST_F(TestPrettyPrint, ListType) { ex_3); CheckArray(*array, {0, 10}, ex_4); - list_type = large_list(int64()); - array = ArrayFromJSON(list_type, "[[null], [], null, [4, 6, 7], [2, 3]]"); + array = ArrayFromJSON(var_large_list_type, "[[null], [], null, [4, 6, 7], [2, 3]]"); CheckStream(*array, make_options(/*indent=*/0, /*window=*/10, /*container_window=*/5), ex); CheckStream(*array, make_options(/*indent=*/2, /*window=*/10, /*container_window=*/5), @@ -861,6 +863,93 @@ TEST_F(TestPrettyPrint, ListType) { CheckArray(*array, {0, 10}, ex_4); } +TEST_F(TestPrettyPrint, ListType) { TestPrettyPrintVarLengthListLike(); } + +template +void TestListViewSpecificPrettyPrinting() { + using ArrayType = typename TypeTraits::ArrayType; + using OffsetType = typename TypeTraits::OffsetType; + + auto string_values = ArrayFromJSON(utf8(), R"(["Hello", "World", null])"); + auto int32_values = ArrayFromJSON(int32(), "[1, 20, 3]"); + auto int16_values = ArrayFromJSON(int16(), "[10, 2, 30]"); + + auto Offsets = [](std::string_view json) { + return ArrayFromJSON(TypeTraits::type_singleton(), json); + }; + auto Sizes = Offsets; + + ASSERT_OK_AND_ASSIGN(auto int_list_view_array, + ArrayType::FromArrays(*Offsets("[0, 0, 1, 2]"), + *Sizes("[2, 1, 1, 1]"), *int32_values)); + ASSERT_OK(int_list_view_array->ValidateFull()); + static const char* ex1 = + "[\n" + " [\n" + " 1,\n" + " 20\n" + " ],\n" + " [\n" + " 1\n" + " ],\n" + " [\n" + " 20\n" + " ],\n" + " [\n" + " 3\n" + " ]\n" + "]"; + CheckStream(*int_list_view_array, {}, ex1); + + ASSERT_OK_AND_ASSIGN(auto string_list_view_array, + ArrayType::FromArrays(*Offsets("[0, 0, 1, 2]"), + *Sizes("[2, 1, 1, 1]"), *string_values)); + ASSERT_OK(string_list_view_array->ValidateFull()); + static const char* ex2 = + "[\n" + " [\n" + " \"Hello\",\n" + " \"World\"\n" + " ],\n" + " [\n" + " \"Hello\"\n" + " ],\n" + " [\n" + " \"World\"\n" + " ],\n" + " [\n" + " null\n" + " ]\n" + "]"; + CheckStream(*string_list_view_array, {}, ex2); + + auto sliced_array = string_list_view_array->Slice(1, 2); + static const char* ex3 = + "[\n" + " [\n" + " \"Hello\"\n" + " ],\n" + " [\n" + " \"World\"\n" + " ]\n" + "]"; + CheckStream(*sliced_array, {}, ex3); + + ASSERT_OK_AND_ASSIGN( + auto empty_array, + ArrayType::FromArrays(*Offsets("[]"), *Sizes("[]"), *int16_values)); + ASSERT_OK(empty_array->ValidateFull()); + static const char* ex4 = "[]"; + CheckStream(*empty_array, {}, ex4); +} + +TEST_F(TestPrettyPrint, ListViewType) { + TestPrettyPrintVarLengthListLike(); + + TestListViewSpecificPrettyPrinting(); + TestListViewSpecificPrettyPrinting(); +} + TEST_F(TestPrettyPrint, ListTypeNoNewlines) { auto list_type = list(int64()); auto empty_array = ArrayFromJSON(list_type, "[]"); diff --git a/cpp/src/arrow/scalar.cc b/cpp/src/arrow/scalar.cc index 167e272705268..6996b46c8b61a 100644 --- a/cpp/src/arrow/scalar.cc +++ b/cpp/src/arrow/scalar.cc @@ -587,6 +587,12 @@ ListScalar::ListScalar(std::shared_ptr value, bool is_valid) LargeListScalar::LargeListScalar(std::shared_ptr value, bool is_valid) : BaseListScalar(value, large_list(value->type()), is_valid) {} +ListViewScalar::ListViewScalar(std::shared_ptr value, bool is_valid) + : BaseListScalar(value, list_view(value->type()), is_valid) {} + +LargeListViewScalar::LargeListViewScalar(std::shared_ptr value, bool is_valid) + : BaseListScalar(value, large_list_view(value->type()), is_valid) {} + inline std::shared_ptr MakeMapType(const std::shared_ptr& pair_type) { ARROW_CHECK_EQ(pair_type->id(), Type::STRUCT); ARROW_CHECK_EQ(pair_type->num_fields(), 2); @@ -776,14 +782,6 @@ struct MakeNullImpl { return Status::OK(); } - template ::ScalarType> - Status VisitListLike(const T& type, int64_t value_size = 0) { - ARROW_ASSIGN_OR_RAISE(std::shared_ptr value, - MakeArrayOfNull(type.value_type(), value_size)); - out_ = std::make_shared(std::move(value), type_, /*is_valid=*/false); - return Status::OK(); - } - Status Visit(const FixedSizeBinaryType& type) { ARROW_ASSIGN_OR_RAISE(std::shared_ptr value, AllocateBuffer(type.byte_width())); @@ -794,11 +792,25 @@ struct MakeNullImpl { return Status::OK(); } + template ::ScalarType> + Status VisitListLike(const T& type, int64_t list_size = 0) { + ARROW_ASSIGN_OR_RAISE(std::shared_ptr value, + MakeArrayOfNull(type.value_type(), list_size)); + out_ = std::make_shared(std::move(value), type_, /*is_valid=*/false); + return Status::OK(); + } + Status Visit(const ListType& type) { return VisitListLike(type); } + Status Visit(const LargeListType& type) { return VisitListLike(type); } + Status Visit(const MapType& type) { return VisitListLike(type); } - Status Visit(const LargeListType& type) { return VisitListLike(type); } + Status Visit(const ListViewType& type) { return VisitListLike(type); } + + Status Visit(const LargeListViewType& type) { + return VisitListLike(type); + } Status Visit(const FixedSizeListType& type) { return VisitListLike(type, type.list_size()); diff --git a/cpp/src/arrow/scalar.h b/cpp/src/arrow/scalar.h index 5175b0128524c..65c5ee4df0a04 100644 --- a/cpp/src/arrow/scalar.h +++ b/cpp/src/arrow/scalar.h @@ -531,6 +531,20 @@ struct ARROW_EXPORT LargeListScalar : public BaseListScalar { explicit LargeListScalar(std::shared_ptr value, bool is_valid = true); }; +struct ARROW_EXPORT ListViewScalar : public BaseListScalar { + using TypeClass = ListViewType; + using BaseListScalar::BaseListScalar; + + explicit ListViewScalar(std::shared_ptr value, bool is_valid = true); +}; + +struct ARROW_EXPORT LargeListViewScalar : public BaseListScalar { + using TypeClass = LargeListViewType; + using BaseListScalar::BaseListScalar; + + explicit LargeListViewScalar(std::shared_ptr value, bool is_valid = true); +}; + struct ARROW_EXPORT MapScalar : public BaseListScalar { using TypeClass = MapType; using BaseListScalar::BaseListScalar; diff --git a/cpp/src/arrow/scalar_test.cc b/cpp/src/arrow/scalar_test.cc index a188aea1669a4..cba817f67b1a9 100644 --- a/cpp/src/arrow/scalar_test.cc +++ b/cpp/src/arrow/scalar_test.cc @@ -394,6 +394,10 @@ class TestRealScalar : public ::testing::Test { void TestLargeListOf() { TestListOf(large_list(type_)); } + void TestListViewOf() { TestListOf(list_view(type_)); } + + void TestLargeListViewOf() { TestListOf(large_list_view(type_)); } + protected: std::shared_ptr type_; std::shared_ptr scalar_val_, scalar_other_, scalar_nan_, scalar_other_nan_, @@ -414,6 +418,10 @@ TYPED_TEST(TestRealScalar, ListOf) { this->TestListOf(); } TYPED_TEST(TestRealScalar, LargeListOf) { this->TestLargeListOf(); } +TYPED_TEST(TestRealScalar, ListViewOf) { this->TestListViewOf(); } + +TYPED_TEST(TestRealScalar, LargeListViewOf) { this->TestLargeListViewOf(); } + template class TestDecimalScalar : public ::testing::Test { public: @@ -1083,7 +1091,7 @@ void CheckInvalidListCast(const Scalar& scalar, const std::shared_ptr& } template -class TestListScalar : public ::testing::Test { +class TestListLikeScalar : public ::testing::Test { public: using ScalarType = typename TypeTraits::ScalarType; @@ -1177,17 +1185,18 @@ class TestListScalar : public ::testing::Test { std::shared_ptr value_; }; -using ListScalarTestTypes = ::testing::Types; +using ListScalarTestTypes = ::testing::Types; -TYPED_TEST_SUITE(TestListScalar, ListScalarTestTypes); +TYPED_TEST_SUITE(TestListLikeScalar, ListScalarTestTypes); -TYPED_TEST(TestListScalar, Basics) { this->TestBasics(); } +TYPED_TEST(TestListLikeScalar, Basics) { this->TestBasics(); } -TYPED_TEST(TestListScalar, ValidateErrors) { this->TestValidateErrors(); } +TYPED_TEST(TestListLikeScalar, ValidateErrors) { this->TestValidateErrors(); } -TYPED_TEST(TestListScalar, Hashing) { this->TestHashing(); } +TYPED_TEST(TestListLikeScalar, Hashing) { this->TestHashing(); } -TYPED_TEST(TestListScalar, Cast) { this->TestCast(); } +TYPED_TEST(TestListLikeScalar, Cast) { this->TestCast(); } TEST(TestFixedSizeListScalar, ValidateErrors) { const auto ty = fixed_size_list(int16(), 3); diff --git a/cpp/src/arrow/testing/random.cc b/cpp/src/arrow/testing/random.cc index 1386075397e20..c317fe7aef44c 100644 --- a/cpp/src/arrow/testing/random.cc +++ b/cpp/src/arrow/testing/random.cc @@ -499,6 +499,7 @@ std::shared_ptr RandomArrayGenerator::FixedSizeBinary(int64_t size, } namespace { + template std::shared_ptr GenerateOffsets(SeedType seed, int64_t size, typename OffsetArrayType::value_type first_offset, @@ -608,6 +609,205 @@ std::shared_ptr OffsetsFromLengthsArray(OffsetArrayType* lengths, std::make_shared(), size, buffers, null_count); return std::make_shared(array_data); } + +// Helper for RandomArrayGenerator::ArrayOf: extract some C value from +// a given metadata key. +template ::ArrowType> +enable_if_parameter_free GetMetadata(const KeyValueMetadata* metadata, + const std::string& key, + T default_value) { + if (!metadata) return default_value; + const auto index = metadata->FindKey(key); + if (index < 0) return default_value; + const auto& value = metadata->value(index); + T output{}; + if (!internal::ParseValue(value.data(), value.length(), &output)) { + ABORT_NOT_OK(Status::Invalid("Could not parse ", key, " = ", value, " as ", + ArrowType::type_name())); + } + return output; +} + +/// \brief Shuffle a list-view array in place using the Fisher–Yates algorithm [1]. +/// +/// [1] https://en.wikipedia.org/wiki/Fisher%E2%80%93Yates_shuffle#The_modern_algorithm +/// +/// \param[in] seed The seed for the random number generator +/// \param[in,out] data The array to shuffle +template +void ShuffleListViewDataInPlace(SeedType seed, ArrayData* data) { + DCHECK_EQ(data->type->id(), ListViewType::type_id); + using offset_type = typename ListViewType::offset_type; + + auto* validity = data->GetMutableValues(0, 0); + auto* offsets = data->GetMutableValues(1); + auto* sizes = data->GetMutableValues(2); + + pcg32_fast rng(seed); + using UniformDist = std::uniform_int_distribution; + UniformDist dist; + for (int64_t i = data->length - 1; i > 0; --i) { + const auto j = dist(rng, UniformDist::param_type(0, i)); + if (ARROW_PREDICT_TRUE(i != j)) { + // Swap validity bits + if (validity) { + const bool valid_i = bit_util::GetBit(validity, data->offset + i); + const bool valid_j = bit_util::GetBit(validity, data->offset + i); + if (valid_i != valid_j) { + bit_util::SetBitTo(validity, data->offset + i, valid_j); + bit_util::SetBitTo(validity, data->offset + j, valid_i); + } + } + // Swap offsets and sizes + std::swap(offsets[i], offsets[j]); + std::swap(sizes[i], sizes[j]); + } + } +} + +/// \brief Generate the list-view offsets based on a random buffer of sizes. +/// +/// The sizes buffer is an input of this function, but when force_empty_nulls is true, +/// some values on the sizes buffer can be set to 0. +/// +/// \param[in] seed The seed for the random number generator +/// \param[in,out] mutable_sizes_array The array of sizes to use +/// \param[in] force_empty_nulls Whether to force null list-view sizes to be 0 +/// \param[in] zero_undefined_offsets Whether to zero the offsets of list-views that have +/// 0 set as the size +/// \param[out] out_max_view_end The maximum value of the end of a list-view +template +std::shared_ptr ViewOffsetsFromLengthsArray( + SeedType seed, OffsetArrayType& mutable_sizes_array, bool force_empty_nulls, + bool zero_undefined_offsets, int64_t* out_max_view_end, int64_t alignment, + MemoryPool* memory_pool) { + using TypeClass = typename OffsetArrayType::TypeClass; + + auto* sizes = mutable_sizes_array.data()->template GetMutableValues(1); + + BufferVector buffers{2}; + buffers[0] = NULLPTR; // sizes can have nulls, offsets don't have to + buffers[1] = *AllocateBuffer(sizeof(offset_type) * mutable_sizes_array.length(), + alignment, memory_pool); + auto offsets = buffers[1]->mutable_data_as(); + + offset_type offset = 0; + offset_type max_view_end = 0; + for (int64_t i = 0; i < mutable_sizes_array.length(); ++i) { + if (mutable_sizes_array.IsNull(i)) { + if (force_empty_nulls) { + sizes[i] = 0; + } + offsets[i] = zero_undefined_offsets ? 0 : offset; + } else { + if (sizes[i] == 0) { + offsets[i] = zero_undefined_offsets ? 0 : offset; + } else { + offsets[i] = offset; + DCHECK_LT(offset, std::numeric_limits::max() - sizes[i]); + offset += sizes[i]; + } + } + max_view_end = std::max(max_view_end, offsets[i] + sizes[i]); + } + *out_max_view_end = max_view_end; + + auto array_data = + ArrayData::Make(TypeTraits::type_singleton(), + mutable_sizes_array.length(), std::move(buffers), /*null_count=*/0); + return std::make_shared(std::move(array_data)); +} + +template +Result> ArrayOfListView(RAG& self, const Field& field, + int64_t length, int64_t alignment, + MemoryPool* memory_pool, + double null_probability) { + using TypeClass = typename ArrayType::TypeClass; + using offset_type = typename ArrayType::offset_type; + using OffsetArrayType = typename CTypeTraits::ArrayType; + using OffsetArrowType = typename CTypeTraits::ArrowType; + + const auto min_length = + GetMetadata(field.metadata().get(), "min_length", 0); + const auto max_length = + GetMetadata(field.metadata().get(), "max_length", 20); + const auto force_empty_nulls = + GetMetadata(field.metadata().get(), "force_empty_nulls", false); + const auto zero_undefined_offsets = + GetMetadata(field.metadata().get(), "zero_undefined_offsets", false); + const auto lengths = internal::checked_pointer_cast( + self.RAG::template Numeric( + length, min_length, max_length, null_probability)); + + int64_t max_view_end = 0; + const auto offsets = ViewOffsetsFromLengthsArray( + self.seed(), *lengths, force_empty_nulls, zero_undefined_offsets, &max_view_end, + alignment, memory_pool); + + const auto values = self.RAG::ArrayOf( + *internal::checked_pointer_cast(field.type())->value_field(), + /*values_length=*/max_view_end, alignment, memory_pool); + + ARROW_ASSIGN_OR_RAISE(auto list_view_array, + ArrayType::FromArrays(field.type(), *offsets, *lengths, *values)); + ShuffleListViewDataInPlace( + self.seed(), const_cast(list_view_array->data().get())); + return list_view_array; +} + +template +Result> RandomListView(RAG& self, const Array& values, + int64_t length, double null_probability, + bool force_empty_nulls, double coverage, + int64_t alignment, + MemoryPool* memory_pool) { + using TypeClass = typename ArrayType::TypeClass; + using offset_type = typename TypeClass::offset_type; + using OffsetArrayType = typename TypeTraits::OffsetArrayType; + using OffsetArrowType = typename OffsetArrayType::TypeClass; + + DCHECK_LE(values.length(), std::numeric_limits::max()); + DCHECK_LE(length, std::numeric_limits::max()); + + auto offsets_array = GenerateOffsets>( + self.seed(), length + 1, 0, static_cast(values.length()), null_probability, + force_empty_nulls, alignment, memory_pool); + auto* offsets = offsets_array->data()->template GetValues(1); + + // The buffers for the sizes array + BufferVector buffers{2}; + buffers[0] = NULLPTR; + buffers[1] = *AllocateBuffer(sizeof(offset_type) * length, alignment, memory_pool); + auto sizes = buffers[1]->mutable_data_as(); + + // Derive sizes from offsets taking coverage into account + pcg32_fast rng(self.seed()); + using NormalDist = std::normal_distribution; + NormalDist size_dist; + for (int64_t i = 0; i < length; ++i) { + const double mean_size = coverage * (offsets[i + 1] - offsets[i]); + const double sampled_size = + std::max(0.0, size_dist(rng, NormalDist::param_type{mean_size})); + // This creates a higher probability of offset[i] + size[i] being closer or equal to + // values.length(), but that skew is acceptable for the purposes of testing. + const auto size = std::min(static_cast(std::llround(sampled_size)), + static_cast(values.length() - offsets[i])); + sizes[i] = offsets_array->IsNull(i) && force_empty_nulls ? 0 : size; + } + + auto sizes_array_data = ArrayData::Make(TypeTraits::type_singleton(), + length, std::move(buffers), /*null_count=*/0); + auto sizes_array = std::make_shared(std::move(sizes_array_data)); + + ARROW_ASSIGN_OR_RAISE( + auto list_view_array, + ArrayType::FromArrays(*offsets_array, *sizes_array, values, memory_pool)); + ShuffleListViewDataInPlace( + self.seed(), const_cast(list_view_array->data().get())); + return list_view_array; +} + } // namespace std::shared_ptr RandomArrayGenerator::Offsets( @@ -637,6 +837,24 @@ std::shared_ptr RandomArrayGenerator::List(const Array& values, int64_t s return *::arrow::ListArray::FromArrays(*offsets, values); } +std::shared_ptr RandomArrayGenerator::ListView(const Array& values, int64_t length, + double null_probability, + bool force_empty_nulls, + double coverage, int64_t alignment, + MemoryPool* memory_pool) { + return *RandomListView(*this, values, length, null_probability, + force_empty_nulls, coverage, alignment, + memory_pool); +} + +std::shared_ptr RandomArrayGenerator::LargeListView( + const Array& values, int64_t length, double null_probability, bool force_empty_nulls, + double coverage, int64_t alignment, MemoryPool* memory_pool) { + return *RandomListView(*this, values, length, null_probability, + force_empty_nulls, coverage, alignment, + memory_pool); +} + std::shared_ptr RandomArrayGenerator::Map(const std::shared_ptr& keys, const std::shared_ptr& items, int64_t size, double null_probability, @@ -713,27 +931,6 @@ std::shared_ptr RandomArrayGenerator::DenseUnion(const ArrayVector& field return *DenseUnionArray::Make(*type_ids, *offsets, fields, type_codes); } -namespace { - -// Helper for RandomArrayGenerator::ArrayOf: extract some C value from -// a given metadata key. -template ::ArrowType> -enable_if_parameter_free GetMetadata(const KeyValueMetadata* metadata, - const std::string& key, - T default_value) { - if (!metadata) return default_value; - const auto index = metadata->FindKey(key); - if (index < 0) return default_value; - const auto& value = metadata->value(index); - T output{}; - if (!internal::ParseValue(value.data(), value.length(), &output)) { - ABORT_NOT_OK(Status::Invalid("Could not parse ", key, " = ", value)); - } - return output; -} - -} // namespace - std::shared_ptr RandomArrayGenerator::ArrayOf(std::shared_ptr type, int64_t size, double null_probability, @@ -811,6 +1008,12 @@ std::shared_ptr RandomArrayGenerator::ArrayOf(const Field& field, int64_t return *ARRAY_TYPE::FromArrays(field.type(), *offsets, *values); \ } +#define GENERATE_LIST_VIEW_CASE(ARRAY_TYPE) \ + case ARRAY_TYPE::TypeClass::type_id: { \ + return *ArrayOfListView(*this, field, length, alignment, memory_pool, \ + null_probability); \ + } + const double null_probability = field.nullable() ? GetMetadata(field.metadata().get(), "null_probability", 0.01) @@ -946,6 +1149,7 @@ std::shared_ptr RandomArrayGenerator::ArrayOf(const Field& field, int64_t } GENERATE_LIST_CASE(ListArray); + GENERATE_LIST_VIEW_CASE(ListViewArray); case Type::type::STRUCT: { ArrayVector child_arrays(field.type()->num_fields()); @@ -1069,6 +1273,7 @@ std::shared_ptr RandomArrayGenerator::ArrayOf(const Field& field, int64_t } GENERATE_LIST_CASE(LargeListArray); + GENERATE_LIST_VIEW_CASE(LargeListViewArray); default: break; @@ -1077,6 +1282,7 @@ std::shared_ptr RandomArrayGenerator::ArrayOf(const Field& field, int64_t #undef GENERATE_INTEGRAL_CASE #undef GENERATE_FLOATING_CASE #undef GENERATE_LIST_CASE +#undef GENERATE_LIST_VIEW_CASE #undef VALIDATE_RANGE #undef VALIDATE_MIN_MAX diff --git a/cpp/src/arrow/testing/random.h b/cpp/src/arrow/testing/random.h index cbdac3baa0109..1d97a3ada724a 100644 --- a/cpp/src/arrow/testing/random.h +++ b/cpp/src/arrow/testing/random.h @@ -458,6 +458,43 @@ class ARROW_TESTING_EXPORT RandomArrayGenerator { int64_t alignment = kDefaultBufferAlignment, MemoryPool* memory_pool = default_memory_pool()); + /// \brief Generate a random ListViewArray + /// + /// \param[in] values The underlying values array + /// \param[in] size The size of the generated list array + /// \param[in] null_probability the probability of a list value being null + /// \param[in] force_empty_nulls if true, null list entries must have 0 length + /// must be set to 0 + /// \param[in] coverage proportion of the values array covered by list-views + /// \param[in] alignment alignment for memory allocations (in bytes) + /// \param[in] memory_pool memory pool to allocate memory from + /// + /// \return a generated Array + std::shared_ptr ListView(const Array& values, int64_t size, + double null_probability = 0, + bool force_empty_nulls = false, double coverage = 1.0, + int64_t alignment = kDefaultBufferAlignment, + MemoryPool* memory_pool = default_memory_pool()); + + /// \brief Generate a random LargeListViewArray + /// + /// \param[in] values The underlying values array + /// \param[in] size The size of the generated list array + /// \param[in] null_probability the probability of a list value being null + /// \param[in] force_empty_nulls if true, null list entries must have 0 length + /// must be set to 0 + /// \param[in] coverage proportion of the values array covered by list-views + /// \param[in] alignment alignment for memory allocations (in bytes) + /// \param[in] memory_pool memory pool to allocate memory from + /// + /// \return a generated Array + std::shared_ptr LargeListView(const Array& values, int64_t size, + double null_probability = 0, + bool force_empty_nulls = false, + double coverage = 1.0, + int64_t alignment = kDefaultBufferAlignment, + MemoryPool* memory_pool = default_memory_pool()); + /// \brief Generate a random MapArray /// /// \param[in] keys The underlying keys array diff --git a/cpp/src/arrow/testing/random_test.cc b/cpp/src/arrow/testing/random_test.cc index 951b654e56f73..a92ecf4e9c45b 100644 --- a/cpp/src/arrow/testing/random_test.cc +++ b/cpp/src/arrow/testing/random_test.cc @@ -70,7 +70,7 @@ class RandomArrayTest : public ::testing::TestWithParam { } bool HasList(const DataType& type) { - if (is_var_length_list(type.id())) { + if (is_var_length_list_like(type.id())) { return true; } for (const auto& child : type.fields()) { @@ -99,7 +99,7 @@ TEST_P(RandomArrayTest, GenerateArrayAlignment) { const int64_t alignment = 1024; auto field = GetField(); if (HasList(*field->type())) { - GTEST_SKIP() << "ListArray::FromArrays does not conserve buffer alignment"; + GTEST_SKIP() << "List[View]Array::FromArrays does not conserve buffer alignment"; } auto array = GenerateArray(*field, /*size=*/13, 0xDEADBEEF, alignment); AssertTypeEqual(field->type(), array->type()); @@ -177,6 +177,13 @@ auto values = ::testing::Values( key_value_metadata({{"force_empty_nulls", "true"}})), field("listint81024values", list(int8()), true, key_value_metadata({{"values", "1024"}})), + field("listviewint8", list_view(int8())), + field("listviewlistviewint8", list_view(list_view(int8()))), + field("listviewint8emptynulls", list_view(int8()), true, + key_value_metadata( + {{"force_empty_nulls", "true"}, {"zero_undefined_offsets", "true"}})), + field("listviewint81024values", list_view(int8()), true, + key_value_metadata({{"values", "1024"}})), field("structints", struct_({ field("int8", int8()), field("int16", int16()), @@ -201,7 +208,8 @@ auto values = ::testing::Values( field("fixedsizelist", fixed_size_list(int8(), 4)), field("durationns", duration(TimeUnit::NANO)), field("largestring", large_utf8()), field("largebinary", large_binary()), - field("largelistlistint8", large_list(list(int8())))); + field("largelistlistint8", large_list(list(int8()))), + field("largelistviewlistviewint8", large_list_view(list_view(int8())))); INSTANTIATE_TEST_SUITE_P( TestRandomArrayGeneration, RandomArrayTest, values, @@ -400,6 +408,39 @@ TEST(TypeSpecificTests, ListLengths) { } } +TEST(TypeSpecificTests, ListViewLengths) { + { + auto field = + arrow::field("list_view", list_view(int8()), + key_value_metadata({{"min_length", "1"}, {"max_length", "1"}})); + auto base_array = GenerateArray(*field, kExpectedLength, 0xDEADBEEF); + AssertTypeEqual(field->type(), base_array->type()); + auto array = internal::checked_pointer_cast(base_array); + ASSERT_OK(array->ValidateFull()); + ASSERT_EQ(array->length(), kExpectedLength); + for (int i = 0; i < kExpectedLength; i++) { + if (!array->IsNull(i)) { + ASSERT_EQ(1, array->value_length(i)); + } + } + } + { + auto field = + arrow::field("list_view", large_list_view(int8()), + key_value_metadata({{"min_length", "10"}, {"max_length", "10"}})); + auto base_array = GenerateArray(*field, kExpectedLength, 0xDEADBEEF); + AssertTypeEqual(field->type(), base_array->type()); + auto array = internal::checked_pointer_cast(base_array); + ASSERT_EQ(array->length(), kExpectedLength); + ASSERT_OK(array->ValidateFull()); + for (int i = 0; i < kExpectedLength; i++) { + if (!array->IsNull(i)) { + ASSERT_EQ(10, array->value_length(i)); + } + } + } +} + TEST(TypeSpecificTests, MapValues) { auto field = arrow::field("map", map(int8(), int8()), key_value_metadata({{"values", "4"}})); @@ -500,6 +541,24 @@ TEST(RandomList, Basics) { } } +TEST(RandomListView, Basics) { + random::RandomArrayGenerator rng(42); + for (const double null_probability : {0.0, 0.1, 0.98}) { + SCOPED_TRACE("null_probability = " + std::to_string(null_probability)); + auto values = rng.Int16(1234, 0, 10000, null_probability); + auto array = rng.ListView(*values, 45, null_probability); + ASSERT_OK(array->ValidateFull()); + ASSERT_EQ(array->length(), 45); + const auto& list_view_array = checked_cast(*array); + ASSERT_EQ(list_view_array.values()->length(), 1234); + int64_t null_count = 0; + for (int64_t i = 0; i < array->length(); ++i) { + null_count += array->IsNull(i); + } + ASSERT_EQ(null_count, array->data()->null_count); + } +} + TEST(RandomChildFieldNullablity, List) { random::RandomArrayGenerator rng(42); @@ -513,6 +572,19 @@ TEST(RandomChildFieldNullablity, List) { ARROW_EXPECT_OK(batch->ValidateFull()); } +TEST(RandomChildFieldNullablity, ListView) { + random::RandomArrayGenerator rng(42); + + auto item = arrow::field("item", arrow::int8(), true); + auto nest_list_view_field = arrow::field("list_view", list_view(item), false); + auto list_view_field = arrow::field("list_view", list_view(nest_list_view_field), true); + auto array = rng.ArrayOf(*list_view_field, 428); + ARROW_EXPECT_OK(array->ValidateFull()); + + auto batch = rng.BatchOf({list_view_field}, 428); + ARROW_EXPECT_OK(batch->ValidateFull()); +} + TEST(RandomChildFieldNullablity, Struct) { random::RandomArrayGenerator rng(42); diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index f378bd974047d..62d2d61598dc8 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -140,6 +140,8 @@ std::vector AllTypeIds() { Type::STRUCT, Type::LIST, Type::LARGE_LIST, + Type::LIST_VIEW, + Type::LARGE_LIST_VIEW, Type::FIXED_SIZE_LIST, Type::MAP, Type::DENSE_UNION, @@ -209,6 +211,8 @@ std::string ToString(Type::type id) { TO_STRING_CASE(STRUCT) TO_STRING_CASE(LIST) TO_STRING_CASE(LARGE_LIST) + TO_STRING_CASE(LIST_VIEW) + TO_STRING_CASE(LARGE_LIST_VIEW) TO_STRING_CASE(FIXED_SIZE_LIST) TO_STRING_CASE(MAP) TO_STRING_CASE(DENSE_UNION) @@ -992,6 +996,18 @@ std::string LargeListType::ToString() const { return s.str(); } +std::string ListViewType::ToString() const { + std::stringstream s; + s << "list_view<" << value_field()->ToString() << ">"; + return s.str(); +} + +std::string LargeListViewType::ToString() const { + std::stringstream s; + s << "large_list_view<" << value_field()->ToString() << ">"; + return s.str(); +} + MapType::MapType(std::shared_ptr key_type, std::shared_ptr item_type, bool keys_sorted) : MapType(::arrow::field("key", std::move(key_type), false), @@ -2888,6 +2904,38 @@ std::string LargeListType::ComputeFingerprint() const { return ""; } +std::string ListViewType::ComputeFingerprint() const { + const auto& child_fingerprint = value_type()->fingerprint(); + if (!child_fingerprint.empty()) { + std::stringstream ss; + ss << TypeIdFingerprint(*this); + if (value_field()->nullable()) { + ss << 'n'; + } else { + ss << 'N'; + } + ss << '{' << child_fingerprint << '}'; + return ss.str(); + } + return ""; +} + +std::string LargeListViewType::ComputeFingerprint() const { + const auto& child_fingerprint = value_type()->fingerprint(); + if (!child_fingerprint.empty()) { + std::stringstream ss; + ss << TypeIdFingerprint(*this); + if (value_field()->nullable()) { + ss << 'n'; + } else { + ss << 'N'; + } + ss << '{' << child_fingerprint << '}'; + return ss.str(); + } + return ""; +} + std::string MapType::ComputeFingerprint() const { const auto& key_fingerprint = key_type()->fingerprint(); const auto& item_fingerprint = item_type()->fingerprint(); @@ -3138,6 +3186,22 @@ std::shared_ptr fixed_size_list(const std::shared_ptr& value_fi return std::make_shared(value_field, list_size); } +std::shared_ptr list_view(std::shared_ptr value_type) { + return std::make_shared(std::move(value_type)); +} + +std::shared_ptr list_view(std::shared_ptr value_field) { + return std::make_shared(std::move(value_field)); +} + +std::shared_ptr large_list_view(std::shared_ptr value_type) { + return std::make_shared(std::move(value_type)); +} + +std::shared_ptr large_list_view(std::shared_ptr value_field) { + return std::make_shared(std::move(value_field)); +} + std::shared_ptr struct_(const FieldVector& fields) { return std::make_shared(fields); } diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index a905192e4a54e..5b1331ab66919 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -1174,6 +1174,71 @@ class ARROW_EXPORT LargeListType : public BaseListType { std::string ComputeFingerprint() const override; }; +/// \brief Type class for array of list views +class ARROW_EXPORT ListViewType : public BaseListType { + public: + static constexpr Type::type type_id = Type::LIST_VIEW; + using offset_type = int32_t; + + static constexpr const char* type_name() { return "list_view"; } + + // ListView can contain any other logical value type + explicit ListViewType(const std::shared_ptr& value_type) + : ListViewType(std::make_shared("item", value_type)) {} + + explicit ListViewType(const std::shared_ptr& value_field) + : BaseListType(type_id) { + children_ = {value_field}; + } + + DataTypeLayout layout() const override { + return DataTypeLayout({DataTypeLayout::Bitmap(), + DataTypeLayout::FixedWidth(sizeof(offset_type)), + DataTypeLayout::FixedWidth(sizeof(offset_type))}); + } + + std::string ToString() const override; + + std::string name() const override { return "list_view"; } + + protected: + std::string ComputeFingerprint() const override; +}; + +/// \brief Concrete type class for large list-view data +/// +/// LargeListViewType is like ListViewType but with 64-bit rather than 32-bit offsets and +/// sizes. +class ARROW_EXPORT LargeListViewType : public BaseListType { + public: + static constexpr Type::type type_id = Type::LARGE_LIST_VIEW; + using offset_type = int64_t; + + static constexpr const char* type_name() { return "large_list_view"; } + + // LargeListView can contain any other logical value type + explicit LargeListViewType(const std::shared_ptr& value_type) + : LargeListViewType(std::make_shared("item", value_type)) {} + + explicit LargeListViewType(const std::shared_ptr& value_field) + : BaseListType(type_id) { + children_ = {value_field}; + } + + DataTypeLayout layout() const override { + return DataTypeLayout({DataTypeLayout::Bitmap(), + DataTypeLayout::FixedWidth(sizeof(offset_type)), + DataTypeLayout::FixedWidth(sizeof(offset_type))}); + } + + std::string ToString() const override; + + std::string name() const override { return "large_list_view"; } + + protected: + std::string ComputeFingerprint() const override; +}; + /// \brief Concrete type class for map data /// /// Map data is nested data where each value is a variable number of diff --git a/cpp/src/arrow/type_fwd.h b/cpp/src/arrow/type_fwd.h index ca263b710317b..63eec10bf723b 100644 --- a/cpp/src/arrow/type_fwd.h +++ b/cpp/src/arrow/type_fwd.h @@ -150,6 +150,16 @@ class LargeListArray; class LargeListBuilder; struct LargeListScalar; +class ListViewType; +class ListViewArray; +class ListViewBuilder; +struct ListViewScalar; + +class LargeListViewType; +class LargeListViewArray; +class LargeListViewBuilder; +struct LargeListViewScalar; + class MapType; class MapArray; class MapBuilder; @@ -432,6 +442,12 @@ struct Type { /// Bytes view type with 4-byte prefix and inline small string optimization BINARY_VIEW = 40, + /// A list of some logical data type represented by offset and size. + LIST_VIEW = 41, + + /// Like LIST_VIEW, but with 64-bit offsets and sizes + LARGE_LIST_VIEW = 42, + // Leave this at the end MAX_ID }; @@ -523,6 +539,19 @@ std::shared_ptr large_list(const std::shared_ptr& value_type); ARROW_EXPORT std::shared_ptr large_list(const std::shared_ptr& value_type); +/// \brief Create a ListViewType instance +ARROW_EXPORT std::shared_ptr list_view(std::shared_ptr value_type); + +/// \brief Create a ListViewType instance from its child Field type +ARROW_EXPORT std::shared_ptr list_view(std::shared_ptr value_type); + +/// \brief Create a LargetListViewType instance +ARROW_EXPORT std::shared_ptr large_list_view( + std::shared_ptr value_type); + +/// \brief Create a LargetListViewType instance from its child Field type +ARROW_EXPORT std::shared_ptr large_list_view(std::shared_ptr value_type); + /// \brief Create a MapType instance from its key and value DataTypes ARROW_EXPORT std::shared_ptr map(std::shared_ptr key_type, diff --git a/cpp/src/arrow/type_test.cc b/cpp/src/arrow/type_test.cc index 273f8933fa577..009e557f82f68 100644 --- a/cpp/src/arrow/type_test.cc +++ b/cpp/src/arrow/type_test.cc @@ -1553,6 +1553,46 @@ TEST(TestLargeListType, Basics) { ASSERT_EQ("large_list>", lt2.ToString()); } +TEST(TestListViewType, Basics) { + std::shared_ptr vt = std::make_shared(); + + ListViewType list_view_type(vt); + ASSERT_EQ(list_view_type.id(), Type::LIST_VIEW); + + ASSERT_EQ("list_view", list_view_type.name()); + ASSERT_EQ("list_view", list_view_type.ToString()); + + ASSERT_EQ(list_view_type.value_type()->id(), vt->id()); + ASSERT_EQ(list_view_type.value_type()->id(), vt->id()); + + std::shared_ptr st = std::make_shared(); + std::shared_ptr lt = std::make_shared(st); + ASSERT_EQ("list_view", lt->ToString()); + + ListViewType lt2(lt); + ASSERT_EQ("list_view>", lt2.ToString()); +} + +TEST(TestLargeListViewType, Basics) { + std::shared_ptr vt = std::make_shared(); + + LargeListViewType list_view_type(vt); + ASSERT_EQ(list_view_type.id(), Type::LARGE_LIST_VIEW); + + ASSERT_EQ("large_list_view", list_view_type.name()); + ASSERT_EQ("large_list_view", list_view_type.ToString()); + + ASSERT_EQ(list_view_type.value_type()->id(), vt->id()); + ASSERT_EQ(list_view_type.value_type()->id(), vt->id()); + + std::shared_ptr st = std::make_shared(); + std::shared_ptr lt = std::make_shared(st); + ASSERT_EQ("large_list_view", lt->ToString()); + + LargeListViewType lt2(lt); + ASSERT_EQ("large_list_view>", lt2.ToString()); +} + TEST(TestMapType, Basics) { auto md = key_value_metadata({"foo"}, {"foo value"}); @@ -1829,6 +1869,32 @@ TEST(TestListType, Equals) { ASSERT_FALSE(list_type.Equals(list_type_named, /*check_metadata=*/true)); } +TEST(TestListViewType, Equals) { + auto t1 = list_view(utf8()); + auto t2 = list_view(utf8()); + auto t3 = list_view(binary()); + auto t4 = list_view(field("item", utf8(), /*nullable=*/false)); + auto tl1 = large_list_view(binary()); + auto tl2 = large_list_view(binary()); + auto tl3 = large_list_view(float64()); + + AssertTypeEqual(*t1, *t2); + AssertTypeNotEqual(*t1, *t3); + AssertTypeNotEqual(*t1, *t4); + AssertTypeNotEqual(*t3, *tl1); + AssertTypeEqual(*tl1, *tl2); + AssertTypeNotEqual(*tl2, *tl3); + + std::shared_ptr vt = std::make_shared(); + std::shared_ptr inner_field = std::make_shared("non_default_name", vt); + + ListViewType list_view_type(vt); + ListViewType list_view_type_named(inner_field); + + AssertTypeEqual(list_view_type, list_view_type_named); + ASSERT_FALSE(list_view_type.Equals(list_view_type_named, /*check_metadata=*/true)); +} + TEST(TestListType, Metadata) { auto md1 = key_value_metadata({"foo", "bar"}, {"foo value", "bar value"}); auto md2 = key_value_metadata({"foo", "bar"}, {"foo value", "bar value"}); @@ -1859,6 +1925,66 @@ TEST(TestListType, Metadata) { AssertTypeNotEqual(*t1, *t5, /*check_metadata =*/true); } +TEST(TestListViewType, Metadata) { + auto md1 = key_value_metadata({"foo", "bar"}, {"foo value", "bar value"}); + auto md2 = key_value_metadata({"foo", "bar"}, {"foo value", "bar value"}); + auto md3 = key_value_metadata({"foo"}, {"foo value"}); + + auto f1 = field("item", utf8(), /*nullable =*/true, md1); + auto f2 = field("item", utf8(), /*nullable =*/true, md2); + auto f3 = field("item", utf8(), /*nullable =*/true, md3); + auto f4 = field("item", utf8()); + auto f5 = field("item", utf8(), /*nullable =*/false, md1); + + auto t1 = list_view(f1); + auto t2 = list_view(f2); + auto t3 = list_view(f3); + auto t4 = list_view(f4); + auto t5 = list_view(f5); + + AssertTypeEqual(*t1, *t2); + AssertTypeEqual(*t1, *t2, /*check_metadata =*/false); + + AssertTypeEqual(*t1, *t3); + AssertTypeNotEqual(*t1, *t3, /*check_metadata =*/true); + + AssertTypeEqual(*t1, *t4); + AssertTypeNotEqual(*t1, *t4, /*check_metadata =*/true); + + AssertTypeNotEqual(*t1, *t5); + AssertTypeNotEqual(*t1, *t5, /*check_metadata =*/true); +} + +TEST(TestLargeListViewType, Metadata) { + auto md1 = key_value_metadata({"foo", "bar"}, {"foo value", "bar value"}); + auto md2 = key_value_metadata({"foo", "bar"}, {"foo value", "bar value"}); + auto md3 = key_value_metadata({"foo"}, {"foo value"}); + + auto f1 = field("item", utf8(), /*nullable =*/true, md1); + auto f2 = field("item", utf8(), /*nullable =*/true, md2); + auto f3 = field("item", utf8(), /*nullable =*/true, md3); + auto f4 = field("item", utf8()); + auto f5 = field("item", utf8(), /*nullable =*/false, md1); + + auto t1 = large_list_view(f1); + auto t2 = large_list_view(f2); + auto t3 = large_list_view(f3); + auto t4 = large_list_view(f4); + auto t5 = large_list_view(f5); + + AssertTypeEqual(*t1, *t2); + AssertTypeEqual(*t1, *t2, /*check_metadata =*/false); + + AssertTypeEqual(*t1, *t3); + AssertTypeNotEqual(*t1, *t3, /*check_metadata =*/true); + + AssertTypeEqual(*t1, *t4); + AssertTypeNotEqual(*t1, *t4, /*check_metadata =*/true); + + AssertTypeNotEqual(*t1, *t5); + AssertTypeNotEqual(*t1, *t5, /*check_metadata =*/true); +} + TEST(TestNestedType, Equals) { auto create_struct = [](std::string inner_name, std::string struct_name) -> std::shared_ptr { @@ -2258,6 +2384,44 @@ TEST(TypesTest, TestRunEndEncodedType) { "run_end_encoded>"); } +TEST(TypesTest, TestListViewType) { + auto int32_expected = std::make_shared(int32()); + auto int32_list_view_type = list_view(int32()); + + ASSERT_EQ(*int32_expected, *int32_list_view_type); + + auto int32_list_view_type_cast = + std::dynamic_pointer_cast(int32_list_view_type); + ASSERT_EQ(*int32_list_view_type_cast->value_type(), *int32()); + + ASSERT_TRUE(int32_list_view_type->field(0)->Equals(Field("item", int32(), true))); + + auto int64_list_view_type = list_view(int64()); + ASSERT_NE(*int32_list_view_type, *int64_list_view_type); + + ASSERT_EQ(int32_list_view_type->ToString(), "list_view"); + ASSERT_EQ(int64_list_view_type->ToString(), "list_view"); +} + +TEST(TypesTest, TestLargeListViewType) { + auto int32_expected = std::make_shared(int32()); + auto int32_list_view_type = large_list_view(int32()); + + ASSERT_EQ(*int32_expected, *int32_list_view_type); + + auto int32_list_view_type_cast = + std::dynamic_pointer_cast(int32_list_view_type); + ASSERT_EQ(*int32_list_view_type_cast->value_type(), *int32()); + + ASSERT_TRUE(int32_list_view_type->field(0)->Equals(Field("item", int32(), true))); + + auto int64_list_view_type = large_list_view(int64()); + ASSERT_NE(*int32_list_view_type, *int64_list_view_type); + + ASSERT_EQ(int32_list_view_type->ToString(), "large_list_view"); + ASSERT_EQ(int64_list_view_type->ToString(), "large_list_view"); +} + #define TEST_PREDICATE(all_types, type_predicate) \ for (auto type : all_types) { \ ASSERT_EQ(type_predicate(type->id()), type_predicate(*type)); \ @@ -2296,6 +2460,7 @@ TEST(TypesTest, TestMembership) { TEST_PREDICATE(all_types, is_fixed_width); TEST_PREDICATE(all_types, is_var_length_list); TEST_PREDICATE(all_types, is_list_like); + TEST_PREDICATE(all_types, is_var_length_list_like); TEST_PREDICATE(all_types, is_nested); TEST_PREDICATE(all_types, is_union); } diff --git a/cpp/src/arrow/type_traits.cc b/cpp/src/arrow/type_traits.cc index de328f322ad5f..ded54aff463c1 100644 --- a/cpp/src/arrow/type_traits.cc +++ b/cpp/src/arrow/type_traits.cc @@ -67,21 +67,23 @@ int RequiredValueAlignmentForBuffer(Type::type type_id, int buffer_index) { case Type::BINARY: // Offsets may be cast to int32_t* case Type::DATE32: case Type::TIME32: - case Type::LIST: // Offsets may be cast to int32_t*, data is in child array - case Type::MAP: // This is a list array + case Type::LIST: // Offsets may be cast to int32_t* + case Type::LIST_VIEW: // Offsets and sizes may be cast to int32_t* + case Type::MAP: // Same as LIST case Type::INTERVAL_MONTHS: // Stored as int32_t* case Type::INTERVAL_DAY_TIME: // Stored as two contiguous 32-bit integers return 4; case Type::INT64: case Type::UINT64: case Type::DOUBLE: - case Type::DECIMAL128: // May be cast to GenericBasicDecimal* which requires - // alignment of 8 - case Type::DECIMAL256: // May be cast to GenericBasicDecimal* which requires - // alignment of 8 - case Type::LARGE_BINARY: // Offsets may be cast to int64_t* - case Type::LARGE_LIST: // Offsets may be cast to int64_t* - case Type::LARGE_STRING: // Offsets may be cast to int64_t* + case Type::DECIMAL128: // May be cast to GenericBasicDecimal* which requires + // alignment of 8 + case Type::DECIMAL256: // May be cast to GenericBasicDecimal* which requires + // alignment of 8 + case Type::LARGE_BINARY: // Offsets may be cast to int64_t* + case Type::LARGE_STRING: // Offsets may be cast to int64_t* + case Type::LARGE_LIST: // Offsets may be cast to int64_t* + case Type::LARGE_LIST_VIEW: // Offsets and sizes may be cast to int64_t* case Type::DATE64: case Type::TIME64: case Type::TIMESTAMP: diff --git a/cpp/src/arrow/type_traits.h b/cpp/src/arrow/type_traits.h index 9d8cafacf397b..ed66c9367dc36 100644 --- a/cpp/src/arrow/type_traits.h +++ b/cpp/src/arrow/type_traits.h @@ -449,6 +449,7 @@ struct TypeTraits { using OffsetBuilderType = Int32Builder; using OffsetScalarType = Int32Scalar; constexpr static bool is_parameter_free = false; + using LargeType = LargeListType; }; template <> @@ -463,6 +464,31 @@ struct TypeTraits { constexpr static bool is_parameter_free = false; }; +template <> +struct TypeTraits { + using ArrayType = ListViewArray; + using BuilderType = ListViewBuilder; + using ScalarType = ListViewScalar; + using OffsetType = Int32Type; + using OffsetArrayType = Int32Array; + using OffsetBuilderType = Int32Builder; + using OffsetScalarType = Int32Scalar; + constexpr static bool is_parameter_free = false; + using LargeType = LargeListViewType; +}; + +template <> +struct TypeTraits { + using ArrayType = LargeListViewArray; + using BuilderType = LargeListViewBuilder; + using ScalarType = LargeListViewScalar; + using OffsetType = Int64Type; + using OffsetArrayType = Int64Array; + using OffsetBuilderType = Int64Builder; + using OffsetScalarType = Int64Scalar; + constexpr static bool is_parameter_free = false; +}; + template <> struct TypeTraits { using ArrayType = MapArray; @@ -750,6 +776,13 @@ using is_list_type = template using enable_if_list_type = enable_if_t::value, R>; +template +using is_list_view_type = + std::disjunction, std::is_same>; + +template +using enable_if_list_view = enable_if_t::value, R>; + template using is_list_like_type = std::integral_constant::value || @@ -758,6 +791,14 @@ using is_list_like_type = template using enable_if_list_like = enable_if_t::value, R>; +template +using is_var_length_list_like_type = + std::disjunction, is_list_view_type>; + +template +using enable_if_var_length_list_like = + enable_if_t::value, R>; + template using is_struct_type = std::is_base_of; @@ -1303,6 +1344,39 @@ constexpr bool is_list_like(Type::type type_id) { return false; } +/// \brief Check for a var-length list or list-view like type +/// +/// \param[in] type_id the type-id to check +/// \return whether type-id is a var-length list or list-view like type +constexpr bool is_var_length_list_like(Type::type type_id) { + switch (type_id) { + case Type::LIST: + case Type::LARGE_LIST: + case Type::LIST_VIEW: + case Type::LARGE_LIST_VIEW: + case Type::MAP: + return true; + default: + break; + } + return false; +} + +/// \brief Check for a list-view type +/// +/// \param[in] type_id the type-id to check +/// \return whether type-id is a list-view type one +constexpr bool is_list_view(Type::type type_id) { + switch (type_id) { + case Type::LIST_VIEW: + case Type::LARGE_LIST_VIEW: + return true; + default: + break; + } + return false; +} + /// \brief Check for a nested type /// /// \param[in] type_id the type-id to check @@ -1311,6 +1385,8 @@ constexpr bool is_nested(Type::type type_id) { switch (type_id) { case Type::LIST: case Type::LARGE_LIST: + case Type::LIST_VIEW: + case Type::LARGE_LIST_VIEW: case Type::FIXED_SIZE_LIST: case Type::MAP: case Type::STRUCT: @@ -1403,12 +1479,14 @@ static inline int offset_bit_width(Type::type type_id) { case Type::STRING: case Type::BINARY: case Type::LIST: + case Type::LIST_VIEW: case Type::MAP: case Type::DENSE_UNION: return 32; case Type::LARGE_STRING: case Type::LARGE_BINARY: case Type::LARGE_LIST: + case Type::LARGE_LIST_VIEW: return 64; default: break; @@ -1609,6 +1687,24 @@ static inline bool is_var_length_list(const DataType& type) { /// Convenience for checking using the type's id static inline bool is_list_like(const DataType& type) { return is_list_like(type.id()); } +/// \brief Check for a var-length list or list-view like type +/// +/// \param[in] type the type to check +/// \return whether type is a var-length list or list-view like type +/// +/// Convenience for checking using the type's id +static inline bool is_var_length_list_like(const DataType& type) { + return is_var_length_list_like(type.id()); +} + +/// \brief Check for a list-view type +/// +/// \param[in] type the type to check +/// \return whether type is a list-view type +/// +/// Convenience for checking using the type's id +static inline bool is_list_view(const DataType& type) { return is_list_view(type.id()); } + /// \brief Check for a nested type /// /// \param[in] type the type to check diff --git a/cpp/src/arrow/util/CMakeLists.txt b/cpp/src/arrow/util/CMakeLists.txt index 3cecab3a633cc..eb3e51d5e4466 100644 --- a/cpp/src/arrow/util/CMakeLists.txt +++ b/cpp/src/arrow/util/CMakeLists.txt @@ -53,6 +53,7 @@ add_arrow_test(utility-test int_util_test.cc ${IO_UTIL_TEST_SOURCES} iterator_test.cc + list_util_test.cc logging_test.cc queue_test.cc range_test.cc diff --git a/cpp/src/arrow/util/list_util.cc b/cpp/src/arrow/util/list_util.cc new file mode 100644 index 0000000000000..15196ff8c12cf --- /dev/null +++ b/cpp/src/arrow/util/list_util.cc @@ -0,0 +1,237 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include + +#include "arrow/array/array_nested.h" +#include "arrow/array/builder_nested.h" +#include "arrow/array/data.h" +#include "arrow/type.h" +#include "arrow/type_traits.h" +#include "arrow/util/bit_run_reader.h" +#include "arrow/util/checked_cast.h" +#include "arrow/util/list_util.h" +#include "arrow/util/logging.h" +#include "arrow/util/string.h" + +namespace arrow::list_util { + +namespace internal { + +namespace { + +using arrow::internal::checked_cast; +using arrow::internal::ReverseSetBitRunReader; +using arrow::internal::SetBitRunReader; + +/// \pre input.length() > 0 && input.null_count() != input.length() +/// \param input A LIST_VIEW or LARGE_LIST_VIEW array +template +std::optional MinViewOffset(const ArraySpan& input) { + const uint8_t* validity = input.buffers[0].data; + const auto* offsets = input.GetValues(1); + const auto* sizes = input.GetValues(2); + + // Make an access to the sizes buffer only when strictly necessary. +#define MINIMIZE_MIN_VIEW_OFFSET(i) \ + auto offset = offsets[i]; \ + if (min_offset.has_value()) { \ + if (offset < *min_offset && sizes[i] > 0) { \ + if (offset == 0) { \ + return 0; \ + } \ + min_offset = offset; \ + } \ + } else { \ + if (sizes[i] > 0) { \ + if (offset == 0) { \ + return 0; \ + } \ + min_offset = offset; \ + } \ + } + + std::optional min_offset; + if (validity == nullptr) { + for (int64_t i = 0; i < input.length; i++) { + MINIMIZE_MIN_VIEW_OFFSET(i); + } + } else { + SetBitRunReader reader(validity, input.offset, input.length); + while (true) { + const auto run = reader.NextRun(); + if (run.length == 0) { + break; + } + for (int64_t i = run.position; i < run.position + run.length; ++i) { + MINIMIZE_MIN_VIEW_OFFSET(i); + } + } + } + return min_offset; + +#undef MINIMIZE_MIN_VIEW_OFFSET +} + +/// \pre input.length() > 0 && input.null_count() != input.length() +/// \param input A LIST_VIEW or LARGE_LIST_VIEW array +template +int64_t MaxViewEnd(const ArraySpan& input) { + const auto values_length = input.child_data[0].length; + + const uint8_t* validity = input.buffers[0].data; + const auto* offsets = input.GetValues(1); + const auto* sizes = input.GetValues(2); + +#define MAXIMIZE_MAX_VIEW_END(i) \ + const auto offset = static_cast(offsets[i]); \ + const offset_type size = sizes[i]; \ + if (size > 0) { \ + const int64_t end = offset + size; \ + if (end > max_end) { \ + if (end == values_length) { \ + return values_length; \ + } \ + max_end = end; \ + } \ + } + + int64_t max_end = 0; + if (validity == nullptr) { + for (int64_t i = input.length - 1; i >= 0; --i) { + MAXIMIZE_MAX_VIEW_END(i); + } + } else { + ReverseSetBitRunReader reader(validity, input.offset, input.length); + while (true) { + const auto run = reader.NextRun(); + if (run.length == 0) { + break; + } + for (int64_t i = run.position + run.length - 1; i >= run.position; --i) { + MAXIMIZE_MAX_VIEW_END(i); + } + } + } + return max_end; + +#undef MAXIMIZE_MAX_VIEW_END +} + +template +std::pair RangeOfValuesUsedByListView(const ArraySpan& input) { + DCHECK(is_list_view(*input.type)); + if (input.length == 0 || input.null_count == input.length) { + return {0, 0}; + } + const auto min_offset = MinViewOffset(input); + // If all list-views are empty, min_offset will be std::nullopt. + if (!min_offset.has_value()) { + return {0, 0}; + } + const int64_t max_end = MaxViewEnd(input); + return {*min_offset, max_end - *min_offset}; +} + +template +std::pair RangeOfValuesUsedByList(const ArraySpan& input) { + DCHECK(is_var_length_list(*input.type)); + if (input.length == 0) { + return {0, 0}; + } + const auto* offsets = input.buffers[1].data_as(); + const int64_t min_offset = offsets[input.offset]; + const int64_t max_end = offsets[input.offset + input.length]; + return {min_offset, max_end - min_offset}; +} + +template +int64_t SumOfListSizes(const ArraySpan& input) { + DCHECK(is_var_length_list(*input.type)); + const uint8_t* validity = input.buffers[0].data; + const auto* offsets = input.GetValues(1); + int64_t sum = 0; + arrow::internal::VisitSetBitRunsVoid( + validity, input.offset, input.length, + [&sum, offsets](int64_t run_start, int64_t run_length) { + sum += offsets[run_start + run_length + 1] - offsets[run_start]; + }); + return sum; +} + +template +int64_t SumOfListViewSizes(const ArraySpan& input) { + DCHECK(is_list_view(*input.type)); + const uint8_t* validity = input.buffers[0].data; + const auto* sizes = input.GetValues(2); + int64_t sum = 0; + arrow::internal::VisitSetBitRunsVoid( + validity, input.offset, input.length, + [&sum, sizes](int64_t run_start, int64_t run_length) { + for (int64_t i = run_start; i < run_start + run_length; ++i) { + sum += sizes[i]; + } + }); + return sum; +} + +} // namespace + +Result> RangeOfValuesUsed(const ArraySpan& input) { + switch (input.type->id()) { + case Type::LIST: + return RangeOfValuesUsedByList(input); + case Type::MAP: + return RangeOfValuesUsedByList(input); + case Type::LARGE_LIST: + return RangeOfValuesUsedByList(input); + case Type::LIST_VIEW: + return RangeOfValuesUsedByListView(input); + case Type::LARGE_LIST_VIEW: + return RangeOfValuesUsedByListView(input); + default: + break; + } + DCHECK(!is_var_length_list_like(*input.type)); + return Status::TypeError( + "RangeOfValuesUsed: input is not a var-length list-like array"); +} + +Result SumOfLogicalListSizes(const ArraySpan& input) { + switch (input.type->id()) { + case Type::LIST: + return SumOfListSizes(input); + case Type::MAP: + return SumOfListSizes(input); + case Type::LARGE_LIST: + return SumOfListSizes(input); + case Type::LIST_VIEW: + return SumOfListViewSizes(input); + case Type::LARGE_LIST_VIEW: + return SumOfListViewSizes(input); + default: + break; + } + DCHECK(!is_var_length_list_like(*input.type)); + return Status::TypeError( + "SumOfLogicalListSizes: input is not a var-length list-like array"); +} + +} // namespace internal + +} // namespace arrow::list_util diff --git a/cpp/src/arrow/util/list_util.h b/cpp/src/arrow/util/list_util.h new file mode 100644 index 0000000000000..467f4eb15edb7 --- /dev/null +++ b/cpp/src/arrow/util/list_util.h @@ -0,0 +1,55 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "arrow/array/data.h" +#include "arrow/result.h" + +namespace arrow { +namespace list_util { +namespace internal { + +/// \brief Calculate the smallest continuous range of values used by the +/// var-length list-like input (list, map and list-view types). +/// +/// \param input The input array such that is_var_length_list_like(input.type) +/// is true +/// \return A pair of (offset, length) describing the range +ARROW_EXPORT Result> RangeOfValuesUsed( + const ArraySpan& input); + +/// \brief Calculate the sum of the sizes of all valid lists or list-views +/// +/// This is usually the same as the length of the RangeOfValuesUsed() range, but +/// it can be: +/// - Smaller: when the child array constains many values that are not +/// referenced by the lists or list-views in the parent array +/// - Greater: when the list-views share child array ranges +/// +/// \param input The input array such that is_var_length_list_like(input.type) +/// is true +/// \return The sum of all list or list-view sizes +ARROW_EXPORT Result SumOfLogicalListSizes(const ArraySpan& input); + +} // namespace internal + +} // namespace list_util +} // namespace arrow diff --git a/cpp/src/arrow/util/list_util_test.cc b/cpp/src/arrow/util/list_util_test.cc new file mode 100644 index 0000000000000..4021180b2bef3 --- /dev/null +++ b/cpp/src/arrow/util/list_util_test.cc @@ -0,0 +1,163 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "arrow/array/builder_nested.h" +#include "arrow/util/list_util.h" + +#include "arrow/testing/builder.h" +#include "arrow/testing/gtest_util.h" + +namespace arrow { + +using internal::checked_cast; +using internal::checked_pointer_cast; + +using ListAndListViewTypes = + ::testing::Types; + +template +class TestListUtils : public ::testing::Test { + public: + using TypeClass = T; + using offset_type = typename TypeClass::offset_type; + using BuilderType = typename TypeTraits::BuilderType; + + void SetUp() override { + value_type_ = int16(); + type_ = std::make_shared(value_type_); + + std::unique_ptr tmp; + ASSERT_OK(MakeBuilder(pool_, type_, &tmp)); + builder_.reset(checked_cast(tmp.release())); + } + + void TestRangeOfValuesUsed() { + std::shared_ptr result; + + // These list-views are built manually with the list-view builders instead + // of using something like ArrayFromJSON() because we want to test the + // RangeOfValuesUsed() function's ability to handle arrays containing + // overlapping list-views. + + // Empty list-like array + ASSERT_OK(builder_->FinishInternal(&result)); + builder_->Reset(); + ASSERT_OK_AND_ASSIGN(auto range, list_util::internal::RangeOfValuesUsed(*result)); + ASSERT_EQ(range.first, 0); + ASSERT_EQ(range.second, 0); + + // List-like array with only nulls + ASSERT_OK(builder_->AppendNulls(3)); + ASSERT_OK(builder_->FinishInternal(&result)); + builder_->Reset(); + ASSERT_OK_AND_ASSIGN(range, list_util::internal::RangeOfValuesUsed(*result)); + ASSERT_EQ(range.first, 0); + ASSERT_EQ(range.second, 0); + + // Array with nulls and non-nulls (starting at a non-zero offset) + Int16Builder* vb = checked_cast(builder_->value_builder()); + ASSERT_OK(vb->Append(-2)); + ASSERT_OK(vb->Append(-1)); + ASSERT_OK(builder_->Append(/*is_valid=*/false, 0)); + ASSERT_OK(builder_->Append(/*is_valid=*/true, 2)); + ASSERT_OK(vb->Append(0)); + ASSERT_OK(vb->Append(1)); + ASSERT_OK(builder_->Append(/*is_valid=*/true, 3)); + ASSERT_OK(vb->Append(2)); + ASSERT_OK(vb->Append(3)); + ASSERT_OK(vb->Append(4)); + if constexpr (is_list_view_type::value) { + ASSERT_OK(vb->Append(10)); + ASSERT_OK(vb->Append(11)); + } + std::shared_ptr array; + ASSERT_OK(builder_->Finish(&array)); + builder_->Reset(); + ASSERT_OK(array->ValidateFull()); + ASSERT_OK_AND_ASSIGN(range, list_util::internal::RangeOfValuesUsed(*array->data())); + ASSERT_EQ(range.first, 2); + ASSERT_EQ(range.second, 5); + + // Overlapping list-views + vb = checked_cast(builder_->value_builder()); + ASSERT_OK(vb->Append(-2)); + ASSERT_OK(vb->Append(-1)); + ASSERT_OK(builder_->Append(/*is_valid=*/false, 0)); + if constexpr (is_list_view_type::value) { + ASSERT_OK(builder_->Append(/*is_valid=*/true, 6)); + ASSERT_OK(vb->Append(0)); + ASSERT_OK(builder_->Append(/*is_valid=*/true, 2)); + ASSERT_OK(vb->Append(1)); + ASSERT_OK(vb->Append(2)); + ASSERT_OK(vb->Append(3)); + ASSERT_OK(builder_->Append(/*is_valid=*/false, 0)); + ASSERT_OK(builder_->Append(/*is_valid=*/true, 1)); + ASSERT_OK(vb->Append(4)); + ASSERT_OK(vb->Append(5)); + // -- used range ends here -- + ASSERT_OK(vb->Append(10)); + ASSERT_OK(vb->Append(11)); + } else { + ASSERT_OK(builder_->Append(/*is_valid=*/true, 6)); + ASSERT_OK(vb->Append(0)); + ASSERT_OK(vb->Append(1)); + ASSERT_OK(vb->Append(2)); + ASSERT_OK(vb->Append(3)); + ASSERT_OK(vb->Append(4)); + ASSERT_OK(vb->Append(5)); + ASSERT_OK(builder_->Append(/*is_valid=*/true, 2)); + ASSERT_OK(vb->Append(1)); + ASSERT_OK(vb->Append(2)); + ASSERT_OK(builder_->Append(/*is_valid=*/false, 0)); + ASSERT_OK(builder_->Append(/*is_valid=*/true, 1)); + ASSERT_OK(vb->Append(4)); + } + ASSERT_OK(builder_->AppendNulls(2)); + ASSERT_OK(builder_->Finish(&array)); + builder_->Reset(); + ASSERT_OK(array->ValidateFull()); + ASSERT_ARRAYS_EQUAL( + *array, *ArrayFromJSON( + type_, "[null, [0, 1, 2, 3, 4, 5], [1, 2], null, [4], null, null]")); + // Check the range + ASSERT_OK_AND_ASSIGN(range, list_util::internal::RangeOfValuesUsed(*array->data())); + ASSERT_EQ(range.first, 2); + if constexpr (is_list_view_type::value) { + ASSERT_EQ(range.second, 6); + } else { + ASSERT_EQ(range.second, 9); + } + // Check the sum of logical sizes as well + ASSERT_OK_AND_ASSIGN(int64_t sum_of_logical_sizes, + list_util::internal::SumOfLogicalListSizes(*array->data())); + ASSERT_EQ(sum_of_logical_sizes, 9); + } + + protected: + MemoryPool* pool_ = default_memory_pool(); + std::shared_ptr type_; + std::shared_ptr value_type_; + std::shared_ptr builder_; +}; + +TYPED_TEST_SUITE(TestListUtils, ListAndListViewTypes); + +TYPED_TEST(TestListUtils, RangeOfValuesUsed) { this->TestRangeOfValuesUsed(); } + +} // namespace arrow diff --git a/cpp/src/arrow/visitor.cc b/cpp/src/arrow/visitor.cc index e057f6b12fb1b..cca99033c9350 100644 --- a/cpp/src/arrow/visitor.cc +++ b/cpp/src/arrow/visitor.cc @@ -63,6 +63,8 @@ ARRAY_VISITOR_DEFAULT(MonthIntervalArray) ARRAY_VISITOR_DEFAULT(DurationArray) ARRAY_VISITOR_DEFAULT(ListArray) ARRAY_VISITOR_DEFAULT(LargeListArray) +ARRAY_VISITOR_DEFAULT(ListViewArray) +ARRAY_VISITOR_DEFAULT(LargeListViewArray) ARRAY_VISITOR_DEFAULT(MapArray) ARRAY_VISITOR_DEFAULT(FixedSizeListArray) ARRAY_VISITOR_DEFAULT(StructArray) @@ -117,6 +119,8 @@ TYPE_VISITOR_DEFAULT(Decimal128Type) TYPE_VISITOR_DEFAULT(Decimal256Type) TYPE_VISITOR_DEFAULT(ListType) TYPE_VISITOR_DEFAULT(LargeListType) +TYPE_VISITOR_DEFAULT(ListViewType) +TYPE_VISITOR_DEFAULT(LargeListViewType) TYPE_VISITOR_DEFAULT(MapType) TYPE_VISITOR_DEFAULT(FixedSizeListType) TYPE_VISITOR_DEFAULT(StructType) @@ -170,6 +174,8 @@ SCALAR_VISITOR_DEFAULT(Decimal128Scalar) SCALAR_VISITOR_DEFAULT(Decimal256Scalar) SCALAR_VISITOR_DEFAULT(ListScalar) SCALAR_VISITOR_DEFAULT(LargeListScalar) +SCALAR_VISITOR_DEFAULT(ListViewScalar) +SCALAR_VISITOR_DEFAULT(LargeListViewScalar) SCALAR_VISITOR_DEFAULT(MapScalar) SCALAR_VISITOR_DEFAULT(FixedSizeListScalar) SCALAR_VISITOR_DEFAULT(StructScalar) diff --git a/cpp/src/arrow/visitor.h b/cpp/src/arrow/visitor.h index 650b0e7ee0a30..75ef46ae4e5c3 100644 --- a/cpp/src/arrow/visitor.h +++ b/cpp/src/arrow/visitor.h @@ -64,6 +64,8 @@ class ARROW_EXPORT ArrayVisitor { virtual Status Visit(const Decimal256Array& array); virtual Status Visit(const ListArray& array); virtual Status Visit(const LargeListArray& array); + virtual Status Visit(const ListViewArray& array); + virtual Status Visit(const LargeListViewArray& array); virtual Status Visit(const MapArray& array); virtual Status Visit(const FixedSizeListArray& array); virtual Status Visit(const StructArray& array); @@ -115,6 +117,8 @@ class ARROW_EXPORT TypeVisitor { virtual Status Visit(const Decimal256Type& type); virtual Status Visit(const ListType& type); virtual Status Visit(const LargeListType& type); + virtual Status Visit(const ListViewType& scalar); + virtual Status Visit(const LargeListViewType& scalar); virtual Status Visit(const MapType& type); virtual Status Visit(const FixedSizeListType& type); virtual Status Visit(const StructType& type); @@ -166,6 +170,8 @@ class ARROW_EXPORT ScalarVisitor { virtual Status Visit(const Decimal256Scalar& scalar); virtual Status Visit(const ListScalar& scalar); virtual Status Visit(const LargeListScalar& scalar); + virtual Status Visit(const ListViewScalar& scalar); + virtual Status Visit(const LargeListViewScalar& scalar); virtual Status Visit(const MapScalar& scalar); virtual Status Visit(const FixedSizeListScalar& scalar); virtual Status Visit(const StructScalar& scalar); diff --git a/cpp/src/arrow/visitor_generate.h b/cpp/src/arrow/visitor_generate.h index 4b57abe53ff14..cbb081bfed311 100644 --- a/cpp/src/arrow/visitor_generate.h +++ b/cpp/src/arrow/visitor_generate.h @@ -59,6 +59,8 @@ namespace arrow { ACTION(Decimal256); \ ACTION(List); \ ACTION(LargeList); \ + ACTION(ListView); \ + ACTION(LargeListView); \ ACTION(Map); \ ACTION(FixedSizeList); \ ACTION(Struct); \ diff --git a/cpp/src/parquet/arrow/path_internal.cc b/cpp/src/parquet/arrow/path_internal.cc index 919c97f4323b6..2d20403eac075 100644 --- a/cpp/src/parquet/arrow/path_internal.cc +++ b/cpp/src/parquet/arrow/path_internal.cc @@ -830,6 +830,8 @@ class PathBuilder { // Types not yet supported in Parquet. NOT_IMPLEMENTED_VISIT(Union) NOT_IMPLEMENTED_VISIT(RunEndEncoded); + NOT_IMPLEMENTED_VISIT(ListView); + NOT_IMPLEMENTED_VISIT(LargeListView); #undef NOT_IMPLEMENTED_VISIT std::vector& paths() { return paths_; } diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index 5dff533c1cce2..8f4ffc67935ee 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -129,6 +129,8 @@ struct ValueBufferSlicer { NOT_IMPLEMENTED_VISIT(Union); NOT_IMPLEMENTED_VISIT(List); NOT_IMPLEMENTED_VISIT(LargeList); + NOT_IMPLEMENTED_VISIT(ListView); + NOT_IMPLEMENTED_VISIT(LargeListView); NOT_IMPLEMENTED_VISIT(Struct); NOT_IMPLEMENTED_VISIT(FixedSizeList); NOT_IMPLEMENTED_VISIT(Dictionary); diff --git a/docs/source/format/CDataInterface.rst b/docs/source/format/CDataInterface.rst index e0884686acf6c..e2022171214b7 100644 --- a/docs/source/format/CDataInterface.rst +++ b/docs/source/format/CDataInterface.rst @@ -207,6 +207,10 @@ names and types of child fields are read from the child arrays. +------------------------+---------------------------------------------------+------------+ | ``+L`` | large list | | +------------------------+---------------------------------------------------+------------+ +| ``+lv`` | list-view | | ++------------------------+---------------------------------------------------+------------+ +| ``+Lv`` | large list-view | | ++------------------------+---------------------------------------------------+------------+ | ``+w:123`` | fixed-sized list [123 items] | | +------------------------+---------------------------------------------------+------------+ | ``+s`` | struct | | @@ -243,6 +247,8 @@ Examples array has format string ``d:12,5``. * A ``list`` array has format string ``+l``, and its single child has format string ``L``. +* A ``large_list_view`` array has format string ``+Lv``, and its single + child has format string ``L``. * A ``struct`` has format string ``+s``; its two children have names ``ints`` and ``floats``, and format strings ``i`` and ``f`` respectively. diff --git a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc index 8ed5d4e216e8e..e979342b886da 100644 --- a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc +++ b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc @@ -1350,6 +1350,8 @@ struct ObjectWriterVisitor { std::is_same::value || std::is_same::value || std::is_same::value || + std::is_same::value || + std::is_same::value || std::is_same::value || (std::is_base_of::value && !std::is_same::value) ||