From 2e8f25de27c0ad574a306fe48059e5495dfed938 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Mon, 18 Sep 2023 12:10:43 -0400 Subject: [PATCH 1/5] GH-37710: [C++][Integration] Add C++ Utf8View implementation --- cpp/src/arrow/array/array_base.cc | 2 + cpp/src/arrow/array/array_binary.cc | 28 ++ cpp/src/arrow/array/array_binary.h | 60 +++++ cpp/src/arrow/array/array_binary_test.cc | 148 ++++++++-- cpp/src/arrow/array/array_test.cc | 8 +- cpp/src/arrow/array/array_view_test.cc | 32 +++ cpp/src/arrow/array/builder_base.cc | 17 +- cpp/src/arrow/array/builder_binary.cc | 64 ++++- cpp/src/arrow/array/builder_binary.h | 253 +++++++++++++++++- cpp/src/arrow/array/builder_dict.cc | 6 + cpp/src/arrow/array/builder_dict.h | 7 + cpp/src/arrow/array/concatenate.cc | 29 ++ cpp/src/arrow/array/concatenate_test.cc | 13 + cpp/src/arrow/array/data.cc | 63 ++++- cpp/src/arrow/array/data.h | 17 +- cpp/src/arrow/array/dict_internal.h | 26 ++ cpp/src/arrow/array/util.cc | 54 +++- cpp/src/arrow/array/util.h | 13 + cpp/src/arrow/array/validate.cc | 151 ++++++++--- cpp/src/arrow/buffer_builder.h | 7 +- cpp/src/arrow/builder.cc | 4 +- cpp/src/arrow/compare.cc | 28 +- cpp/src/arrow/compute/kernels/vector_hash.cc | 95 ++----- .../engine/substrait/expression_internal.cc | 2 + .../arrow/engine/substrait/type_internal.cc | 2 + cpp/src/arrow/integration/json_internal.cc | 204 ++++++++++++-- cpp/src/arrow/ipc/feather.cc | 4 +- cpp/src/arrow/ipc/feather_test.cc | 6 +- cpp/src/arrow/ipc/json_simple.cc | 4 + cpp/src/arrow/ipc/json_simple_test.cc | 10 +- cpp/src/arrow/ipc/metadata_internal.cc | 38 ++- cpp/src/arrow/ipc/metadata_internal.h | 6 +- cpp/src/arrow/ipc/read_write_test.cc | 91 ++++--- cpp/src/arrow/ipc/reader.cc | 36 ++- cpp/src/arrow/ipc/test_common.cc | 53 ++-- cpp/src/arrow/ipc/test_common.h | 2 +- cpp/src/arrow/ipc/writer.cc | 24 +- cpp/src/arrow/ipc/writer.h | 1 + cpp/src/arrow/json/converter.cc | 2 + cpp/src/arrow/json/converter_test.cc | 4 +- cpp/src/arrow/json/parser.cc | 1 + cpp/src/arrow/json/test_common.h | 10 +- cpp/src/arrow/pretty_print.cc | 1 + cpp/src/arrow/scalar.cc | 19 +- cpp/src/arrow/scalar.h | 43 ++- cpp/src/arrow/testing/gtest_util.h | 7 + cpp/src/arrow/testing/random.cc | 54 +++- cpp/src/arrow/testing/random.h | 25 +- cpp/src/arrow/testing/random_test.cc | 1 + cpp/src/arrow/type.cc | 34 ++- cpp/src/arrow/type.h | 124 ++++++++- cpp/src/arrow/type_fwd.h | 21 ++ cpp/src/arrow/type_test.cc | 12 + cpp/src/arrow/type_traits.cc | 2 + cpp/src/arrow/type_traits.h | 53 +++- cpp/src/arrow/util/assume_aligned.h | 45 ++++ cpp/src/arrow/util/binary_view_util.h | 105 ++++++++ cpp/src/arrow/util/string.cc | 42 +-- cpp/src/arrow/util/string.h | 4 +- cpp/src/arrow/visit_data_inline.h | 49 +++- cpp/src/arrow/visitor.cc | 8 +- cpp/src/arrow/visitor.h | 6 + cpp/src/arrow/visitor_generate.h | 2 + .../parquet/arrow/arrow_reader_writer_test.cc | 3 +- cpp/src/parquet/column_writer.cc | 2 + dev/archery/archery/integration/datagen.py | 5 +- .../src/arrow/python/arrow_to_pandas.cc | 3 +- .../src/arrow/python/python_to_arrow.cc | 12 +- r/src/r_to_arrow.cpp | 7 +- 69 files changed, 1962 insertions(+), 352 deletions(-) create mode 100644 cpp/src/arrow/util/assume_aligned.h create mode 100644 cpp/src/arrow/util/binary_view_util.h diff --git a/cpp/src/arrow/array/array_base.cc b/cpp/src/arrow/array/array_base.cc index f7b8d7954e1cf..eab71de27b11a 100644 --- a/cpp/src/arrow/array/array_base.cc +++ b/cpp/src/arrow/array/array_base.cc @@ -87,6 +87,8 @@ struct ScalarFromArraySlotImpl { return Finish(a.GetString(index_)); } + Status Visit(const BinaryViewArray& a) { return Finish(a.GetString(index_)); } + Status Visit(const FixedSizeBinaryArray& a) { return Finish(a.GetString(index_)); } Status Visit(const DayTimeIntervalArray& a) { return Finish(a.Value(index_)); } diff --git a/cpp/src/arrow/array/array_binary.cc b/cpp/src/arrow/array/array_binary.cc index 9466b5a48f9d7..19a653df00f8c 100644 --- a/cpp/src/arrow/array/array_binary.cc +++ b/cpp/src/arrow/array/array_binary.cc @@ -24,6 +24,7 @@ #include "arrow/array/validate.h" #include "arrow/type.h" #include "arrow/type_traits.h" +#include "arrow/util/binary_view_util.h" #include "arrow/util/checked_cast.h" #include "arrow/util/logging.h" @@ -89,6 +90,33 @@ LargeStringArray::LargeStringArray(int64_t length, Status LargeStringArray::ValidateUTF8() const { return internal::ValidateUTF8(*data_); } +BinaryViewArray::BinaryViewArray(std::shared_ptr data) { + ARROW_CHECK_EQ(data->type->id(), Type::BINARY_VIEW); + SetData(std::move(data)); +} + +BinaryViewArray::BinaryViewArray(std::shared_ptr type, int64_t length, + std::shared_ptr views, BufferVector buffers, + std::shared_ptr null_bitmap, int64_t null_count, + int64_t offset) { + buffers.insert(buffers.begin(), std::move(views)); + buffers.insert(buffers.begin(), std::move(null_bitmap)); + SetData( + ArrayData::Make(std::move(type), length, std::move(buffers), null_count, offset)); +} + +std::string_view BinaryViewArray::GetView(int64_t i) const { + const std::shared_ptr* data_buffers = data_->buffers.data() + 2; + return util::FromIndexOffsetBinaryView(raw_values_[i], data_buffers); +} + +StringViewArray::StringViewArray(std::shared_ptr data) { + ARROW_CHECK_EQ(data->type->id(), Type::STRING_VIEW); + SetData(std::move(data)); +} + +Status StringViewArray::ValidateUTF8() const { return internal::ValidateUTF8(*data_); } + FixedSizeBinaryArray::FixedSizeBinaryArray(const std::shared_ptr& data) { SetData(data); } diff --git a/cpp/src/arrow/array/array_binary.h b/cpp/src/arrow/array/array_binary.h index 7e58a96ff841a..fd68a379ddbfb 100644 --- a/cpp/src/arrow/array/array_binary.h +++ b/cpp/src/arrow/array/array_binary.h @@ -22,6 +22,7 @@ #include #include +#include #include #include #include @@ -217,6 +218,65 @@ class ARROW_EXPORT LargeStringArray : public LargeBinaryArray { Status ValidateUTF8() const; }; +// ---------------------------------------------------------------------- +// BinaryView and StringView + +/// Concrete Array class for variable-size binary view data using the +/// BinaryViewType::c_type struct to reference in-line or out-of-line string values +class ARROW_EXPORT BinaryViewArray : public FlatArray { + public: + using TypeClass = BinaryViewType; + using IteratorType = stl::ArrayIterator; + using c_type = BinaryViewType::c_type; + + explicit BinaryViewArray(std::shared_ptr data); + + BinaryViewArray(std::shared_ptr type, int64_t length, + std::shared_ptr views, BufferVector data_buffers, + std::shared_ptr null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount, int64_t offset = 0); + + // For API compatibility with BinaryArray etc. + std::string_view GetView(int64_t i) const; + std::string GetString(int64_t i) const { return std::string{GetView(i)}; } + + const auto& values() const { return data_->buffers[1]; } + const c_type* raw_values() const { return raw_values_; } + + std::optional operator[](int64_t i) const { + return *IteratorType(*this, i); + } + + IteratorType begin() const { return IteratorType(*this); } + IteratorType end() const { return IteratorType(*this, length()); } + + protected: + using FlatArray::FlatArray; + + void SetData(std::shared_ptr data) { + FlatArray::SetData(std::move(data)); + raw_values_ = data_->GetValuesSafe(1); + } + + const c_type* raw_values_; +}; + +/// Concrete Array class for variable-size string view (utf-8) data using +/// BinaryViewType::c_type to reference in-line or out-of-line string values +class ARROW_EXPORT StringViewArray : public BinaryViewArray { + public: + using TypeClass = StringViewType; + + explicit StringViewArray(std::shared_ptr data); + + using BinaryViewArray::BinaryViewArray; + + /// \brief Validate that this array contains only valid UTF8 entries + /// + /// This check is also implied by ValidateFull() + Status ValidateUTF8() const; +}; + // ---------------------------------------------------------------------- // Fixed width binary diff --git a/cpp/src/arrow/array/array_binary_test.cc b/cpp/src/arrow/array/array_binary_test.cc index 3bc9bb91a022a..e6b1ed339a27e 100644 --- a/cpp/src/arrow/array/array_binary_test.cc +++ b/cpp/src/arrow/array/array_binary_test.cc @@ -27,17 +27,21 @@ #include "arrow/array.h" #include "arrow/array/builder_binary.h" +#include "arrow/array/validate.h" #include "arrow/buffer.h" #include "arrow/memory_pool.h" #include "arrow/status.h" #include "arrow/testing/builder.h" #include "arrow/testing/gtest_util.h" +#include "arrow/testing/matchers.h" #include "arrow/testing/util.h" #include "arrow/type.h" #include "arrow/type_traits.h" #include "arrow/util/bit_util.h" #include "arrow/util/bitmap_builders.h" #include "arrow/util/checked_cast.h" +#include "arrow/util/key_value_metadata.h" +#include "arrow/util/logging.h" #include "arrow/visit_data_inline.h" namespace arrow { @@ -365,38 +369,136 @@ TYPED_TEST(TestStringArray, TestValidateOffsets) { this->TestValidateOffsets(); TYPED_TEST(TestStringArray, TestValidateData) { this->TestValidateData(); } +// Produce an Array of index/offset views from a std::vector of index/offset +// BinaryViewType::c_type +Result> MakeBinaryViewArray( + BufferVector data_buffers, const std::vector& views, + bool validate = true) { + auto length = static_cast(views.size()); + auto arr = std::make_shared( + utf8_view(), length, Buffer::FromVector(views), std::move(data_buffers)); + if (validate) { + RETURN_NOT_OK(arr->ValidateFull()); + } + return arr; +} + +TEST(StringViewArray, Validate) { + // Since this is a test of validation, we need to be able to construct invalid arrays. + auto buffer_s = Buffer::FromString("supercalifragilistic(sp?)"); + auto buffer_y = Buffer::FromString("yyyyyyyyyyyyyyyyyyyyyyyyy"); + + // empty array is valid + EXPECT_THAT(MakeBinaryViewArray({}, {}), Ok()); + + // empty array with some data buffers is valid + EXPECT_THAT(MakeBinaryViewArray({buffer_s, buffer_y}, {}), Ok()); + + // inline views need not have a corresponding buffer + EXPECT_THAT(MakeBinaryViewArray({}, + { + util::ToInlineBinaryView("hello"), + util::ToInlineBinaryView("world"), + util::ToInlineBinaryView("inline me"), + }), + Ok()); + + // non-inline views are expected to reference only buffers managed by the array + EXPECT_THAT( + MakeBinaryViewArray({buffer_s, buffer_y}, + {util::ToIndexOffsetBinaryView( + "supe", static_cast(buffer_s->size()), 0, 0), + util::ToIndexOffsetBinaryView( + "yyyy", static_cast(buffer_y->size()), 1, 0)}), + Ok()); + + // views may not reference data buffers not present in the array + EXPECT_THAT( + MakeBinaryViewArray({}, {util::ToIndexOffsetBinaryView( + "supe", static_cast(buffer_s->size()), 0, 0)}), + Raises(StatusCode::IndexError)); + // ... or ranges which overflow the referenced data buffer + EXPECT_THAT( + MakeBinaryViewArray( + {buffer_s}, {util::ToIndexOffsetBinaryView( + "supe", static_cast(buffer_s->size() + 50), 0, 0)}), + Raises(StatusCode::IndexError)); + + // Additionally, the prefixes of non-inline views must match the data buffer + EXPECT_THAT( + MakeBinaryViewArray({buffer_s, buffer_y}, + {util::ToIndexOffsetBinaryView( + "SUPE", static_cast(buffer_s->size()), 0, 0), + util::ToIndexOffsetBinaryView( + "yyyy", static_cast(buffer_y->size()), 1, 0)}), + Raises(StatusCode::Invalid)); + + // Invalid string views which are masked by a null bit do not cause validation to fail + auto invalid_but_masked = + MakeBinaryViewArray({buffer_s}, + {util::ToIndexOffsetBinaryView( + "SUPE", static_cast(buffer_s->size()), 0, 0), + util::ToIndexOffsetBinaryView("yyyy", 50, 40, 30)}, + /*validate=*/false) + .ValueOrDie() + ->data(); + invalid_but_masked->null_count = 2; + invalid_but_masked->buffers[0] = *AllocateEmptyBitmap(2); + EXPECT_THAT(internal::ValidateArrayFull(*invalid_but_masked), Ok()); + + // overlapping views are allowed + EXPECT_THAT(MakeBinaryViewArray( + {buffer_s}, + { + util::ToIndexOffsetBinaryView( + "supe", static_cast(buffer_s->size()), 0, 0), + util::ToIndexOffsetBinaryView( + "uper", static_cast(buffer_s->size() - 1), 0, 1), + util::ToIndexOffsetBinaryView( + "perc", static_cast(buffer_s->size() - 2), 0, 2), + util::ToIndexOffsetBinaryView( + "erca", static_cast(buffer_s->size() - 3), 0, 3), + }), + Ok()); +} + template class TestUTF8Array : public ::testing::Test { public: using TypeClass = T; - using offset_type = typename TypeClass::offset_type; using ArrayType = typename TypeTraits::ArrayType; - Status ValidateUTF8(int64_t length, std::vector offsets, - std::string_view data, int64_t offset = 0) { - ArrayType arr(length, Buffer::Wrap(offsets), std::make_shared(data), - /*null_bitmap=*/nullptr, /*null_count=*/0, offset); - return arr.ValidateUTF8(); + std::shared_ptr type() const { + if constexpr (is_binary_view_like_type::value) { + return TypeClass::is_utf8 ? utf8_view() : binary_view(); + } else { + return TypeTraits::type_singleton(); + } } - Status ValidateUTF8(const std::string& json) { - auto ty = TypeTraits::type_singleton(); - auto arr = ArrayFromJSON(ty, json); - return checked_cast(*arr).ValidateUTF8(); + Status ValidateUTF8(const Array& arr) { + return checked_cast(arr).ValidateUTF8(); + } + + Status ValidateUTF8(std::vector values) { + std::shared_ptr arr; + ArrayFromVector(type(), values, &arr); + return ValidateUTF8(*arr); } void TestValidateUTF8() { - ASSERT_OK(ValidateUTF8(R"(["Voix", "ambiguë", "d’un", "cœur"])")); - ASSERT_OK(ValidateUTF8(1, {0, 4}, "\xf4\x8f\xbf\xbf")); // \U0010ffff + ASSERT_OK( + ValidateUTF8(*ArrayFromJSON(type(), R"(["Voix", "ambiguë", "d’un", "cœur"])"))); + ASSERT_OK(ValidateUTF8({"\xf4\x8f\xbf\xbf"})); // \U0010ffff - ASSERT_RAISES(Invalid, ValidateUTF8(1, {0, 1}, "\xf4")); + ASSERT_RAISES(Invalid, ValidateUTF8({"\xf4"})); // More tests in TestValidateData() above // (ValidateFull() calls ValidateUTF8() internally) } }; -TYPED_TEST_SUITE(TestUTF8Array, StringArrowTypes); +TYPED_TEST_SUITE(TestUTF8Array, StringOrStringViewArrowTypes); TYPED_TEST(TestUTF8Array, TestValidateUTF8) { this->TestValidateUTF8(); } @@ -880,14 +982,24 @@ class TestBaseBinaryDataVisitor : public ::testing::Test { public: using TypeClass = T; - void SetUp() override { type_ = TypeTraits::type_singleton(); } + void SetUp() override { + if constexpr (is_binary_view_like_type::value) { + type_ = TypeClass::is_utf8 ? utf8_view() : binary_view(); + } else { + type_ = TypeTraits::type_singleton(); + } + } void TestBasics() { - auto array = ArrayFromJSON(type_, R"(["foo", null, "bar"])"); + auto array = ArrayFromJSON( + type_, + R"(["foo", null, "bar", "inline_me", "allocate_me_aaaaa", "allocate_me_bbbb"])"); BinaryAppender appender; ArraySpanVisitor visitor; ASSERT_OK(visitor.Visit(*array->data(), &appender)); - ASSERT_THAT(appender.data, ::testing::ElementsAreArray({"foo", "(null)", "bar"})); + ASSERT_THAT(appender.data, + ::testing::ElementsAreArray({"foo", "(null)", "bar", "inline_me", + "allocate_me_aaaaa", "allocate_me_bbbb"})); ARROW_UNUSED(visitor); // Workaround weird MSVC warning } @@ -904,7 +1016,7 @@ class TestBaseBinaryDataVisitor : public ::testing::Test { std::shared_ptr type_; }; -TYPED_TEST_SUITE(TestBaseBinaryDataVisitor, BaseBinaryArrowTypes); +TYPED_TEST_SUITE(TestBaseBinaryDataVisitor, BaseBinaryOrBinaryViewLikeArrowTypes); TYPED_TEST(TestBaseBinaryDataVisitor, Basics) { this->TestBasics(); } diff --git a/cpp/src/arrow/array/array_test.cc b/cpp/src/arrow/array/array_test.cc index 2bef9d725d37f..46908439ef5f0 100644 --- a/cpp/src/arrow/array/array_test.cc +++ b/cpp/src/arrow/array/array_test.cc @@ -382,10 +382,12 @@ static std::vector> TestArrayUtilitiesAgainstTheseType float64(), binary(), large_binary(), + binary_view(), fixed_size_binary(3), decimal(16, 4), utf8(), large_utf8(), + utf8_view(), list(utf8()), list(int64()), // NOTE: Regression case for ARROW-9071/MakeArrayOfNull list(large_utf8()), @@ -601,12 +603,15 @@ static ScalarVector GetScalars() { std::make_shared(60, duration(TimeUnit::SECOND)), std::make_shared(hello), std::make_shared(hello), + std::make_shared(hello), std::make_shared( hello, fixed_size_binary(static_cast(hello->size()))), std::make_shared(Decimal128(10), decimal(16, 4)), std::make_shared(Decimal256(10), decimal(76, 38)), std::make_shared(hello), std::make_shared(hello), + std::make_shared(hello), + std::make_shared(Buffer::FromString("long string; not inlined")), std::make_shared(ArrayFromJSON(int8(), "[1, 2, 3]")), ScalarFromJSON(map(int8(), utf8()), R"([[1, "foo"], [2, "bar"]])"), std::make_shared(ArrayFromJSON(int8(), "[1, 1, 2, 2, 3, 3]")), @@ -647,13 +652,14 @@ TEST_F(TestArray, TestMakeArrayFromScalar) { for (int64_t length : {16}) { for (auto scalar : scalars) { + ARROW_SCOPED_TRACE("scalar type: ", scalar->type->ToString()); ASSERT_OK_AND_ASSIGN(auto array, MakeArrayFromScalar(*scalar, length)); ASSERT_OK(array->ValidateFull()); ASSERT_EQ(array->length(), length); ASSERT_EQ(array->null_count(), 0); // test case for ARROW-13321 - for (int64_t i : std::vector{0, length / 2, length - 1}) { + for (int64_t i : {int64_t{0}, length / 2, length - 1}) { ASSERT_OK_AND_ASSIGN(auto s, array->GetScalar(i)); AssertScalarsEqual(*s, *scalar, /*verbose=*/true); } diff --git a/cpp/src/arrow/array/array_view_test.cc b/cpp/src/arrow/array/array_view_test.cc index 07dc3014e4029..97110ea97f3fc 100644 --- a/cpp/src/arrow/array/array_view_test.cc +++ b/cpp/src/arrow/array/array_view_test.cc @@ -126,6 +126,38 @@ TEST(TestArrayView, StringAsBinary) { CheckView(expected, arr); } +TEST(TestArrayView, StringViewAsBinaryView) { + for (auto json : { + R"(["foox", "barz", null])", + R"(["foox", "barz_not_inlined", null])", + }) { + auto arr = ArrayFromJSON(utf8_view(), json); + auto expected = ArrayFromJSON(binary_view(), json); + CheckView(arr, expected); + CheckView(expected, arr); + } +} + +TEST(TestArrayView, StringViewAsBinaryViewInStruct) { + auto padl = ArrayFromJSON(list(int16()), "[[0, -1], [], [42]]"); + auto padr = ArrayFromJSON(utf8(), R"(["foox", "barz", null])"); + + for (auto json : { + R"(["foox", "barz", null])", + R"(["foox", "barz_not_inlined", null])", + }) { + auto arr = + StructArray::Make({padl, ArrayFromJSON(utf8_view(), json), padr}, {"", "", ""}) + .ValueOrDie(); + auto expected = + StructArray::Make({padl, ArrayFromJSON(binary_view(), json), padr}, {"", "", ""}) + .ValueOrDie(); + + CheckView(arr, expected); + CheckView(expected, arr); + } +} + TEST(TestArrayView, PrimitiveWrongSize) { auto arr = ArrayFromJSON(int16(), "[0, -1, 42]"); CheckViewFails(arr, int8()); diff --git a/cpp/src/arrow/array/builder_base.cc b/cpp/src/arrow/array/builder_base.cc index 3000aea3e189a..76f0d9fc3f679 100644 --- a/cpp/src/arrow/array/builder_base.cc +++ b/cpp/src/arrow/array/builder_base.cc @@ -96,10 +96,7 @@ namespace { template struct AppendScalarImpl { template - enable_if_t::value || is_decimal_type::value || - is_fixed_size_binary_type::value, - Status> - Visit(const T&) { + Status HandleFixedWidth(const T&) { auto builder = checked_cast::BuilderType*>(builder_); RETURN_NOT_OK(builder->Reserve(n_repeats_ * (scalars_end_ - scalars_begin_))); @@ -117,7 +114,17 @@ struct AppendScalarImpl { } template - enable_if_base_binary Visit(const T&) { + enable_if_t::value, Status> Visit(const T& t) { + return HandleFixedWidth(t); + } + + Status Visit(const FixedSizeBinaryType& t) { return HandleFixedWidth(t); } + Status Visit(const Decimal128Type& t) { return HandleFixedWidth(t); } + Status Visit(const Decimal256Type& t) { return HandleFixedWidth(t); } + + template + enable_if_t::value || is_string_like_type::value, Status> + Visit(const T&) { int64_t data_size = 0; for (auto it = scalars_begin_; it != scalars_end_; ++it) { const auto& scalar = checked_cast::ScalarType&>(*it); diff --git a/cpp/src/arrow/array/builder_binary.cc b/cpp/src/arrow/array/builder_binary.cc index 571f450aab9c1..932277a4f374b 100644 --- a/cpp/src/arrow/array/builder_binary.cc +++ b/cpp/src/arrow/array/builder_binary.cc @@ -40,6 +40,60 @@ namespace arrow { using internal::checked_cast; +// ---------------------------------------------------------------------- +// Binary/StringView +BinaryViewBuilder::BinaryViewBuilder(const std::shared_ptr& type, + MemoryPool* pool) + : BinaryViewBuilder(pool) {} + +Status BinaryViewBuilder::AppendArraySlice(const ArraySpan& array, int64_t offset, + int64_t length) { + auto bitmap = array.GetValues(0, 0); + auto values = array.GetValues(1) + offset; + + int64_t out_of_line_total = 0; + for (int64_t i = 0; i < length; i++) { + if (!values[i].is_inline()) { + out_of_line_total += static_cast(values[i].size()); + } + } + RETURN_NOT_OK(Reserve(length)); + RETURN_NOT_OK(ReserveData(out_of_line_total)); + + for (int64_t i = 0; i < length; i++) { + if (bitmap && !bit_util::GetBit(bitmap, array.offset + offset + i)) { + UnsafeAppendNull(); + continue; + } + + UnsafeAppend( + util::FromIndexOffsetBinaryView(values[i], array.GetVariadicBuffers().data())); + } + return Status::OK(); +} + +Status BinaryViewBuilder::FinishInternal(std::shared_ptr* out) { + ARROW_ASSIGN_OR_RAISE(auto null_bitmap, null_bitmap_builder_.FinishWithLength(length_)); + ARROW_ASSIGN_OR_RAISE(auto data, data_builder_.FinishWithLength(length_)); + BufferVector buffers = {null_bitmap, data}; + for (auto&& buffer : data_heap_builder_.Finish()) { + buffers.push_back(std::move(buffer)); + } + *out = ArrayData::Make(type(), length_, std::move(buffers), null_count_); + Reset(); + return Status::OK(); +} + +Status BinaryViewBuilder::ReserveData(int64_t length) { + return data_heap_builder_.Reserve(length); +} + +void BinaryViewBuilder::Reset() { + ArrayBuilder::Reset(); + data_builder_.Reset(); + data_heap_builder_.Reset(); +} + // ---------------------------------------------------------------------- // Fixed width binary @@ -125,8 +179,8 @@ const uint8_t* FixedSizeBinaryBuilder::GetValue(int64_t i) const { std::string_view FixedSizeBinaryBuilder::GetView(int64_t i) const { const uint8_t* data_ptr = byte_builder_.data(); - return std::string_view(reinterpret_cast(data_ptr + i * byte_width_), - byte_width_); + return {reinterpret_cast(data_ptr + i * byte_width_), + static_cast(byte_width_)}; } // ---------------------------------------------------------------------- @@ -173,10 +227,10 @@ Status ChunkedStringBuilder::Finish(ArrayVector* out) { RETURN_NOT_OK(ChunkedBinaryBuilder::Finish(out)); // Change data type to string/utf8 - for (size_t i = 0; i < out->size(); ++i) { - std::shared_ptr data = (*out)[i]->data(); + for (auto& chunk : *out) { + std::shared_ptr data = chunk->data()->Copy(); data->type = ::arrow::utf8(); - (*out)[i] = std::make_shared(data); + chunk = std::make_shared(std::move(data)); } return Status::OK(); } diff --git a/cpp/src/arrow/array/builder_binary.h b/cpp/src/arrow/array/builder_binary.h index b0c4fe2fc81fd..1da9ed88fe3ba 100644 --- a/cpp/src/arrow/array/builder_binary.h +++ b/cpp/src/arrow/array/builder_binary.h @@ -36,6 +36,7 @@ #include "arrow/buffer_builder.h" #include "arrow/status.h" #include "arrow/type.h" +#include "arrow/util/binary_view_util.h" #include "arrow/util/macros.h" #include "arrow/util/visibility.h" @@ -204,10 +205,10 @@ class BaseBinaryBuilder } } } else { - for (std::size_t i = 0; i < values.size(); ++i) { + for (const auto& value : values) { UnsafeAppendNextOffset(); - value_data_builder_.UnsafeAppend( - reinterpret_cast(values[i].data()), values[i].size()); + value_data_builder_.UnsafeAppend(reinterpret_cast(value.data()), + value.size()); } } @@ -463,6 +464,248 @@ class ARROW_EXPORT LargeStringBuilder : public LargeBinaryBuilder { std::shared_ptr type() const override { return large_utf8(); } }; +// ---------------------------------------------------------------------- +// BinaryViewBuilder, StringViewBuilder +// +// These builders do not support building raw pointer view arrays. + +namespace internal { + +// We allocate medium-sized memory chunks and accumulate data in those, which +// may result in some waste if there are many large-ish strings. If a string +// comes along that does not fit into a block, we allocate a new block and +// write into that. +// +// Later we can implement optimizations to continuing filling underfull blocks +// after encountering a large string that required allocating a new block. +class ARROW_EXPORT StringHeapBuilder { + public: + static constexpr int64_t kDefaultBlocksize = 32 << 10; // 32KB + + StringHeapBuilder(MemoryPool* pool, int64_t alignment) + : pool_(pool), alignment_(alignment) {} + + void SetBlockSize(int64_t blocksize) { blocksize_ = blocksize; } + + using c_type = BinaryViewType::c_type; + + template + std::conditional_t, c_type> Append(const uint8_t* value, + int64_t length) { + if (length <= BinaryViewType::kInlineSize) { + return util::ToInlineBinaryView(value, static_cast(length)); + } + + if constexpr (Safe) { + ARROW_RETURN_NOT_OK(Reserve(length)); + } + + auto v = util::ToIndexOffsetBinaryView(value, static_cast(length), + static_cast(blocks_.size() - 1), + current_offset_); + + memcpy(current_out_buffer_, value, static_cast(length)); + current_out_buffer_ += length; + current_remaining_bytes_ -= length; + current_offset_ += static_cast(length); + return v; + } + + static constexpr int64_t ValueSizeLimit() { + return std::numeric_limits::max(); + } + + /// \brief Ensure that the indicated number of bytes can be appended via + /// UnsafeAppend operations without the need to allocate more memory + Status Reserve(int64_t num_bytes) { + if (ARROW_PREDICT_FALSE(num_bytes > ValueSizeLimit())) { + return Status::CapacityError( + "BinaryView or StringView elements cannot reference " + "strings larger than 2GB"); + } + if (num_bytes > current_remaining_bytes_) { + // Ensure the buffer is fully overwritten to avoid leaking uninitialized + // bytes from the allocator + if (current_remaining_bytes_ > 0) { + std::memset(current_out_buffer_, 0, current_remaining_bytes_); + blocks_.back() = SliceBuffer(blocks_.back(), 0, + blocks_.back()->size() - current_remaining_bytes_); + } + current_remaining_bytes_ = num_bytes > blocksize_ ? num_bytes : blocksize_; + ARROW_ASSIGN_OR_RAISE(std::shared_ptr new_block, + AllocateBuffer(current_remaining_bytes_, alignment_, pool_)); + current_offset_ = 0; + current_out_buffer_ = new_block->mutable_data(); + blocks_.emplace_back(std::move(new_block)); + } + return Status::OK(); + } + + void Reset() { + current_offset_ = 0; + current_out_buffer_ = NULLPTR; + current_remaining_bytes_ = 0; + blocks_.clear(); + } + + int64_t current_remaining_bytes() const { return current_remaining_bytes_; } + + std::vector> Finish() { + current_offset_ = 0; + current_out_buffer_ = NULLPTR; + current_remaining_bytes_ = 0; + return std::move(blocks_); + } + + private: + MemoryPool* pool_; + int64_t alignment_; + int64_t blocksize_ = kDefaultBlocksize; + std::vector> blocks_; + + int32_t current_offset_ = 0; + uint8_t* current_out_buffer_ = NULLPTR; + int64_t current_remaining_bytes_ = 0; +}; + +} // namespace internal + +class ARROW_EXPORT BinaryViewBuilder : public ArrayBuilder { + public: + using TypeClass = BinaryViewType; + + // this constructor provided for MakeBuilder compatibility + BinaryViewBuilder(const std::shared_ptr&, MemoryPool* pool); + + explicit BinaryViewBuilder(MemoryPool* pool = default_memory_pool(), + int64_t alignment = kDefaultBufferAlignment) + : ArrayBuilder(pool, alignment), + data_builder_(pool, alignment), + data_heap_builder_(pool, alignment) {} + + void SetBlockSize(int64_t blocksize) { data_heap_builder_.SetBlockSize(blocksize); } + + int64_t current_block_bytes_remaining() const { + return data_heap_builder_.current_remaining_bytes(); + } + + Status Append(const uint8_t* value, int64_t length) { + ARROW_RETURN_NOT_OK(Reserve(1)); + UnsafeAppendToBitmap(true); + ARROW_ASSIGN_OR_RAISE(auto v, + data_heap_builder_.Append(value, length)); + data_builder_.UnsafeAppend(v); + return Status::OK(); + } + + Status Append(const char* value, int64_t length) { + return Append(reinterpret_cast(value), length); + } + + Status Append(std::string_view value) { + return Append(value.data(), static_cast(value.size())); + } + + /// \brief Append without checking capacity + /// + /// Builder should have been presized using Reserve() and ReserveData(), + /// respectively, and the value must not be larger than 2GB + void UnsafeAppend(const uint8_t* value, int64_t length) { + UnsafeAppendToBitmap(true); + auto v = data_heap_builder_.Append(value, length); + data_builder_.UnsafeAppend(v); + } + + void UnsafeAppend(const char* value, int64_t length) { + UnsafeAppend(reinterpret_cast(value), length); + } + + void UnsafeAppend(const std::string& value) { + UnsafeAppend(value.c_str(), static_cast(value.size())); + } + + void UnsafeAppend(std::string_view value) { + UnsafeAppend(value.data(), static_cast(value.size())); + } + + /// \brief Ensures there is enough allocated available capacity in the + /// out-of-line data heap to append the indicated number of bytes without + /// additional allocations + Status ReserveData(int64_t length); + + Status AppendNulls(int64_t length) final { + ARROW_RETURN_NOT_OK(Reserve(length)); + data_builder_.UnsafeAppend(length, BinaryViewType::c_type{}); + UnsafeSetNull(length); + return Status::OK(); + } + + /// \brief Append a single null element + Status AppendNull() final { + ARROW_RETURN_NOT_OK(Reserve(1)); + data_builder_.UnsafeAppend(BinaryViewType::c_type{}); + UnsafeAppendToBitmap(false); + return Status::OK(); + } + + /// \brief Append a empty element (length-0 inline string) + Status AppendEmptyValue() final { + ARROW_RETURN_NOT_OK(Reserve(1)); + data_builder_.UnsafeAppend(BinaryViewType::c_type{}); + UnsafeAppendToBitmap(true); + return Status::OK(); + } + + /// \brief Append several empty elements + Status AppendEmptyValues(int64_t length) final { + ARROW_RETURN_NOT_OK(Reserve(length)); + data_builder_.UnsafeAppend(length, BinaryViewType::c_type{}); + UnsafeSetNotNull(length); + return Status::OK(); + } + + void UnsafeAppendNull() { + data_builder_.UnsafeAppend(BinaryViewType::c_type{}); + UnsafeAppendToBitmap(false); + } + + void UnsafeAppendEmptyValue() { + data_builder_.UnsafeAppend(BinaryViewType::c_type{}); + UnsafeAppendToBitmap(true); + } + + /// \brief Append a slice of a BinaryViewArray passed as an ArraySpan. Copies + /// the underlying out-of-line string memory to avoid memory lifetime issues + Status AppendArraySlice(const ArraySpan& array, int64_t offset, + int64_t length) override; + + void Reset() override; + + Status Resize(int64_t capacity) override { + ARROW_RETURN_NOT_OK(CheckCapacity(capacity)); + capacity = std::max(capacity, kMinBuilderCapacity); + ARROW_RETURN_NOT_OK(data_builder_.Resize(capacity)); + return ArrayBuilder::Resize(capacity); + } + + Status FinishInternal(std::shared_ptr* out) override; + + std::shared_ptr type() const override { return binary_view(); } + + protected: + TypedBufferBuilder data_builder_; + + // Accumulates out-of-line data in fixed-size chunks which are then attached + // to the resulting ArrayData + internal::StringHeapBuilder data_heap_builder_; +}; + +class ARROW_EXPORT StringViewBuilder : public BinaryViewBuilder { + public: + using BinaryViewBuilder::BinaryViewBuilder; + std::shared_ptr type() const override { return utf8_view(); } +}; + // ---------------------------------------------------------------------- // FixedSizeBinaryBuilder @@ -498,7 +741,7 @@ class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder { Status Append(const Buffer& s) { ARROW_RETURN_NOT_OK(Reserve(1)); - UnsafeAppend(std::string_view(s)); + UnsafeAppend(s); return Status::OK(); } @@ -549,7 +792,7 @@ class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder { UnsafeAppend(reinterpret_cast(value.data())); } - void UnsafeAppend(const Buffer& s) { UnsafeAppend(std::string_view(s)); } + void UnsafeAppend(const Buffer& s) { UnsafeAppend(std::string_view{s}); } void UnsafeAppend(const std::shared_ptr& s) { UnsafeAppend(*s); } diff --git a/cpp/src/arrow/array/builder_dict.cc b/cpp/src/arrow/array/builder_dict.cc index 525b0afbc908a..7a96463ec3c43 100644 --- a/cpp/src/arrow/array/builder_dict.cc +++ b/cpp/src/arrow/array/builder_dict.cc @@ -194,6 +194,12 @@ Status DictionaryMemoTable::GetOrInsert(const BinaryType*, std::string_view valu return impl_->GetOrInsert(value, out); } +Status DictionaryMemoTable::GetOrInsert(const BinaryViewType*, std::string_view value, + int32_t* out) { + // Create BinaryArray dictionary for now + return impl_->GetOrInsert(value, out); +} + Status DictionaryMemoTable::GetOrInsert(const LargeBinaryType*, std::string_view value, int32_t* out) { return impl_->GetOrInsert(value, out); diff --git a/cpp/src/arrow/array/builder_dict.h b/cpp/src/arrow/array/builder_dict.h index cb0aaf309915b..3f0d711dc5bb5 100644 --- a/cpp/src/arrow/array/builder_dict.h +++ b/cpp/src/arrow/array/builder_dict.h @@ -60,6 +60,12 @@ struct DictionaryValue> { BinaryType, LargeBinaryType>::type; }; +template +struct DictionaryValue> { + using type = std::string_view; + using PhysicalType = BinaryViewType; +}; + template struct DictionaryValue> { using type = std::string_view; @@ -114,6 +120,7 @@ class ARROW_EXPORT DictionaryMemoTable { Status GetOrInsert(const BinaryType*, std::string_view value, int32_t* out); Status GetOrInsert(const LargeBinaryType*, std::string_view value, int32_t* out); + Status GetOrInsert(const BinaryViewType*, std::string_view value, int32_t* out); class DictionaryMemoTableImpl; std::unique_ptr impl_; diff --git a/cpp/src/arrow/array/concatenate.cc b/cpp/src/arrow/array/concatenate.cc index f7549fa9d1d1a..f8c5762910b90 100644 --- a/cpp/src/arrow/array/concatenate.cc +++ b/cpp/src/arrow/array/concatenate.cc @@ -230,6 +230,35 @@ class ConcatenateImpl { return ConcatenateBuffers(value_buffers, pool_).Value(&out_->buffers[2]); } + Status Visit(const BinaryViewType& type) { + out_->buffers.resize(2); + + for (const auto& in_data : in_) { + for (const auto& buf : util::span(in_data->buffers).subspan(2)) { + out_->buffers.push_back(buf); + } + } + + ARROW_ASSIGN_OR_RAISE(auto view_buffers, Buffers(1, BinaryViewType::kSize)); + ARROW_ASSIGN_OR_RAISE(auto view_buffer, ConcatenateBuffers(view_buffers, pool_)); + + auto* s = view_buffer->mutable_data_as(); + size_t preceding_buffer_count = 0; + + int64_t i = in_[0]->length; + for (size_t in_index = 1; in_index < in_.size(); ++in_index) { + preceding_buffer_count += in_[in_index - 1]->buffers.size() - 2; + + for (int64_t end_i = i + in_[in_index]->length; i < end_i; ++i) { + if (s[i].is_inline()) continue; + s[i].ref.buffer_index += static_cast(preceding_buffer_count); + } + } + + out_->buffers[1] = std::move(view_buffer); + return Status::OK(); + } + Status Visit(const ListType&) { std::vector value_ranges; ARROW_ASSIGN_OR_RAISE(auto index_buffers, Buffers(1, sizeof(int32_t))); diff --git a/cpp/src/arrow/array/concatenate_test.cc b/cpp/src/arrow/array/concatenate_test.cc index 4c03fab731ffe..55535f2c15433 100644 --- a/cpp/src/arrow/array/concatenate_test.cc +++ b/cpp/src/arrow/array/concatenate_test.cc @@ -92,8 +92,14 @@ class ConcatenateTest : public ::testing::Test { for (auto null_probability : this->null_probabilities_) { std::shared_ptr array; factory(size, null_probability, &array); + ASSERT_OK(array->ValidateFull()); auto expected = array->Slice(offsets.front(), offsets.back() - offsets.front()); + ASSERT_OK(expected->ValidateFull()); auto slices = this->Slices(array, offsets); + for (auto slice : slices) { + ASSERT_OK(slice->ValidateFull()); + } + ASSERT_OK(expected->ValidateFull()); ASSERT_OK_AND_ASSIGN(auto actual, Concatenate(slices)); AssertArraysEqual(*expected, *actual); if (actual->data()->buffers[0]) { @@ -155,6 +161,13 @@ TEST_F(ConcatenateTest, StringType) { }); } +TEST_F(ConcatenateTest, StringViewType) { + Check([this](int32_t size, double null_probability, std::shared_ptr* out) { + *out = rng_.StringView(size, /*min_length =*/0, /*max_length =*/15, null_probability); + ASSERT_OK((**out).ValidateFull()); + }); +} + TEST_F(ConcatenateTest, LargeStringType) { Check([this](int32_t size, double null_probability, std::shared_ptr* out) { *out = diff --git a/cpp/src/arrow/array/data.cc b/cpp/src/arrow/array/data.cc index 79595ab7c7c31..a59da54f0fe13 100644 --- a/cpp/src/arrow/array/data.cc +++ b/cpp/src/arrow/array/data.cc @@ -31,6 +31,7 @@ #include "arrow/status.h" #include "arrow/type.h" #include "arrow/type_traits.h" +#include "arrow/util/binary_view_util.h" #include "arrow/util/bitmap_ops.h" #include "arrow/util/logging.h" #include "arrow/util/macros.h" @@ -92,6 +93,11 @@ bool RunEndEncodedMayHaveLogicalNulls(const ArrayData& data) { return ArraySpan(data).MayHaveLogicalNulls(); } +BufferSpan PackVariadicBuffers(util::span const> buffers) { + return {const_cast(reinterpret_cast(buffers.data())), + static_cast(buffers.size() * sizeof(std::shared_ptr))}; +} + } // namespace internal std::shared_ptr ArrayData::Make(std::shared_ptr type, int64_t length, @@ -187,7 +193,7 @@ void ArraySpan::SetMembers(const ArrayData& data) { } this->offset = data.offset; - for (int i = 0; i < static_cast(data.buffers.size()); ++i) { + for (int i = 0; i < std::min(static_cast(data.buffers.size()), 3); ++i) { const std::shared_ptr& buffer = data.buffers[i]; // It is the invoker-of-kernels's responsibility to ensure that // const buffers are not written to accidentally. @@ -200,7 +206,7 @@ void ArraySpan::SetMembers(const ArrayData& data) { Type::type type_id = this->type->id(); if (type_id == Type::EXTENSION) { - const ExtensionType* ext_type = checked_cast(this->type); + auto* ext_type = checked_cast(this->type); type_id = ext_type->storage_type()->id(); } @@ -215,6 +221,11 @@ void ArraySpan::SetMembers(const ArrayData& data) { this->buffers[i] = {}; } + if (type_id == Type::STRING_VIEW || type_id == Type::BINARY_VIEW) { + // store the span of data buffers in the third buffer + this->buffers[2] = internal::PackVariadicBuffers(util::span(data.buffers).subspan(2)); + } + if (type_id == Type::DICTIONARY) { this->child_data.resize(1); this->child_data[0].SetMembers(*data.dictionary); @@ -247,6 +258,8 @@ int GetNumBuffers(const DataType& type) { case Type::LARGE_BINARY: case Type::STRING: case Type::LARGE_STRING: + case Type::STRING_VIEW: + case Type::BINARY_VIEW: case Type::DENSE_UNION: return 3; case Type::EXTENSION: @@ -351,6 +364,19 @@ void ArraySpan::FillFromScalar(const Scalar& value) { } this->buffers[2].data = const_cast(data_buffer); this->buffers[2].size = data_size; + } else if (type_id == Type::BINARY_VIEW || type_id == Type::STRING_VIEW) { + const auto& scalar = checked_cast(value); + + this->buffers[1].size = BinaryViewType::kSize; + this->buffers[1].data = scalar.scratch_space_; + static_assert(sizeof(BinaryViewType::c_type) <= sizeof(scalar.scratch_space_)); + auto* view = new (&scalar.scratch_space_) BinaryViewType::c_type; + if (scalar.is_valid) { + *view = util::ToIndexOffsetBinaryView(std::string_view{*scalar.value}, 0, 0); + this->buffers[2] = internal::PackVariadicBuffers({&scalar.value, 1}); + } else { + *view = {}; + } } else if (type_id == Type::FIXED_SIZE_BINARY) { const auto& scalar = checked_cast(value); this->buffers[1].data = const_cast(scalar.value->data()); @@ -513,6 +539,14 @@ std::shared_ptr ArraySpan::ToArrayData() const { type_id = ext_type->storage_type()->id(); } + if (HasVariadicBuffers()) { + DCHECK_EQ(result->buffers.size(), 3); + result->buffers.pop_back(); + for (const auto& data_buffer : GetVariadicBuffers()) { + result->buffers.push_back(data_buffer); + } + } + if (type_id == Type::NA) { result->null_count = this->length; } else if (this->buffers[0].data == nullptr) { @@ -531,6 +565,16 @@ std::shared_ptr ArraySpan::ToArrayData() const { return result; } +util::span const> ArraySpan::GetVariadicBuffers() const { + DCHECK(HasVariadicBuffers()); + return {buffers[2].data_as>(), + buffers[2].size / sizeof(std::shared_ptr)}; +} + +bool ArraySpan::HasVariadicBuffers() const { + return type->id() == Type::BINARY_VIEW || type->id() == Type::STRING_VIEW; +} + std::shared_ptr ArraySpan::ToArray() const { return MakeArray(this->ToArrayData()); } @@ -722,7 +766,8 @@ struct ViewDataImpl { } RETURN_NOT_OK(CheckInputAvailable()); - const auto& in_spec = in_layouts[in_layout_idx].buffers[in_buffer_idx]; + const auto& in_layout = in_layouts[in_layout_idx]; + const auto& in_spec = in_layout.buffers[in_buffer_idx]; if (out_spec != in_spec) { return InvalidView("incompatible layouts"); } @@ -733,6 +778,18 @@ struct ViewDataImpl { DCHECK_GT(in_data_item->buffers.size(), in_buffer_idx); out_buffers.push_back(in_data_item->buffers[in_buffer_idx]); ++in_buffer_idx; + + if (in_buffer_idx == in_layout.buffers.size()) { + if (out_layout.variadic_spec != in_layout.variadic_spec) { + return InvalidView("incompatible layouts"); + } + + if (in_layout.variadic_spec) { + for (; in_buffer_idx < in_data_item->buffers.size(); ++in_buffer_idx) { + out_buffers.push_back(in_data_item->buffers[in_buffer_idx]); + } + } + } AdjustInputPointer(); } diff --git a/cpp/src/arrow/array/data.h b/cpp/src/arrow/array/data.h index 8c6b250b71adf..c54068986359d 100644 --- a/cpp/src/arrow/array/data.h +++ b/cpp/src/arrow/array/data.h @@ -28,6 +28,7 @@ #include "arrow/type.h" #include "arrow/util/bit_util.h" #include "arrow/util/macros.h" +#include "arrow/util/span.h" #include "arrow/util/visibility.h" namespace arrow { @@ -472,10 +473,12 @@ struct ARROW_EXPORT ArraySpan { void SetSlice(int64_t offset, int64_t length) { this->offset = offset; this->length = length; - if (this->type->id() != Type::NA) { + if (this->type->id() == Type::NA) { + this->null_count = this->length; + } else if (this->MayHaveNulls()) { this->null_count = kUnknownNullCount; } else { - this->null_count = this->length; + this->null_count = 0; } } @@ -530,6 +533,16 @@ struct ARROW_EXPORT ArraySpan { /// \see GetNullCount int64_t ComputeLogicalNullCount() const; + /// Some DataTypes (StringView, BinaryView) may have an arbitrary number of variadic + /// buffers. Since ArraySpan only has 3 buffers, we pack the variadic buffers into + /// buffers[2]; IE buffers[2].data points to the first shared_ptr of the + /// variadic set and buffers[2].size is the number of variadic buffers times + /// sizeof(shared_ptr). + /// + /// \see HasVariadicBuffers + util::span const> GetVariadicBuffers() const; + bool HasVariadicBuffers() const; + private: ARROW_FRIEND_EXPORT friend bool internal::IsNullRunEndEncoded(const ArrayData& span, int64_t i); diff --git a/cpp/src/arrow/array/dict_internal.h b/cpp/src/arrow/array/dict_internal.h index 3c1c8c453d1e7..63d8583b17324 100644 --- a/cpp/src/arrow/array/dict_internal.h +++ b/cpp/src/arrow/array/dict_internal.h @@ -150,6 +150,32 @@ struct DictionaryTraits> { } }; +template +struct DictionaryTraits> { + using MemoTableType = typename HashTraits::MemoTableType; + + static_assert(std::is_same_v>); + + // Instead of defining a custom memo table for StringView we reuse BinaryType's, + // then convert to views when we copy data out of the memo table. + static Result> GetDictionaryArrayData( + MemoryPool* pool, const std::shared_ptr& type, + const MemoTableType& memo_table, int64_t start_offset) { + DCHECK(type->id() == Type::STRING_VIEW || type->id() == Type::BINARY_VIEW); + + BinaryViewBuilder builder(pool); + RETURN_NOT_OK(builder.Resize(memo_table.size() - start_offset)); + RETURN_NOT_OK(builder.ReserveData(memo_table.values_size())); + memo_table.VisitValues(static_cast(start_offset), + [&](std::string_view s) { builder.UnsafeAppend(s); }); + + std::shared_ptr out; + RETURN_NOT_OK(builder.FinishInternal(&out)); + out->type = type; + return out; + } +}; + template struct DictionaryTraits> { using MemoTableType = typename HashTraits::MemoTableType; diff --git a/cpp/src/arrow/array/util.cc b/cpp/src/arrow/array/util.cc index 98e9d51b5fc75..369cf1a652dae 100644 --- a/cpp/src/arrow/array/util.cc +++ b/cpp/src/arrow/array/util.cc @@ -43,6 +43,9 @@ #include "arrow/util/decimal.h" #include "arrow/util/endian.h" #include "arrow/util/logging.h" +#include "arrow/util/sort.h" +#include "arrow/util/span.h" +#include "arrow/visit_data_inline.h" #include "arrow/visit_type_inline.h" namespace arrow { @@ -271,6 +274,13 @@ class ArrayDataEndianSwapper { return Status::OK(); } + Status Visit(const BinaryViewType& type) { + // TODO(GH-37879): This requires knowledge of whether the array is being swapped to + // native endian or from it so that we know what size to trust when deciding whether + // something is an inline view. + return Status::NotImplemented("Swapping endianness of ", type); + } + Status Visit(const ListType& type) { RETURN_NOT_OK(SwapOffsets(1)); return Status::OK(); @@ -379,6 +389,10 @@ class NullArrayFactory { return MaxOf(sizeof(typename T::offset_type) * (length_ + 1)); } + Status Visit(const BinaryViewType& type) { + return MaxOf(sizeof(BinaryViewType::c_type) * length_); + } + Status Visit(const FixedSizeListType& type) { return MaxOf(GetBufferLength(type.value_type(), type.list_size() * length_)); } @@ -498,6 +512,11 @@ class NullArrayFactory { return Status::OK(); } + Status Visit(const BinaryViewType&) { + out_->buffers.resize(2, buffer_); + return Status::OK(); + } + template enable_if_var_size_list Visit(const T& type) { out_->buffers.resize(2, buffer_); @@ -600,6 +619,11 @@ class RepeatedArrayFactory { RepeatedArrayFactory(MemoryPool* pool, const Scalar& scalar, int64_t length) : pool_(pool), scalar_(scalar), length_(length) {} + template + const auto& scalar() const { + return checked_cast::ScalarType&>(scalar_); + } + Result> Create() { RETURN_NOT_OK(VisitTypeInline(*scalar_.type, this)); return out_; @@ -621,7 +645,7 @@ class RepeatedArrayFactory { template enable_if_t::value || is_temporal_type::value, Status> Visit( const T&) { - auto value = checked_cast::ScalarType&>(scalar_).value; + auto value = scalar().value; return FinishFixedWidth(&value, sizeof(value)); } @@ -632,8 +656,7 @@ class RepeatedArrayFactory { template enable_if_decimal Visit(const T&) { - using ScalarType = typename TypeTraits::ScalarType; - auto value = checked_cast(scalar_).value.ToBytes(); + auto value = scalar().value.ToBytes(); return FinishFixedWidth(value.data(), value.size()); } @@ -644,29 +667,36 @@ class RepeatedArrayFactory { template enable_if_base_binary Visit(const T&) { - std::shared_ptr value = - checked_cast::ScalarType&>(scalar_).value; + const std::shared_ptr& value = scalar().value; std::shared_ptr values_buffer, offsets_buffer; RETURN_NOT_OK(CreateBufferOf(value->data(), value->size(), &values_buffer)); auto size = static_cast(value->size()); RETURN_NOT_OK(CreateOffsetsBuffer(size, &offsets_buffer)); - out_ = std::make_shared::ArrayType>(length_, offsets_buffer, - values_buffer); + out_ = std::make_shared::ArrayType>( + length_, std::move(offsets_buffer), std::move(values_buffer)); + return Status::OK(); + } + + template + enable_if_binary_view_like Visit(const T& type) { + std::string_view value{*scalar().value}; + auto s = util::ToIndexOffsetBinaryView(value, 0, 0); + RETURN_NOT_OK(FinishFixedWidth(&s, sizeof(s))); + if (!s.is_inline()) { + out_->data()->buffers.push_back(scalar().value); + } return Status::OK(); } template enable_if_var_size_list Visit(const T& type) { - using ScalarType = typename TypeTraits::ScalarType; using ArrayType = typename TypeTraits::ArrayType; - auto value = checked_cast(scalar_).value; - - ArrayVector values(length_, value); + ArrayVector values(length_, scalar().value); ARROW_ASSIGN_OR_RAISE(auto value_array, Concatenate(values, pool_)); std::shared_ptr offsets_buffer; - auto size = static_cast(value->length()); + auto size = static_cast(scalar().value->length()); RETURN_NOT_OK(CreateOffsetsBuffer(size, &offsets_buffer)); out_ = diff --git a/cpp/src/arrow/array/util.h b/cpp/src/arrow/array/util.h index 9f34af0525d96..a340caca03074 100644 --- a/cpp/src/arrow/array/util.h +++ b/cpp/src/arrow/array/util.h @@ -86,5 +86,18 @@ Result> SwapEndianArrayData( ARROW_EXPORT std::vector RechunkArraysConsistently(const std::vector&); +/// Convert between index/offset and raw pointer views. +/// +/// This function can be used to overwrite a buffer of views if desired, +/// IE it is supported for `in.buffers[1].data == out`. +/// +/// Note that calling this function is not necessary if all views happen to be +/// Inline; this is usually efficiently detectable by checking for an absence of any +/// data buffers. +/// +/// Will raise IndexError if a view refers to memory outside the data buffers. +ARROW_EXPORT +Status SwapBinaryViewPointers(const ArraySpan& in, BinaryViewType::c_type* out); + } // namespace internal } // namespace arrow diff --git a/cpp/src/arrow/array/validate.cc b/cpp/src/arrow/array/validate.cc index 19ff8e28b536c..142bab151744a 100644 --- a/cpp/src/arrow/array/validate.cc +++ b/cpp/src/arrow/array/validate.cc @@ -31,41 +31,43 @@ #include "arrow/util/int_util_overflow.h" #include "arrow/util/logging.h" #include "arrow/util/ree_util.h" +#include "arrow/util/sort.h" +#include "arrow/util/string.h" +#include "arrow/util/unreachable.h" #include "arrow/util/utf8.h" #include "arrow/visit_data_inline.h" #include "arrow/visit_type_inline.h" -namespace arrow { -namespace internal { +namespace arrow::internal { namespace { struct UTF8DataValidator { const ArrayData& data; - Status Visit(const DataType&) { - // Default, should be unreachable - return Status::NotImplemented(""); - } + template + Status Visit(const T&) { + if constexpr (std::is_same_v || std::is_same_v || + std::is_same_v) { + util::InitializeUTF8(); - template - enable_if_string Visit(const StringType&) { - util::InitializeUTF8(); - - int64_t i = 0; - return VisitArraySpanInline( - data, - [&](std::string_view v) { - if (ARROW_PREDICT_FALSE(!util::ValidateUTF8(v))) { - return Status::Invalid("Invalid UTF8 sequence at string index ", i); - } - ++i; - return Status::OK(); - }, - [&]() { - ++i; - return Status::OK(); - }); + int64_t i = 0; + return VisitArraySpanInline( + data, + [&](std::string_view v) { + if (ARROW_PREDICT_FALSE(!util::ValidateUTF8(v))) { + return Status::Invalid("Invalid UTF8 sequence at string index ", i); + } + ++i; + return Status::OK(); + }, + [&]() { + ++i; + return Status::OK(); + }); + } else { + Unreachable("utf-8 validation of non string type"); + } } }; @@ -169,6 +171,14 @@ struct ValidateArrayImpl { return Status::OK(); } + Status Visit(const StringViewType& type) { + RETURN_NOT_OK(ValidateBinaryView(type)); + if (full_validation) { + RETURN_NOT_OK(ValidateUTF8(data)); + } + return Status::OK(); + } + Status Visit(const Date64Type& type) { RETURN_NOT_OK(ValidateFixedWidthBuffers()); @@ -248,6 +258,8 @@ struct ValidateArrayImpl { Status Visit(const LargeBinaryType& type) { return ValidateBinaryLike(type); } + Status Visit(const BinaryViewType& type) { return ValidateBinaryView(type); } + Status Visit(const ListType& type) { return ValidateListLike(type); } Status Visit(const LargeListType& type) { return ValidateListLike(type); } @@ -453,11 +465,16 @@ struct ValidateArrayImpl { return Status::Invalid("Array length is negative"); } - if (data.buffers.size() != layout.buffers.size()) { + if (layout.variadic_spec) { + if (data.buffers.size() < layout.buffers.size()) { + return Status::Invalid("Expected at least ", layout.buffers.size(), + " buffers in array of type ", type.ToString(), ", got ", + data.buffers.size()); + } + } else if (data.buffers.size() != layout.buffers.size()) { return Status::Invalid("Expected ", layout.buffers.size(), - " buffers in array " - "of type ", - type.ToString(), ", got ", data.buffers.size()); + " buffers in array of type ", type.ToString(), ", got ", + data.buffers.size()); } // This check is required to avoid addition overflow below @@ -469,7 +486,9 @@ struct ValidateArrayImpl { for (int i = 0; i < static_cast(data.buffers.size()); ++i) { const auto& buffer = data.buffers[i]; - const auto& spec = layout.buffers[i]; + const auto& spec = i < static_cast(layout.buffers.size()) + ? layout.buffers[i] + : *layout.variadic_spec; if (buffer == nullptr) { continue; @@ -595,6 +614,74 @@ struct ValidateArrayImpl { return Status::OK(); } + Status ValidateBinaryView(const BinaryViewType& type) { + int64_t views_byte_size = data.buffers[1]->size(); + int64_t required_view_count = data.length + data.offset; + if (static_cast(views_byte_size / BinaryViewType::kSize) < + required_view_count) { + return Status::Invalid("View buffer size (bytes): ", views_byte_size, + " isn't large enough for length: ", data.length, + " and offset: ", data.offset); + } + + if (!full_validation) return Status::OK(); + + auto CheckPrefix = [&](size_t i, + std::array prefix, + const uint8_t* data) { + if (std::memcmp(data, prefix.data(), BinaryViewType::kPrefixSize) == 0) { + return Status::OK(); + } + return Status::Invalid("View at slot ", i, " has inlined prefix 0x", + HexEncode(prefix.data(), BinaryViewType::kPrefixSize), + " but the out-of-line data begins with 0x", + HexEncode(data, BinaryViewType::kPrefixSize)); + }; + + util::span views(data.GetValues(1), + static_cast(data.length)); + util::span data_buffers(data.buffers.data() + 2, data.buffers.size() - 2); + + for (size_t i = 0; i < static_cast(data.length); ++i) { + if (data.IsNull(i)) continue; + + if (views[i].size() < 0) { + return Status::Invalid("View at slot ", i, " has negative size ", + views[i].size()); + } + + if (views[i].is_inline()) continue; + + auto [size, prefix, buffer_index, offset] = views[i].ref; + + if (buffer_index < 0) { + return Status::Invalid("View at slot ", i, " has negative buffer index ", + buffer_index); + } + + if (offset < 0) { + return Status::Invalid("View at slot ", i, " has negative offset ", offset); + } + + if (static_cast(buffer_index) >= data_buffers.size()) { + return Status::IndexError("View at slot ", i, " references buffer ", buffer_index, + " but there are only ", data_buffers.size(), + " data buffers"); + } + const auto& buffer = data_buffers[buffer_index]; + + if (int64_t end = offset + static_cast(size); end > buffer->size()) { + return Status::IndexError( + "View at slot ", i, " references range ", offset, "-", end, " of buffer ", + buffer_index, " but that buffer is only ", buffer->size(), " bytes long"); + } + + RETURN_NOT_OK(CheckPrefix(i, prefix, buffer->data() + offset)); + } + + return Status::OK(); + } + template Status ValidateListLike(const ListType& type) { const ArrayData& values = *data.child_data[0]; @@ -798,7 +885,8 @@ Status ValidateArrayFull(const Array& array) { return ValidateArrayFull(*array.d ARROW_EXPORT Status ValidateUTF8(const ArrayData& data) { - DCHECK(data.type->id() == Type::STRING || data.type->id() == Type::LARGE_STRING); + DCHECK(data.type->id() == Type::STRING || data.type->id() == Type::STRING_VIEW || + data.type->id() == Type::LARGE_STRING); UTF8DataValidator validator{data}; return VisitTypeInline(*data.type, &validator); } @@ -806,5 +894,4 @@ Status ValidateUTF8(const ArrayData& data) { ARROW_EXPORT Status ValidateUTF8(const Array& array) { return ValidateUTF8(*array.data()); } -} // namespace internal -} // namespace arrow +} // namespace arrow::internal diff --git a/cpp/src/arrow/buffer_builder.h b/cpp/src/arrow/buffer_builder.h index e7eea64043ba8..a84c98b6b2491 100644 --- a/cpp/src/arrow/buffer_builder.h +++ b/cpp/src/arrow/buffer_builder.h @@ -143,7 +143,10 @@ class ARROW_EXPORT BufferBuilder { memcpy(data_ + size_, data, static_cast(length)); size_ += length; } - void UnsafeAppend(std::string_view v) { UnsafeAppend(v.data(), v.size()); } + + void UnsafeAppend(std::string_view v) { + UnsafeAppend(v.data(), static_cast(v.size())); + } void UnsafeAppend(const int64_t num_copies, uint8_t value) { memset(data_ + size_, value, static_cast(num_copies)); @@ -268,7 +271,7 @@ class TypedBufferBuilder< template void UnsafeAppend(Iter values_begin, Iter values_end) { - int64_t num_elements = static_cast(std::distance(values_begin, values_end)); + auto num_elements = static_cast(std::distance(values_begin, values_end)); auto data = mutable_data() + length(); bytes_builder_.UnsafeAdvance(num_elements * sizeof(T)); std::copy(values_begin, values_end, data); diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc index caddbf9db5578..c7e6207bfefa4 100644 --- a/cpp/src/arrow/builder.cc +++ b/cpp/src/arrow/builder.cc @@ -148,6 +148,8 @@ struct DictionaryBuilderCase { Status Visit(const StringType&) { return CreateFor(); } Status Visit(const LargeBinaryType&) { return CreateFor(); } Status Visit(const LargeStringType&) { return CreateFor(); } + Status Visit(const BinaryViewType&) { return CreateFor(); } + Status Visit(const StringViewType&) { return CreateFor(); } Status Visit(const FixedSizeBinaryType&) { return CreateFor(); } Status Visit(const Decimal128Type&) { return CreateFor(); } Status Visit(const Decimal256Type&) { return CreateFor(); } @@ -190,7 +192,7 @@ struct DictionaryBuilderCase { struct MakeBuilderImpl { template - enable_if_not_nested Visit(const T&) { + enable_if_not_nested Visit(const T& t) { out.reset(new typename TypeTraits::BuilderType(type, pool)); return Status::OK(); } diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc index df41cd22c9e06..409b21c90b6d1 100644 --- a/cpp/src/arrow/compare.cc +++ b/cpp/src/arrow/compare.cc @@ -38,6 +38,7 @@ #include "arrow/tensor.h" #include "arrow/type.h" #include "arrow/type_traits.h" +#include "arrow/util/binary_view_util.h" #include "arrow/util/bit_run_reader.h" #include "arrow/util/bit_util.h" #include "arrow/util/bitmap_ops.h" @@ -261,6 +262,25 @@ class RangeDataEqualsImpl { // Also matches StringType Status Visit(const BinaryType& type) { return CompareBinary(type); } + // Also matches StringViewType + Status Visit(const BinaryViewType& type) { + auto* left_values = left_.GetValues(1) + left_start_idx_; + auto* right_values = right_.GetValues(1) + right_start_idx_; + + auto* left_buffers = left_.buffers.data() + 2; + auto* right_buffers = right_.buffers.data() + 2; + VisitValidRuns([&](int64_t i, int64_t length) { + for (auto end_i = i + length; i < end_i; ++i) { + if (!util::EqualIndexOffsetBinaryView(left_values[i], right_values[i], + left_buffers, right_buffers)) { + return false; + } + } + return true; + }); + return Status::OK(); + } + // Also matches LargeStringType Status Visit(const LargeBinaryType& type) { return CompareBinary(type); } @@ -632,6 +652,11 @@ class TypeEqualsVisitor { return Status::OK(); } + Status Visit(const BinaryViewType&) { + result_ = true; + return Status::OK(); + } + template enable_if_interval Visit(const T& left) { const auto& right = checked_cast(right_); @@ -802,8 +827,7 @@ class ScalarEqualsVisitor { Status Visit(const DoubleScalar& left) { return CompareFloating(left); } template - typename std::enable_if::value, Status>::type - Visit(const T& left) { + enable_if_t::value, Status> Visit(const T& left) { const auto& right = checked_cast(right_); result_ = internal::SharedPtrEquals(left.value, right.value); return Status::OK(); diff --git a/cpp/src/arrow/compute/kernels/vector_hash.cc b/cpp/src/arrow/compute/kernels/vector_hash.cc index d9143b760f32b..5426dc405429c 100644 --- a/cpp/src/arrow/compute/kernels/vector_hash.cc +++ b/cpp/src/arrow/compute/kernels/vector_hash.cc @@ -31,6 +31,7 @@ #include "arrow/compute/kernels/common_internal.h" #include "arrow/result.h" #include "arrow/util/hashing.h" +#include "arrow/util/unreachable.h" namespace arrow { @@ -262,7 +263,7 @@ class HashKernel : public KernelState { // Base class for all "regular" hash kernel implementations // (NullType has a separate implementation) -template class RegularHashKernel : public HashKernel { public: @@ -503,39 +504,13 @@ class DictionaryHashKernel : public HashKernel { }; // ---------------------------------------------------------------------- - -template -struct HashKernelTraits {}; - -template -struct HashKernelTraits> { - using HashKernel = NullHashKernel; -}; - -template -struct HashKernelTraits> { - using HashKernel = RegularHashKernel; -}; - -template -struct HashKernelTraits> { - using HashKernel = RegularHashKernel; -}; - -template -Result> HashInitImpl(KernelContext* ctx, - const KernelInitArgs& args) { - using HashKernelType = typename HashKernelTraits::HashKernel; - auto result = std::make_unique(args.inputs[0].GetSharedPtr(), - args.options, ctx->memory_pool()); - RETURN_NOT_OK(result->Reset()); - return std::move(result); -} - -template +template Result> HashInit(KernelContext* ctx, const KernelInitArgs& args) { - return HashInitImpl(ctx, args); + auto result = std::make_unique(args.inputs[0].GetSharedPtr(), args.options, + ctx->memory_pool()); + RETURN_NOT_OK(result->Reset()); + return std::move(result); } template @@ -544,22 +519,22 @@ KernelInit GetHashInit(Type::type type_id) { // representation switch (type_id) { case Type::NA: - return HashInit; + return HashInit>; case Type::BOOL: - return HashInit; + return HashInit>; case Type::INT8: case Type::UINT8: - return HashInit; + return HashInit>; case Type::INT16: case Type::UINT16: - return HashInit; + return HashInit>; case Type::INT32: case Type::UINT32: case Type::FLOAT: case Type::DATE32: case Type::TIME32: case Type::INTERVAL_MONTHS: - return HashInit; + return HashInit>; case Type::INT64: case Type::UINT64: case Type::DOUBLE: @@ -568,22 +543,24 @@ KernelInit GetHashInit(Type::type type_id) { case Type::TIMESTAMP: case Type::DURATION: case Type::INTERVAL_DAY_TIME: - return HashInit; + return HashInit>; case Type::BINARY: case Type::STRING: - return HashInit; + return HashInit>; case Type::LARGE_BINARY: case Type::LARGE_STRING: - return HashInit; + return HashInit>; + case Type::BINARY_VIEW: + case Type::STRING_VIEW: + return HashInit>; case Type::FIXED_SIZE_BINARY: case Type::DECIMAL128: case Type::DECIMAL256: - return HashInit; + return HashInit>; case Type::INTERVAL_MONTH_DAY_NANO: - return HashInit; + return HashInit>; default: - DCHECK(false); - return nullptr; + Unreachable("non hashable type"); } } @@ -593,31 +570,11 @@ template Result> DictionaryHashInit(KernelContext* ctx, const KernelInitArgs& args) { const auto& dict_type = checked_cast(*args.inputs[0].type); - Result> indices_hasher; - switch (dict_type.index_type()->id()) { - case Type::INT8: - case Type::UINT8: - indices_hasher = HashInitImpl(ctx, args); - break; - case Type::INT16: - case Type::UINT16: - indices_hasher = HashInitImpl(ctx, args); - break; - case Type::INT32: - case Type::UINT32: - indices_hasher = HashInitImpl(ctx, args); - break; - case Type::INT64: - case Type::UINT64: - indices_hasher = HashInitImpl(ctx, args); - break; - default: - DCHECK(false) << "Unsupported dictionary index type"; - break; - } - RETURN_NOT_OK(indices_hasher); - return std::make_unique(std::move(indices_hasher.ValueOrDie()), - dict_type.value_type()); + ARROW_ASSIGN_OR_RAISE(auto indices_hasher, + GetHashInit(dict_type.index_type()->id())(ctx, args)); + return std::make_unique( + checked_pointer_cast(std::move(indices_hasher)), + dict_type.value_type()); } Status HashExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { diff --git a/cpp/src/arrow/engine/substrait/expression_internal.cc b/cpp/src/arrow/engine/substrait/expression_internal.cc index 0df8425609ff1..d395261597696 100644 --- a/cpp/src/arrow/engine/substrait/expression_internal.cc +++ b/cpp/src/arrow/engine/substrait/expression_internal.cc @@ -727,6 +727,8 @@ struct ScalarToProtoImpl { s); } + Status Visit(const BinaryViewScalar& s) { return NotImplemented(s); } + Status Visit(const FixedSizeBinaryScalar& s) { return FromBuffer( [](Lit* lit, std::string&& s) { lit->set_fixed_binary(std::move(s)); }, s); diff --git a/cpp/src/arrow/engine/substrait/type_internal.cc b/cpp/src/arrow/engine/substrait/type_internal.cc index 1f9141f36ba6b..d3fb058137e6a 100644 --- a/cpp/src/arrow/engine/substrait/type_internal.cc +++ b/cpp/src/arrow/engine/substrait/type_internal.cc @@ -263,6 +263,8 @@ struct DataTypeToProtoImpl { return SetWith(&substrait::Type::set_allocated_binary); } + Status Visit(const BinaryViewType& t) { return NotImplemented(t); } + Status Visit(const FixedSizeBinaryType& t) { SetWithThen(&substrait::Type::set_allocated_fixed_binary)->set_length(t.byte_width()); return Status::OK(); diff --git a/cpp/src/arrow/integration/json_internal.cc b/cpp/src/arrow/integration/json_internal.cc index ed7be4b502985..1f19e891f425c 100644 --- a/cpp/src/arrow/integration/json_internal.cc +++ b/cpp/src/arrow/integration/json_internal.cc @@ -48,6 +48,7 @@ #include "arrow/util/key_value_metadata.h" #include "arrow/util/logging.h" #include "arrow/util/range.h" +#include "arrow/util/span.h" #include "arrow/util/string.h" #include "arrow/util/value_parsing.h" #include "arrow/visit_array_inline.h" @@ -106,6 +107,11 @@ std::string GetTimeUnitName(TimeUnit::type unit) { return "UNKNOWN"; } +std::string_view GetStringView(const rj::Value& str) { + DCHECK(str.IsString()); + return {str.GetString(), str.GetStringLength()}; +} + class SchemaWriter { public: explicit SchemaWriter(const Schema& schema, const DictionaryFieldMapper& mapper, @@ -226,8 +232,9 @@ class SchemaWriter { template enable_if_t::value || is_primitive_ctype::value || - is_base_binary_type::value || is_var_length_list_type::value || - is_struct_type::value || is_run_end_encoded_type::value> + is_base_binary_type::value || is_binary_view_like_type::value || + is_var_length_list_type::value || is_struct_type::value || + is_run_end_encoded_type::value> WriteTypeMetadata(const T& type) {} void WriteTypeMetadata(const MapType& type) { @@ -382,6 +389,8 @@ class SchemaWriter { Status Visit(const TimeType& type) { return WritePrimitive("time", type); } Status Visit(const StringType& type) { return WriteVarBytes("utf8", type); } Status Visit(const BinaryType& type) { return WriteVarBytes("binary", type); } + Status Visit(const StringViewType& type) { return WritePrimitive("utf8view", type); } + Status Visit(const BinaryViewType& type) { return WritePrimitive("binaryview", type); } Status Visit(const LargeStringType& type) { return WriteVarBytes("largeutf8", type); } Status Visit(const LargeBinaryType& type) { return WriteVarBytes("largebinary", type); } Status Visit(const FixedSizeBinaryType& type) { @@ -528,22 +537,19 @@ class ArrayWriter { } } - // Binary, encode to hexadecimal. - template - enable_if_binary_like WriteDataValues( - const ArrayType& arr) { - for (int64_t i = 0; i < arr.length(); ++i) { - writer_->String(HexEncode(arr.GetView(i))); - } - } - - // UTF8 string, write as is - template - enable_if_string_like WriteDataValues( - const ArrayType& arr) { + template + std::enable_if_t::value || + is_fixed_size_binary_type::value> + WriteDataValues(const ArrayType& arr) { for (int64_t i = 0; i < arr.length(); ++i) { - auto view = arr.GetView(i); - writer_->String(view.data(), static_cast(view.size())); + if constexpr (Type::is_utf8) { + // UTF8 string, write as is + auto view = arr.GetView(i); + writer_->String(view.data(), static_cast(view.size())); + } else { + // Binary, encode to hexadecimal. + writer_->String(HexEncode(arr.GetView(i))); + } } } @@ -642,6 +648,50 @@ class ArrayWriter { writer_->EndArray(); } + template + void WriteBinaryViewField(const ArrayType& array) { + writer_->Key("VIEWS"); + writer_->StartArray(); + for (int64_t i = 0; i < array.length(); ++i) { + auto s = array.raw_values()[i]; + writer_->StartObject(); + writer_->Key("SIZE"); + writer_->Int64(s.size()); + if (s.is_inline()) { + writer_->Key("INLINED"); + if constexpr (ArrayType::TypeClass::is_utf8) { + writer_->String(reinterpret_cast(s.inline_data()), s.size()); + } else { + writer_->String(HexEncode(s.inline_data(), s.size())); + } + } else { + // Prefix is always 4 bytes so it may not be utf-8 even if the whole + // string view is + writer_->Key("PREFIX_HEX"); + writer_->String(HexEncode(s.inline_data(), BinaryViewType::kPrefixSize)); + writer_->Key("BUFFER_INDEX"); + writer_->Int64(s.ref.buffer_index); + writer_->Key("OFFSET"); + writer_->Int64(s.ref.offset); + } + writer_->EndObject(); + } + writer_->EndArray(); + } + + void WriteVariadicBuffersField(const BinaryViewArray& arr) { + writer_->Key("VARIADIC_DATA_BUFFERS"); + writer_->StartArray(); + const auto& buffers = arr.data()->buffers; + for (size_t i = 2; i < buffers.size(); ++i) { + // Encode the data buffers into hexadecimal strings. + // Even for arrays which contain utf-8, portions of the buffer not + // referenced by any view may be invalid. + writer_->String(buffers[i]->ToHexString()); + } + writer_->EndArray(); + } + void WriteValidityField(const Array& arr) { writer_->Key("VALIDITY"); writer_->StartArray(); @@ -682,8 +732,10 @@ class ArrayWriter { } template - enable_if_t::value, Status> Visit( - const ArrayType& array) { + enable_if_t::value && + !is_binary_view_like_type::value, + Status> + Visit(const ArrayType& array) { WriteValidityField(array); WriteDataField(array); SetNoChildren(); @@ -700,6 +752,17 @@ class ArrayWriter { return Status::OK(); } + template + enable_if_binary_view_like Visit( + const ArrayType& array) { + WriteValidityField(array); + WriteBinaryViewField(array); + WriteVariadicBuffersField(array); + + SetNoChildren(); + return Status::OK(); + } + Status Visit(const DictionaryArray& array) { return VisitArrayValues(*array.indices()); } @@ -1033,6 +1096,10 @@ Result> GetType(const RjObject& json_type, return utf8(); } else if (type_name == "binary") { return binary(); + } else if (type_name == "utf8view") { + return utf8_view(); + } else if (type_name == "binaryview") { + return binary_view(); } else if (type_name == "largeutf8") { return large_utf8(); } else if (type_name == "largebinary") { @@ -1246,10 +1313,12 @@ class ArrayReader { return Status::OK(); } - Result GetDataArray(const RjObject& obj) { - ARROW_ASSIGN_OR_RAISE(const auto json_data_arr, GetMemberArray(obj, kData)); + Result GetDataArray(const RjObject& obj, + const std::string& key = kData) { + ARROW_ASSIGN_OR_RAISE(const auto json_data_arr, GetMemberArray(obj, key)); if (static_cast(json_data_arr.Size()) != length_) { - return Status::Invalid("JSON DATA array size differs from advertised array length"); + return Status::Invalid("JSON ", key, " array size ", json_data_arr.Size(), + " differs from advertised array length ", length_); } return json_data_arr; } @@ -1332,6 +1401,97 @@ class ArrayReader { return FinishBuilder(&builder); } + template + enable_if_binary_view_like Visit(const ViewType& type) { + ARROW_ASSIGN_OR_RAISE(const auto json_views, GetDataArray(obj_, "VIEWS")); + ARROW_ASSIGN_OR_RAISE(const auto json_variadic_bufs, + GetMemberArray(obj_, "VARIADIC_DATA_BUFFERS")); + + using internal::Zip; + using util::span; + + BufferVector buffers; + buffers.resize(json_variadic_bufs.Size() + 2); + for (auto [json_buf, buf] : Zip(json_variadic_bufs, span{buffers}.subspan(2))) { + auto hex_string = GetStringView(json_buf); + ARROW_ASSIGN_OR_RAISE( + buf, AllocateBuffer(static_cast(hex_string.size()) / 2, pool_)); + RETURN_NOT_OK(ParseHexValues(hex_string, buf->mutable_data())); + } + + TypedBufferBuilder validity_builder{pool_}; + RETURN_NOT_OK(validity_builder.Resize(length_)); + for (bool is_valid : is_valid_) { + validity_builder.UnsafeAppend(is_valid); + } + ARROW_ASSIGN_OR_RAISE(buffers[0], validity_builder.Finish()); + + ARROW_ASSIGN_OR_RAISE( + buffers[1], AllocateBuffer(length_ * sizeof(BinaryViewType::c_type), pool_)); + + span views{buffers[1]->mutable_data_as(), + static_cast(length_)}; + + int64_t null_count = 0; + for (auto [json_view, s, is_valid] : Zip(json_views, views, is_valid_)) { + if (!is_valid) { + s = {}; + ++null_count; + continue; + } + + DCHECK(json_view.IsObject()); + const auto& json_view_obj = json_view.GetObject(); + + auto json_size = json_view_obj.FindMember("SIZE"); + RETURN_NOT_INT("SIZE", json_size, json_view_obj); + DCHECK_GE(json_size->value.GetInt64(), 0); + auto size = static_cast(json_size->value.GetInt64()); + + if (size <= BinaryViewType::kInlineSize) { + auto json_inlined = json_view_obj.FindMember("INLINED"); + RETURN_NOT_STRING("INLINED", json_inlined, json_view_obj); + s.inlined = {size, {}}; + + if constexpr (ViewType::is_utf8) { + DCHECK_LE(json_inlined->value.GetStringLength(), BinaryViewType::kInlineSize); + memcpy(&s.inlined.data, json_inlined->value.GetString(), size); + } else { + DCHECK_LE(json_inlined->value.GetStringLength(), + BinaryViewType::kInlineSize * 2); + RETURN_NOT_OK( + ParseHexValues(GetStringView(json_inlined->value), s.inlined.data.data())); + } + continue; + } + + auto json_prefix = json_view_obj.FindMember("PREFIX_HEX"); + auto json_buffer_index = json_view_obj.FindMember("BUFFER_INDEX"); + auto json_offset = json_view_obj.FindMember("OFFSET"); + RETURN_NOT_STRING("PREFIX_HEX", json_prefix, json_view_obj); + RETURN_NOT_INT("BUFFER_INDEX", json_buffer_index, json_view_obj); + RETURN_NOT_INT("OFFSET", json_offset, json_view_obj); + + s.ref = { + size, + {}, + static_cast(json_buffer_index->value.GetInt64()), + static_cast(json_offset->value.GetInt64()), + }; + + DCHECK_EQ(json_prefix->value.GetStringLength(), BinaryViewType::kPrefixSize * 2); + RETURN_NOT_OK( + ParseHexValues(GetStringView(json_prefix->value), s.ref.prefix.data())); + + DCHECK_LE(static_cast(s.ref.buffer_index), buffers.size() - 2); + DCHECK_LE(static_cast(s.ref.offset) + s.size(), + buffers[s.ref.buffer_index + 2]->size()); + } + + data_ = ArrayData::Make(type_, length_, std::move(buffers), null_count); + return Status::OK(); + } + Status Visit(const DayTimeIntervalType& type) { DayTimeIntervalBuilder builder(pool_); diff --git a/cpp/src/arrow/ipc/feather.cc b/cpp/src/arrow/ipc/feather.cc index b6d3a3d7d8cbb..1ef076fac40e2 100644 --- a/cpp/src/arrow/ipc/feather.cc +++ b/cpp/src/arrow/ipc/feather.cc @@ -536,8 +536,8 @@ struct ArrayWriterV1 { is_nested_type::value || is_null_type::value || is_decimal_type::value || std::is_same::value || is_duration_type::value || is_interval_type::value || is_fixed_size_binary_type::value || - std::is_same::value || std::is_same::value || - std::is_same::value, + is_binary_view_like_type::value || std::is_same::value || + std::is_same::value || std::is_same::value, Status>::type Visit(const T& type) { return Status::NotImplemented(type.ToString()); diff --git a/cpp/src/arrow/ipc/feather_test.cc b/cpp/src/arrow/ipc/feather_test.cc index e1d4282cb2635..0b6ae4f620647 100644 --- a/cpp/src/arrow/ipc/feather_test.cc +++ b/cpp/src/arrow/ipc/feather_test.cc @@ -264,7 +264,8 @@ TEST_P(TestFeather, TimeTypes) { TEST_P(TestFeather, VLenPrimitiveRoundTrip) { std::shared_ptr batch; - ASSERT_OK(ipc::test::MakeStringTypesRecordBatch(&batch)); + ASSERT_OK(ipc::test::MakeStringTypesRecordBatch(&batch, /*with_nulls=*/true, + /*with_view_types=*/false)); CheckRoundtrip(batch); } @@ -306,7 +307,8 @@ TEST_P(TestFeather, SliceFloatRoundTrip) { TEST_P(TestFeather, SliceStringsRoundTrip) { std::shared_ptr batch; - ASSERT_OK(ipc::test::MakeStringTypesRecordBatch(&batch, /*with_nulls=*/true)); + ASSERT_OK(ipc::test::MakeStringTypesRecordBatch(&batch, /*with_nulls=*/true, + /*with_view_types=*/false)); CheckSlices(batch); } diff --git a/cpp/src/arrow/ipc/json_simple.cc b/cpp/src/arrow/ipc/json_simple.cc index eea0c9730283e..4d2d803f3f65e 100644 --- a/cpp/src/arrow/ipc/json_simple.cc +++ b/cpp/src/arrow/ipc/json_simple.cc @@ -847,6 +847,8 @@ Status GetDictConverter(const std::shared_ptr& type, PARAM_CONVERTER_CASE(Type::BINARY, StringConverter, BinaryType) PARAM_CONVERTER_CASE(Type::LARGE_STRING, StringConverter, LargeStringType) PARAM_CONVERTER_CASE(Type::LARGE_BINARY, StringConverter, LargeBinaryType) + PARAM_CONVERTER_CASE(Type::STRING_VIEW, StringConverter, StringViewType) + PARAM_CONVERTER_CASE(Type::BINARY_VIEW, StringConverter, BinaryViewType) SIMPLE_CONVERTER_CASE(Type::FIXED_SIZE_BINARY, FixedSizeBinaryConverter, FixedSizeBinaryType) SIMPLE_CONVERTER_CASE(Type::DECIMAL128, Decimal128Converter, Decimal128Type) @@ -905,6 +907,8 @@ Status GetConverter(const std::shared_ptr& type, SIMPLE_CONVERTER_CASE(Type::BINARY, StringConverter) SIMPLE_CONVERTER_CASE(Type::LARGE_STRING, StringConverter) SIMPLE_CONVERTER_CASE(Type::LARGE_BINARY, StringConverter) + SIMPLE_CONVERTER_CASE(Type::STRING_VIEW, StringConverter) + SIMPLE_CONVERTER_CASE(Type::BINARY_VIEW, StringConverter) SIMPLE_CONVERTER_CASE(Type::FIXED_SIZE_BINARY, FixedSizeBinaryConverter<>) SIMPLE_CONVERTER_CASE(Type::DECIMAL128, Decimal128Converter<>) SIMPLE_CONVERTER_CASE(Type::DECIMAL256, Decimal256Converter<>) diff --git a/cpp/src/arrow/ipc/json_simple_test.cc b/cpp/src/arrow/ipc/json_simple_test.cc index 6eee5955242aa..b67c26999945b 100644 --- a/cpp/src/arrow/ipc/json_simple_test.cc +++ b/cpp/src/arrow/ipc/json_simple_test.cc @@ -271,7 +271,13 @@ INSTANTIATE_TYPED_TEST_SUITE_P(TestHalfFloat, TestIntegers, HalfFloatType); template class TestStrings : public ::testing::Test { public: - std::shared_ptr type() { return TypeTraits::type_singleton(); } + std::shared_ptr type() const { + if constexpr (is_binary_view_like_type::value) { + return T::is_utf8 ? utf8_view() : binary_view(); + } else { + return TypeTraits::type_singleton(); + } + } }; TYPED_TEST_SUITE_P(TestStrings); @@ -327,6 +333,8 @@ INSTANTIATE_TYPED_TEST_SUITE_P(TestString, TestStrings, StringType); INSTANTIATE_TYPED_TEST_SUITE_P(TestBinary, TestStrings, BinaryType); INSTANTIATE_TYPED_TEST_SUITE_P(TestLargeString, TestStrings, LargeStringType); INSTANTIATE_TYPED_TEST_SUITE_P(TestLargeBinary, TestStrings, LargeBinaryType); +INSTANTIATE_TYPED_TEST_SUITE_P(TestStringView, TestStrings, StringViewType); +INSTANTIATE_TYPED_TEST_SUITE_P(TestBinaryView, TestStrings, BinaryViewType); TEST(TestNull, Basics) { std::shared_ptr type = null(); diff --git a/cpp/src/arrow/ipc/metadata_internal.cc b/cpp/src/arrow/ipc/metadata_internal.cc index 1394516ecd5ce..9350fb4ac6096 100644 --- a/cpp/src/arrow/ipc/metadata_internal.cc +++ b/cpp/src/arrow/ipc/metadata_internal.cc @@ -258,6 +258,9 @@ Status ConcreteTypeFromFlatbuffer(flatbuf::Type type, const void* type_data, case flatbuf::Type::LargeBinary: *out = large_binary(); return Status::OK(); + case flatbuf::Type::BinaryView: + *out = binary_view(); + return Status::OK(); case flatbuf::Type::FixedSizeBinary: { auto fw_binary = static_cast(type_data); return FixedSizeBinaryType::Make(fw_binary->byteWidth()).Value(out); @@ -268,6 +271,9 @@ Status ConcreteTypeFromFlatbuffer(flatbuf::Type type, const void* type_data, case flatbuf::Type::LargeUtf8: *out = large_utf8(); return Status::OK(); + case flatbuf::Type::Utf8View: + *out = utf8_view(); + return Status::OK(); case flatbuf::Type::Bool: *out = boolean(); return Status::OK(); @@ -534,6 +540,18 @@ class FieldToFlatbufferVisitor { return Status::OK(); } + Status Visit(const BinaryViewType& type) { + fb_type_ = flatbuf::Type::BinaryView; + type_offset_ = flatbuf::CreateBinaryView(fbb_).Union(); + return Status::OK(); + } + + Status Visit(const StringViewType& type) { + fb_type_ = flatbuf::Type::Utf8View; + type_offset_ = flatbuf::CreateUtf8View(fbb_).Union(); + return Status::OK(); + } + Status Visit(const LargeBinaryType& type) { fb_type_ = flatbuf::Type::LargeBinary; type_offset_ = flatbuf::CreateLargeBinary(fbb_).Union(); @@ -967,6 +985,7 @@ static Status GetBodyCompression(FBB& fbb, const IpcWriteOptions& options, static Status MakeRecordBatch(FBB& fbb, int64_t length, int64_t body_length, const std::vector& nodes, const std::vector& buffers, + const std::vector& variadic_buffer_counts, const IpcWriteOptions& options, RecordBatchOffset* offset) { FieldNodeVector fb_nodes; RETURN_NOT_OK(WriteFieldNodes(fbb, nodes, &fb_nodes)); @@ -977,7 +996,10 @@ static Status MakeRecordBatch(FBB& fbb, int64_t length, int64_t body_length, BodyCompressionOffset fb_compression; RETURN_NOT_OK(GetBodyCompression(fbb, options, &fb_compression)); - *offset = flatbuf::CreateRecordBatch(fbb, length, fb_nodes, fb_buffers, fb_compression); + auto fb_variadic_buffer_counts = fbb.CreateVector(variadic_buffer_counts); + + *offset = flatbuf::CreateRecordBatch(fbb, length, fb_nodes, fb_buffers, fb_compression, + fb_variadic_buffer_counts); return Status::OK(); } @@ -1224,11 +1246,12 @@ Status WriteRecordBatchMessage( int64_t length, int64_t body_length, const std::shared_ptr& custom_metadata, const std::vector& nodes, const std::vector& buffers, - const IpcWriteOptions& options, std::shared_ptr* out) { + const std::vector& variadic_buffer_counts, const IpcWriteOptions& options, + std::shared_ptr* out) { FBB fbb; RecordBatchOffset record_batch; - RETURN_NOT_OK( - MakeRecordBatch(fbb, length, body_length, nodes, buffers, options, &record_batch)); + RETURN_NOT_OK(MakeRecordBatch(fbb, length, body_length, nodes, buffers, + variadic_buffer_counts, options, &record_batch)); return WriteFBMessage(fbb, flatbuf::MessageHeader::RecordBatch, record_batch.Union(), body_length, options.metadata_version, custom_metadata, options.memory_pool) @@ -1285,11 +1308,12 @@ Status WriteDictionaryMessage( int64_t id, bool is_delta, int64_t length, int64_t body_length, const std::shared_ptr& custom_metadata, const std::vector& nodes, const std::vector& buffers, - const IpcWriteOptions& options, std::shared_ptr* out) { + const std::vector& variadic_buffer_counts, const IpcWriteOptions& options, + std::shared_ptr* out) { FBB fbb; RecordBatchOffset record_batch; - RETURN_NOT_OK( - MakeRecordBatch(fbb, length, body_length, nodes, buffers, options, &record_batch)); + RETURN_NOT_OK(MakeRecordBatch(fbb, length, body_length, nodes, buffers, + variadic_buffer_counts, options, &record_batch)); auto dictionary_batch = flatbuf::CreateDictionaryBatch(fbb, id, record_batch, is_delta).Union(); return WriteFBMessage(fbb, flatbuf::MessageHeader::DictionaryBatch, dictionary_batch, diff --git a/cpp/src/arrow/ipc/metadata_internal.h b/cpp/src/arrow/ipc/metadata_internal.h index abbed5b2dace0..631a336f75a9a 100644 --- a/cpp/src/arrow/ipc/metadata_internal.h +++ b/cpp/src/arrow/ipc/metadata_internal.h @@ -201,7 +201,8 @@ Status WriteRecordBatchMessage( const int64_t length, const int64_t body_length, const std::shared_ptr& custom_metadata, const std::vector& nodes, const std::vector& buffers, - const IpcWriteOptions& options, std::shared_ptr* out); + const std::vector& variadic_counts, const IpcWriteOptions& options, + std::shared_ptr* out); ARROW_EXPORT Result> WriteTensorMessage(const Tensor& tensor, @@ -225,7 +226,8 @@ Status WriteDictionaryMessage( const int64_t body_length, const std::shared_ptr& custom_metadata, const std::vector& nodes, const std::vector& buffers, - const IpcWriteOptions& options, std::shared_ptr* out); + const std::vector& variadic_counts, const IpcWriteOptions& options, + std::shared_ptr* out); static inline Result> WriteFlatbufferBuilder( flatbuffers::FlatBufferBuilder& fbb, // NOLINT non-const reference diff --git a/cpp/src/arrow/ipc/read_write_test.cc b/cpp/src/arrow/ipc/read_write_test.cc index 3ae007c20efe7..05a48aec2c7f3 100644 --- a/cpp/src/arrow/ipc/read_write_test.cc +++ b/cpp/src/arrow/ipc/read_write_test.cc @@ -159,7 +159,7 @@ TEST_P(TestMessage, SerializeCustomMetadata) { ASSERT_OK(internal::WriteRecordBatchMessage( /*length=*/0, /*body_length=*/0, metadata, /*nodes=*/{}, - /*buffers=*/{}, options_, &serialized)); + /*buffers=*/{}, /*variadic_counts=*/{}, options_, &serialized)); ASSERT_OK_AND_ASSIGN(std::unique_ptr message, Message::Open(serialized, /*body=*/nullptr)); @@ -240,23 +240,33 @@ class TestSchemaMetadata : public ::testing::Test { } }; -const std::shared_ptr INT32 = std::make_shared(); - TEST_F(TestSchemaMetadata, PrimitiveFields) { - auto f0 = field("f0", std::make_shared()); - auto f1 = field("f1", std::make_shared(), false); - auto f2 = field("f2", std::make_shared()); - auto f3 = field("f3", std::make_shared()); - auto f4 = field("f4", std::make_shared()); - auto f5 = field("f5", std::make_shared()); - auto f6 = field("f6", std::make_shared()); - auto f7 = field("f7", std::make_shared()); - auto f8 = field("f8", std::make_shared()); - auto f9 = field("f9", std::make_shared(), false); - auto f10 = field("f10", std::make_shared()); - - Schema schema({f0, f1, f2, f3, f4, f5, f6, f7, f8, f9, f10}); - CheckSchemaRoundtrip(schema); + CheckSchemaRoundtrip(Schema({ + field("f0", int8()), + field("f1", int16(), false), + field("f2", int32()), + field("f3", int64()), + field("f4", uint8()), + field("f5", uint16()), + field("f6", uint32()), + field("f7", uint64()), + field("f8", float32()), + field("f9", float64(), false), + field("f10", boolean()), + })); +} + +TEST_F(TestSchemaMetadata, BinaryFields) { + CheckSchemaRoundtrip(Schema({ + field("f0", utf8()), + field("f1", binary()), + field("f2", large_utf8()), + field("f3", large_binary()), + field("f4", utf8_view()), + field("f5", binary_view()), + field("f6", fixed_size_binary(3)), + field("f7", fixed_size_binary(33)), + })); } TEST_F(TestSchemaMetadata, PrimitiveFieldsWithKeyValueMetadata) { @@ -269,15 +279,14 @@ TEST_F(TestSchemaMetadata, PrimitiveFieldsWithKeyValueMetadata) { } TEST_F(TestSchemaMetadata, NestedFields) { - auto type = list(int32()); - auto f0 = field("f0", type); - - std::shared_ptr type2( - new StructType({field("k1", INT32), field("k2", INT32), field("k3", INT32)})); - auto f1 = field("f1", type2); - - Schema schema({f0, f1}); - CheckSchemaRoundtrip(schema); + CheckSchemaRoundtrip(Schema({ + field("f0", list(int32())), + field("f1", struct_({ + field("k1", int32()), + field("k2", int32()), + field("k3", int32()), + })), + })); } // Verify that nullable=false is well-preserved for child fields of map type. @@ -305,19 +314,15 @@ TEST_F(TestSchemaMetadata, NestedFieldsWithKeyValueMetadata) { TEST_F(TestSchemaMetadata, DictionaryFields) { { - auto dict_type = dictionary(int8(), int32(), true /* ordered */); - auto f0 = field("f0", dict_type); - auto f1 = field("f1", list(dict_type)); - - Schema schema({f0, f1}); - CheckSchemaRoundtrip(schema); + auto dict_type = dictionary(int8(), int32(), /*ordered=*/true); + CheckSchemaRoundtrip(Schema({ + field("f0", dict_type), + field("f1", list(dict_type)), + })); } { auto dict_type = dictionary(int8(), list(int32())); - auto f0 = field("f0", dict_type); - - Schema schema({f0}); - CheckSchemaRoundtrip(schema); + CheckSchemaRoundtrip(Schema({field("f0", dict_type)})); } } @@ -325,9 +330,7 @@ TEST_F(TestSchemaMetadata, NestedDictionaryFields) { { auto inner_dict_type = dictionary(int8(), int32(), /*ordered=*/true); auto dict_type = dictionary(int16(), list(inner_dict_type)); - - Schema schema({field("f0", dict_type)}); - CheckSchemaRoundtrip(schema); + CheckSchemaRoundtrip(Schema({field("f0", dict_type)})); } { auto dict_type1 = dictionary(int8(), utf8(), /*ordered=*/true); @@ -2910,21 +2913,21 @@ void GetReadRecordBatchReadRanges( // 1) read magic and footer length IO // 2) read footer IO // 3) read record batch metadata IO - ASSERT_EQ(read_ranges.size(), 3 + expected_body_read_lengths.size()); + EXPECT_EQ(read_ranges.size(), 3 + expected_body_read_lengths.size()); const int32_t magic_size = static_cast(strlen(ipc::internal::kArrowMagicBytes)); // read magic and footer length IO auto file_end_size = magic_size + sizeof(int32_t); auto footer_length_offset = buffer->size() - file_end_size; auto footer_length = bit_util::FromLittleEndian( util::SafeLoadAs(buffer->data() + footer_length_offset)); - ASSERT_EQ(read_ranges[0].length, file_end_size); + EXPECT_EQ(read_ranges[0].length, file_end_size); // read footer IO - ASSERT_EQ(read_ranges[1].length, footer_length); + EXPECT_EQ(read_ranges[1].length, footer_length); // read record batch metadata. The exact size is tricky to determine but it doesn't // matter for this test and it should be smaller than the footer. - ASSERT_LT(read_ranges[2].length, footer_length); + EXPECT_LE(read_ranges[2].length, footer_length); for (uint32_t i = 0; i < expected_body_read_lengths.size(); i++) { - ASSERT_EQ(read_ranges[3 + i].length, expected_body_read_lengths[i]); + EXPECT_EQ(read_ranges[3 + i].length, expected_body_read_lengths[i]); } } diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc index 6e801e1f8adb7..1bd14ac120cba 100644 --- a/cpp/src/arrow/ipc/reader.cc +++ b/cpp/src/arrow/ipc/reader.cc @@ -59,6 +59,7 @@ #include "arrow/util/thread_pool.h" #include "arrow/util/ubsan.h" #include "arrow/util/vector.h" +#include "arrow/visit_data_inline.h" #include "arrow/visit_type_inline.h" #include "generated/File_generated.h" // IWYU pragma: export @@ -248,6 +249,15 @@ class ArrayLoader { } } + Result GetVariadicCount(int i) { + auto* variadic_counts = metadata_->variadicBufferCounts(); + CHECK_FLATBUFFERS_NOT_NULL(variadic_counts, "RecordBatch.variadicBufferCounts"); + if (i >= static_cast(variadic_counts->size())) { + return Status::IOError("variadic_count_index out of range."); + } + return static_cast(variadic_counts->Get(i)); + } + Status GetFieldMetadata(int field_index, ArrayData* out) { auto nodes = metadata_->nodes(); CHECK_FLATBUFFERS_NOT_NULL(nodes, "Table.nodes"); @@ -296,7 +306,6 @@ class ArrayLoader { return Status::OK(); } - template Status LoadBinary(Type::type type_id) { DCHECK_NE(out_, nullptr); out_->buffers.resize(3); @@ -355,7 +364,22 @@ class ArrayLoader { template enable_if_base_binary Visit(const T& type) { - return LoadBinary(type.id()); + return LoadBinary(type.id()); + } + + Status Visit(const BinaryViewType& type) { + out_->buffers.resize(2); + + RETURN_NOT_OK(LoadCommon(type.id())); + RETURN_NOT_OK(GetBuffer(buffer_index_++, &out_->buffers[1])); + + ARROW_ASSIGN_OR_RAISE(auto character_buffer_count, + GetVariadicCount(variadic_count_index_++)); + out_->buffers.resize(character_buffer_count + 2); + for (size_t i = 0; i < character_buffer_count; ++i) { + RETURN_NOT_OK(GetBuffer(buffer_index_++, &out_->buffers[i + 2])); + } + return Status::OK(); } Status Visit(const FixedSizeBinaryType& type) { @@ -450,6 +474,7 @@ class ArrayLoader { int buffer_index_ = 0; int field_index_ = 0; bool skip_io_ = false; + int variadic_count_index_ = 0; BatchDataReadRequest read_request_; const Field* field_ = nullptr; @@ -580,10 +605,9 @@ Result> LoadRecordBatchSubset( // swap endian in a set of ArrayData if necessary (swap_endian == true) if (context.swap_endian) { - for (int i = 0; i < static_cast(filtered_columns.size()); ++i) { - ARROW_ASSIGN_OR_RAISE(filtered_columns[i], - arrow::internal::SwapEndianArrayData( - filtered_columns[i], context.options.memory_pool)); + for (auto& filtered_column : filtered_columns) { + ARROW_ASSIGN_OR_RAISE(filtered_column, + arrow::internal::SwapEndianArrayData(filtered_column)); } } return RecordBatch::Make(std::move(filtered_schema), metadata->length(), diff --git a/cpp/src/arrow/ipc/test_common.cc b/cpp/src/arrow/ipc/test_common.cc index 53721c0b20fbc..6faaf96b332d4 100644 --- a/cpp/src/arrow/ipc/test_common.cc +++ b/cpp/src/arrow/ipc/test_common.cc @@ -351,39 +351,32 @@ static Status MakeBinaryArrayWithUniqueValues(int64_t length, bool include_nulls return builder.Finish(out); } -Status MakeStringTypesRecordBatch(std::shared_ptr* out, bool with_nulls) { +Status MakeStringTypesRecordBatch(std::shared_ptr* out, bool with_nulls, + bool with_view_types) { const int64_t length = 500; - auto f0 = field("strings", utf8()); - auto f1 = field("binaries", binary()); - auto f2 = field("large_strings", large_utf8()); - auto f3 = field("large_binaries", large_binary()); - auto schema = ::arrow::schema({f0, f1, f2, f3}); - - std::shared_ptr a0, a1, a2, a3; - MemoryPool* pool = default_memory_pool(); - // Quirk with RETURN_NOT_OK macro and templated functions - { - auto s = - MakeBinaryArrayWithUniqueValues(length, with_nulls, pool, &a0); - RETURN_NOT_OK(s); + ArrayVector arrays; + FieldVector fields; + + auto AppendColumn = [&](auto& MakeArray) { + arrays.emplace_back(); + RETURN_NOT_OK(MakeArray(length, with_nulls, default_memory_pool(), &arrays.back())); + + const auto& type = arrays.back()->type(); + fields.push_back(field(type->ToString(), type)); + return Status::OK(); + }; + + RETURN_NOT_OK(AppendColumn(MakeBinaryArrayWithUniqueValues)); + RETURN_NOT_OK(AppendColumn(MakeBinaryArrayWithUniqueValues)); + RETURN_NOT_OK(AppendColumn(MakeBinaryArrayWithUniqueValues)); + RETURN_NOT_OK(AppendColumn(MakeBinaryArrayWithUniqueValues)); + if (with_view_types) { + RETURN_NOT_OK(AppendColumn(MakeBinaryArrayWithUniqueValues)); + RETURN_NOT_OK(AppendColumn(MakeBinaryArrayWithUniqueValues)); } - { - auto s = - MakeBinaryArrayWithUniqueValues(length, with_nulls, pool, &a1); - RETURN_NOT_OK(s); - } - { - auto s = MakeBinaryArrayWithUniqueValues(length, with_nulls, pool, - &a2); - RETURN_NOT_OK(s); - } - { - auto s = MakeBinaryArrayWithUniqueValues(length, with_nulls, pool, - &a3); - RETURN_NOT_OK(s); - } - *out = RecordBatch::Make(schema, length, {a0, a1, a2, a3}); + + *out = RecordBatch::Make(schema(std::move(fields)), length, std::move(arrays)); return Status::OK(); } diff --git a/cpp/src/arrow/ipc/test_common.h b/cpp/src/arrow/ipc/test_common.h index 5e0c65556c630..fc0c8ddbea319 100644 --- a/cpp/src/arrow/ipc/test_common.h +++ b/cpp/src/arrow/ipc/test_common.h @@ -96,7 +96,7 @@ Status MakeRandomStringArray(int64_t length, bool include_nulls, MemoryPool* poo ARROW_TESTING_EXPORT Status MakeStringTypesRecordBatch(std::shared_ptr* out, - bool with_nulls = true); + bool with_nulls = true, bool with_view_types = true); ARROW_TESTING_EXPORT Status MakeStringTypesRecordBatchWithNulls(std::shared_ptr* out); diff --git a/cpp/src/arrow/ipc/writer.cc b/cpp/src/arrow/ipc/writer.cc index e4b49ed56464e..9668f459d0d31 100644 --- a/cpp/src/arrow/ipc/writer.cc +++ b/cpp/src/arrow/ipc/writer.cc @@ -52,10 +52,12 @@ #include "arrow/util/checked_cast.h" #include "arrow/util/compression.h" #include "arrow/util/endian.h" +#include "arrow/util/int_util_overflow.h" #include "arrow/util/key_value_metadata.h" #include "arrow/util/logging.h" #include "arrow/util/parallel.h" #include "arrow/visit_array_inline.h" +#include "arrow/visit_data_inline.h" #include "arrow/visit_type_inline.h" namespace arrow { @@ -174,7 +176,8 @@ class RecordBatchSerializer { // Override this for writing dictionary metadata virtual Status SerializeMetadata(int64_t num_rows) { return WriteRecordBatchMessage(num_rows, out_->body_length, custom_metadata_, - field_nodes_, buffer_meta_, options_, &out_->metadata); + field_nodes_, buffer_meta_, variadic_counts_, options_, + &out_->metadata); } bool ShouldCompress(int64_t uncompressed_size, int64_t compressed_size) const { @@ -296,6 +299,8 @@ class RecordBatchSerializer { offset += size + padding; } + variadic_counts_ = out_->variadic_buffer_counts; + out_->body_length = offset - buffer_start_offset_; DCHECK(bit_util::IsMultipleOf8(out_->body_length)); @@ -403,6 +408,18 @@ class RecordBatchSerializer { return Status::OK(); } + Status Visit(const BinaryViewArray& array) { + auto views = SliceBuffer(array.values(), array.offset() * BinaryViewType::kSize, + array.length() * BinaryViewType::kSize); + out_->body_buffers.emplace_back(std::move(views)); + + out_->variadic_buffer_counts.emplace_back(array.data()->buffers.size() - 2); + for (size_t i = 2; i < array.data()->buffers.size(); ++i) { + out_->body_buffers.emplace_back(array.data()->buffers[i]); + } + return Status::OK(); + } + template enable_if_var_size_list Visit(const T& array) { using offset_type = typename T::offset_type; @@ -590,6 +607,7 @@ class RecordBatchSerializer { std::vector field_nodes_; std::vector buffer_meta_; + std::vector variadic_counts_; const IpcWriteOptions& options_; int64_t max_recursion_depth_; @@ -606,8 +624,8 @@ class DictionarySerializer : public RecordBatchSerializer { Status SerializeMetadata(int64_t num_rows) override { return WriteDictionaryMessage(dictionary_id_, is_delta_, num_rows, out_->body_length, - custom_metadata_, field_nodes_, buffer_meta_, options_, - &out_->metadata); + custom_metadata_, field_nodes_, buffer_meta_, + variadic_counts_, options_, &out_->metadata); } Status Assemble(const std::shared_ptr& dictionary) { diff --git a/cpp/src/arrow/ipc/writer.h b/cpp/src/arrow/ipc/writer.h index 9e18a213ba3f2..4e0ee3dfc8b44 100644 --- a/cpp/src/arrow/ipc/writer.h +++ b/cpp/src/arrow/ipc/writer.h @@ -57,6 +57,7 @@ struct IpcPayload { MessageType type = MessageType::NONE; std::shared_ptr metadata; std::vector> body_buffers; + std::vector variadic_buffer_counts; int64_t body_length = 0; // serialized body length (padded, maybe compressed) int64_t raw_body_length = 0; // initial uncompressed body length }; diff --git a/cpp/src/arrow/json/converter.cc b/cpp/src/arrow/json/converter.cc index 04ebe4714ceec..c393b77acf334 100644 --- a/cpp/src/arrow/json/converter.cc +++ b/cpp/src/arrow/json/converter.cc @@ -304,6 +304,8 @@ Status MakeConverter(const std::shared_ptr& out_type, MemoryPool* pool CONVERTER_CASE(Type::STRING, BinaryConverter); CONVERTER_CASE(Type::LARGE_BINARY, BinaryConverter); CONVERTER_CASE(Type::LARGE_STRING, BinaryConverter); + CONVERTER_CASE(Type::BINARY_VIEW, BinaryConverter); + CONVERTER_CASE(Type::STRING_VIEW, BinaryConverter); CONVERTER_CASE(Type::DECIMAL128, DecimalConverter); CONVERTER_CASE(Type::DECIMAL256, DecimalConverter); default: diff --git a/cpp/src/arrow/json/converter_test.cc b/cpp/src/arrow/json/converter_test.cc index 378a4491c0e62..cfc44c99976d5 100644 --- a/cpp/src/arrow/json/converter_test.cc +++ b/cpp/src/arrow/json/converter_test.cc @@ -131,8 +131,8 @@ TEST(ConverterTest, Floats) { } } -TEST(ConverterTest, StringAndLargeString) { - for (auto string_type : {utf8(), large_utf8()}) { +TEST(ConverterTest, StringAndLargeStringAndStringView) { + for (auto string_type : {utf8(), large_utf8(), utf8_view()}) { ParseOptions options; options.explicit_schema = schema({field("", string_type)}); diff --git a/cpp/src/arrow/json/parser.cc b/cpp/src/arrow/json/parser.cc index e2941a29ab9bd..185dcde355f0a 100644 --- a/cpp/src/arrow/json/parser.cc +++ b/cpp/src/arrow/json/parser.cc @@ -104,6 +104,7 @@ Status Kind::ForType(const DataType& type, Kind::type* kind) { Status Visit(const DateType&) { return SetKind(Kind::kNumber); } Status Visit(const BinaryType&) { return SetKind(Kind::kString); } Status Visit(const LargeBinaryType&) { return SetKind(Kind::kString); } + Status Visit(const BinaryViewType&) { return SetKind(Kind::kString); } Status Visit(const TimestampType&) { return SetKind(Kind::kString); } Status Visit(const DecimalType&) { return SetKind(Kind::kNumberOrString); } Status Visit(const DictionaryType& dict_type) { diff --git a/cpp/src/arrow/json/test_common.h b/cpp/src/arrow/json/test_common.h index 0f7b3466fdbc9..f7ab6fd10275f 100644 --- a/cpp/src/arrow/json/test_common.h +++ b/cpp/src/arrow/json/test_common.h @@ -110,8 +110,7 @@ struct GenerateImpl { return OK(writer.Double(val)); } - template - enable_if_base_binary Visit(const T&) { + Status GenerateAscii(const DataType&) { auto size = std::poisson_distribution<>{4}(e); std::uniform_int_distribution gen_char(32, 126); // FIXME generate UTF8 std::string s(size, '\0'); @@ -119,6 +118,13 @@ struct GenerateImpl { return OK(writer.String(s.c_str())); } + template + enable_if_base_binary Visit(const T& t) { + return GenerateAscii(t); + } + + Status Visit(const BinaryViewType& t) { return GenerateAscii(t); } + template enable_if_list_like Visit(const T& t) { auto size = std::poisson_distribution<>{4}(e); diff --git a/cpp/src/arrow/pretty_print.cc b/cpp/src/arrow/pretty_print.cc index a5410df7e9ae2..bfda62881542e 100644 --- a/cpp/src/arrow/pretty_print.cc +++ b/cpp/src/arrow/pretty_print.cc @@ -302,6 +302,7 @@ class ArrayPrinter : public PrettyPrinter { std::is_base_of::value || std::is_base_of::value || std::is_base_of::value || + std::is_base_of::value || std::is_base_of::value || std::is_base_of::value || std::is_base_of::value || diff --git a/cpp/src/arrow/scalar.cc b/cpp/src/arrow/scalar.cc index b2ad1ad519bb2..167e272705268 100644 --- a/cpp/src/arrow/scalar.cc +++ b/cpp/src/arrow/scalar.cc @@ -263,6 +263,12 @@ struct ScalarValidateImpl { Status Visit(const StringScalar& s) { return ValidateStringScalar(s); } + Status Visit(const BinaryViewScalar& s) { return ValidateBinaryScalar(s); } + + Status Visit(const StringViewScalar& s) { return ValidateStringScalar(s); } + + Status Visit(const LargeBinaryScalar& s) { return ValidateBinaryScalar(s); } + Status Visit(const LargeStringScalar& s) { return ValidateStringScalar(s); } template @@ -548,17 +554,8 @@ Status Scalar::ValidateFull() const { return ScalarValidateImpl(/*full_validation=*/true).Validate(*this); } -BinaryScalar::BinaryScalar(std::string s) - : BinaryScalar(Buffer::FromString(std::move(s))) {} - -StringScalar::StringScalar(std::string s) - : StringScalar(Buffer::FromString(std::move(s))) {} - -LargeBinaryScalar::LargeBinaryScalar(std::string s) - : LargeBinaryScalar(Buffer::FromString(std::move(s))) {} - -LargeStringScalar::LargeStringScalar(std::string s) - : LargeStringScalar(Buffer::FromString(std::move(s))) {} +BaseBinaryScalar::BaseBinaryScalar(std::string s, std::shared_ptr type) + : BaseBinaryScalar(Buffer::FromString(std::move(s)), std::move(type)) {} FixedSizeBinaryScalar::FixedSizeBinaryScalar(std::shared_ptr value, std::shared_ptr type, diff --git a/cpp/src/arrow/scalar.h b/cpp/src/arrow/scalar.h index 1d1ce4aa72948..5175b0128524c 100644 --- a/cpp/src/arrow/scalar.h +++ b/cpp/src/arrow/scalar.h @@ -263,24 +263,21 @@ struct ARROW_EXPORT BaseBinaryScalar return value ? std::string_view(*value) : std::string_view(); } - protected: BaseBinaryScalar(std::shared_ptr value, std::shared_ptr type) : internal::PrimitiveScalarBase{std::move(type), true}, value(std::move(value)) {} friend ArraySpan; + BaseBinaryScalar(std::string s, std::shared_ptr type); }; struct ARROW_EXPORT BinaryScalar : public BaseBinaryScalar { using BaseBinaryScalar::BaseBinaryScalar; using TypeClass = BinaryType; - BinaryScalar(std::shared_ptr value, std::shared_ptr type) - : BaseBinaryScalar(std::move(value), std::move(type)) {} - explicit BinaryScalar(std::shared_ptr value) : BinaryScalar(std::move(value), binary()) {} - explicit BinaryScalar(std::string s); + explicit BinaryScalar(std::string s) : BaseBinaryScalar(std::move(s), binary()) {} BinaryScalar() : BinaryScalar(binary()) {} }; @@ -292,11 +289,39 @@ struct ARROW_EXPORT StringScalar : public BinaryScalar { explicit StringScalar(std::shared_ptr value) : StringScalar(std::move(value), utf8()) {} - explicit StringScalar(std::string s); + explicit StringScalar(std::string s) : BinaryScalar(std::move(s), utf8()) {} StringScalar() : StringScalar(utf8()) {} }; +struct ARROW_EXPORT BinaryViewScalar : public BaseBinaryScalar { + using BaseBinaryScalar::BaseBinaryScalar; + using TypeClass = BinaryViewType; + + explicit BinaryViewScalar(std::shared_ptr value) + : BinaryViewScalar(std::move(value), binary_view()) {} + + explicit BinaryViewScalar(std::string s) + : BaseBinaryScalar(std::move(s), binary_view()) {} + + BinaryViewScalar() : BinaryViewScalar(binary_view()) {} + + std::string_view view() const override { return std::string_view(*this->value); } +}; + +struct ARROW_EXPORT StringViewScalar : public BinaryViewScalar { + using BinaryViewScalar::BinaryViewScalar; + using TypeClass = StringViewType; + + explicit StringViewScalar(std::shared_ptr value) + : StringViewScalar(std::move(value), utf8_view()) {} + + explicit StringViewScalar(std::string s) + : BinaryViewScalar(std::move(s), utf8_view()) {} + + StringViewScalar() : StringViewScalar(utf8_view()) {} +}; + struct ARROW_EXPORT LargeBinaryScalar : public BaseBinaryScalar { using BaseBinaryScalar::BaseBinaryScalar; using TypeClass = LargeBinaryType; @@ -307,7 +332,8 @@ struct ARROW_EXPORT LargeBinaryScalar : public BaseBinaryScalar { explicit LargeBinaryScalar(std::shared_ptr value) : LargeBinaryScalar(std::move(value), large_binary()) {} - explicit LargeBinaryScalar(std::string s); + explicit LargeBinaryScalar(std::string s) + : BaseBinaryScalar(std::move(s), large_binary()) {} LargeBinaryScalar() : LargeBinaryScalar(large_binary()) {} }; @@ -319,7 +345,8 @@ struct ARROW_EXPORT LargeStringScalar : public LargeBinaryScalar { explicit LargeStringScalar(std::shared_ptr value) : LargeStringScalar(std::move(value), large_utf8()) {} - explicit LargeStringScalar(std::string s); + explicit LargeStringScalar(std::string s) + : LargeBinaryScalar(std::move(s), large_utf8()) {} LargeStringScalar() : LargeStringScalar(large_utf8()) {} }; diff --git a/cpp/src/arrow/testing/gtest_util.h b/cpp/src/arrow/testing/gtest_util.h index bb462af86a5f2..641aae5a5e2e4 100644 --- a/cpp/src/arrow/testing/gtest_util.h +++ b/cpp/src/arrow/testing/gtest_util.h @@ -176,10 +176,17 @@ using DecimalArrowTypes = ::testing::Types; using BaseBinaryArrowTypes = ::testing::Types; +using BaseBinaryOrBinaryViewLikeArrowTypes = + ::testing::Types; + using BinaryArrowTypes = ::testing::Types; using StringArrowTypes = ::testing::Types; +using StringOrStringViewArrowTypes = + ::testing::Types; + using ListArrowTypes = ::testing::Types; using UnionArrowTypes = ::testing::Types; diff --git a/cpp/src/arrow/testing/random.cc b/cpp/src/arrow/testing/random.cc index b74c41f75e452..1386075397e20 100644 --- a/cpp/src/arrow/testing/random.cc +++ b/cpp/src/arrow/testing/random.cc @@ -363,13 +363,11 @@ std::shared_ptr RandomArrayGenerator::Decimal256(std::shared_ptr -static std::shared_ptr GenerateBinaryArray(RandomArrayGenerator* gen, int64_t size, - int32_t min_length, int32_t max_length, - double null_probability, - int64_t alignment, - MemoryPool* memory_pool) { - using offset_type = typename TypeClass::offset_type; +template +static std::shared_ptr GenerateBinaryArray( + RandomArrayGenerator* gen, int64_t size, int32_t min_length, int32_t max_length, + double null_probability, std::optional max_data_buffer_length, + int64_t alignment, MemoryPool* memory_pool) { using BuilderType = typename TypeTraits::BuilderType; using OffsetArrowType = typename CTypeTraits::ArrowType; using OffsetArrayType = typename TypeTraits::ArrayType; @@ -387,7 +385,12 @@ static std::shared_ptr GenerateBinaryArray(RandomArrayGenerator* gen, int /*null_probability=*/0); std::vector str_buffer(max_length); - BuilderType builder(memory_pool, alignment); + BuilderType builder{memory_pool, alignment}; + if constexpr (std::is_base_of_v) { + if (max_data_buffer_length) { + builder.SetBlockSize(*max_data_buffer_length); + } + } for (int64_t i = 0; i < size; ++i) { if (lengths->IsValid(i)) { @@ -409,7 +412,8 @@ std::shared_ptr RandomArrayGenerator::String(int64_t size, int32_t min_le int64_t alignment, MemoryPool* memory_pool) { return GenerateBinaryArray(this, size, min_length, max_length, - null_probability, alignment, memory_pool); + null_probability, /*max_data_buffer_length=*/{}, + alignment, memory_pool); } std::shared_ptr RandomArrayGenerator::LargeString(int64_t size, int32_t min_length, @@ -417,8 +421,9 @@ std::shared_ptr RandomArrayGenerator::LargeString(int64_t size, int32_t m double null_probability, int64_t alignment, MemoryPool* memory_pool) { - return GenerateBinaryArray(this, size, min_length, max_length, - null_probability, alignment, memory_pool); + return GenerateBinaryArray( + this, size, min_length, max_length, null_probability, /*max_data_buffer_length=*/{}, + alignment, memory_pool); } std::shared_ptr RandomArrayGenerator::BinaryWithRepeats( @@ -430,6 +435,15 @@ std::shared_ptr RandomArrayGenerator::BinaryWithRepeats( return *strings->View(binary()); } +std::shared_ptr RandomArrayGenerator::StringView( + int64_t size, int32_t min_length, int32_t max_length, double null_probability, + std::optional max_data_buffer_length, int64_t alignment, + MemoryPool* memory_pool) { + return GenerateBinaryArray( + this, size, min_length, max_length, null_probability, max_data_buffer_length, + alignment, memory_pool); +} + std::shared_ptr RandomArrayGenerator::StringWithRepeats( int64_t size, int64_t unique, int32_t min_length, int32_t max_length, double null_probability, int64_t alignment, MemoryPool* memory_pool) { @@ -843,6 +857,24 @@ std::shared_ptr RandomArrayGenerator::ArrayOf(const Field& field, int64_t ->View(field.type()); } + case Type::type::STRING_VIEW: + case Type::type::BINARY_VIEW: { + const auto min_length = + GetMetadata(field.metadata().get(), "min_length", 0); + const auto max_length = + GetMetadata(field.metadata().get(), "max_length", 20); + std::optional max_data_buffer_length = + GetMetadata(field.metadata().get(), "max_data_buffer_length", 0); + if (*max_data_buffer_length == 0) { + *max_data_buffer_length = {}; + } + + return StringView(length, min_length, max_length, null_probability, + max_data_buffer_length, alignment) + ->View(field.type()) + .ValueOrDie(); + } + case Type::type::DECIMAL128: return Decimal128(field.type(), length, null_probability, alignment, memory_pool); diff --git a/cpp/src/arrow/testing/random.h b/cpp/src/arrow/testing/random.h index de9ea6d05648d..1cf57cdb4e939 100644 --- a/cpp/src/arrow/testing/random.h +++ b/cpp/src/arrow/testing/random.h @@ -367,6 +367,25 @@ class ARROW_TESTING_EXPORT RandomArrayGenerator { int64_t alignment = kDefaultBufferAlignment, MemoryPool* memory_pool = default_memory_pool()); + /// \brief Generate a random StringViewArray + /// + /// \param[in] size the size of the array to generate + /// \param[in] min_length the lower bound of the string length + /// determined by the uniform distribution + /// \param[in] max_length the upper bound of the string length + /// determined by the uniform distribution + /// \param[in] max_data_buffer_length the data buffer size at which + /// a new chunk will be generated + /// \param[in] alignment alignment for memory allocations (in bytes) + /// \param[in] null_probability the probability of a value being null + /// + /// \return a generated Array + std::shared_ptr StringView(int64_t size, int32_t min_length, int32_t max_length, + double null_probability = 0, + std::optional max_data_buffer_length = {}, + int64_t alignment = kDefaultBufferAlignment, + MemoryPool* memory_pool = default_memory_pool()); + /// \brief Generate a random LargeStringArray /// /// \param[in] size the size of the array to generate @@ -556,10 +575,14 @@ class ARROW_TESTING_EXPORT RandomArrayGenerator { /// - max_length (T::offset_type): the minimum length of the child to generate, /// default 1024 /// - /// For string and binary types T (not including their large variants): + /// For string and binary types T (not including their large or view variants): /// - unique (int32_t): if positive, this many distinct values will be generated /// and all array values will be one of these values, default -1 /// + /// For string and binary view types T: + /// - max_data_buffer_length (int64_t): the data buffer size at which a new chunk + /// will be generated, default 32KB + /// /// For MapType: /// - values (int32_t): the number of key-value pairs to generate, which will be /// partitioned among the array values. diff --git a/cpp/src/arrow/testing/random_test.cc b/cpp/src/arrow/testing/random_test.cc index f269818e83a3d..951b654e56f73 100644 --- a/cpp/src/arrow/testing/random_test.cc +++ b/cpp/src/arrow/testing/random_test.cc @@ -160,6 +160,7 @@ auto values = ::testing::Values( field("uint32", uint32()), field("int32", int32()), field("uint64", uint64()), field("int64", int64()), field("float16", float16()), field("float32", float32()), field("float64", float64()), field("string", utf8()), field("binary", binary()), + field("string_view", utf8_view()), field("binary_view", binary_view()), field("fixed_size_binary", fixed_size_binary(8)), field("decimal128", decimal128(8, 3)), field("decimal128", decimal128(29, -5)), field("decimal256", decimal256(16, 4)), field("decimal256", decimal256(57, -6)), diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index a4f43256827da..5feeaf84d95e1 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -64,10 +64,14 @@ constexpr Type::type FixedSizeListType::type_id; constexpr Type::type BinaryType::type_id; +constexpr Type::type BinaryViewType::type_id; + constexpr Type::type LargeBinaryType::type_id; constexpr Type::type StringType::type_id; +constexpr Type::type StringViewType::type_id; + constexpr Type::type LargeStringType::type_id; constexpr Type::type FixedSizeBinaryType::type_id; @@ -130,6 +134,8 @@ std::vector AllTypeIds() { Type::BINARY, Type::LARGE_STRING, Type::LARGE_BINARY, + Type::STRING_VIEW, + Type::BINARY_VIEW, Type::FIXED_SIZE_BINARY, Type::STRUCT, Type::LIST, @@ -194,7 +200,9 @@ std::string ToString(Type::type id) { TO_STRING_CASE(INTERVAL_MONTHS) TO_STRING_CASE(DURATION) TO_STRING_CASE(STRING) + TO_STRING_CASE(STRING_VIEW) TO_STRING_CASE(BINARY) + TO_STRING_CASE(BINARY_VIEW) TO_STRING_CASE(LARGE_STRING) TO_STRING_CASE(LARGE_BINARY) TO_STRING_CASE(FIXED_SIZE_BINARY) @@ -247,8 +255,12 @@ struct PhysicalTypeVisitor { } template - Status Visit(const Type&) { - result = TypeTraits::type_singleton(); + Status Visit(const Type& type) { + if constexpr (std::is_base_of_v) { + result = binary_view(); + } else { + result = TypeTraits::type_singleton(); + } return Status::OK(); } }; @@ -1058,10 +1070,14 @@ std::string FixedSizeListType::ToString() const { std::string BinaryType::ToString() const { return "binary"; } +std::string BinaryViewType::ToString() const { return "binary_view"; } + std::string LargeBinaryType::ToString() const { return "large_binary"; } std::string StringType::ToString() const { return "string"; } +std::string StringViewType::ToString() const { return "string_view"; } + std::string LargeStringType::ToString() const { return "large_string"; } int FixedSizeBinaryType::bit_width() const { return CHAR_BIT * byte_width(); } @@ -2821,8 +2837,10 @@ PARAMETER_LESS_FINGERPRINT(HalfFloat) PARAMETER_LESS_FINGERPRINT(Float) PARAMETER_LESS_FINGERPRINT(Double) PARAMETER_LESS_FINGERPRINT(Binary) +PARAMETER_LESS_FINGERPRINT(BinaryView) PARAMETER_LESS_FINGERPRINT(LargeBinary) PARAMETER_LESS_FINGERPRINT(String) +PARAMETER_LESS_FINGERPRINT(StringView) PARAMETER_LESS_FINGERPRINT(LargeString) PARAMETER_LESS_FINGERPRINT(Date32) PARAMETER_LESS_FINGERPRINT(Date64) @@ -3034,6 +3052,16 @@ TYPE_FACTORY(large_binary, LargeBinaryType) TYPE_FACTORY(date64, Date64Type) TYPE_FACTORY(date32, Date32Type) +const std::shared_ptr& utf8_view() { + static std::shared_ptr type = std::make_shared(); + return type; +} + +const std::shared_ptr& binary_view() { + static std::shared_ptr type = std::make_shared(); + return type; +} + std::shared_ptr fixed_size_binary(int32_t byte_width) { return std::make_shared(byte_width); } @@ -3294,7 +3322,7 @@ void InitStaticData() { // * Time32 // * Time64 // * Timestamp - g_primitive_types = {null(), boolean(), date32(), date64()}; + g_primitive_types = {null(), boolean(), date32(), date64(), binary_view(), utf8_view()}; Extend(g_numeric_types, &g_primitive_types); Extend(g_base_binary_types, &g_primitive_types); } diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 3f4dd5c9b21fa..a905192e4a54e 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -113,8 +114,14 @@ struct ARROW_EXPORT DataTypeLayout { std::vector buffers; /// Whether this type expects an associated dictionary array. bool has_dictionary = false; + /// If this is provided, the number of buffers expected is only lower-bounded by + /// buffers.size(). Buffers beyond this lower bound are expected to conform to + /// variadic_spec. + std::optional variadic_spec; - explicit DataTypeLayout(std::vector v) : buffers(std::move(v)) {} + explicit DataTypeLayout(std::vector buffers, + std::optional variadic_spec = {}) + : buffers(std::move(buffers)), variadic_spec(variadic_spec) {} }; /// \brief Base class for all data types @@ -772,6 +779,103 @@ class ARROW_EXPORT BinaryType : public BaseBinaryType { explicit BinaryType(Type::type logical_type) : BaseBinaryType(logical_type) {} }; +/// \brief Concrete type class for variable-size binary view data +class ARROW_EXPORT BinaryViewType : public DataType { + public: + static constexpr Type::type type_id = Type::BINARY_VIEW; + static constexpr bool is_utf8 = false; + using PhysicalType = BinaryViewType; + + static constexpr int kSize = 16; + static constexpr int kInlineSize = 12; + static constexpr int kPrefixSize = 4; + + /// Variable length string or binary with inline optimization for small values (12 bytes + /// or fewer). This is similar to std::string_view except limited in size to INT32_MAX + /// and at least the first four bytes of the string are copied inline (accessible + /// without pointer dereference). This inline prefix allows failing comparisons early. + /// Furthermore when dealing with short strings the CPU cache working set is reduced + /// since many can be inline. + /// + /// This union supports two states: + /// + /// - Entirely inlined string data + /// |----|--------------| + /// ^ ^ + /// | | + /// size in-line string data, zero padded + /// + /// - Reference into a buffer + /// |----|----|----|----| + /// ^ ^ ^ ^ + /// | | | | + /// size | | `------. + /// prefix | | + /// buffer index | + /// offset in buffer + /// + /// Adapted from TU Munich's UmbraDB [1], Velox, DuckDB. + /// + /// [1]: https://db.in.tum.de/~freitag/papers/p29-neumann-cidr20.pdf + /// + /// Alignment to 64 bits enables an aligned load of the size and prefix into + /// a single 64 bit integer, which is useful to the comparison fast path. + union alignas(int64_t) c_type { + struct { + int32_t size; + std::array data; + } inlined; + + struct { + int32_t size; + std::array prefix; + int32_t buffer_index; + int32_t offset; + } ref; + + /// The number of bytes viewed. + int32_t size() const { + // Size is in the common initial subsequence of each member of the union, + // so accessing `inlined.size` is legal even if another member is active. + return inlined.size; + } + + /// True if the view's data is entirely stored inline. + bool is_inline() const { return size() <= kInlineSize; } + + /// Return a pointer to the inline data of a view. + /// + /// For inline views, this points to the entire data of the view. + /// For other views, this points to the 4 byte prefix. + const uint8_t* inline_data() const& { + // Since `ref.prefix` has the same address as `inlined.data`, + // the branch will be trivially optimized out. + return is_inline() ? inlined.data.data() : ref.prefix.data(); + } + const uint8_t* inline_data() && = delete; + }; + static_assert(sizeof(c_type) == kSize); + static_assert(std::is_trivial_v); + + static constexpr const char* type_name() { return "binary_view"; } + + BinaryViewType() : BinaryViewType(Type::BINARY_VIEW) {} + + DataTypeLayout layout() const override { + return DataTypeLayout({DataTypeLayout::Bitmap(), DataTypeLayout::FixedWidth(kSize)}, + DataTypeLayout::VariableWidth()); + } + + std::string ToString() const override; + std::string name() const override { return "binary_view"; } + + protected: + std::string ComputeFingerprint() const override; + + // Allow subclasses like StringType to change the logical type. + explicit BinaryViewType(Type::type logical_type) : DataType(logical_type) {} +}; + /// \brief Concrete type class for large variable-size binary data class ARROW_EXPORT LargeBinaryType : public BaseBinaryType { public: @@ -818,6 +922,24 @@ class ARROW_EXPORT StringType : public BinaryType { std::string ComputeFingerprint() const override; }; +/// \brief Concrete type class for variable-size string data, utf8-encoded +class ARROW_EXPORT StringViewType : public BinaryViewType { + public: + static constexpr Type::type type_id = Type::STRING_VIEW; + static constexpr bool is_utf8 = true; + using PhysicalType = BinaryViewType; + + static constexpr const char* type_name() { return "utf8_view"; } + + StringViewType() : BinaryViewType(Type::STRING_VIEW) {} + + std::string ToString() const override; + std::string name() const override { return "utf8_view"; } + + protected: + std::string ComputeFingerprint() const override; +}; + /// \brief Concrete type class for large variable-size string data, utf8-encoded class ARROW_EXPORT LargeStringType : public LargeBinaryType { public: diff --git a/cpp/src/arrow/type_fwd.h b/cpp/src/arrow/type_fwd.h index 499684719feab..ca263b710317b 100644 --- a/cpp/src/arrow/type_fwd.h +++ b/cpp/src/arrow/type_fwd.h @@ -110,6 +110,11 @@ class BinaryArray; class BinaryBuilder; struct BinaryScalar; +class BinaryViewType; +class BinaryViewArray; +class BinaryViewBuilder; +struct BinaryViewScalar; + class LargeBinaryType; class LargeBinaryArray; class LargeBinaryBuilder; @@ -125,6 +130,11 @@ class StringArray; class StringBuilder; struct StringScalar; +class StringViewType; +class StringViewArray; +class StringViewBuilder; +struct StringViewScalar; + class LargeStringType; class LargeStringArray; class LargeStringBuilder; @@ -415,6 +425,13 @@ struct Type { /// Run-end encoded data. RUN_END_ENCODED = 38, + /// String (UTF8) view type with 4-byte prefix and inline small string + /// optimization + STRING_VIEW = 39, + + /// Bytes view type with 4-byte prefix and inline small string optimization + BINARY_VIEW = 40, + // Leave this at the end MAX_ID }; @@ -456,10 +473,14 @@ ARROW_EXPORT const std::shared_ptr& float32(); ARROW_EXPORT const std::shared_ptr& float64(); /// \brief Return a StringType instance ARROW_EXPORT const std::shared_ptr& utf8(); +/// \brief Return a StringViewType instance +ARROW_EXPORT const std::shared_ptr& utf8_view(); /// \brief Return a LargeStringType instance ARROW_EXPORT const std::shared_ptr& large_utf8(); /// \brief Return a BinaryType instance ARROW_EXPORT const std::shared_ptr& binary(); +/// \brief Return a BinaryViewType instance +ARROW_EXPORT const std::shared_ptr& binary_view(); /// \brief Return a LargeBinaryType instance ARROW_EXPORT const std::shared_ptr& large_binary(); /// \brief Return a Date32Type instance diff --git a/cpp/src/arrow/type_test.cc b/cpp/src/arrow/type_test.cc index 9ba8cf98dea4f..273f8933fa577 100644 --- a/cpp/src/arrow/type_test.cc +++ b/cpp/src/arrow/type_test.cc @@ -1469,9 +1469,21 @@ TEST(TestBinaryType, ToString) { TEST(TestStringType, ToString) { StringType str; ASSERT_EQ(str.id(), Type::STRING); + ASSERT_EQ(str.name(), std::string("utf8")); + ASSERT_EQ(str.type_name(), std::string("utf8")); ASSERT_EQ(str.ToString(), std::string("string")); } +TEST(TestBinaryViewType, ToString) { + BinaryViewType t1; + BinaryViewType e1; + StringViewType t2; + AssertTypeEqual(t1, e1); + AssertTypeNotEqual(t1, t2); + ASSERT_EQ(t1.id(), Type::BINARY_VIEW); + ASSERT_EQ(t1.ToString(), std::string("binary_view")); +} + TEST(TestLargeBinaryTypes, ToString) { BinaryType bt1; LargeBinaryType t1; diff --git a/cpp/src/arrow/type_traits.cc b/cpp/src/arrow/type_traits.cc index ac16afe4b8cd8..de328f322ad5f 100644 --- a/cpp/src/arrow/type_traits.cc +++ b/cpp/src/arrow/type_traits.cc @@ -88,6 +88,8 @@ int RequiredValueAlignmentForBuffer(Type::type type_id, int buffer_index) { case Type::DURATION: case Type::INTERVAL_MONTH_DAY_NANO: // Stored as two 32-bit integers and a 64-bit // integer + case Type::STRING_VIEW: + case Type::BINARY_VIEW: return 8; case Type::DICTIONARY: case Type::EXTENSION: diff --git a/cpp/src/arrow/type_traits.h b/cpp/src/arrow/type_traits.h index bcbde23ae4a4b..898e6465ef299 100644 --- a/cpp/src/arrow/type_traits.h +++ b/cpp/src/arrow/type_traits.h @@ -341,6 +341,15 @@ struct TypeTraits { static inline std::shared_ptr type_singleton() { return binary(); } }; +template <> +struct TypeTraits { + using ArrayType = BinaryViewArray; + using BuilderType = BinaryViewBuilder; + using ScalarType = BinaryViewScalar; + using CType = BinaryViewType::c_type; + constexpr static bool is_parameter_free = false; +}; + template <> struct TypeTraits { using ArrayType = LargeBinaryArray; @@ -371,6 +380,15 @@ struct TypeTraits { static inline std::shared_ptr type_singleton() { return utf8(); } }; +template <> +struct TypeTraits { + using ArrayType = StringViewArray; + using BuilderType = StringViewBuilder; + using ScalarType = StringViewScalar; + using CType = BinaryViewType::c_type; + constexpr static bool is_parameter_free = false; +}; + template <> struct TypeTraits { using ArrayType = LargeStringArray; @@ -399,6 +417,11 @@ struct CTypeTraits : public TypeTraits { using ArrowType = StringType; }; +template <> +struct CTypeTraits : public TypeTraits { + using ArrowType = BinaryViewType; +}; + template <> struct CTypeTraits : public CTypeTraits {}; @@ -614,9 +637,28 @@ using is_string_type = template using enable_if_string = enable_if_t::value, R>; +template +using is_binary_view_like_type = std::is_base_of; + +template +using is_binary_view_type = std::is_same; + +template +using is_string_view_type = std::is_same; + +template +using enable_if_binary_view_like = enable_if_t::value, R>; + +template +using enable_if_binary_view = enable_if_t::value, R>; + +template +using enable_if_string_view = enable_if_t::value, R>; + template using is_string_like_type = - std::integral_constant::value && T::is_utf8>; + std::integral_constant::value && T::is_utf8) || + is_string_view_type::value>; template using enable_if_string_like = enable_if_t::value, R>; @@ -639,10 +681,9 @@ template using enable_if_fixed_width_type = enable_if_t::value, R>; template -using is_binary_like_type = - std::integral_constant::value && - !is_string_like_type::value) || - is_fixed_size_binary_type::value>; +using is_binary_like_type = std::integral_constant< + bool, (is_base_binary_type::value && !is_string_like_type::value) || + is_binary_view_type::value || is_fixed_size_binary_type::value>; template using enable_if_binary_like = enable_if_t::value, R>; @@ -801,8 +842,10 @@ using enable_if_has_c_type = enable_if_t::value, R>; template using has_string_view = std::integral_constant::value || + std::is_same::value || std::is_same::value || std::is_same::value || + std::is_same::value || std::is_same::value || std::is_same::value>; diff --git a/cpp/src/arrow/util/assume_aligned.h b/cpp/src/arrow/util/assume_aligned.h new file mode 100644 index 0000000000000..dfcbb0e6289e3 --- /dev/null +++ b/cpp/src/arrow/util/assume_aligned.h @@ -0,0 +1,45 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +namespace arrow::util { + +template +[[nodiscard]] constexpr P* AssumeAligned(P* ptr) { +#if defined(__has_builtin) +#if __has_builtin(__builtin_assume_aligned) +#define ARROW_HAS_BUILTIN_ASSUME_ALIGNED +#endif +#endif + +#if defined(ARROW_HAS_BUILTIN_ASSUME_ALIGNED) +#undef ARROW_HAS_BUILTIN_ASSUME_ALIGNED + return static_cast(__builtin_assume_aligned(ptr, N)); +#else + return ptr; +#endif +} + +template +[[nodiscard]] constexpr P* AssumeAlignedAs(P* ptr) { + return AssumeAligned(ptr); +} + +} // namespace arrow::util diff --git a/cpp/src/arrow/util/binary_view_util.h b/cpp/src/arrow/util/binary_view_util.h new file mode 100644 index 0000000000000..e14da5d7e138c --- /dev/null +++ b/cpp/src/arrow/util/binary_view_util.h @@ -0,0 +1,105 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "arrow/type.h" +#include "arrow/util/assume_aligned.h" +#include "arrow/util/span.h" + +namespace arrow::util { + +inline BinaryViewType::c_type ToInlineBinaryView(const void* data, int32_t size) { + // Small string: inlined. Bytes beyond size are zeroed + BinaryViewType::c_type out; + out.inlined = {size, {}}; + memcpy(&out.inlined.data, data, size); + return out; +} + +inline BinaryViewType::c_type ToInlineBinaryView(std::string_view v) { + return ToInlineBinaryView(v.data(), static_cast(v.size())); +} + +inline BinaryViewType::c_type ToIndexOffsetBinaryView(const void* data, int32_t size, + int32_t buffer_index, + int32_t offset) { + if (size <= BinaryViewType::kInlineSize) { + return ToInlineBinaryView(data, size); + } + + // Large string: store index/offset. + BinaryViewType::c_type out; + out.ref = {size, {}, buffer_index, offset}; + memcpy(&out.ref.prefix, data, sizeof(out.ref.prefix)); + return out; +} + +inline BinaryViewType::c_type ToIndexOffsetBinaryView(std::string_view v, + int32_t buffer_index, + int32_t offset) { + return ToIndexOffsetBinaryView(v.data(), static_cast(v.size()), buffer_index, + offset); +} + +template +std::string_view FromIndexOffsetBinaryView(const BinaryViewType::c_type& v, + const BufferPtr* data_buffers) { + auto* data = v.is_inline() ? v.inlined.data.data() + : data_buffers[v.ref.buffer_index]->data() + v.ref.offset; + return {reinterpret_cast(data), static_cast(v.size())}; +} +template +std::string_view FromIndexOffsetBinaryView(BinaryViewType::c_type&&, + const BufferPtr*) = delete; + +template +bool EqualIndexOffsetBinaryView(BinaryViewType::c_type l, BinaryViewType::c_type r, + const BufferPtr*... buffers) { + int64_t l_size_and_prefix, r_size_and_prefix; + memcpy(&l_size_and_prefix, &l, sizeof(l_size_and_prefix)); + memcpy(&r_size_and_prefix, &r, sizeof(r_size_and_prefix)); + + if (l_size_and_prefix != r_size_and_prefix) return false; + + if (l.is_inline()) { + // The inline part is zeroed at construction, so we can compare + // a word at a time if data extends past 'prefix_'. + int64_t l_inlined, r_inlined; + memcpy(&l_inlined, + AssumeAlignedAs(l.inline_data() + BinaryViewType::kPrefixSize), + sizeof(l_size_and_prefix)); + memcpy(&r_inlined, + AssumeAlignedAs(r.inline_data() + BinaryViewType::kPrefixSize), + sizeof(r_size_and_prefix)); + return l_inlined == r_inlined; + } + + // Sizes are equal and this is not inline, therefore both are out + // of line and have kPrefixSize first in common. + auto [l_buffers, r_buffers] = std::pair{buffers...}; + const uint8_t* l_data = l_buffers[l.ref.buffer_index]->data() + l.ref.offset; + const uint8_t* r_data = r_buffers[r.ref.buffer_index]->data() + r.ref.offset; + return memcmp(l_data + BinaryViewType::kPrefixSize, + r_data + BinaryViewType::kPrefixSize, + l.size() - BinaryViewType::kPrefixSize) == 0; +} + +} // namespace arrow::util diff --git a/cpp/src/arrow/util/string.cc b/cpp/src/arrow/util/string.cc index 2055b4f47ea22..192173fa16ce9 100644 --- a/cpp/src/arrow/util/string.cc +++ b/cpp/src/arrow/util/string.cc @@ -25,15 +25,13 @@ namespace arrow { -static const char* kAsciiTable = "0123456789ABCDEF"; - std::string HexEncode(const uint8_t* data, size_t length) { - std::string hex_string; - hex_string.reserve(length * 2); - for (size_t j = 0; j < length; ++j) { + std::string hex_string(length * 2, '\0'); + for (size_t j = 0, i = 0; j < length; ++j) { // Convert to 2 base16 digits - hex_string.push_back(kAsciiTable[data[j] >> 4]); - hex_string.push_back(kAsciiTable[data[j] & 15]); + constexpr auto kHexDigitTable = "0123456789ABCDEF"; + hex_string[i++] = kHexDigitTable[data[j] >> 4]; + hex_string[i++] = kHexDigitTable[data[j] & 0b1111]; } return hex_string; } @@ -73,20 +71,34 @@ std::string HexEncode(std::string_view str) { return HexEncode(str.data(), str.s std::string Escape(std::string_view str) { return Escape(str.data(), str.size()); } -Status ParseHexValue(const char* data, uint8_t* out) { - char c1 = data[0]; - char c2 = data[1]; +constexpr uint8_t kInvalidHexDigit = -1; - const char* kAsciiTableEnd = kAsciiTable + 16; - const char* pos1 = std::lower_bound(kAsciiTable, kAsciiTableEnd, c1); - const char* pos2 = std::lower_bound(kAsciiTable, kAsciiTableEnd, c2); +constexpr uint8_t ParseHexDigit(char c) { + if (c >= '0' && c <= '9') return c - '0'; + if (c >= 'A' && c <= 'F') return c - 'A' + 10; + return kInvalidHexDigit; +} + +Status ParseHexValue(const char* data, uint8_t* out) { + uint8_t high = ParseHexDigit(data[0]); + uint8_t low = ParseHexDigit(data[1]); // Error checking - if (pos1 == kAsciiTableEnd || pos2 == kAsciiTableEnd || *pos1 != c1 || *pos2 != c2) { + if (high == kInvalidHexDigit || low == kInvalidHexDigit) { return Status::Invalid("Encountered non-hex digit"); } - *out = static_cast((pos1 - kAsciiTable) << 4 | (pos2 - kAsciiTable)); + *out = static_cast(high << 4 | low); + return Status::OK(); +} + +Status ParseHexValues(std::string_view hex_string, uint8_t* out) { + if (hex_string.size() % 2 != 0) { + return Status::Invalid("Expected base16 hex string"); + } + for (size_t j = 0; j < hex_string.size() / 2; ++j) { + RETURN_NOT_OK(ParseHexValue(hex_string.data() + j * 2, out + j)); + } return Status::OK(); } diff --git a/cpp/src/arrow/util/string.h b/cpp/src/arrow/util/string.h index d9777efc56a8c..d7e377773f62f 100644 --- a/cpp/src/arrow/util/string.h +++ b/cpp/src/arrow/util/string.h @@ -46,7 +46,9 @@ ARROW_EXPORT std::string HexEncode(std::string_view str); ARROW_EXPORT std::string Escape(std::string_view str); -ARROW_EXPORT Status ParseHexValue(const char* data, uint8_t* out); +ARROW_EXPORT Status ParseHexValue(const char* hex_pair, uint8_t* out); + +ARROW_EXPORT Status ParseHexValues(std::string_view hex_string, uint8_t* out); namespace internal { diff --git a/cpp/src/arrow/visit_data_inline.h b/cpp/src/arrow/visit_data_inline.h index 6a9b32d73a635..dcfb48dc477fd 100644 --- a/cpp/src/arrow/visit_data_inline.h +++ b/cpp/src/arrow/visit_data_inline.h @@ -23,6 +23,7 @@ #include "arrow/status.h" #include "arrow/type.h" #include "arrow/type_traits.h" +#include "arrow/util/binary_view_util.h" #include "arrow/util/bit_block_counter.h" #include "arrow/util/bit_util.h" #include "arrow/util/checked_cast.h" @@ -144,6 +145,44 @@ struct ArraySpanInlineVisitor> { } }; +// BinaryView, StringView... +template +struct ArraySpanInlineVisitor> { + using c_type = std::string_view; + + template + static Status VisitStatus(const ArraySpan& arr, ValidFunc&& valid_func, + NullFunc&& null_func) { + if (arr.length == 0) { + return Status::OK(); + } + auto* s = arr.GetValues(1); + auto* data_buffers = arr.GetVariadicBuffers().data(); + return VisitBitBlocks( + arr.buffers[0].data, arr.offset, arr.length, + [&](int64_t index) { + return valid_func(util::FromIndexOffsetBinaryView(s[index], data_buffers)); + }, + [&]() { return null_func(); }); + } + + template + static void VisitVoid(const ArraySpan& arr, ValidFunc&& valid_func, + NullFunc&& null_func) { + if (arr.length == 0) { + return; + } + auto* s = arr.GetValues(1); + auto* data_buffers = arr.GetVariadicBuffers().data(); + VisitBitBlocksVoid( + arr.buffers[0].data, arr.offset, arr.length, + [&](int64_t index) { + valid_func(util::FromIndexOffsetBinaryView(s[index], data_buffers)); + }, + std::forward(null_func)); + } +}; + // FixedSizeBinary, Decimal128 template struct ArraySpanInlineVisitor> { @@ -240,9 +279,8 @@ typename internal::call_traits::enable_if_return::type VisitNullBitmapInline(const uint8_t* valid_bits, int64_t valid_bits_offset, int64_t num_values, int64_t null_count, ValidFunc&& valid_func, NullFunc&& null_func) { - ARROW_UNUSED(null_count); - internal::OptionalBitBlockCounter bit_counter(valid_bits, valid_bits_offset, - num_values); + internal::OptionalBitBlockCounter bit_counter(null_count == 0 ? NULLPTR : valid_bits, + valid_bits_offset, num_values); int64_t position = 0; int64_t offset_position = valid_bits_offset; while (position < num_values) { @@ -273,9 +311,8 @@ typename internal::call_traits::enable_if_return::type VisitNullBitmapInline(const uint8_t* valid_bits, int64_t valid_bits_offset, int64_t num_values, int64_t null_count, ValidFunc&& valid_func, NullFunc&& null_func) { - ARROW_UNUSED(null_count); - internal::OptionalBitBlockCounter bit_counter(valid_bits, valid_bits_offset, - num_values); + internal::OptionalBitBlockCounter bit_counter(null_count == 0 ? NULLPTR : valid_bits, + valid_bits_offset, num_values); int64_t position = 0; int64_t offset_position = valid_bits_offset; while (position < num_values) { diff --git a/cpp/src/arrow/visitor.cc b/cpp/src/arrow/visitor.cc index ed3d5bc2c68d7..e057f6b12fb1b 100644 --- a/cpp/src/arrow/visitor.cc +++ b/cpp/src/arrow/visitor.cc @@ -45,8 +45,10 @@ ARRAY_VISITOR_DEFAULT(UInt64Array) ARRAY_VISITOR_DEFAULT(HalfFloatArray) ARRAY_VISITOR_DEFAULT(FloatArray) ARRAY_VISITOR_DEFAULT(DoubleArray) -ARRAY_VISITOR_DEFAULT(BinaryArray) ARRAY_VISITOR_DEFAULT(StringArray) +ARRAY_VISITOR_DEFAULT(StringViewArray) +ARRAY_VISITOR_DEFAULT(BinaryArray) +ARRAY_VISITOR_DEFAULT(BinaryViewArray) ARRAY_VISITOR_DEFAULT(LargeBinaryArray) ARRAY_VISITOR_DEFAULT(LargeStringArray) ARRAY_VISITOR_DEFAULT(FixedSizeBinaryArray) @@ -96,7 +98,9 @@ TYPE_VISITOR_DEFAULT(HalfFloatType) TYPE_VISITOR_DEFAULT(FloatType) TYPE_VISITOR_DEFAULT(DoubleType) TYPE_VISITOR_DEFAULT(StringType) +TYPE_VISITOR_DEFAULT(StringViewType) TYPE_VISITOR_DEFAULT(BinaryType) +TYPE_VISITOR_DEFAULT(BinaryViewType) TYPE_VISITOR_DEFAULT(LargeStringType) TYPE_VISITOR_DEFAULT(LargeBinaryType) TYPE_VISITOR_DEFAULT(FixedSizeBinaryType) @@ -147,7 +151,9 @@ SCALAR_VISITOR_DEFAULT(HalfFloatScalar) SCALAR_VISITOR_DEFAULT(FloatScalar) SCALAR_VISITOR_DEFAULT(DoubleScalar) SCALAR_VISITOR_DEFAULT(StringScalar) +SCALAR_VISITOR_DEFAULT(StringViewScalar) SCALAR_VISITOR_DEFAULT(BinaryScalar) +SCALAR_VISITOR_DEFAULT(BinaryViewScalar) SCALAR_VISITOR_DEFAULT(LargeStringScalar) SCALAR_VISITOR_DEFAULT(LargeBinaryScalar) SCALAR_VISITOR_DEFAULT(FixedSizeBinaryScalar) diff --git a/cpp/src/arrow/visitor.h b/cpp/src/arrow/visitor.h index b22d4d3c567e1..650b0e7ee0a30 100644 --- a/cpp/src/arrow/visitor.h +++ b/cpp/src/arrow/visitor.h @@ -45,7 +45,9 @@ class ARROW_EXPORT ArrayVisitor { virtual Status Visit(const FloatArray& array); virtual Status Visit(const DoubleArray& array); virtual Status Visit(const StringArray& array); + virtual Status Visit(const StringViewArray& array); virtual Status Visit(const BinaryArray& array); + virtual Status Visit(const BinaryViewArray& array); virtual Status Visit(const LargeStringArray& array); virtual Status Visit(const LargeBinaryArray& array); virtual Status Visit(const FixedSizeBinaryArray& array); @@ -94,7 +96,9 @@ class ARROW_EXPORT TypeVisitor { virtual Status Visit(const FloatType& type); virtual Status Visit(const DoubleType& type); virtual Status Visit(const StringType& type); + virtual Status Visit(const StringViewType& type); virtual Status Visit(const BinaryType& type); + virtual Status Visit(const BinaryViewType& type); virtual Status Visit(const LargeStringType& type); virtual Status Visit(const LargeBinaryType& type); virtual Status Visit(const FixedSizeBinaryType& type); @@ -143,7 +147,9 @@ class ARROW_EXPORT ScalarVisitor { virtual Status Visit(const FloatScalar& scalar); virtual Status Visit(const DoubleScalar& scalar); virtual Status Visit(const StringScalar& scalar); + virtual Status Visit(const StringViewScalar& scalar); virtual Status Visit(const BinaryScalar& scalar); + virtual Status Visit(const BinaryViewScalar& scalar); virtual Status Visit(const LargeStringScalar& scalar); virtual Status Visit(const LargeBinaryScalar& scalar); virtual Status Visit(const FixedSizeBinaryScalar& scalar); diff --git a/cpp/src/arrow/visitor_generate.h b/cpp/src/arrow/visitor_generate.h index 8f6b176ba8fea..4b57abe53ff14 100644 --- a/cpp/src/arrow/visitor_generate.h +++ b/cpp/src/arrow/visitor_generate.h @@ -40,7 +40,9 @@ namespace arrow { ACTION(Boolean); \ ARROW_GENERATE_FOR_ALL_NUMERIC_TYPES(ACTION); \ ACTION(String); \ + ACTION(StringView); \ ACTION(Binary); \ + ACTION(BinaryView); \ ACTION(LargeString); \ ACTION(LargeBinary); \ ACTION(FixedSizeBinary); \ diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc index 315405ef1e569..4e23d0fab5c69 100644 --- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc +++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc @@ -3374,6 +3374,8 @@ TEST(ArrowReadWrite, NestedRequiredOuterOptional) { for (const auto& inner_type : types) { if (inner_type->id() == ::arrow::Type::NA) continue; + if (inner_type->id() == ::arrow::Type::BINARY_VIEW) continue; + if (inner_type->id() == ::arrow::Type::STRING_VIEW) continue; auto writer_props = WriterProperties::Builder(); auto arrow_writer_props = ArrowWriterProperties::Builder(); @@ -3389,7 +3391,6 @@ TEST(ArrowReadWrite, NestedRequiredOuterOptional) { arrow_writer_props.coerce_timestamps(unit); } } - ASSERT_NO_FATAL_FAILURE(DoNestedRequiredRoundtrip(inner_type, writer_props.build(), arrow_writer_props.build())); diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index 72e984d7736fa..5dff533c1cce2 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -134,6 +134,8 @@ struct ValueBufferSlicer { NOT_IMPLEMENTED_VISIT(Dictionary); NOT_IMPLEMENTED_VISIT(RunEndEncoded); NOT_IMPLEMENTED_VISIT(Extension); + NOT_IMPLEMENTED_VISIT(BinaryView); + NOT_IMPLEMENTED_VISIT(StringView); #undef NOT_IMPLEMENTED_VISIT diff --git a/dev/archery/archery/integration/datagen.py b/dev/archery/archery/integration/datagen.py index 7635cfd98feda..1ce2775c160b8 100644 --- a/dev/archery/archery/integration/datagen.py +++ b/dev/archery/archery/integration/datagen.py @@ -1856,12 +1856,13 @@ def _temp_path(): .skip_tester('Rust'), generate_binary_view_case() - .skip_tester('C++') .skip_tester('C#') .skip_tester('Go') .skip_tester('Java') .skip_tester('JS') - .skip_tester('Rust'), + .skip_tester('Rust') + .skip_format(SKIP_C_SCHEMA, 'C++') + .skip_format(SKIP_C_ARRAY, 'C++'), generate_extension_case() .skip_tester('C#') diff --git a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc index 91c7b8a45718e..8ed5d4e216e8e 100644 --- a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc +++ b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc @@ -1353,7 +1353,8 @@ struct ObjectWriterVisitor { std::is_same::value || (std::is_base_of::value && !std::is_same::value) || - std::is_base_of::value, + std::is_base_of::value || + std::is_base_of::value, Status> Visit(const Type& type) { return Status::NotImplemented("No implemented conversion to object dtype: ", diff --git a/python/pyarrow/src/arrow/python/python_to_arrow.cc b/python/pyarrow/src/arrow/python/python_to_arrow.cc index 2143129356fd5..e47492499867a 100644 --- a/python/pyarrow/src/arrow/python/python_to_arrow.cc +++ b/python/pyarrow/src/arrow/python/python_to_arrow.cc @@ -572,12 +572,18 @@ struct PyConverterTrait; template struct PyConverterTrait< - T, enable_if_t<(!is_nested_type::value && !is_interval_type::value && - !is_extension_type::value) || - std::is_same::value>> { + T, + enable_if_t<(!is_nested_type::value && !is_interval_type::value && + !is_extension_type::value && !is_binary_view_like_type::value) || + std::is_same::value>> { using type = PyPrimitiveConverter; }; +template +struct PyConverterTrait> { + // not implemented +}; + template struct PyConverterTrait> { using type = PyListConverter; diff --git a/r/src/r_to_arrow.cpp b/r/src/r_to_arrow.cpp index 19c23973a62fe..d9bf848e24292 100644 --- a/r/src/r_to_arrow.cpp +++ b/r/src/r_to_arrow.cpp @@ -1051,10 +1051,15 @@ struct RConverterTrait; template struct RConverterTrait< T, enable_if_t::value && !is_interval_type::value && - !is_extension_type::value>> { + !is_extension_type::value && !is_binary_view_like_type::value>> { using type = RPrimitiveConverter; }; +template +struct RConverterTrait> { + // not implemented +}; + template struct RConverterTrait> { using type = RListConverter; From 0e14114b5bb8ada8088fd63ecdee811fd13efcf4 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Tue, 10 Oct 2023 12:49:36 -0400 Subject: [PATCH 2/5] review comments --- cpp/src/arrow/array/array_binary.cc | 2 +- cpp/src/arrow/array/array_binary_test.cc | 68 ++++++++++------------ cpp/src/arrow/array/builder_binary.cc | 21 ++++--- cpp/src/arrow/array/builder_binary.h | 6 +- cpp/src/arrow/array/concatenate.cc | 7 ++- cpp/src/arrow/array/concatenate_test.cc | 4 +- cpp/src/arrow/array/data.cc | 6 +- cpp/src/arrow/array/data.h | 2 +- cpp/src/arrow/array/util.cc | 2 +- cpp/src/arrow/array/util.h | 13 ----- cpp/src/arrow/array/validate.cc | 13 ++++- cpp/src/arrow/compare.cc | 4 +- cpp/src/arrow/integration/json_internal.cc | 34 ++++++----- cpp/src/arrow/ipc/metadata_internal.cc | 5 +- cpp/src/arrow/ipc/reader.cc | 1 - cpp/src/arrow/testing/random.h | 3 +- cpp/src/arrow/type.cc | 6 +- cpp/src/arrow/type_traits.h | 6 +- cpp/src/arrow/util/assume_aligned.h | 45 -------------- cpp/src/arrow/util/binary_view_util.h | 36 +++++------- cpp/src/arrow/visit_data_inline.h | 4 +- 21 files changed, 116 insertions(+), 172 deletions(-) delete mode 100644 cpp/src/arrow/util/assume_aligned.h diff --git a/cpp/src/arrow/array/array_binary.cc b/cpp/src/arrow/array/array_binary.cc index 19a653df00f8c..d83ba0ca8936d 100644 --- a/cpp/src/arrow/array/array_binary.cc +++ b/cpp/src/arrow/array/array_binary.cc @@ -107,7 +107,7 @@ BinaryViewArray::BinaryViewArray(std::shared_ptr type, int64_t length, std::string_view BinaryViewArray::GetView(int64_t i) const { const std::shared_ptr* data_buffers = data_->buffers.data() + 2; - return util::FromIndexOffsetBinaryView(raw_values_[i], data_buffers); + return util::FromBinaryView(raw_values_[i], data_buffers); } StringViewArray::StringViewArray(std::shared_ptr data) { diff --git a/cpp/src/arrow/array/array_binary_test.cc b/cpp/src/arrow/array/array_binary_test.cc index e6b1ed339a27e..04391be0ac789 100644 --- a/cpp/src/arrow/array/array_binary_test.cc +++ b/cpp/src/arrow/array/array_binary_test.cc @@ -405,41 +405,39 @@ TEST(StringViewArray, Validate) { // non-inline views are expected to reference only buffers managed by the array EXPECT_THAT( - MakeBinaryViewArray({buffer_s, buffer_y}, - {util::ToIndexOffsetBinaryView( - "supe", static_cast(buffer_s->size()), 0, 0), - util::ToIndexOffsetBinaryView( - "yyyy", static_cast(buffer_y->size()), 1, 0)}), + MakeBinaryViewArray( + {buffer_s, buffer_y}, + {util::ToBinaryView("supe", static_cast(buffer_s->size()), 0, 0), + util::ToBinaryView("yyyy", static_cast(buffer_y->size()), 1, 0)}), Ok()); // views may not reference data buffers not present in the array EXPECT_THAT( - MakeBinaryViewArray({}, {util::ToIndexOffsetBinaryView( - "supe", static_cast(buffer_s->size()), 0, 0)}), + MakeBinaryViewArray( + {}, {util::ToBinaryView("supe", static_cast(buffer_s->size()), 0, 0)}), Raises(StatusCode::IndexError)); // ... or ranges which overflow the referenced data buffer EXPECT_THAT( MakeBinaryViewArray( - {buffer_s}, {util::ToIndexOffsetBinaryView( + {buffer_s}, {util::ToBinaryView( "supe", static_cast(buffer_s->size() + 50), 0, 0)}), Raises(StatusCode::IndexError)); // Additionally, the prefixes of non-inline views must match the data buffer EXPECT_THAT( - MakeBinaryViewArray({buffer_s, buffer_y}, - {util::ToIndexOffsetBinaryView( - "SUPE", static_cast(buffer_s->size()), 0, 0), - util::ToIndexOffsetBinaryView( - "yyyy", static_cast(buffer_y->size()), 1, 0)}), + MakeBinaryViewArray( + {buffer_s, buffer_y}, + {util::ToBinaryView("SUPE", static_cast(buffer_s->size()), 0, 0), + util::ToBinaryView("yyyy", static_cast(buffer_y->size()), 1, 0)}), Raises(StatusCode::Invalid)); // Invalid string views which are masked by a null bit do not cause validation to fail auto invalid_but_masked = - MakeBinaryViewArray({buffer_s}, - {util::ToIndexOffsetBinaryView( - "SUPE", static_cast(buffer_s->size()), 0, 0), - util::ToIndexOffsetBinaryView("yyyy", 50, 40, 30)}, - /*validate=*/false) + MakeBinaryViewArray( + {buffer_s}, + {util::ToBinaryView("SUPE", static_cast(buffer_s->size()), 0, 0), + util::ToBinaryView("yyyy", 50, 40, 30)}, + /*validate=*/false) .ValueOrDie() ->data(); invalid_but_masked->null_count = 2; @@ -447,19 +445,19 @@ TEST(StringViewArray, Validate) { EXPECT_THAT(internal::ValidateArrayFull(*invalid_but_masked), Ok()); // overlapping views are allowed - EXPECT_THAT(MakeBinaryViewArray( - {buffer_s}, - { - util::ToIndexOffsetBinaryView( - "supe", static_cast(buffer_s->size()), 0, 0), - util::ToIndexOffsetBinaryView( - "uper", static_cast(buffer_s->size() - 1), 0, 1), - util::ToIndexOffsetBinaryView( - "perc", static_cast(buffer_s->size() - 2), 0, 2), - util::ToIndexOffsetBinaryView( - "erca", static_cast(buffer_s->size() - 3), 0, 3), - }), - Ok()); + EXPECT_THAT( + MakeBinaryViewArray( + {buffer_s}, + { + util::ToBinaryView("supe", static_cast(buffer_s->size()), 0, 0), + util::ToBinaryView("uper", static_cast(buffer_s->size() - 1), 0, + 1), + util::ToBinaryView("perc", static_cast(buffer_s->size() - 2), 0, + 2), + util::ToBinaryView("erca", static_cast(buffer_s->size() - 3), 0, + 3), + }), + Ok()); } template @@ -982,13 +980,7 @@ class TestBaseBinaryDataVisitor : public ::testing::Test { public: using TypeClass = T; - void SetUp() override { - if constexpr (is_binary_view_like_type::value) { - type_ = TypeClass::is_utf8 ? utf8_view() : binary_view(); - } else { - type_ = TypeTraits::type_singleton(); - } - } + void SetUp() override { type_ = TypeTraits::type_singleton(); } void TestBasics() { auto array = ArrayFromJSON( diff --git a/cpp/src/arrow/array/builder_binary.cc b/cpp/src/arrow/array/builder_binary.cc index 932277a4f374b..3ff22d4a3feeb 100644 --- a/cpp/src/arrow/array/builder_binary.cc +++ b/cpp/src/arrow/array/builder_binary.cc @@ -35,6 +35,7 @@ #include "arrow/util/checked_cast.h" #include "arrow/util/decimal.h" #include "arrow/util/logging.h" +#include "arrow/visit_data_inline.h" namespace arrow { @@ -51,12 +52,17 @@ Status BinaryViewBuilder::AppendArraySlice(const ArraySpan& array, int64_t offse auto bitmap = array.GetValues(0, 0); auto values = array.GetValues(1) + offset; - int64_t out_of_line_total = 0; - for (int64_t i = 0; i < length; i++) { - if (!values[i].is_inline()) { - out_of_line_total += static_cast(values[i].size()); - } - } + int64_t out_of_line_total = 0, i = 0; + VisitNullBitmapInline( + array.buffers[0].data, array.offset, array.length, array.null_count, + [&] { + if (!values[i].is_inline()) { + out_of_line_total += static_cast(values[i].size()); + } + ++i; + }, + [&] { ++i; }); + RETURN_NOT_OK(Reserve(length)); RETURN_NOT_OK(ReserveData(out_of_line_total)); @@ -66,8 +72,7 @@ Status BinaryViewBuilder::AppendArraySlice(const ArraySpan& array, int64_t offse continue; } - UnsafeAppend( - util::FromIndexOffsetBinaryView(values[i], array.GetVariadicBuffers().data())); + UnsafeAppend(util::FromBinaryView(values[i], array.GetVariadicBuffers().data())); } return Status::OK(); } diff --git a/cpp/src/arrow/array/builder_binary.h b/cpp/src/arrow/array/builder_binary.h index 1da9ed88fe3ba..f5b335b87a26d 100644 --- a/cpp/src/arrow/array/builder_binary.h +++ b/cpp/src/arrow/array/builder_binary.h @@ -500,9 +500,9 @@ class ARROW_EXPORT StringHeapBuilder { ARROW_RETURN_NOT_OK(Reserve(length)); } - auto v = util::ToIndexOffsetBinaryView(value, static_cast(length), - static_cast(blocks_.size() - 1), - current_offset_); + auto v = + util::ToBinaryView(value, static_cast(length), + static_cast(blocks_.size() - 1), current_offset_); memcpy(current_out_buffer_, value, static_cast(length)); current_out_buffer_ += length; diff --git a/cpp/src/arrow/array/concatenate.cc b/cpp/src/arrow/array/concatenate.cc index f8c5762910b90..682e4945a027f 100644 --- a/cpp/src/arrow/array/concatenate.cc +++ b/cpp/src/arrow/array/concatenate.cc @@ -242,7 +242,7 @@ class ConcatenateImpl { ARROW_ASSIGN_OR_RAISE(auto view_buffers, Buffers(1, BinaryViewType::kSize)); ARROW_ASSIGN_OR_RAISE(auto view_buffer, ConcatenateBuffers(view_buffers, pool_)); - auto* s = view_buffer->mutable_data_as(); + auto* views = view_buffer->mutable_data_as(); size_t preceding_buffer_count = 0; int64_t i = in_[0]->length; @@ -250,8 +250,9 @@ class ConcatenateImpl { preceding_buffer_count += in_[in_index - 1]->buffers.size() - 2; for (int64_t end_i = i + in_[in_index]->length; i < end_i; ++i) { - if (s[i].is_inline()) continue; - s[i].ref.buffer_index += static_cast(preceding_buffer_count); + if (views[i].is_inline()) continue; + views[i].ref.buffer_index = SafeSignedAdd( + views[i].ref.buffer_index, static_cast(preceding_buffer_count)); } } diff --git a/cpp/src/arrow/array/concatenate_test.cc b/cpp/src/arrow/array/concatenate_test.cc index 55535f2c15433..e49f65e15e15d 100644 --- a/cpp/src/arrow/array/concatenate_test.cc +++ b/cpp/src/arrow/array/concatenate_test.cc @@ -99,8 +99,8 @@ class ConcatenateTest : public ::testing::Test { for (auto slice : slices) { ASSERT_OK(slice->ValidateFull()); } - ASSERT_OK(expected->ValidateFull()); ASSERT_OK_AND_ASSIGN(auto actual, Concatenate(slices)); + ASSERT_OK(actual->ValidateFull()); AssertArraysEqual(*expected, *actual); if (actual->data()->buffers[0]) { CheckTrailingBitsAreZeroed(actual->data()->buffers[0], actual->length()); @@ -163,7 +163,7 @@ TEST_F(ConcatenateTest, StringType) { TEST_F(ConcatenateTest, StringViewType) { Check([this](int32_t size, double null_probability, std::shared_ptr* out) { - *out = rng_.StringView(size, /*min_length =*/0, /*max_length =*/15, null_probability); + *out = rng_.StringView(size, /*min_length =*/0, /*max_length =*/40, null_probability); ASSERT_OK((**out).ValidateFull()); }); } diff --git a/cpp/src/arrow/array/data.cc b/cpp/src/arrow/array/data.cc index a59da54f0fe13..678513fd4d151 100644 --- a/cpp/src/arrow/array/data.cc +++ b/cpp/src/arrow/array/data.cc @@ -93,7 +93,7 @@ bool RunEndEncodedMayHaveLogicalNulls(const ArrayData& data) { return ArraySpan(data).MayHaveLogicalNulls(); } -BufferSpan PackVariadicBuffers(util::span const> buffers) { +BufferSpan PackVariadicBuffers(util::span> buffers) { return {const_cast(reinterpret_cast(buffers.data())), static_cast(buffers.size() * sizeof(std::shared_ptr))}; } @@ -372,7 +372,7 @@ void ArraySpan::FillFromScalar(const Scalar& value) { static_assert(sizeof(BinaryViewType::c_type) <= sizeof(scalar.scratch_space_)); auto* view = new (&scalar.scratch_space_) BinaryViewType::c_type; if (scalar.is_valid) { - *view = util::ToIndexOffsetBinaryView(std::string_view{*scalar.value}, 0, 0); + *view = util::ToBinaryView(std::string_view{*scalar.value}, 0, 0); this->buffers[2] = internal::PackVariadicBuffers({&scalar.value, 1}); } else { *view = {}; @@ -565,7 +565,7 @@ std::shared_ptr ArraySpan::ToArrayData() const { return result; } -util::span const> ArraySpan::GetVariadicBuffers() const { +util::span> ArraySpan::GetVariadicBuffers() const { DCHECK(HasVariadicBuffers()); return {buffers[2].data_as>(), buffers[2].size / sizeof(std::shared_ptr)}; diff --git a/cpp/src/arrow/array/data.h b/cpp/src/arrow/array/data.h index c54068986359d..40a77640cd1e5 100644 --- a/cpp/src/arrow/array/data.h +++ b/cpp/src/arrow/array/data.h @@ -540,7 +540,7 @@ struct ARROW_EXPORT ArraySpan { /// sizeof(shared_ptr). /// /// \see HasVariadicBuffers - util::span const> GetVariadicBuffers() const; + util::span> GetVariadicBuffers() const; bool HasVariadicBuffers() const; private: diff --git a/cpp/src/arrow/array/util.cc b/cpp/src/arrow/array/util.cc index 369cf1a652dae..9ea2fc2b6f0a1 100644 --- a/cpp/src/arrow/array/util.cc +++ b/cpp/src/arrow/array/util.cc @@ -680,7 +680,7 @@ class RepeatedArrayFactory { template enable_if_binary_view_like Visit(const T& type) { std::string_view value{*scalar().value}; - auto s = util::ToIndexOffsetBinaryView(value, 0, 0); + auto s = util::ToBinaryView(value, 0, 0); RETURN_NOT_OK(FinishFixedWidth(&s, sizeof(s))); if (!s.is_inline()) { out_->data()->buffers.push_back(scalar().value); diff --git a/cpp/src/arrow/array/util.h b/cpp/src/arrow/array/util.h index a340caca03074..9f34af0525d96 100644 --- a/cpp/src/arrow/array/util.h +++ b/cpp/src/arrow/array/util.h @@ -86,18 +86,5 @@ Result> SwapEndianArrayData( ARROW_EXPORT std::vector RechunkArraysConsistently(const std::vector&); -/// Convert between index/offset and raw pointer views. -/// -/// This function can be used to overwrite a buffer of views if desired, -/// IE it is supported for `in.buffers[1].data == out`. -/// -/// Note that calling this function is not necessary if all views happen to be -/// Inline; this is usually efficiently detectable by checking for an absence of any -/// data buffers. -/// -/// Will raise IndexError if a view refers to memory outside the data buffers. -ARROW_EXPORT -Status SwapBinaryViewPointers(const ArraySpan& in, BinaryViewType::c_type* out); - } // namespace internal } // namespace arrow diff --git a/cpp/src/arrow/array/validate.cc b/cpp/src/arrow/array/validate.cc index 142bab151744a..3dde41b1450e8 100644 --- a/cpp/src/arrow/array/validate.cc +++ b/cpp/src/arrow/array/validate.cc @@ -650,7 +650,18 @@ struct ValidateArrayImpl { views[i].size()); } - if (views[i].is_inline()) continue; + if (views[i].is_inline()) { + auto padding_bytes = util::span(views[i].inlined.data).subspan(views[i].size()); + for (auto padding_byte : padding_bytes) { + if (padding_byte != 0) { + return Status::Invalid("View at slot ", i, " was inline with size ", + views[i].size(), + " but its padding bytes were not all zero: ", + HexEncode(padding_bytes.data(), padding_bytes.size())); + } + } + continue; + } auto [size, prefix, buffer_index, offset] = views[i].ref; diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc index 409b21c90b6d1..50cfdd05a14bb 100644 --- a/cpp/src/arrow/compare.cc +++ b/cpp/src/arrow/compare.cc @@ -271,8 +271,8 @@ class RangeDataEqualsImpl { auto* right_buffers = right_.buffers.data() + 2; VisitValidRuns([&](int64_t i, int64_t length) { for (auto end_i = i + length; i < end_i; ++i) { - if (!util::EqualIndexOffsetBinaryView(left_values[i], right_values[i], - left_buffers, right_buffers)) { + if (!util::EqualBinaryView(left_values[i], right_values[i], left_buffers, + right_buffers)) { return false; } } diff --git a/cpp/src/arrow/integration/json_internal.cc b/cpp/src/arrow/integration/json_internal.cc index 1f19e891f425c..c4f44d21bed38 100644 --- a/cpp/src/arrow/integration/json_internal.cc +++ b/cpp/src/arrow/integration/json_internal.cc @@ -107,9 +107,11 @@ std::string GetTimeUnitName(TimeUnit::type unit) { return "UNKNOWN"; } -std::string_view GetStringView(const rj::Value& str) { - DCHECK(str.IsString()); - return {str.GetString(), str.GetStringLength()}; +Result GetStringView(const rj::Value& str) { + if (!str.IsString()) { + return Status::Invalid("field was not a string line ", __LINE__); + } + return std::string_view{str.GetString(), str.GetStringLength()}; } class SchemaWriter { @@ -1413,7 +1415,7 @@ class ArrayReader { BufferVector buffers; buffers.resize(json_variadic_bufs.Size() + 2); for (auto [json_buf, buf] : Zip(json_variadic_bufs, span{buffers}.subspan(2))) { - auto hex_string = GetStringView(json_buf); + ARROW_ASSIGN_OR_RAISE(auto hex_string, GetStringView(json_buf)); ARROW_ASSIGN_OR_RAISE( buf, AllocateBuffer(static_cast(hex_string.size()) / 2, pool_)); RETURN_NOT_OK(ParseHexValues(hex_string, buf->mutable_data())); @@ -1433,9 +1435,9 @@ class ArrayReader { static_cast(length_)}; int64_t null_count = 0; - for (auto [json_view, s, is_valid] : Zip(json_views, views, is_valid_)) { + for (auto [json_view, out_view, is_valid] : Zip(json_views, views, is_valid_)) { if (!is_valid) { - s = {}; + out_view = {}; ++null_count; continue; } @@ -1451,16 +1453,16 @@ class ArrayReader { if (size <= BinaryViewType::kInlineSize) { auto json_inlined = json_view_obj.FindMember("INLINED"); RETURN_NOT_STRING("INLINED", json_inlined, json_view_obj); - s.inlined = {size, {}}; + out_view.inlined = {size, {}}; if constexpr (ViewType::is_utf8) { DCHECK_LE(json_inlined->value.GetStringLength(), BinaryViewType::kInlineSize); - memcpy(&s.inlined.data, json_inlined->value.GetString(), size); + memcpy(&out_view.inlined.data, json_inlined->value.GetString(), size); } else { DCHECK_LE(json_inlined->value.GetStringLength(), BinaryViewType::kInlineSize * 2); - RETURN_NOT_OK( - ParseHexValues(GetStringView(json_inlined->value), s.inlined.data.data())); + ARROW_ASSIGN_OR_RAISE(auto inlined, GetStringView(json_inlined->value)); + RETURN_NOT_OK(ParseHexValues(inlined, out_view.inlined.data.data())); } continue; } @@ -1472,7 +1474,7 @@ class ArrayReader { RETURN_NOT_INT("BUFFER_INDEX", json_buffer_index, json_view_obj); RETURN_NOT_INT("OFFSET", json_offset, json_view_obj); - s.ref = { + out_view.ref = { size, {}, static_cast(json_buffer_index->value.GetInt64()), @@ -1480,12 +1482,12 @@ class ArrayReader { }; DCHECK_EQ(json_prefix->value.GetStringLength(), BinaryViewType::kPrefixSize * 2); - RETURN_NOT_OK( - ParseHexValues(GetStringView(json_prefix->value), s.ref.prefix.data())); + ARROW_ASSIGN_OR_RAISE(auto prefix, GetStringView(json_prefix->value)); + RETURN_NOT_OK(ParseHexValues(prefix, out_view.ref.prefix.data())); - DCHECK_LE(static_cast(s.ref.buffer_index), buffers.size() - 2); - DCHECK_LE(static_cast(s.ref.offset) + s.size(), - buffers[s.ref.buffer_index + 2]->size()); + DCHECK_LE(static_cast(out_view.ref.buffer_index), buffers.size() - 2); + DCHECK_LE(static_cast(out_view.ref.offset) + out_view.size(), + buffers[out_view.ref.buffer_index + 2]->size()); } data_ = ArrayData::Make(type_, length_, std::move(buffers), null_count); diff --git a/cpp/src/arrow/ipc/metadata_internal.cc b/cpp/src/arrow/ipc/metadata_internal.cc index 9350fb4ac6096..ab1a58dd1df8b 100644 --- a/cpp/src/arrow/ipc/metadata_internal.cc +++ b/cpp/src/arrow/ipc/metadata_internal.cc @@ -996,7 +996,10 @@ static Status MakeRecordBatch(FBB& fbb, int64_t length, int64_t body_length, BodyCompressionOffset fb_compression; RETURN_NOT_OK(GetBodyCompression(fbb, options, &fb_compression)); - auto fb_variadic_buffer_counts = fbb.CreateVector(variadic_buffer_counts); + flatbuffers::Offset> fb_variadic_buffer_counts{}; + if (!variadic_buffer_counts.empty()) { + fb_variadic_buffer_counts = fbb.CreateVector(variadic_buffer_counts); + } *offset = flatbuf::CreateRecordBatch(fbb, length, fb_nodes, fb_buffers, fb_compression, fb_variadic_buffer_counts); diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc index 1bd14ac120cba..d603062d81d4a 100644 --- a/cpp/src/arrow/ipc/reader.cc +++ b/cpp/src/arrow/ipc/reader.cc @@ -59,7 +59,6 @@ #include "arrow/util/thread_pool.h" #include "arrow/util/ubsan.h" #include "arrow/util/vector.h" -#include "arrow/visit_data_inline.h" #include "arrow/visit_type_inline.h" #include "generated/File_generated.h" // IWYU pragma: export diff --git a/cpp/src/arrow/testing/random.h b/cpp/src/arrow/testing/random.h index 1cf57cdb4e939..cbdac3baa0109 100644 --- a/cpp/src/arrow/testing/random.h +++ b/cpp/src/arrow/testing/random.h @@ -374,10 +374,11 @@ class ARROW_TESTING_EXPORT RandomArrayGenerator { /// determined by the uniform distribution /// \param[in] max_length the upper bound of the string length /// determined by the uniform distribution + /// \param[in] null_probability the probability of a value being null /// \param[in] max_data_buffer_length the data buffer size at which /// a new chunk will be generated /// \param[in] alignment alignment for memory allocations (in bytes) - /// \param[in] null_probability the probability of a value being null + /// \param[in] memory_pool memory pool to allocate memory from /// /// \return a generated Array std::shared_ptr StringView(int64_t size, int32_t min_length, int32_t max_length, diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index 5feeaf84d95e1..f378bd974047d 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -256,11 +256,7 @@ struct PhysicalTypeVisitor { template Status Visit(const Type& type) { - if constexpr (std::is_base_of_v) { - result = binary_view(); - } else { - result = TypeTraits::type_singleton(); - } + result = TypeTraits::type_singleton(); return Status::OK(); } }; diff --git a/cpp/src/arrow/type_traits.h b/cpp/src/arrow/type_traits.h index 898e6465ef299..738aee08a3b65 100644 --- a/cpp/src/arrow/type_traits.h +++ b/cpp/src/arrow/type_traits.h @@ -347,7 +347,8 @@ struct TypeTraits { using BuilderType = BinaryViewBuilder; using ScalarType = BinaryViewScalar; using CType = BinaryViewType::c_type; - constexpr static bool is_parameter_free = false; + constexpr static bool is_parameter_free = true; + static inline std::shared_ptr type_singleton() { return binary_view(); } }; template <> @@ -386,7 +387,8 @@ struct TypeTraits { using BuilderType = StringViewBuilder; using ScalarType = StringViewScalar; using CType = BinaryViewType::c_type; - constexpr static bool is_parameter_free = false; + constexpr static bool is_parameter_free = true; + static inline std::shared_ptr type_singleton() { return utf8_view(); } }; template <> diff --git a/cpp/src/arrow/util/assume_aligned.h b/cpp/src/arrow/util/assume_aligned.h deleted file mode 100644 index dfcbb0e6289e3..0000000000000 --- a/cpp/src/arrow/util/assume_aligned.h +++ /dev/null @@ -1,45 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include - -namespace arrow::util { - -template -[[nodiscard]] constexpr P* AssumeAligned(P* ptr) { -#if defined(__has_builtin) -#if __has_builtin(__builtin_assume_aligned) -#define ARROW_HAS_BUILTIN_ASSUME_ALIGNED -#endif -#endif - -#if defined(ARROW_HAS_BUILTIN_ASSUME_ALIGNED) -#undef ARROW_HAS_BUILTIN_ASSUME_ALIGNED - return static_cast(__builtin_assume_aligned(ptr, N)); -#else - return ptr; -#endif -} - -template -[[nodiscard]] constexpr P* AssumeAlignedAs(P* ptr) { - return AssumeAligned(ptr); -} - -} // namespace arrow::util diff --git a/cpp/src/arrow/util/binary_view_util.h b/cpp/src/arrow/util/binary_view_util.h index e14da5d7e138c..13f05e0e7c808 100644 --- a/cpp/src/arrow/util/binary_view_util.h +++ b/cpp/src/arrow/util/binary_view_util.h @@ -21,7 +21,6 @@ #include #include "arrow/type.h" -#include "arrow/util/assume_aligned.h" #include "arrow/util/span.h" namespace arrow::util { @@ -38,9 +37,8 @@ inline BinaryViewType::c_type ToInlineBinaryView(std::string_view v) { return ToInlineBinaryView(v.data(), static_cast(v.size())); } -inline BinaryViewType::c_type ToIndexOffsetBinaryView(const void* data, int32_t size, - int32_t buffer_index, - int32_t offset) { +inline BinaryViewType::c_type ToBinaryView(const void* data, int32_t size, + int32_t buffer_index, int32_t offset) { if (size <= BinaryViewType::kInlineSize) { return ToInlineBinaryView(data, size); } @@ -52,27 +50,24 @@ inline BinaryViewType::c_type ToIndexOffsetBinaryView(const void* data, int32_t return out; } -inline BinaryViewType::c_type ToIndexOffsetBinaryView(std::string_view v, - int32_t buffer_index, - int32_t offset) { - return ToIndexOffsetBinaryView(v.data(), static_cast(v.size()), buffer_index, - offset); +inline BinaryViewType::c_type ToBinaryView(std::string_view v, int32_t buffer_index, + int32_t offset) { + return ToBinaryView(v.data(), static_cast(v.size()), buffer_index, offset); } template -std::string_view FromIndexOffsetBinaryView(const BinaryViewType::c_type& v, - const BufferPtr* data_buffers) { +std::string_view FromBinaryView(const BinaryViewType::c_type& v, + const BufferPtr* data_buffers) { auto* data = v.is_inline() ? v.inlined.data.data() : data_buffers[v.ref.buffer_index]->data() + v.ref.offset; return {reinterpret_cast(data), static_cast(v.size())}; } template -std::string_view FromIndexOffsetBinaryView(BinaryViewType::c_type&&, - const BufferPtr*) = delete; +std::string_view FromBinaryView(BinaryViewType::c_type&&, const BufferPtr*) = delete; -template -bool EqualIndexOffsetBinaryView(BinaryViewType::c_type l, BinaryViewType::c_type r, - const BufferPtr*... buffers) { +template +bool EqualBinaryView(BinaryViewType::c_type l, BinaryViewType::c_type r, + const BufferPtr* l_buffers, const BufferPtr* r_buffers) { int64_t l_size_and_prefix, r_size_and_prefix; memcpy(&l_size_and_prefix, &l, sizeof(l_size_and_prefix)); memcpy(&r_size_and_prefix, &r, sizeof(r_size_and_prefix)); @@ -83,18 +78,13 @@ bool EqualIndexOffsetBinaryView(BinaryViewType::c_type l, BinaryViewType::c_type // The inline part is zeroed at construction, so we can compare // a word at a time if data extends past 'prefix_'. int64_t l_inlined, r_inlined; - memcpy(&l_inlined, - AssumeAlignedAs(l.inline_data() + BinaryViewType::kPrefixSize), - sizeof(l_size_and_prefix)); - memcpy(&r_inlined, - AssumeAlignedAs(r.inline_data() + BinaryViewType::kPrefixSize), - sizeof(r_size_and_prefix)); + memcpy(&l_inlined, l.inline_data() + BinaryViewType::kPrefixSize, sizeof(l_inlined)); + memcpy(&r_inlined, r.inline_data() + BinaryViewType::kPrefixSize, sizeof(r_inlined)); return l_inlined == r_inlined; } // Sizes are equal and this is not inline, therefore both are out // of line and have kPrefixSize first in common. - auto [l_buffers, r_buffers] = std::pair{buffers...}; const uint8_t* l_data = l_buffers[l.ref.buffer_index]->data() + l.ref.offset; const uint8_t* r_data = r_buffers[r.ref.buffer_index]->data() + r.ref.offset; return memcmp(l_data + BinaryViewType::kPrefixSize, diff --git a/cpp/src/arrow/visit_data_inline.h b/cpp/src/arrow/visit_data_inline.h index dcfb48dc477fd..874cf5b514452 100644 --- a/cpp/src/arrow/visit_data_inline.h +++ b/cpp/src/arrow/visit_data_inline.h @@ -161,7 +161,7 @@ struct ArraySpanInlineVisitor> { return VisitBitBlocks( arr.buffers[0].data, arr.offset, arr.length, [&](int64_t index) { - return valid_func(util::FromIndexOffsetBinaryView(s[index], data_buffers)); + return valid_func(util::FromBinaryView(s[index], data_buffers)); }, [&]() { return null_func(); }); } @@ -177,7 +177,7 @@ struct ArraySpanInlineVisitor> { VisitBitBlocksVoid( arr.buffers[0].data, arr.offset, arr.length, [&](int64_t index) { - valid_func(util::FromIndexOffsetBinaryView(s[index], data_buffers)); + valid_func(util::FromBinaryView(s[index], data_buffers)); }, std::forward(null_func)); } From ce925f684771e3bd89c7ee2194886efd6983520a Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Tue, 10 Oct 2023 13:06:43 -0400 Subject: [PATCH 3/5] concatenate should ensure null slots contain an empty string view --- cpp/src/arrow/array/concatenate.cc | 11 +++++++++++ cpp/src/arrow/visit_data_inline.h | 4 +--- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/array/concatenate.cc b/cpp/src/arrow/array/concatenate.cc index 682e4945a027f..7110c8a5b649e 100644 --- a/cpp/src/arrow/array/concatenate.cc +++ b/cpp/src/arrow/array/concatenate.cc @@ -43,6 +43,7 @@ #include "arrow/util/int_util_overflow.h" #include "arrow/util/logging.h" #include "arrow/util/ree_util.h" +#include "arrow/visit_data_inline.h" #include "arrow/visit_type_inline.h" namespace arrow { @@ -256,6 +257,16 @@ class ConcatenateImpl { } } + if (out_->buffers[0] != nullptr) { + i = in_[0]->length; + VisitNullBitmapInline( + out_->buffers[0]->data(), in_[0]->length, out_->length, out_->null_count, + [&] { ++i; }, + [&] { + views[i++] = {}; // overwrite views under null bits with an empty view + }); + } + out_->buffers[1] = std::move(view_buffer); return Status::OK(); } diff --git a/cpp/src/arrow/visit_data_inline.h b/cpp/src/arrow/visit_data_inline.h index 874cf5b514452..a2ba9cfc65071 100644 --- a/cpp/src/arrow/visit_data_inline.h +++ b/cpp/src/arrow/visit_data_inline.h @@ -176,9 +176,7 @@ struct ArraySpanInlineVisitor> { auto* data_buffers = arr.GetVariadicBuffers().data(); VisitBitBlocksVoid( arr.buffers[0].data, arr.offset, arr.length, - [&](int64_t index) { - valid_func(util::FromBinaryView(s[index], data_buffers)); - }, + [&](int64_t index) { valid_func(util::FromBinaryView(s[index], data_buffers)); }, std::forward(null_func)); } }; From 02d3385e76674ca4cd3d4cf24455eed0ef0d81ae Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Wed, 11 Oct 2023 10:28:20 -0400 Subject: [PATCH 4/5] correct value count --- cpp/src/arrow/array/concatenate.cc | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cpp/src/arrow/array/concatenate.cc b/cpp/src/arrow/array/concatenate.cc index 7110c8a5b649e..37c7271b5b95c 100644 --- a/cpp/src/arrow/array/concatenate.cc +++ b/cpp/src/arrow/array/concatenate.cc @@ -260,8 +260,7 @@ class ConcatenateImpl { if (out_->buffers[0] != nullptr) { i = in_[0]->length; VisitNullBitmapInline( - out_->buffers[0]->data(), in_[0]->length, out_->length, out_->null_count, - [&] { ++i; }, + out_->buffers[0]->data(), i, out_->length - i, out_->null_count, [&] { ++i; }, [&] { views[i++] = {}; // overwrite views under null bits with an empty view }); From 028bca223255432134f52ebc5222fd00eeaf419b Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Wed, 25 Oct 2023 17:15:05 -0400 Subject: [PATCH 5/5] revert binary_like, review comments --- cpp/src/arrow/array/builder_base.cc | 3 +-- cpp/src/arrow/array/builder_binary.h | 8 ++++++ cpp/src/arrow/array/concatenate_test.cc | 3 ++- cpp/src/arrow/array/diff.cc | 31 +++++++++++----------- cpp/src/arrow/array/diff_test.cc | 13 +++++++++ cpp/src/arrow/integration/json_internal.cc | 7 ++--- cpp/src/arrow/pretty_print.cc | 17 +++++------- cpp/src/arrow/type_traits.h | 10 +++---- cpp/src/arrow/util/binary_view_util.h | 4 +-- 9 files changed, 55 insertions(+), 41 deletions(-) diff --git a/cpp/src/arrow/array/builder_base.cc b/cpp/src/arrow/array/builder_base.cc index 76f0d9fc3f679..d3502a0ab645a 100644 --- a/cpp/src/arrow/array/builder_base.cc +++ b/cpp/src/arrow/array/builder_base.cc @@ -123,8 +123,7 @@ struct AppendScalarImpl { Status Visit(const Decimal256Type& t) { return HandleFixedWidth(t); } template - enable_if_t::value || is_string_like_type::value, Status> - Visit(const T&) { + enable_if_has_string_view Visit(const T&) { int64_t data_size = 0; for (auto it = scalars_begin_; it != scalars_end_; ++it) { const auto& scalar = checked_cast::ScalarType&>(*it); diff --git a/cpp/src/arrow/array/builder_binary.h b/cpp/src/arrow/array/builder_binary.h index f5b335b87a26d..3e87cf2403610 100644 --- a/cpp/src/arrow/array/builder_binary.h +++ b/cpp/src/arrow/array/builder_binary.h @@ -583,8 +583,16 @@ class ARROW_EXPORT BinaryViewBuilder : public ArrayBuilder { data_builder_(pool, alignment), data_heap_builder_(pool, alignment) {} + /// Set the size for future preallocated data buffers. + /// + /// The default size is 32KB, so after each 32KB of string data appended to the builder + /// a new data buffer will be allocated. Adjust this to a larger value to decrease the + /// frequency of allocation, or to a smaller value to lower the overhead of each + /// allocation. void SetBlockSize(int64_t blocksize) { data_heap_builder_.SetBlockSize(blocksize); } + /// The number of bytes which can be appended to this builder without allocating another + /// data buffer. int64_t current_block_bytes_remaining() const { return data_heap_builder_.current_remaining_bytes(); } diff --git a/cpp/src/arrow/array/concatenate_test.cc b/cpp/src/arrow/array/concatenate_test.cc index e49f65e15e15d..0ef1136ea78f8 100644 --- a/cpp/src/arrow/array/concatenate_test.cc +++ b/cpp/src/arrow/array/concatenate_test.cc @@ -163,7 +163,8 @@ TEST_F(ConcatenateTest, StringType) { TEST_F(ConcatenateTest, StringViewType) { Check([this](int32_t size, double null_probability, std::shared_ptr* out) { - *out = rng_.StringView(size, /*min_length =*/0, /*max_length =*/40, null_probability); + *out = rng_.StringView(size, /*min_length =*/0, /*max_length =*/40, null_probability, + /*max_buffer_length=*/200); ASSERT_OK((**out).ValidateFull()); }); } diff --git a/cpp/src/arrow/array/diff.cc b/cpp/src/arrow/array/diff.cc index 800f19b752726..f267c52c0d696 100644 --- a/cpp/src/arrow/array/diff.cc +++ b/cpp/src/arrow/array/diff.cc @@ -476,30 +476,31 @@ class MakeFormatterImpl { return Status::OK(); } - // format Binary, LargeBinary and FixedSizeBinary in hexadecimal template - enable_if_binary_like Visit(const T&) { + enable_if_has_string_view Visit(const T&) { using ArrayType = typename TypeTraits::ArrayType; impl_ = [](const Array& array, int64_t index, std::ostream* os) { - *os << HexEncode(checked_cast(array).GetView(index)); + std::string_view view = checked_cast(array).GetView(index); + if constexpr (T::is_utf8) { + // format String and StringView with \"\n\r\t\\ escaped + *os << '"' << Escape(view) << '"'; + } else { + // format Binary, LargeBinary, BinaryView, and FixedSizeBinary in hexadecimal + *os << HexEncode(view); + } }; return Status::OK(); } - // format Strings with \"\n\r\t\\ escaped + // format Decimals with Decimal___Array::FormatValue template - enable_if_string_like Visit(const T&) { - using ArrayType = typename TypeTraits::ArrayType; - impl_ = [](const Array& array, int64_t index, std::ostream* os) { - *os << "\"" << Escape(checked_cast(array).GetView(index)) << "\""; - }; - return Status::OK(); - } - - // format Decimals with Decimal128Array::FormatValue - Status Visit(const Decimal128Type&) { + enable_if_decimal Visit(const T&) { impl_ = [](const Array& array, int64_t index, std::ostream* os) { - *os << checked_cast(array).FormatValue(index); + if constexpr (T::type_id == Type::DECIMAL128) { + *os << checked_cast(array).FormatValue(index); + } else { + *os << checked_cast(array).FormatValue(index); + } }; return Status::OK(); } diff --git a/cpp/src/arrow/array/diff_test.cc b/cpp/src/arrow/array/diff_test.cc index e322006732f9a..4c2f39eddf863 100644 --- a/cpp/src/arrow/array/diff_test.cc +++ b/cpp/src/arrow/array/diff_test.cc @@ -596,6 +596,19 @@ TEST_F(DiffTest, UnifiedDiffFormatter) { @@ -5, +8 @@ +79 +11 +)"); + } + + for (const auto& type : { + decimal128(10, 4), + decimal256(10, 4), + }) { + base_ = ArrayFromJSON(type, R"(["123.4567", "-78.9000"])"); + target_ = ArrayFromJSON(type, R"(["123.4567", "-123.4567"])"); + AssertDiffAndFormat(R"( +@@ -1, +1 @@ +--78.9000 ++-123.4567 )"); } } diff --git a/cpp/src/arrow/integration/json_internal.cc b/cpp/src/arrow/integration/json_internal.cc index c4f44d21bed38..59749c36a958e 100644 --- a/cpp/src/arrow/integration/json_internal.cc +++ b/cpp/src/arrow/integration/json_internal.cc @@ -109,7 +109,7 @@ std::string GetTimeUnitName(TimeUnit::type unit) { Result GetStringView(const rj::Value& str) { if (!str.IsString()) { - return Status::Invalid("field was not a string line ", __LINE__); + return Status::Invalid("field was not a string"); } return std::string_view{str.GetString(), str.GetStringLength()}; } @@ -1364,10 +1364,7 @@ class ArrayReader { RETURN_NOT_OK(builder.AppendNull()); continue; } - - DCHECK(json_val.IsString()); - std::string_view val{ - json_val.GetString()}; // XXX can we use json_val.GetStringLength()? + ARROW_ASSIGN_OR_RAISE(auto val, GetStringView(json_val)); int64_t offset_start = ParseOffset(json_offsets[i]); int64_t offset_end = ParseOffset(json_offsets[i + 1]); diff --git a/cpp/src/arrow/pretty_print.cc b/cpp/src/arrow/pretty_print.cc index bfda62881542e..b392e027a6b89 100644 --- a/cpp/src/arrow/pretty_print.cc +++ b/cpp/src/arrow/pretty_print.cc @@ -229,18 +229,13 @@ class ArrayPrinter : public PrettyPrinter { } template - enable_if_string_like WriteDataValues(const ArrayType& array) { + enable_if_has_string_view WriteDataValues(const ArrayType& array) { return WriteValues(array, [&](int64_t i) { - (*sink_) << "\"" << array.GetView(i) << "\""; - return Status::OK(); - }); - } - - template - enable_if_t::value && !is_decimal_type::value, Status> - WriteDataValues(const ArrayType& array) { - return WriteValues(array, [&](int64_t i) { - (*sink_) << HexEncode(array.GetView(i)); + if constexpr (T::is_utf8) { + (*sink_) << "\"" << array.GetView(i) << "\""; + } else { + (*sink_) << HexEncode(array.GetView(i)); + } return Status::OK(); }); } diff --git a/cpp/src/arrow/type_traits.h b/cpp/src/arrow/type_traits.h index 738aee08a3b65..9d8cafacf397b 100644 --- a/cpp/src/arrow/type_traits.h +++ b/cpp/src/arrow/type_traits.h @@ -659,8 +659,7 @@ using enable_if_string_view = enable_if_t::value, R>; template using is_string_like_type = - std::integral_constant::value && T::is_utf8) || - is_string_view_type::value>; + std::integral_constant::value && T::is_utf8>; template using enable_if_string_like = enable_if_t::value, R>; @@ -683,9 +682,10 @@ template using enable_if_fixed_width_type = enable_if_t::value, R>; template -using is_binary_like_type = std::integral_constant< - bool, (is_base_binary_type::value && !is_string_like_type::value) || - is_binary_view_type::value || is_fixed_size_binary_type::value>; +using is_binary_like_type = + std::integral_constant::value && + !is_string_like_type::value) || + is_fixed_size_binary_type::value>; template using enable_if_binary_like = enable_if_t::value, R>; diff --git a/cpp/src/arrow/util/binary_view_util.h b/cpp/src/arrow/util/binary_view_util.h index 13f05e0e7c808..94f7a5bdfa667 100644 --- a/cpp/src/arrow/util/binary_view_util.h +++ b/cpp/src/arrow/util/binary_view_util.h @@ -75,8 +75,8 @@ bool EqualBinaryView(BinaryViewType::c_type l, BinaryViewType::c_type r, if (l_size_and_prefix != r_size_and_prefix) return false; if (l.is_inline()) { - // The inline part is zeroed at construction, so we can compare - // a word at a time if data extends past 'prefix_'. + // The columnar spec mandates that the inlined part be zero-padded, so we can compare + // a word at a time regardless of the exact size. int64_t l_inlined, r_inlined; memcpy(&l_inlined, l.inline_data() + BinaryViewType::kPrefixSize, sizeof(l_inlined)); memcpy(&r_inlined, r.inline_data() + BinaryViewType::kPrefixSize, sizeof(r_inlined));