diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc index 7ff3261ecba5e..c53da8591e94e 100644 --- a/cpp/src/arrow/array-test.cc +++ b/cpp/src/arrow/array-test.cc @@ -1155,6 +1155,45 @@ TEST_F(TestBinaryBuilder, TestScalarAppend) { } } +TEST_F(TestBinaryBuilder, TestCapacityReserve) { + vector strings = {"aaaaa", "bbbbbbbbbb", "ccccccccccccccc", "dddddddddd"}; + int N = static_cast(strings.size()); + int reps = 15; + int64_t length = 0; + int64_t capacity = 1000; + int64_t expected_capacity = BitUtil::RoundUpToMultipleOf64(capacity); + + ASSERT_OK(builder_->ReserveData(capacity)); + + ASSERT_EQ(length, builder_->value_data_length()); + ASSERT_EQ(expected_capacity, builder_->value_data_capacity()); + + for (int j = 0; j < reps; ++j) { + for (int i = 0; i < N; ++i) { + ASSERT_OK(builder_->Append(strings[i])); + length += static_cast(strings[i].size()); + + ASSERT_EQ(length, builder_->value_data_length()); + ASSERT_EQ(expected_capacity, builder_->value_data_capacity()); + } + } + + int extra_capacity = 500; + expected_capacity = BitUtil::RoundUpToMultipleOf64(length + extra_capacity); + + ASSERT_OK(builder_->ReserveData(extra_capacity)); + + ASSERT_EQ(length, builder_->value_data_length()); + ASSERT_EQ(expected_capacity, builder_->value_data_capacity()); + + Done(); + + ASSERT_EQ(reps * N, result_->length()); + ASSERT_EQ(0, result_->null_count()); + ASSERT_EQ(reps * 40, result_->value_data()->size()); + ASSERT_EQ(expected_capacity, result_->value_data()->capacity()); +} + TEST_F(TestBinaryBuilder, TestZeroLength) { // All buffers are null Done(); diff --git a/cpp/src/arrow/buffer.h b/cpp/src/arrow/buffer.h index b50b1a1aa041d..44c352a93f273 100644 --- a/cpp/src/arrow/buffer.h +++ b/cpp/src/arrow/buffer.h @@ -333,6 +333,7 @@ class ARROW_EXPORT TypedBufferBuilder : public BufferBuilder { const T* data() const { return reinterpret_cast(data_); } int64_t length() const { return size_ / sizeof(T); } + int64_t capacity() const { return capacity_ / sizeof(T); } }; /// \brief Allocate a fixed size mutable buffer from a memory pool diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc index de132b5f6a0d1..db901526fc2ee 100644 --- a/cpp/src/arrow/builder.cc +++ b/cpp/src/arrow/builder.cc @@ -1165,13 +1165,13 @@ Status ListBuilder::Init(int64_t elements) { DCHECK_LT(elements, std::numeric_limits::max()); RETURN_NOT_OK(ArrayBuilder::Init(elements)); // one more then requested for offsets - return offsets_builder_.Resize((elements + 1) * sizeof(int64_t)); + return offsets_builder_.Resize((elements + 1) * sizeof(int32_t)); } Status ListBuilder::Resize(int64_t capacity) { DCHECK_LT(capacity, std::numeric_limits::max()); // one more then requested for offsets - RETURN_NOT_OK(offsets_builder_.Resize((capacity + 1) * sizeof(int64_t))); + RETURN_NOT_OK(offsets_builder_.Resize((capacity + 1) * sizeof(int32_t))); return ArrayBuilder::Resize(capacity); } @@ -1216,16 +1216,26 @@ Status BinaryBuilder::Init(int64_t elements) { DCHECK_LT(elements, std::numeric_limits::max()); RETURN_NOT_OK(ArrayBuilder::Init(elements)); // one more then requested for offsets - return offsets_builder_.Resize((elements + 1) * sizeof(int64_t)); + return offsets_builder_.Resize((elements + 1) * sizeof(int32_t)); } Status BinaryBuilder::Resize(int64_t capacity) { DCHECK_LT(capacity, std::numeric_limits::max()); // one more then requested for offsets - RETURN_NOT_OK(offsets_builder_.Resize((capacity + 1) * sizeof(int64_t))); + RETURN_NOT_OK(offsets_builder_.Resize((capacity + 1) * sizeof(int32_t))); return ArrayBuilder::Resize(capacity); } +Status BinaryBuilder::ReserveData(int64_t elements) { + if (value_data_length() + elements > value_data_capacity()) { + if (value_data_length() + elements > std::numeric_limits::max()) { + return Status::Invalid("Cannot reserve capacity larger than 2^31 - 1 for binary"); + } + RETURN_NOT_OK(value_data_builder_.Reserve(elements)); + } + return Status::OK(); +} + Status BinaryBuilder::AppendNextOffset() { const int64_t num_bytes = value_data_builder_.length(); if (ARROW_PREDICT_FALSE(num_bytes > kMaximumCapacity)) { diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h index ce7b8cd197da3..d1611f60cd924 100644 --- a/cpp/src/arrow/builder.h +++ b/cpp/src/arrow/builder.h @@ -682,10 +682,15 @@ class ARROW_EXPORT BinaryBuilder : public ArrayBuilder { Status Init(int64_t elements) override; Status Resize(int64_t capacity) override; + /// \brief Ensures there is enough allocated capacity to append the indicated + /// number of bytes to the value data buffer without additional allocations + Status ReserveData(int64_t elements); Status FinishInternal(std::shared_ptr* out) override; /// \return size of values buffer so far int64_t value_data_length() const { return value_data_builder_.length(); } + /// \return capacity of values buffer + int64_t value_data_capacity() const { return value_data_builder_.capacity(); } /// Temporary access to a value. ///