-
Notifications
You must be signed in to change notification settings - Fork 3.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
GH-38432: [C++][Parquet] Trying to use original logic of BinaryBuilder #38437
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1205,16 +1205,22 @@ struct ArrowBinaryHelper<ByteArrayType> { | |
return Status::OK(); | ||
} | ||
|
||
Status PrepareNextInput(int64_t next_value_length, | ||
std::optional<int64_t> estimated_remaining_data_length = {}) { | ||
Status PrepareNextInput(int64_t next_value_length) { | ||
if (ARROW_PREDICT_FALSE(!CanFit(next_value_length))) { | ||
// This element would exceed the capacity of a chunk | ||
return PushChunk(); | ||
} | ||
return Status::OK(); | ||
} | ||
|
||
Status PrepareNextInputWithEstimatedLength(int64_t next_value_length, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Split into two functions to make it fast and clear |
||
int64_t estimated_remaining_data_length) { | ||
if (ARROW_PREDICT_FALSE(!CanFit(next_value_length))) { | ||
// This element would exceed the capacity of a chunk | ||
RETURN_NOT_OK(PushChunk()); | ||
RETURN_NOT_OK(acc_->builder->Reserve(entries_remaining_)); | ||
if (estimated_remaining_data_length.has_value()) { | ||
RETURN_NOT_OK(acc_->builder->ReserveData( | ||
std::min<int64_t>(*estimated_remaining_data_length, chunk_space_remaining_))); | ||
} | ||
RETURN_NOT_OK(acc_->builder->ReserveData( | ||
std::min<int64_t>(estimated_remaining_data_length, chunk_space_remaining_))); | ||
} | ||
return Status::OK(); | ||
} | ||
|
@@ -1271,8 +1277,10 @@ struct ArrowBinaryHelper<FLBAType> { | |
return acc_->Reserve(entries_remaining_); | ||
} | ||
|
||
Status PrepareNextInput(int64_t next_value_length, | ||
std::optional<int64_t> estimated_remaining_data_length = {}) { | ||
Status PrepareNextInput(int64_t next_value_length) { return Status::OK(); } | ||
|
||
Status PrepareNextInputWithEstimatedLength(int64_t next_value_length, | ||
int64_t estimated_remaining_data_length) { | ||
return Status::OK(); | ||
} | ||
|
||
|
@@ -1421,7 +1429,7 @@ class PlainByteArrayDecoder : public PlainDecoder<ByteArrayType>, | |
if (ARROW_PREDICT_FALSE(len_ < increment)) { | ||
ParquetException::EofException(); | ||
} | ||
RETURN_NOT_OK(helper.PrepareNextInput(value_len, len_)); | ||
RETURN_NOT_OK(helper.PrepareNextInputWithEstimatedLength(value_len, len_)); | ||
helper.UnsafeAppend(data_ + 4, value_len); | ||
data_ += increment; | ||
len_ -= increment; | ||
|
@@ -1915,7 +1923,6 @@ class DictByteArrayDecoderImpl : public DictDecoderImpl<ByteArrayType>, | |
int32_t indices[kBufferSize]; | ||
|
||
ArrowBinaryHelper<ByteArrayType> helper(out, num_values); | ||
RETURN_NOT_OK(helper.Prepare()); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Though I think this Prepare is ok, original code doesn't have this Prepare, perhaps it would make entries large? |
||
|
||
auto dict_values = reinterpret_cast<const ByteArray*>(dictionary_->data()); | ||
int values_decoded = 0; | ||
|
@@ -1983,7 +1990,6 @@ class DictByteArrayDecoderImpl : public DictDecoderImpl<ByteArrayType>, | |
int values_decoded = 0; | ||
|
||
ArrowBinaryHelper<ByteArrayType> helper(out, num_values); | ||
RETURN_NOT_OK(helper.Prepare(len_)); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ditto |
||
|
||
auto dict_values = reinterpret_cast<const ByteArray*>(dictionary_->data()); | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This part doesn't call
RETURN_NOT_OK(acc_->builder->Reserve(entries_remaining_));
, this keeps some logic same as before https://github.com/apache/arrow/pull/14341/files , it leavePrepare()
to allocate the data and don't push when chunk is switched. But I don't know if this is right