Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GH-38071: [C++][CI] Fix Overlap column chunk ranges for pre-buffer #38073

Merged
merged 5 commits into from
Oct 8, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions cpp/src/arrow/io/caching.cc
Original file line number Diff line number Diff line change
Expand Up @@ -174,8 +174,9 @@ struct ReadRangeCache::Impl {

// Add the given ranges to the cache, coalescing them where possible
virtual Status Cache(std::vector<ReadRange> ranges) {
ranges = internal::CoalesceReadRanges(std::move(ranges), options.hole_size_limit,
options.range_size_limit);
ARROW_ASSIGN_OR_RAISE(
ranges, internal::CoalesceReadRanges(std::move(ranges), options.hole_size_limit,
options.range_size_limit));
std::vector<RangeCacheEntry> new_entries = MakeCacheEntries(ranges);
// Add new entries, themselves ordered by offset
if (entries.size() > 0) {
Expand Down
12 changes: 7 additions & 5 deletions cpp/src/arrow/io/interfaces.cc
Original file line number Diff line number Diff line change
Expand Up @@ -425,7 +425,7 @@ ThreadPool* GetIOThreadPool() {
namespace {

struct ReadRangeCombiner {
std::vector<ReadRange> Coalesce(std::vector<ReadRange> ranges) {
Result<std::vector<ReadRange>> Coalesce(std::vector<ReadRange> ranges) {
if (ranges.empty()) {
return ranges;
}
Expand Down Expand Up @@ -454,7 +454,9 @@ struct ReadRangeCombiner {
const auto& left = ranges[i];
const auto& right = ranges[i + 1];
DCHECK_LE(left.offset, right.offset);
DCHECK_LE(left.offset + left.length, right.offset) << "Some read ranges overlap";
if (left.offset + left.length > right.offset) {
return Status::IOError("Some read ranges overlap");
}
}
#endif

Expand Down Expand Up @@ -509,9 +511,9 @@ struct ReadRangeCombiner {

}; // namespace

std::vector<ReadRange> CoalesceReadRanges(std::vector<ReadRange> ranges,
int64_t hole_size_limit,
int64_t range_size_limit) {
Result<std::vector<ReadRange>> CoalesceReadRanges(std::vector<ReadRange> ranges,
int64_t hole_size_limit,
int64_t range_size_limit) {
DCHECK_GT(range_size_limit, hole_size_limit);

ReadRangeCombiner combiner{hole_size_limit, range_size_limit};
Expand Down
6 changes: 3 additions & 3 deletions cpp/src/arrow/io/util_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,9 @@ ARROW_EXPORT Status ValidateWriteRange(int64_t offset, int64_t size, int64_t fil
ARROW_EXPORT Status ValidateRange(int64_t offset, int64_t size);

ARROW_EXPORT
std::vector<ReadRange> CoalesceReadRanges(std::vector<ReadRange> ranges,
int64_t hole_size_limit,
int64_t range_size_limit);
Result<std::vector<ReadRange>> CoalesceReadRanges(std::vector<ReadRange> ranges,
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't know if this matter, since it change the public api.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's in an internal header so no expectation that it remains stable

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, that's good, thanks!

int64_t hole_size_limit,
int64_t range_size_limit);

ARROW_EXPORT
::arrow::internal::ThreadPool* GetIOThreadPool();
Expand Down
6 changes: 4 additions & 2 deletions cpp/src/parquet/file_reader.cc
Original file line number Diff line number Diff line change
Expand Up @@ -139,8 +139,10 @@ const RowGroupMetaData* RowGroupReader::metadata() const { return contents_->met
::arrow::io::ReadRange ComputeColumnChunkRange(FileMetaData* file_metadata,
int64_t source_size, int row_group_index,
int column_index) {
auto row_group_metadata = file_metadata->RowGroup(row_group_index);
auto column_metadata = row_group_metadata->ColumnChunk(column_index);
std::unique_ptr<RowGroupMetaData> row_group_metadata =
file_metadata->RowGroup(row_group_index);
std::unique_ptr<ColumnChunkMetaData> column_metadata =
row_group_metadata->ColumnChunk(column_index);

int64_t col_start = column_metadata->data_page_offset();
if (column_metadata->has_dictionary_page() &&
Expand Down
Loading