Skip to content

Commit

Permalink
apacheGH-43797: [C++] Attach arrow::ArrayStatistics to `arrow::Arra…
Browse files Browse the repository at this point in the history
…yData` (apache#43801)

### Rationale for this change

If we can attach associated statistics to an array via `ArrayData`, we can use it in later processes such as query planning.

If `ArrayData` not `Array` has statistics, we can use statistics in computing kernels.

There was a concern that associated `arrow::ArrayStatistics` may be outdated if `arrow::ArrayData` is mutated after attaching `arrow::ArrayStatistics`. But `arrow::ArrayData` isn't mutable after the first population. So `arrow::ArrayStatistics` will not be outdated. We can require mutators to take responsibility for statistics.

### What changes are included in this PR?

* Add `arrow::ArrayData::statistics`
* Add `arrow::Array::statistics()` to get statistics attached in `arrow::ArrayData`

This doesn't provide a new `arrow::ArrayData` constructor (`arrow::ArrayData::Make()`) that accepts `arrow::ArrayStatistics`. We can change `arrow::ArrayData::statistics` after we create `arrow::ArrayData`.

### Are these changes tested?

Yes.

### Are there any user-facing changes?

Yes.

`arrow::Array::statistics()` is a new public API.
* GitHub Issue: apache#43797

Authored-by: Sutou Kouhei <[email protected]>
Signed-off-by: Sutou Kouhei <[email protected]>
  • Loading branch information
kou authored Sep 3, 2024
1 parent 589ab7a commit 4ed5a14
Show file tree
Hide file tree
Showing 4 changed files with 159 additions and 2 deletions.
8 changes: 8 additions & 0 deletions cpp/src/arrow/array/array_base.h
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,14 @@ class ARROW_EXPORT Array {
/// \return DeviceAllocationType
DeviceAllocationType device_type() const { return data_->device_type(); }

/// \brief Return the statistics of this Array
///
/// This just delegates to calling statistics on the underlying ArrayData
/// object which backs this Array.
///
/// \return const ArrayStatistics&
std::shared_ptr<ArrayStatistics> statistics() const { return data_->statistics; }

protected:
Array() = default;
ARROW_DEFAULT_MOVE_AND_ASSIGN(Array);
Expand Down
126 changes: 126 additions & 0 deletions cpp/src/arrow/array/array_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -3709,6 +3709,132 @@ TEST(TestSwapEndianArrayData, InvalidLength) {
}
}

class TestArrayDataStatistics : public ::testing::Test {
public:
void SetUp() {
valids_ = {1, 0, 1, 1};
null_count_ = std::count(valids_.begin(), valids_.end(), 0);
null_buffer_ = *internal::BytesToBits(valids_);
values_ = {1, 0, 3, -4};
min_ = *std::min_element(values_.begin(), values_.end());
max_ = *std::max_element(values_.begin(), values_.end());
values_buffer_ = Buffer::FromVector(values_);
data_ = ArrayData::Make(int32(), values_.size(), {null_buffer_, values_buffer_},
null_count_);
data_->statistics = std::make_shared<ArrayStatistics>();
data_->statistics->null_count = null_count_;
data_->statistics->min = min_;
data_->statistics->is_min_exact = true;
data_->statistics->max = max_;
data_->statistics->is_max_exact = true;
}

protected:
std::vector<uint8_t> valids_;
size_t null_count_;
std::shared_ptr<Buffer> null_buffer_;
std::vector<int32_t> values_;
int64_t min_;
int64_t max_;
std::shared_ptr<Buffer> values_buffer_;
std::shared_ptr<ArrayData> data_;
};

TEST_F(TestArrayDataStatistics, MoveConstructor) {
ArrayData copied_data(*data_);
ArrayData moved_data(std::move(copied_data));

ASSERT_TRUE(moved_data.statistics->null_count.has_value());
ASSERT_EQ(null_count_, moved_data.statistics->null_count.value());

ASSERT_TRUE(moved_data.statistics->min.has_value());
ASSERT_TRUE(std::holds_alternative<int64_t>(moved_data.statistics->min.value()));
ASSERT_EQ(min_, std::get<int64_t>(moved_data.statistics->min.value()));
ASSERT_TRUE(moved_data.statistics->is_min_exact);

ASSERT_TRUE(moved_data.statistics->max.has_value());
ASSERT_TRUE(std::holds_alternative<int64_t>(moved_data.statistics->max.value()));
ASSERT_EQ(max_, std::get<int64_t>(moved_data.statistics->max.value()));
ASSERT_TRUE(moved_data.statistics->is_max_exact);
}

TEST_F(TestArrayDataStatistics, CopyConstructor) {
ArrayData copied_data(*data_);

ASSERT_TRUE(copied_data.statistics->null_count.has_value());
ASSERT_EQ(null_count_, copied_data.statistics->null_count.value());

ASSERT_TRUE(copied_data.statistics->min.has_value());
ASSERT_TRUE(std::holds_alternative<int64_t>(copied_data.statistics->min.value()));
ASSERT_EQ(min_, std::get<int64_t>(copied_data.statistics->min.value()));
ASSERT_TRUE(copied_data.statistics->is_min_exact);

ASSERT_TRUE(copied_data.statistics->max.has_value());
ASSERT_TRUE(std::holds_alternative<int64_t>(copied_data.statistics->max.value()));
ASSERT_EQ(max_, std::get<int64_t>(copied_data.statistics->max.value()));
ASSERT_TRUE(copied_data.statistics->is_max_exact);
}

TEST_F(TestArrayDataStatistics, MoveAssignment) {
ArrayData copied_data(*data_);
ArrayData moved_data;
moved_data = std::move(copied_data);

ASSERT_TRUE(moved_data.statistics->null_count.has_value());
ASSERT_EQ(null_count_, moved_data.statistics->null_count.value());

ASSERT_TRUE(moved_data.statistics->min.has_value());
ASSERT_TRUE(std::holds_alternative<int64_t>(moved_data.statistics->min.value()));
ASSERT_EQ(min_, std::get<int64_t>(moved_data.statistics->min.value()));
ASSERT_TRUE(moved_data.statistics->is_min_exact);

ASSERT_TRUE(moved_data.statistics->max.has_value());
ASSERT_TRUE(std::holds_alternative<int64_t>(moved_data.statistics->max.value()));
ASSERT_EQ(max_, std::get<int64_t>(moved_data.statistics->max.value()));
ASSERT_TRUE(moved_data.statistics->is_max_exact);
}

TEST_F(TestArrayDataStatistics, CopyAssignment) {
ArrayData copied_data;
copied_data = *data_;

ASSERT_TRUE(copied_data.statistics->null_count.has_value());
ASSERT_EQ(null_count_, copied_data.statistics->null_count.value());

ASSERT_TRUE(copied_data.statistics->min.has_value());
ASSERT_TRUE(std::holds_alternative<int64_t>(copied_data.statistics->min.value()));
ASSERT_EQ(min_, std::get<int64_t>(copied_data.statistics->min.value()));
ASSERT_TRUE(copied_data.statistics->is_min_exact);

ASSERT_TRUE(copied_data.statistics->max.has_value());
ASSERT_TRUE(std::holds_alternative<int64_t>(copied_data.statistics->max.value()));
ASSERT_EQ(max_, std::get<int64_t>(copied_data.statistics->max.value()));
ASSERT_TRUE(copied_data.statistics->is_max_exact);
}

TEST_F(TestArrayDataStatistics, CopyTo) {
ASSERT_OK_AND_ASSIGN(auto copied_data,
data_->CopyTo(arrow::default_cpu_memory_manager()));

ASSERT_TRUE(copied_data->statistics->null_count.has_value());
ASSERT_EQ(null_count_, copied_data->statistics->null_count.value());

ASSERT_TRUE(copied_data->statistics->min.has_value());
ASSERT_TRUE(std::holds_alternative<int64_t>(copied_data->statistics->min.value()));
ASSERT_EQ(min_, std::get<int64_t>(copied_data->statistics->min.value()));
ASSERT_TRUE(copied_data->statistics->is_min_exact);

ASSERT_TRUE(copied_data->statistics->max.has_value());
ASSERT_TRUE(std::holds_alternative<int64_t>(copied_data->statistics->max.value()));
ASSERT_EQ(max_, std::get<int64_t>(copied_data->statistics->max.value()));
ASSERT_TRUE(copied_data->statistics->is_max_exact);
}

TEST_F(TestArrayDataStatistics, Slice) {
auto sliced_data = data_->Slice(0, 1);
ASSERT_FALSE(sliced_data->statistics);
}

template <typename PType>
class TestPrimitiveArray : public ::testing::Test {
public:
Expand Down
3 changes: 3 additions & 0 deletions cpp/src/arrow/array/data.cc
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,8 @@ Result<std::shared_ptr<ArrayData>> CopyToImpl(const ArrayData& data,
ARROW_ASSIGN_OR_RAISE(output->dictionary, CopyToImpl(*data.dictionary, to, copy_fn));
}

output->statistics = data.statistics;

return output;
}
} // namespace
Expand Down Expand Up @@ -195,6 +197,7 @@ std::shared_ptr<ArrayData> ArrayData::Slice(int64_t off, int64_t len) const {
} else {
copy->null_count = null_count != 0 ? kUnknownNullCount : 0;
}
copy->statistics = nullptr;
return copy;
}

Expand Down
24 changes: 22 additions & 2 deletions cpp/src/arrow/array/data.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#include <utility>
#include <vector>

#include "arrow/array/statistics.h"
#include "arrow/buffer.h"
#include "arrow/result.h"
#include "arrow/type.h"
Expand Down Expand Up @@ -152,7 +153,8 @@ struct ARROW_EXPORT ArrayData {
offset(other.offset),
buffers(std::move(other.buffers)),
child_data(std::move(other.child_data)),
dictionary(std::move(other.dictionary)) {
dictionary(std::move(other.dictionary)),
statistics(std::move(other.statistics)) {
SetNullCount(other.null_count);
}

Expand All @@ -163,7 +165,8 @@ struct ARROW_EXPORT ArrayData {
offset(other.offset),
buffers(other.buffers),
child_data(other.child_data),
dictionary(other.dictionary) {
dictionary(other.dictionary),
statistics(other.statistics) {
SetNullCount(other.null_count);
}

Expand All @@ -176,6 +179,7 @@ struct ARROW_EXPORT ArrayData {
buffers = std::move(other.buffers);
child_data = std::move(other.child_data);
dictionary = std::move(other.dictionary);
statistics = std::move(other.statistics);
return *this;
}

Expand All @@ -188,6 +192,7 @@ struct ARROW_EXPORT ArrayData {
buffers = other.buffers;
child_data = other.child_data;
dictionary = other.dictionary;
statistics = other.statistics;
return *this;
}

Expand Down Expand Up @@ -274,6 +279,18 @@ struct ARROW_EXPORT ArrayData {
}

/// \brief Construct a zero-copy slice of the data with the given offset and length
///
/// The associated `ArrayStatistics` is always discarded in a sliced
/// `ArrayData`. Because `ArrayStatistics` in the original
/// `ArrayData` may be invalid in a sliced `ArrayData`. If you want
/// to reuse statistics in the original `ArrayData`, you need to do
/// it by yourself.
///
/// If the specified slice range has the same range as the original
/// `ArrayData`, we can reuse statistics in the original
/// `ArrayData`. Because it has the same data as the original
/// `ArrayData`. But the associated `ArrayStatistics` is discarded
/// in this case too. Use `Copy()` instead for the case.
std::shared_ptr<ArrayData> Slice(int64_t offset, int64_t length) const;

/// \brief Input-checking variant of Slice
Expand Down Expand Up @@ -390,6 +407,9 @@ struct ARROW_EXPORT ArrayData {

// The dictionary for this Array, if any. Only used for dictionary type
std::shared_ptr<ArrayData> dictionary;

// The statistics for this Array.
std::shared_ptr<ArrayStatistics> statistics;
};

/// \brief A non-owning Buffer reference
Expand Down

0 comments on commit 4ed5a14

Please sign in to comment.