diff --git a/cpp/src/arrow/compute/kernels/scalar_hash_test.cc b/cpp/src/arrow/compute/kernels/scalar_hash_test.cc index c3c848c843770..55491403fe2e5 100644 --- a/cpp/src/arrow/compute/kernels/scalar_hash_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_hash_test.cc @@ -16,6 +16,7 @@ // under the License. #include +#include #include "arrow/chunked_array.h" #include "arrow/compute/api.h" @@ -36,10 +37,8 @@ namespace arrow { namespace compute { constexpr auto kSeed = 0x94378165; -// constexpr auto kArrayLengths = {0, 50, 100}; -// constexpr auto kNullProbabilities = {0.0, 0.5, 1.0}; -constexpr auto kArrayLengths = {5}; -constexpr auto kNullProbabilities = {0.0}; +constexpr auto kArrayLengths = {0, 50, 100}; +constexpr auto kNullProbabilities = {0.0, 0.5, 1.0}; class TestScalarHash : public ::testing::Test { public: @@ -136,6 +135,38 @@ class TestScalarHash : public ::testing::Test { } } + void CheckHashQuality(const std::string& func, const std::shared_ptr& arr, + float tolerance = 1.0) { + ASSERT_OK_AND_ASSIGN(Datum result, CallFunction(func, {arr})); + auto hashes = result.make_array(); + + auto expected = arr->length(); + if (arr->null_count()) { + expected -= (arr->null_count() - 1); + } + if (func == "hash64") { + auto hashes64 = dynamic_cast(hashes.get()); + std::unordered_set hash_set; + for (int64_t i = 0; i < hashes64->length(); ++i) { + hash_set.insert(hashes64->Value(i)); + } + ASSERT_LE(hash_set.size(), expected); + ASSERT_GE(hash_set.size(), expected * tolerance); + } else if (func == "hash32") { + auto hashes32 = dynamic_cast(hashes.get()); + std::unordered_set hash_set; + for (int64_t i = 0; i < hashes32->length(); ++i) { + if (hashes32->IsValid(i)) { + hash_set.insert(hashes32->Value(i)); + } + } + ASSERT_LE(hash_set.size(), expected); + ASSERT_GE(hash_set.size(), expected * tolerance); + } else { + FAIL() << "Unknown function: " << func; + } + } + void CheckPrimitive(const std::string& func, const std::shared_ptr& arr) { ASSERT_OK_AND_ASSIGN(Datum hash_result, CallFunction(func, {arr})); CheckDeterministic(func, arr); @@ -394,5 +425,56 @@ TEST_F(TestScalarHash, RandomMap) { } } +// copied from cpp/src/arrow/util/hashing_test.cc +template +static std::unordered_set MakeSequentialIntegers(int32_t n_values) { + std::unordered_set values; + values.reserve(n_values); + + for (int32_t i = 0; i < n_values; ++i) { + values.insert(static_cast(i)); + } + DCHECK_EQ(values.size(), static_cast(n_values)); + return values; +} + +// copied from cpp/src/arrow/util/hashing_test.cc +static std::unordered_set MakeDistinctStrings(int32_t n_values) { + std::unordered_set values; + values.reserve(n_values); + + // Generate strings between 0 and 24 bytes, with ASCII characters + std::default_random_engine gen(42); + std::uniform_int_distribution length_dist(0, 24); + std::uniform_int_distribution char_dist('0', 'z'); + + while (values.size() < static_cast(n_values)) { + auto length = length_dist(gen); + std::string s(length, 'X'); + for (int32_t i = 0; i < length; ++i) { + s[i] = static_cast(char_dist(gen)); + } + values.insert(std::move(s)); + } + return values; +} + +TEST_F(TestScalarHash, HashQuality) { + for (auto& func : {"hash32", "hash64"}) { + std::shared_ptr arr; + auto integer_values = MakeSequentialIntegers(100000); + auto integer_vector = + std::vector(integer_values.begin(), integer_values.end()); + arrow::ArrayFromVector(integer_vector, &arr); + CheckHashQuality(func, arr); + + auto string_values = MakeDistinctStrings(10000); + auto string_vector = + std::vector(string_values.begin(), string_values.end()); + arrow::ArrayFromVector(string_vector, &arr); + CheckHashQuality(func, arr); + } +} + } // namespace compute } // namespace arrow