Skip to content

Commit

Permalink
Consolidate all sites that can hash a value inside a vector
Browse files Browse the repository at this point in the history
Summary:
This commit aims to guarantee consistent evaluation and hashing of NaN
(Not-a-Number) values for floating-point types across multiple sites,
including SimpleVector, VectorHasher, RowContainer, and ContainerRowSerde.
Unit tests will be added incrementally to make it easier to review and to
include additional changes that might be required for it.

The changes to these classes would affect the following:
- Multiple aggregates that have 'distinct' like semantics. Only for
   the codepaths that consume complex types as inputs. Aggregates
   include: multimap_agg, map_union_sum, map_union, count(distinct)
- In-predicate
- Map_subscript, only one optimization code path for
   constant/dictionary encoded map
- Hash Partitioning operator (HashPartition)
- Hash Join operator (HashProbe, HashTable)
- Local partitioning operator (LocalPartition)
- Hash Aggregation operator (GroupingSet)

Differential Revision: D57892204
  • Loading branch information
Bikramjeet Vig authored and facebook-github-bot committed May 29, 2024
1 parent ffe4cec commit 40ae068
Show file tree
Hide file tree
Showing 9 changed files with 141 additions and 10 deletions.
7 changes: 6 additions & 1 deletion velox/exec/ContainerRowSerde.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
*/

#include "velox/exec/ContainerRowSerde.h"
#include "velox/type/FloatingPointUtil.h"
#include "velox/vector/ComplexVector.h"
#include "velox/vector/FlatVector.h"

Expand Down Expand Up @@ -728,7 +729,11 @@ uint64_t hashSwitch(ByteInputStream& stream, const Type* type);
template <TypeKind Kind>
uint64_t hashOne(ByteInputStream& stream, const Type* /*type*/) {
using T = typename TypeTraits<Kind>::NativeType;
return folly::hasher<T>()(stream.read<T>());
if constexpr (std::is_floating_point_v<T>) {
return util::floating_point::NaNAwareHash<T>()(stream.read<T>());
} else {
return folly::hasher<T>()(stream.read<T>());
}
}

template <>
Expand Down
3 changes: 3 additions & 0 deletions velox/exec/RowContainer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#include "velox/exec/Aggregate.h"
#include "velox/exec/ContainerRowSerde.h"
#include "velox/exec/Operator.h"
#include "velox/type/FloatingPointUtil.h"

namespace facebook::velox::exec {
namespace {
Expand Down Expand Up @@ -866,6 +867,8 @@ void RowContainer::hashTyped(
Kind == TypeKind::MAP) {
auto in = prepareRead(row, offset);
hash = ContainerRowSerde::hash(in, type);
} else if constexpr (std::is_floating_point_v<T>) {
hash = util::floating_point::NaNAwareHash<T>()(valueAt<T>(row, offset));
} else {
hash = folly::hasher<T>()(valueAt<T>(row, offset));
}
Expand Down
7 changes: 6 additions & 1 deletion velox/exec/VectorHasher.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include "velox/common/base/Portability.h"
#include "velox/common/base/SimdUtil.h"
#include "velox/common/memory/HashStringAllocator.h"
#include "velox/type/FloatingPointUtil.h"

namespace facebook::velox::exec {

Expand Down Expand Up @@ -61,7 +62,11 @@ uint64_t hashOne(DecodedVector& decoded, vector_size_t index) {
}
// Inlined for scalars.
using T = typename KindToFlatVector<Kind>::HashRowType;
return folly::hasher<T>()(decoded.valueAt<T>(index));
if constexpr (std::is_floating_point_v<T>) {
return util::floating_point::NaNAwareHash<T>()(decoded.valueAt<T>(index));
} else {
return folly::hasher<T>()(decoded.valueAt<T>(index));
}
}
} // namespace

Expand Down
35 changes: 34 additions & 1 deletion velox/exec/tests/ContainerRowSerdeTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,7 @@ class ContainerRowSerdeTest : public testing::Test,
}

void testCompare(const VectorPtr& vector) {
auto positions = serializeWithPositions(vector);
auto positions = serializeWithPositions(vector);

CompareFlags compareFlags =
CompareFlags::equality(CompareFlags::NullHandlingMode::kNullAsValue);
Expand Down Expand Up @@ -571,5 +571,38 @@ TEST_F(ContainerRowSerdeTest, fuzzCompare) {
}
}

TEST_F(ContainerRowSerdeTest, nans) {
// Verify that the NaNs with different representations are considered equal
// and have the same hash value.
auto vector = makeNullableFlatVector<double>(
{std::nan("1"),
std::nan("2"),
std::numeric_limits<double>::quiet_NaN(),
std::numeric_limits<double>::signaling_NaN()});

// Compare with the same NaN value
auto expected = makeConstant(std::nan("1"), 4, vector->type());

auto positions = serializeWithPositions(vector);

CompareFlags compareFlags =
CompareFlags::equality(CompareFlags::NullHandlingMode::kNullAsValue);

DecodedVector decodedVector(*expected);

for (auto i = 0; i < positions.size(); ++i) {
auto stream = HashStringAllocator::prepareRead(positions.at(i).header);
ASSERT_EQ(
0, ContainerRowSerde::compare(stream, decodedVector, i, compareFlags))
<< "at " << i << ": " << vector->toString(i);

stream = HashStringAllocator::prepareRead(positions.at(i).header);
ASSERT_EQ(
expected->hashValueAt(i),
ContainerRowSerde::hash(stream, vector->type().get()))
<< "at " << i << ": " << vector->toString(i);
}
}

} // namespace
} // namespace facebook::velox::exec
39 changes: 39 additions & 0 deletions velox/exec/tests/RowContainerTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1690,6 +1690,45 @@ TEST_F(RowContainerTest, unknown) {
}));
}

TEST_F(RowContainerTest, nans) {
const static auto kNaN = std::numeric_limits<double>::quiet_NaN();
const static auto kSNaN = std::numeric_limits<double>::signaling_NaN();
static const auto kNaNHash = folly::hasher<double>{}(kNaN);
std::vector<TypePtr> types = {DOUBLE()};
auto rowContainer = std::make_unique<RowContainer>(types, pool_.get());

auto data = makeRowVector({
makeFlatVector<double>({kNaN, kSNaN}),
});

auto size = data->size();
DecodedVector decoded(*data->childAt(0));
auto rows = store(*rowContainer, decoded, size);

// Verify that the hashes are equal.
std::vector<uint64_t> hashes(size, 0);
rowContainer->hash(
0, folly::Range(rows.data(), rows.size()), false /*mix*/, hashes.data());
for (auto hash : hashes) {
ASSERT_EQ(kNaNHash, hash);
}

// Fill in hashes with sequential numbers: 0, 1, 2,..
std::iota(hashes.begin(), hashes.end(), 0);
rowContainer->hash(
0, folly::Range(rows.data(), rows.size()), true /*mix*/, hashes.data());
for (auto i = 0; i < size; ++i) {
ASSERT_EQ(bits::hashMix(i, kNaNHash), hashes[i]);
}

// Verify that they are considered equal.
for (size_t row = 0; row < size; ++row) {
ASSERT_TRUE(rowContainer->equals<false>(
rows[row], rowContainer->columnAt(0), decoded, row));
}
ASSERT_EQ(rowContainer->compare(rows[0], rows[1], 0, {}), 0);
}

TEST_F(RowContainerTest, toString) {
std::vector<TypePtr> keyTypes = {BIGINT(), VARCHAR()};
std::vector<TypePtr> dependentTypes = {TINYINT(), REAL(), ARRAY(BIGINT())};
Expand Down
32 changes: 32 additions & 0 deletions velox/exec/tests/VectorHasherTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,38 @@ TEST_F(VectorHasherTest, flat) {
}
}

TEST_F(VectorHasherTest, nans) {
// Sanity check to ensure the NaNs are correctly hashed, that is, all NaNs are
// considered equal and therefore should have the same hash.
static const auto kNaN = std::numeric_limits<double>::quiet_NaN();
static const auto kSNaN = std::numeric_limits<double>::signaling_NaN();
folly::hasher<double> hasher;
static const auto kNaNHash = hasher(kNaN);
auto vectorHasher = exec::VectorHasher::create(DOUBLE(), 1);
ASSERT_EQ(vectorHasher->channel(), 1);
ASSERT_EQ(vectorHasher->typeKind(), TypeKind::DOUBLE);

// Using two different binary representations of NaN.
std::vector<double> inputValues = {1.0, -1.0, kNaN, kSNaN, 0.0};

auto vector = BaseVector::create(DOUBLE(), inputValues.size(), pool());
auto flatVector = vector->asFlatVector<double>();
for (int32_t i = 0; i < inputValues.size(); i++) {
flatVector->set(i, inputValues[i]);
}

raw_vector<uint64_t> hashes(inputValues.size());
std::fill(hashes.begin(), hashes.end(), 0);
SelectivityVector allRows = SelectivityVector(inputValues.size());
vectorHasher->decode(*vector, allRows);
vectorHasher->hash(allRows, false, hashes);
std::vector<uint64_t> expected = {
hasher(1.0), hasher(-1.0), kNaNHash, kNaNHash, hasher(0.0)};
for (int32_t i = 0; i < inputValues.size(); i++) {
EXPECT_EQ(hashes[i], expected[i]) << "at " << i;
}
}

TEST_F(VectorHasherTest, nonNullConstant) {
auto hasher = exec::VectorHasher::create(INTEGER(), 1);
auto vector = BaseVector::createConstant(INTEGER(), 123, 100, pool());
Expand Down
4 changes: 2 additions & 2 deletions velox/type/FloatingPointUtil.h
Original file line number Diff line number Diff line change
Expand Up @@ -75,11 +75,11 @@ template <
struct NaNAwareHash {
std::size_t operator()(const FLOAT& val) const noexcept {
static const std::size_t kNanHash =
std::hash<FLOAT>{}(std::numeric_limits<FLOAT>::quiet_NaN());
folly::hasher<FLOAT>{}(std::numeric_limits<FLOAT>::quiet_NaN());
if (std::isnan(val)) {
return kNanHash;
}
return std::hash<FLOAT>{}(val);
return folly::hasher<FLOAT>{}(val);
}
};

Expand Down
11 changes: 9 additions & 2 deletions velox/vector/SimpleVector.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@

#include "velox/functions/lib/string/StringCore.h"
#include "velox/type/DecimalUtil.h"
#include "velox/type/FloatingPointUtil.h"
#include "velox/type/Type.h"
#include "velox/vector/BaseVector.h"
#include "velox/vector/TypeAliases.h"
Expand Down Expand Up @@ -171,8 +172,14 @@ class SimpleVector : public BaseVector {
* @return the hash of the value at the given index in this vector
*/
uint64_t hashValueAt(vector_size_t index) const override {
return isNullAt(index) ? BaseVector::kNullHash
: folly::hasher<T>{}(valueAt(index));
if constexpr (std::is_floating_point_v<T>) {
return isNullAt(index)
? BaseVector::kNullHash
: util::floating_point::NaNAwareHash<T>{}(valueAt(index));
} else {
return isNullAt(index) ? BaseVector::kNullHash
: folly::hasher<T>{}(valueAt(index));
}
}

std::optional<bool> isSorted() const {
Expand Down
13 changes: 10 additions & 3 deletions velox/vector/tests/SimpleVectorTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
*/

#include <algorithm>
#include <functional>

#include <fmt/core.h>
#include <glog/logging.h>
Expand All @@ -25,6 +26,7 @@
#include <folly/dynamic.h>

#include "velox/common/base/tests/GTestUtils.h"
#include "velox/type/FloatingPointUtil.h"
#include "velox/vector/BuilderTypeUtils.h"
#include "velox/vector/SimpleVector.h"
#include "velox/vector/tests/SimpleVectorTestHelper.h"
Expand Down Expand Up @@ -649,12 +651,17 @@ VELOX_TYPED_TEST_SUITE(SimpleVectorUnaryTypedTest, SimpleTypes);

TYPED_TEST(SimpleVectorUnaryTypedTest, hashAll) {
LOG(INFO) << "hashAll: " << type_name<TypeParam>();
folly::hasher<TypeParam> hasher;
std::function<size_t(TypeParam)> hasher;
if constexpr (std::is_floating_point_v<TypeParam>) {
hasher = util::floating_point::NaNAwareHash<TypeParam>{};
} else {
hasher = folly::hasher<TypeParam>{};
}
auto hashTest = [&hasher](SimpleVector<TypeParam>* vector) {
auto hashes = vector->hashAll();
for (size_t i = 0; i < vector->size(); ++i) {
auto expected = vector->isNullAt(i) ? BaseVector::kNullHash
: hasher(vector->valueAt(i));
uint64_t expected = vector->isNullAt(i) ? BaseVector::kNullHash
: hasher(vector->valueAt(i));
EXPECT_EQ(hashes->valueAt(i), expected);
}
};
Expand Down

0 comments on commit 40ae068

Please sign in to comment.