Skip to content

Commit

Permalink
Optimize VectorHasher::makeValueIdsDecoded to not cache hash values w…
Browse files Browse the repository at this point in the history
…hen it is not beneficial

Summary:
Similar to facebookincubator#7150, when we only need to make IDs for a small number of rows fewer than dictionary values, using cache is slower and we should just compute the IDs directly.

Related issue: facebookincubator#10057

Differential Revision: D58215380
  • Loading branch information
Yuhta authored and facebook-github-bot committed Jun 6, 2024
1 parent 7637d56 commit 29a55b5
Show file tree
Hide file tree
Showing 3 changed files with 74 additions and 38 deletions.
86 changes: 48 additions & 38 deletions velox/exec/VectorHasher.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,33 @@ bool VectorHasher::makeValueIdsFlatWithNulls<bool>(
return true;
}

template <typename T>
void VectorHasher::makeValueIdForOneRow(
bool isNull,
const T* values,
vector_size_t row,
uint64_t* result,
bool& success) {
if (isNull) {
if (multiplier_ == 1) {
*result = 0;
}
return;
}
T value = values[row];
if (!success) {
analyzeValue(value);
return;
}
auto id = valueId(value);
if (id == kUnmappable) {
success = false;
analyzeValue(value);
} else {
*result = multiplier_ == 1 ? id : *result + multiplier_ * id;
}
}

template <typename T>
bool VectorHasher::makeValueIdsFlatNoNulls(
const SelectivityVector& rows,
Expand All @@ -192,20 +219,8 @@ bool VectorHasher::makeValueIdsFlatNoNulls(

bool success = true;
rows.applyToSelected([&](vector_size_t row) INLINE_LAMBDA {
T value = values[row];
if (!success) {
// If all were not mappable and we do not remove unmappable,
// we just analyze the remaining so we can decide the hash mode.
analyzeValue(value);
return;
}
uint64_t id = valueId(value);
if (id == kUnmappable) {
success = false;
analyzeValue(value);
return;
}
result[row] = multiplier_ == 1 ? id : result[row] + multiplier_ * id;
auto* currentResult = result ? result + row : nullptr;
makeValueIdForOneRow<T>(false, values, row, currentResult, success);
});

return success;
Expand All @@ -220,26 +235,9 @@ bool VectorHasher::makeValueIdsFlatWithNulls(

bool success = true;
rows.applyToSelected([&](vector_size_t row) INLINE_LAMBDA {
if (bits::isBitNull(nulls, row)) {
if (multiplier_ == 1) {
result[row] = 0;
}
return;
}
T value = values[row];
if (!success) {
// If all were not mappable we just analyze the remaining so we can decide
// the hash mode.
analyzeValue(value);
return;
}
uint64_t id = valueId(value);
if (id == kUnmappable) {
success = false;
analyzeValue(value);
return;
}
result[row] = multiplier_ == 1 ? id : result[row] + multiplier_ * id;
auto* currentResult = result ? result + row : nullptr;
bool isNull = bits::isBitNull(nulls, row);
makeValueIdForOneRow<T>(isNull, values, row, currentResult, success);
});
return success;
}
Expand All @@ -248,13 +246,25 @@ template <typename T, bool mayHaveNulls>
bool VectorHasher::makeValueIdsDecoded(
const SelectivityVector& rows,
uint64_t* result) {
cachedHashes_.resize(decoded_.base()->size());
std::fill(cachedHashes_.begin(), cachedHashes_.end(), 0);

auto indices = decoded_.indices();
auto values = decoded_.data<T>();

bool success = true;

if (rows.countSelected() <= decoded_.base()->size()) {
// Cache is not beneficial in this case and we don't use them.
auto* nulls = decoded_.nulls(&rows);
rows.applyToSelected([&](vector_size_t row) INLINE_LAMBDA {
auto* currentResult = result ? result + row : nullptr;
bool isNull = mayHaveNulls ? bits::isBitNull(nulls, row) : false;
makeValueIdForOneRow<T>(
isNull, values, indices[row], currentResult, success);
});
return success;
}

cachedHashes_.resize(decoded_.base()->size());
std::fill(cachedHashes_.begin(), cachedHashes_.end(), 0);

int numCachedHashes = 0;
rows.testSelected([&](vector_size_t row) INLINE_LAMBDA {
if constexpr (mayHaveNulls) {
Expand Down
8 changes: 8 additions & 0 deletions velox/exec/VectorHasher.h
Original file line number Diff line number Diff line change
Expand Up @@ -338,6 +338,14 @@ class VectorHasher {
template <TypeKind Kind>
bool makeValueIds(const SelectivityVector& rows, uint64_t* result);

template <typename T>
void makeValueIdForOneRow(
bool isNull,
const T* values,
vector_size_t row,
uint64_t* result,
bool& success);

template <typename T>
bool makeValueIdsFlatNoNulls(const SelectivityVector& rows, uint64_t* result);

Expand Down
18 changes: 18 additions & 0 deletions velox/exec/tests/VectorHasherTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -827,6 +827,24 @@ TEST_F(VectorHasherTest, computeValueIdsStrings) {
}

ASSERT_LE(uniqueValues.size(), multiplier);

{
result[42] = 0xAAAAAAAAAAAAAAAA;
VectorHasher hasher(dictionaryVectors[0]->type(), 0);
SelectivityVector oneRow(size, false);
oneRow.setValid(42, true);
oneRow.updateBounds();
hasher.decode(*dictionaryVectors[0], oneRow);
ASSERT_FALSE(hasher.computeValueIds(oneRow, result));
uint64_t asRange;
uint64_t asDistinct;
hasher.cardinality(0, asRange, asDistinct);
ASSERT_EQ(asDistinct, 2);
hasher.enableValueIds(1, 0);
hasher.decode(*dictionaryVectors[0], oneRow);
ASSERT_TRUE(hasher.computeValueIds(oneRow, result));
ASSERT_EQ(result[42], 1);
}
}

namespace {
Expand Down

0 comments on commit 29a55b5

Please sign in to comment.