Skip to content

Commit

Permalink
Fix HashTable::toString()
Browse files Browse the repository at this point in the history
  • Loading branch information
mbasmanova committed Dec 7, 2023
1 parent 730dd53 commit 6729921
Show file tree
Hide file tree
Showing 2 changed files with 101 additions and 20 deletions.
55 changes: 35 additions & 20 deletions velox/exec/HashTable.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1465,27 +1465,31 @@ void HashTable<ignoreNullKeys>::decideHashMode(
template <bool ignoreNullKeys>
std::string HashTable<ignoreNullKeys>::toString() {
std::stringstream out;
int64_t occupied = 0;

out << "[HashTable size: " << capacity_
out << "[HashTable keys: " << hashers_.size()
<< " hash mode: " << modeString(hashMode_) << " capacity: " << capacity_
<< " distinct count: " << numDistinct_
<< " tombstone count: " << numTombstones_ << "]";
<< " tombstones count: " << numTombstones_ << "]";
if (table_ == nullptr) {
out << "(no table) ";
out << " (no table)";
}

for (auto& hasher : hashers_) {
out << hasher->toString();
out << std::endl << hasher->toString();
}
out << std::endl;

if (kTrackLoads) {
out << std::endl;
out << fmt::format(
"{} probes {} tag loads {} row loads {} hits",
numProbes_,
numTagLoads_,
numRowLoads_,
numHits_);
"{} probes {} tag loads {} row loads {} hits",
numProbes_,
numTagLoads_,
numRowLoads_,
numHits_)
<< std::endl;
}

if (hashMode_ == HashMode::kArray) {
int64_t occupied = 0;
if (table_ && tableAllocation_.data() && tableAllocation_.size()) {
// 'size_' and 'table_' may not be set if initializing.
uint64_t size = std::min<uint64_t>(
Expand All @@ -1494,20 +1498,31 @@ std::string HashTable<ignoreNullKeys>::toString() {
occupied += table_[i] != nullptr;
}
}
out << "Total slots used: " << occupied << std::endl;
} else {
// Count of groups indexed by number of non-empty slots.
int64_t numGroups[sizeof(TagVector) + 1] = {};
int64_t occupied = 0;

// Count of buckets indexed by the number of non-empty slots.
// Each bucket has 16 slots. Hence, the number of non-empty slots is between
// 0 and 16 (17 possible values).
int64_t numBuckets[sizeof(TagVector) + 1] = {};
for (int64_t bucketOffset = 0; bucketOffset < sizeMask_;
bucketOffset += kBucketSize) {
auto tags = loadTags(bucketOffset);
auto filled = simd::toBitMask(tags != TagVector::broadcast(0));
++numGroups[__builtin_popcount(filled)];
occupied += filled;
auto numOccupied = __builtin_popcount(filled);

++numBuckets[numOccupied];
occupied += numOccupied;
}
out << " occupied=" << occupied;
out << std::endl;
for (auto i = 0; i < sizeof(numGroups) / sizeof(numGroups[0]); ++i) {
out << numGroups[i] << " groups with " << i << " entries" << std::endl;

out << "Total buckets: " << (sizeMask_ / kBucketSize) << std::endl;
out << "Total slots used: " << occupied << std::endl;
for (auto i = 1; i < sizeof(TagVector) + 1; ++i) {
if (numBuckets[i] > 0) {
out << numBuckets[i] << " buckets with " << i << " slots used"
<< std::endl;
}
}
}
return out.str();
Expand Down
66 changes: 66 additions & 0 deletions velox/exec/tests/HashTableTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -365,6 +365,22 @@ class HashTableTest : public testing::TestWithParam<bool>,
}
}

void store(RowContainer& rowContainer, const RowVectorPtr& data) {
std::vector<DecodedVector> decodedVectors;
for (auto& vector : data->children()) {
decodedVectors.emplace_back(*vector);
}

std::vector<char*> rows;
for (auto i = 0; i < data->size(); ++i) {
auto* row = rowContainer.newRow();

for (auto j = 0; j < decodedVectors.size(); ++j) {
rowContainer.store(decodedVectors[j], i, row, j);
}
}
}

void testProbe() {
auto lookup = std::make_unique<HashLookup>(topTable_->hashers());
auto batchSize = batches_[0]->size();
Expand Down Expand Up @@ -911,3 +927,53 @@ DEBUG_ONLY_TEST_P(HashTableTest, failureInCreateRowPartitions) {
// Any outstanding async work should be finish cleanly despite the exception.
executor_->join();
}

TEST_P(HashTableTest, toStringSingleKey) {
std::vector<std::unique_ptr<VectorHasher>> hashers;
hashers.push_back(std::make_unique<VectorHasher>(BIGINT(), 0));

auto table = HashTable<false>::createForJoin(
std::move(hashers),
{}, /*dependentTypes*/
true /*allowDuplicates*/,
false /*hasProbedFlag*/,
1 /*minTableSizeForParallelJoinBuild*/,
pool());

auto data = makeRowVector({
makeFlatVector<int64_t>(1'000, [](auto row) { return row / 2; }),
});

store(*table->rows(), data);

table->prepareJoinTable({});

ASSERT_NO_THROW(table->toString());
}

TEST_P(HashTableTest, toStringMultipleKeys) {
std::vector<std::unique_ptr<VectorHasher>> hashers;
hashers.push_back(std::make_unique<VectorHasher>(BIGINT(), 0));
hashers.push_back(std::make_unique<VectorHasher>(VARCHAR(), 1));

auto table = HashTable<false>::createForJoin(
std::move(hashers),
{}, /*dependentTypes*/
true /*allowDuplicates*/,
false /*hasProbedFlag*/,
1 /*minTableSizeForParallelJoinBuild*/,
pool());

vector_size_t size = 1'000;
auto data = makeRowVector({
makeFlatVector<int64_t>(size, [](auto row) { return row / 2; }),
makeFlatVector<std::string>(
size, [](auto row) { return std::string(row, 'x'); }),
});

store(*table->rows(), data);

table->prepareJoinTable({});

ASSERT_NO_THROW(table->toString());
}

0 comments on commit 6729921

Please sign in to comment.