Skip to content

Commit

Permalink
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Fix HashTable::toString()
Browse files Browse the repository at this point in the history
mbasmanova committed Dec 7, 2023
1 parent 730dd53 commit dd392eb
Showing 3 changed files with 153 additions and 20 deletions.
98 changes: 78 additions & 20 deletions velox/exec/HashTable.cpp
Original file line number Diff line number Diff line change
@@ -1465,27 +1465,31 @@ void HashTable<ignoreNullKeys>::decideHashMode(
template <bool ignoreNullKeys>
std::string HashTable<ignoreNullKeys>::toString() {
std::stringstream out;
int64_t occupied = 0;

out << "[HashTable size: " << capacity_
out << "[HashTable keys: " << hashers_.size()
<< " hash mode: " << modeString(hashMode_) << " capacity: " << capacity_
<< " distinct count: " << numDistinct_
<< " tombstone count: " << numTombstones_ << "]";
<< " tombstones count: " << numTombstones_ << "]";
if (table_ == nullptr) {
out << "(no table) ";
out << " (no table)";
}

for (auto& hasher : hashers_) {
out << hasher->toString();
out << std::endl << hasher->toString();
}
out << std::endl;

if (kTrackLoads) {
out << std::endl;
out << fmt::format(
"{} probes {} tag loads {} row loads {} hits",
numProbes_,
numTagLoads_,
numRowLoads_,
numHits_);
"{} probes {} tag loads {} row loads {} hits",
numProbes_,
numTagLoads_,
numRowLoads_,
numHits_)
<< std::endl;
}

if (hashMode_ == HashMode::kArray) {
int64_t occupied = 0;
if (table_ && tableAllocation_.data() && tableAllocation_.size()) {
// 'size_' and 'table_' may not be set if initializing.
uint64_t size = std::min<uint64_t>(
@@ -1494,22 +1498,76 @@ std::string HashTable<ignoreNullKeys>::toString() {
occupied += table_[i] != nullptr;
}
}
out << "Total slots used: " << occupied << std::endl;
} else {
// Count of groups indexed by number of non-empty slots.
int64_t numGroups[sizeof(TagVector) + 1] = {};
int64_t occupied = 0;

// Count of buckets indexed by the number of non-empty slots.
// Each bucket has 16 slots. Hence, the number of non-empty slots is between
// 0 and 16 (17 possible values).
int64_t numBuckets[sizeof(TagVector) + 1] = {};
for (int64_t bucketOffset = 0; bucketOffset < sizeMask_;
bucketOffset += kBucketSize) {
auto tags = loadTags(bucketOffset);
auto filled = simd::toBitMask(tags != TagVector::broadcast(0));
++numGroups[__builtin_popcount(filled)];
occupied += filled;
auto numOccupied = __builtin_popcount(filled);

++numBuckets[numOccupied];
occupied += numOccupied;
}
out << " occupied=" << occupied;
out << std::endl;
for (auto i = 0; i < sizeof(numGroups) / sizeof(numGroups[0]); ++i) {
out << numGroups[i] << " groups with " << i << " entries" << std::endl;

out << "Total buckets: " << (sizeMask_ / kBucketSize + 1) << std::endl;
out << "Total slots used: " << occupied << std::endl;
for (auto i = 1; i < sizeof(TagVector) + 1; ++i) {
if (numBuckets[i] > 0) {
out << numBuckets[i] << " buckets with " << i << " slots used"
<< std::endl;
}
}
}

return out.str();
}

template <bool ignoreNullKeys>
std::string HashTable<ignoreNullKeys>::toString(
int64_t startBucket,
int64_t numBuckets) const {
if (table_ == nullptr) {
return "(no table)";
}

VELOX_CHECK_GE(startBucket, 0);
VELOX_CHECK_GT(numBuckets, 0);

const int64_t totalBuckets = sizeMask_ / kBucketSize + 1;
if (startBucket >= totalBuckets) {
return "";
}

const int64_t endBucket =
std::min<int64_t>(startBucket + numBuckets, totalBuckets);

std::ostringstream out;
for (int64_t i = startBucket; i < endBucket; ++i) {
out << std::setw(1 + endBucket / 10) << i << ": ";

auto bucket = bucketAt(i * kBucketSize);
for (auto i = 0; i < sizeof(TagVector); ++i) {
if (i > 0) {
out << ", ";
}
if (bucket->tagAt(i) == ProbeState::kTombstoneTag) {
out << std::setw(3) << "T";
} else if (bucket->tagAt(i) == ProbeState::kEmptyTag) {
out << std::setw(3) << "E";
} else {
out << (int) bucket->tagAt(i);
}
}
out << std::endl;
}

return out.str();
}

5 changes: 5 additions & 0 deletions velox/exec/HashTable.h
Original file line number Diff line number Diff line change
@@ -487,6 +487,11 @@ class HashTable : public BaseHashTable {

std::string toString() override;

/// Returns the details of the range of buckets. The range starts from
/// zero-based 'startBucket' and contains 'numBuckets' or however many there
/// are left till the end of the table.
std::string toString(int64_t startBucket, int64_t numBuckets = 1) const;

/// Invoked to check the consistency of the internal state. The function scans
/// all the table slots to check if the relevant slot counting are correct
/// such as the number of used slots ('numDistinct_') and the number of
70 changes: 70 additions & 0 deletions velox/exec/tests/HashTableTest.cpp
Original file line number Diff line number Diff line change
@@ -365,6 +365,22 @@ class HashTableTest : public testing::TestWithParam<bool>,
}
}

void store(RowContainer& rowContainer, const RowVectorPtr& data) {
std::vector<DecodedVector> decodedVectors;
for (auto& vector : data->children()) {
decodedVectors.emplace_back(*vector);
}

std::vector<char*> rows;
for (auto i = 0; i < data->size(); ++i) {
auto* row = rowContainer.newRow();

for (auto j = 0; j < decodedVectors.size(); ++j) {
rowContainer.store(decodedVectors[j], i, row, j);
}
}
}

void testProbe() {
auto lookup = std::make_unique<HashLookup>(topTable_->hashers());
auto batchSize = batches_[0]->size();
@@ -911,3 +927,57 @@ DEBUG_ONLY_TEST_P(HashTableTest, failureInCreateRowPartitions) {
// Any outstanding async work should be finish cleanly despite the exception.
executor_->join();
}

TEST_P(HashTableTest, toStringSingleKey) {
std::vector<std::unique_ptr<VectorHasher>> hashers;
hashers.push_back(std::make_unique<VectorHasher>(BIGINT(), 0));

auto table = HashTable<false>::createForJoin(
std::move(hashers),
{}, /*dependentTypes*/
true /*allowDuplicates*/,
false /*hasProbedFlag*/,
1 /*minTableSizeForParallelJoinBuild*/,
pool());

auto data = makeRowVector({
makeFlatVector<int64_t>(1'000, [](auto row) { return row / 2; }),
});

store(*table->rows(), data);

table->prepareJoinTable({});

ASSERT_NO_THROW(table->toString());
ASSERT_NO_THROW(table->toString(0));
ASSERT_NO_THROW(table->toString(10));
ASSERT_NO_THROW(table->toString(1000));
ASSERT_NO_THROW(table->toString(31, 5));
}

TEST_P(HashTableTest, toStringMultipleKeys) {
std::vector<std::unique_ptr<VectorHasher>> hashers;
hashers.push_back(std::make_unique<VectorHasher>(BIGINT(), 0));
hashers.push_back(std::make_unique<VectorHasher>(VARCHAR(), 1));

auto table = HashTable<false>::createForJoin(
std::move(hashers),
{}, /*dependentTypes*/
true /*allowDuplicates*/,
false /*hasProbedFlag*/,
1 /*minTableSizeForParallelJoinBuild*/,
pool());

vector_size_t size = 1'000;
auto data = makeRowVector({
makeFlatVector<int64_t>(size, [](auto row) { return row / 2; }),
makeFlatVector<std::string>(
size, [](auto row) { return std::string(row, 'x'); }),
});

store(*table->rows(), data);

table->prepareJoinTable({});

ASSERT_NO_THROW(table->toString());
}

0 comments on commit dd392eb

Please sign in to comment.