Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix HashTable::toString() #7921

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
99 changes: 79 additions & 20 deletions velox/exec/HashTable.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1465,27 +1465,31 @@ void HashTable<ignoreNullKeys>::decideHashMode(
template <bool ignoreNullKeys>
std::string HashTable<ignoreNullKeys>::toString() {
std::stringstream out;
int64_t occupied = 0;

out << "[HashTable size: " << capacity_
out << "[HashTable keys: " << hashers_.size()
<< " hash mode: " << modeString(hashMode_) << " capacity: " << capacity_
<< " distinct count: " << numDistinct_
<< " tombstone count: " << numTombstones_ << "]";
<< " tombstones count: " << numTombstones_ << "]";
if (table_ == nullptr) {
out << "(no table) ";
out << " (no table)";
}

for (auto& hasher : hashers_) {
out << hasher->toString();
out << std::endl << hasher->toString();
}
out << std::endl;

if (kTrackLoads) {
out << std::endl;
out << fmt::format(
"{} probes {} tag loads {} row loads {} hits",
numProbes_,
numTagLoads_,
numRowLoads_,
numHits_);
"{} probes {} tag loads {} row loads {} hits",
numProbes_,
numTagLoads_,
numRowLoads_,
numHits_)
<< std::endl;
}

if (hashMode_ == HashMode::kArray) {
int64_t occupied = 0;
if (table_ && tableAllocation_.data() && tableAllocation_.size()) {
// 'size_' and 'table_' may not be set if initializing.
uint64_t size = std::min<uint64_t>(
Expand All @@ -1494,22 +1498,77 @@ std::string HashTable<ignoreNullKeys>::toString() {
occupied += table_[i] != nullptr;
}
}
out << "Total slots used: " << occupied << std::endl;
} else {
// Count of groups indexed by number of non-empty slots.
int64_t numGroups[sizeof(TagVector) + 1] = {};
int64_t occupied = 0;

// Count of buckets indexed by the number of non-empty slots.
// Each bucket has 16 slots. Hence, the number of non-empty slots is between
// 0 and 16 (17 possible values).
int64_t numBuckets[sizeof(TagVector) + 1] = {};
for (int64_t bucketOffset = 0; bucketOffset < sizeMask_;
bucketOffset += kBucketSize) {
auto tags = loadTags(bucketOffset);
auto filled = simd::toBitMask(tags != TagVector::broadcast(0));
++numGroups[__builtin_popcount(filled)];
occupied += filled;
auto numOccupied = __builtin_popcount(filled);

++numBuckets[numOccupied];
occupied += numOccupied;
}
out << " occupied=" << occupied;
out << std::endl;
for (auto i = 0; i < sizeof(numGroups) / sizeof(numGroups[0]); ++i) {
out << numGroups[i] << " groups with " << i << " entries" << std::endl;

out << "Total buckets: " << (sizeMask_ / kBucketSize + 1) << std::endl;
out << "Total slots used: " << occupied << std::endl;
for (auto i = 1; i < sizeof(TagVector) + 1; ++i) {
if (numBuckets[i] > 0) {
out << numBuckets[i] << " buckets with " << i << " slots used"
<< std::endl;
}
}
}

return out.str();
}

template <bool ignoreNullKeys>
std::string HashTable<ignoreNullKeys>::toString(
int64_t startBucket,
int64_t numBuckets) const {
if (table_ == nullptr) {
return "(no table)";
}

VELOX_CHECK_GE(startBucket, 0);
VELOX_CHECK_GT(numBuckets, 0);

const int64_t totalBuckets = sizeMask_ / kBucketSize + 1;
if (startBucket >= totalBuckets) {
return "";
}

const int64_t endBucket =
std::min<int64_t>(startBucket + numBuckets, totalBuckets);

std::ostringstream out;
for (int64_t i = startBucket; i < endBucket; ++i) {
out << std::setw(1 + endBucket / 10) << i << ": ";

auto bucket = bucketAt(i * kBucketSize);
for (auto j = 0; j < sizeof(TagVector); ++j) {
if (j > 0) {
out << ", ";
}
const auto tag = bucket->tagAt(j);
if (tag == ProbeState::kTombstoneTag) {
out << std::setw(3) << "T";
} else if (tag == ProbeState::kEmptyTag) {
out << std::setw(3) << "E";
} else {
out << (int)tag;
}
}
out << std::endl;
}

return out.str();
}

Expand Down
5 changes: 5 additions & 0 deletions velox/exec/HashTable.h
Original file line number Diff line number Diff line change
Expand Up @@ -487,6 +487,11 @@ class HashTable : public BaseHashTable {

std::string toString() override;

/// Returns the details of the range of buckets. The range starts from
/// zero-based 'startBucket' and contains 'numBuckets' or however many there
/// are left till the end of the table.
std::string toString(int64_t startBucket, int64_t numBuckets = 1) const;

/// Invoked to check the consistency of the internal state. The function scans
/// all the table slots to check if the relevant slot counting are correct
/// such as the number of used slots ('numDistinct_') and the number of
Expand Down
70 changes: 70 additions & 0 deletions velox/exec/tests/HashTableTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -365,6 +365,22 @@ class HashTableTest : public testing::TestWithParam<bool>,
}
}

void store(RowContainer& rowContainer, const RowVectorPtr& data) {
std::vector<DecodedVector> decodedVectors;
for (auto& vector : data->children()) {
decodedVectors.emplace_back(*vector);
}

std::vector<char*> rows;
for (auto i = 0; i < data->size(); ++i) {
auto* row = rowContainer.newRow();

for (auto j = 0; j < decodedVectors.size(); ++j) {
rowContainer.store(decodedVectors[j], i, row, j);
}
}
}

void testProbe() {
auto lookup = std::make_unique<HashLookup>(topTable_->hashers());
auto batchSize = batches_[0]->size();
Expand Down Expand Up @@ -911,3 +927,57 @@ DEBUG_ONLY_TEST_P(HashTableTest, failureInCreateRowPartitions) {
// Any outstanding async work should be finish cleanly despite the exception.
executor_->join();
}

TEST_P(HashTableTest, toStringSingleKey) {
std::vector<std::unique_ptr<VectorHasher>> hashers;
hashers.push_back(std::make_unique<VectorHasher>(BIGINT(), 0));

auto table = HashTable<false>::createForJoin(
std::move(hashers),
{}, /*dependentTypes*/
true /*allowDuplicates*/,
false /*hasProbedFlag*/,
1 /*minTableSizeForParallelJoinBuild*/,
pool());

auto data = makeRowVector({
makeFlatVector<int64_t>(1'000, [](auto row) { return row / 2; }),
});

store(*table->rows(), data);

table->prepareJoinTable({});

ASSERT_NO_THROW(table->toString());
ASSERT_NO_THROW(table->toString(0));
ASSERT_NO_THROW(table->toString(10));
ASSERT_NO_THROW(table->toString(1000));
ASSERT_NO_THROW(table->toString(31, 5));
}

TEST_P(HashTableTest, toStringMultipleKeys) {
std::vector<std::unique_ptr<VectorHasher>> hashers;
hashers.push_back(std::make_unique<VectorHasher>(BIGINT(), 0));
hashers.push_back(std::make_unique<VectorHasher>(VARCHAR(), 1));

auto table = HashTable<false>::createForJoin(
std::move(hashers),
{}, /*dependentTypes*/
true /*allowDuplicates*/,
false /*hasProbedFlag*/,
1 /*minTableSizeForParallelJoinBuild*/,
pool());

vector_size_t size = 1'000;
auto data = makeRowVector({
makeFlatVector<int64_t>(size, [](auto row) { return row / 2; }),
makeFlatVector<std::string>(
size, [](auto row) { return std::string(row, 'x'); }),
});

store(*table->rows(), data);

table->prepareJoinTable({});

ASSERT_NO_THROW(table->toString());
}