diff --git a/velox/exec/HashTable.cpp b/velox/exec/HashTable.cpp index 8ecbe18145ac3..53c6b4fbf8271 100644 --- a/velox/exec/HashTable.cpp +++ b/velox/exec/HashTable.cpp @@ -1465,27 +1465,31 @@ void HashTable::decideHashMode( template std::string HashTable::toString() { std::stringstream out; - int64_t occupied = 0; - - out << "[HashTable size: " << capacity_ + out << "[HashTable keys: " << hashers_.size() + << " hash mode: " << modeString(hashMode_) << " capacity: " << capacity_ << " distinct count: " << numDistinct_ - << " tombstone count: " << numTombstones_ << "]"; + << " tombstones count: " << numTombstones_ << "]"; if (table_ == nullptr) { - out << "(no table) "; + out << " (no table)"; } + for (auto& hasher : hashers_) { - out << hasher->toString(); + out << std::endl << hasher->toString(); } + out << std::endl; + if (kTrackLoads) { - out << std::endl; out << fmt::format( - "{} probes {} tag loads {} row loads {} hits", - numProbes_, - numTagLoads_, - numRowLoads_, - numHits_); + "{} probes {} tag loads {} row loads {} hits", + numProbes_, + numTagLoads_, + numRowLoads_, + numHits_) + << std::endl; } + if (hashMode_ == HashMode::kArray) { + int64_t occupied = 0; if (table_ && tableAllocation_.data() && tableAllocation_.size()) { // 'size_' and 'table_' may not be set if initializing. uint64_t size = std::min( @@ -1494,22 +1498,34 @@ std::string HashTable::toString() { occupied += table_[i] != nullptr; } } + out << "Total slots used: " << occupied << std::endl; } else { - // Count of groups indexed by number of non-empty slots. - int64_t numGroups[sizeof(TagVector) + 1] = {}; - for (int64_t bucketOffset = 0; bucketOffset < sizeMask_; + int64_t occupied = 0; + + // Count of buckets indexed by the number of non-empty slots. + // Each bucket has 16 slots. Hence, the number of non-empty slots is between + // 0 and 16 (17 possible values). + int64_t numBuckets[sizeof(TagVector) + 1] = {}; + for (int64_t bucketOffset = 0; bucketOffset <= sizeMask_; bucketOffset += kBucketSize) { auto tags = loadTags(bucketOffset); auto filled = simd::toBitMask(tags != TagVector::broadcast(0)); - ++numGroups[__builtin_popcount(filled)]; - occupied += filled; + auto numOccupied = __builtin_popcount(filled); + + ++numBuckets[numOccupied]; + occupied += numOccupied; } - out << " occupied=" << occupied; - out << std::endl; - for (auto i = 0; i < sizeof(numGroups) / sizeof(numGroups[0]); ++i) { - out << numGroups[i] << " groups with " << i << " entries" << std::endl; + + out << "Total buckets: " << (sizeMask_ / kBucketSize + 1) << std::endl; + out << "Total slots used: " << occupied << std::endl; + for (auto i = 1; i < sizeof(TagVector) + 1; ++i) { + if (numBuckets[i] > 0) { + out << numBuckets[i] << " buckets with " << i << " slots used" + << std::endl; + } } } + return out.str(); } diff --git a/velox/exec/tests/HashTableTest.cpp b/velox/exec/tests/HashTableTest.cpp index 7d0f871beb9b6..02155c5df755b 100644 --- a/velox/exec/tests/HashTableTest.cpp +++ b/velox/exec/tests/HashTableTest.cpp @@ -365,6 +365,22 @@ class HashTableTest : public testing::TestWithParam, } } + void store(RowContainer& rowContainer, const RowVectorPtr& data) { + std::vector decodedVectors; + for (auto& vector : data->children()) { + decodedVectors.emplace_back(*vector); + } + + std::vector rows; + for (auto i = 0; i < data->size(); ++i) { + auto* row = rowContainer.newRow(); + + for (auto j = 0; j < decodedVectors.size(); ++j) { + rowContainer.store(decodedVectors[j], i, row, j); + } + } + } + void testProbe() { auto lookup = std::make_unique(topTable_->hashers()); auto batchSize = batches_[0]->size(); @@ -911,3 +927,53 @@ DEBUG_ONLY_TEST_P(HashTableTest, failureInCreateRowPartitions) { // Any outstanding async work should be finish cleanly despite the exception. executor_->join(); } + +TEST_P(HashTableTest, toStringSingleKey) { + std::vector> hashers; + hashers.push_back(std::make_unique(BIGINT(), 0)); + + auto table = HashTable::createForJoin( + std::move(hashers), + {}, /*dependentTypes*/ + true /*allowDuplicates*/, + false /*hasProbedFlag*/, + 1 /*minTableSizeForParallelJoinBuild*/, + pool()); + + auto data = makeRowVector({ + makeFlatVector(1'000, [](auto row) { return row / 2; }), + }); + + store(*table->rows(), data); + + table->prepareJoinTable({}); + + ASSERT_NO_THROW(table->toString()); +} + +TEST_P(HashTableTest, toStringMultipleKeys) { + std::vector> hashers; + hashers.push_back(std::make_unique(BIGINT(), 0)); + hashers.push_back(std::make_unique(VARCHAR(), 1)); + + auto table = HashTable::createForJoin( + std::move(hashers), + {}, /*dependentTypes*/ + true /*allowDuplicates*/, + false /*hasProbedFlag*/, + 1 /*minTableSizeForParallelJoinBuild*/, + pool()); + + vector_size_t size = 1'000; + auto data = makeRowVector({ + makeFlatVector(size, [](auto row) { return row / 2; }), + makeFlatVector( + size, [](auto row) { return std::string(row, 'x'); }), + }); + + store(*table->rows(), data); + + table->prepareJoinTable({}); + + ASSERT_NO_THROW(table->toString()); +}