From dd392eb19c6b00a6b2ee748ad84ba29aaaccd4d8 Mon Sep 17 00:00:00 2001 From: Masha Basmanova Date: Thu, 7 Dec 2023 17:55:30 -0500 Subject: [PATCH] Fix HashTable::toString() --- velox/exec/HashTable.cpp | 98 ++++++++++++++++++++++++------ velox/exec/HashTable.h | 5 ++ velox/exec/tests/HashTableTest.cpp | 70 +++++++++++++++++++++ 3 files changed, 153 insertions(+), 20 deletions(-) diff --git a/velox/exec/HashTable.cpp b/velox/exec/HashTable.cpp index 8ecbe18145ac3..b38a5c2a63771 100644 --- a/velox/exec/HashTable.cpp +++ b/velox/exec/HashTable.cpp @@ -1465,27 +1465,31 @@ void HashTable::decideHashMode( template std::string HashTable::toString() { std::stringstream out; - int64_t occupied = 0; - - out << "[HashTable size: " << capacity_ + out << "[HashTable keys: " << hashers_.size() + << " hash mode: " << modeString(hashMode_) << " capacity: " << capacity_ << " distinct count: " << numDistinct_ - << " tombstone count: " << numTombstones_ << "]"; + << " tombstones count: " << numTombstones_ << "]"; if (table_ == nullptr) { - out << "(no table) "; + out << " (no table)"; } + for (auto& hasher : hashers_) { - out << hasher->toString(); + out << std::endl << hasher->toString(); } + out << std::endl; + if (kTrackLoads) { - out << std::endl; out << fmt::format( - "{} probes {} tag loads {} row loads {} hits", - numProbes_, - numTagLoads_, - numRowLoads_, - numHits_); + "{} probes {} tag loads {} row loads {} hits", + numProbes_, + numTagLoads_, + numRowLoads_, + numHits_) + << std::endl; } + if (hashMode_ == HashMode::kArray) { + int64_t occupied = 0; if (table_ && tableAllocation_.data() && tableAllocation_.size()) { // 'size_' and 'table_' may not be set if initializing. uint64_t size = std::min( @@ -1494,22 +1498,76 @@ std::string HashTable::toString() { occupied += table_[i] != nullptr; } } + out << "Total slots used: " << occupied << std::endl; } else { - // Count of groups indexed by number of non-empty slots. - int64_t numGroups[sizeof(TagVector) + 1] = {}; + int64_t occupied = 0; + + // Count of buckets indexed by the number of non-empty slots. + // Each bucket has 16 slots. Hence, the number of non-empty slots is between + // 0 and 16 (17 possible values). + int64_t numBuckets[sizeof(TagVector) + 1] = {}; for (int64_t bucketOffset = 0; bucketOffset < sizeMask_; bucketOffset += kBucketSize) { auto tags = loadTags(bucketOffset); auto filled = simd::toBitMask(tags != TagVector::broadcast(0)); - ++numGroups[__builtin_popcount(filled)]; - occupied += filled; + auto numOccupied = __builtin_popcount(filled); + + ++numBuckets[numOccupied]; + occupied += numOccupied; } - out << " occupied=" << occupied; - out << std::endl; - for (auto i = 0; i < sizeof(numGroups) / sizeof(numGroups[0]); ++i) { - out << numGroups[i] << " groups with " << i << " entries" << std::endl; + + out << "Total buckets: " << (sizeMask_ / kBucketSize + 1) << std::endl; + out << "Total slots used: " << occupied << std::endl; + for (auto i = 1; i < sizeof(TagVector) + 1; ++i) { + if (numBuckets[i] > 0) { + out << numBuckets[i] << " buckets with " << i << " slots used" + << std::endl; + } + } + } + + return out.str(); +} + +template +std::string HashTable::toString( + int64_t startBucket, + int64_t numBuckets) const { + if (table_ == nullptr) { + return "(no table)"; + } + + VELOX_CHECK_GE(startBucket, 0); + VELOX_CHECK_GT(numBuckets, 0); + + const int64_t totalBuckets = sizeMask_ / kBucketSize + 1; + if (startBucket >= totalBuckets) { + return ""; + } + + const int64_t endBucket = + std::min(startBucket + numBuckets, totalBuckets); + + std::ostringstream out; + for (int64_t i = startBucket; i < endBucket; ++i) { + out << std::setw(1 + endBucket / 10) << i << ": "; + + auto bucket = bucketAt(i * kBucketSize); + for (auto i = 0; i < sizeof(TagVector); ++i) { + if (i > 0) { + out << ", "; + } + if (bucket->tagAt(i) == ProbeState::kTombstoneTag) { + out << std::setw(3) << "T"; + } else if (bucket->tagAt(i) == ProbeState::kEmptyTag) { + out << std::setw(3) << "E"; + } else { + out << (int) bucket->tagAt(i); + } } + out << std::endl; } + return out.str(); } diff --git a/velox/exec/HashTable.h b/velox/exec/HashTable.h index 843f3641c69de..bf8c43deb5134 100644 --- a/velox/exec/HashTable.h +++ b/velox/exec/HashTable.h @@ -487,6 +487,11 @@ class HashTable : public BaseHashTable { std::string toString() override; + /// Returns the details of the range of buckets. The range starts from + /// zero-based 'startBucket' and contains 'numBuckets' or however many there + /// are left till the end of the table. + std::string toString(int64_t startBucket, int64_t numBuckets = 1) const; + /// Invoked to check the consistency of the internal state. The function scans /// all the table slots to check if the relevant slot counting are correct /// such as the number of used slots ('numDistinct_') and the number of diff --git a/velox/exec/tests/HashTableTest.cpp b/velox/exec/tests/HashTableTest.cpp index 7d0f871beb9b6..32e2d6fb67e6b 100644 --- a/velox/exec/tests/HashTableTest.cpp +++ b/velox/exec/tests/HashTableTest.cpp @@ -365,6 +365,22 @@ class HashTableTest : public testing::TestWithParam, } } + void store(RowContainer& rowContainer, const RowVectorPtr& data) { + std::vector decodedVectors; + for (auto& vector : data->children()) { + decodedVectors.emplace_back(*vector); + } + + std::vector rows; + for (auto i = 0; i < data->size(); ++i) { + auto* row = rowContainer.newRow(); + + for (auto j = 0; j < decodedVectors.size(); ++j) { + rowContainer.store(decodedVectors[j], i, row, j); + } + } + } + void testProbe() { auto lookup = std::make_unique(topTable_->hashers()); auto batchSize = batches_[0]->size(); @@ -911,3 +927,57 @@ DEBUG_ONLY_TEST_P(HashTableTest, failureInCreateRowPartitions) { // Any outstanding async work should be finish cleanly despite the exception. executor_->join(); } + +TEST_P(HashTableTest, toStringSingleKey) { + std::vector> hashers; + hashers.push_back(std::make_unique(BIGINT(), 0)); + + auto table = HashTable::createForJoin( + std::move(hashers), + {}, /*dependentTypes*/ + true /*allowDuplicates*/, + false /*hasProbedFlag*/, + 1 /*minTableSizeForParallelJoinBuild*/, + pool()); + + auto data = makeRowVector({ + makeFlatVector(1'000, [](auto row) { return row / 2; }), + }); + + store(*table->rows(), data); + + table->prepareJoinTable({}); + + ASSERT_NO_THROW(table->toString()); + ASSERT_NO_THROW(table->toString(0)); + ASSERT_NO_THROW(table->toString(10)); + ASSERT_NO_THROW(table->toString(1000)); + ASSERT_NO_THROW(table->toString(31, 5)); +} + +TEST_P(HashTableTest, toStringMultipleKeys) { + std::vector> hashers; + hashers.push_back(std::make_unique(BIGINT(), 0)); + hashers.push_back(std::make_unique(VARCHAR(), 1)); + + auto table = HashTable::createForJoin( + std::move(hashers), + {}, /*dependentTypes*/ + true /*allowDuplicates*/, + false /*hasProbedFlag*/, + 1 /*minTableSizeForParallelJoinBuild*/, + pool()); + + vector_size_t size = 1'000; + auto data = makeRowVector({ + makeFlatVector(size, [](auto row) { return row / 2; }), + makeFlatVector( + size, [](auto row) { return std::string(row, 'x'); }), + }); + + store(*table->rows(), data); + + table->prepareJoinTable({}); + + ASSERT_NO_THROW(table->toString()); +}