Skip to content

Commit

Permalink
Add counters and stats for cache shrink for monitoring (#7645)
Browse files Browse the repository at this point in the history
Summary:
Add counters to track the cache shrink count and execution time distribution for monitoring.
This helps us to detect any memory usage pattern changes in prod once we have enabled
memory pushback support as well as check if cache shrink is fast enough

Pull Request resolved: #7645

Reviewed By: amitkdutta

Differential Revision: D51450820

Pulled By: xiaoxmeng

fbshipit-source-id: 20016931eb80f758f49c040932f898948c559d62
  • Loading branch information
xiaoxmeng authored and facebook-github-bot committed Nov 20, 2023
1 parent a86cd7b commit 43524cc
Show file tree
Hide file tree
Showing 3 changed files with 45 additions and 17 deletions.
8 changes: 8 additions & 0 deletions velox/common/base/Counters.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,14 @@ void registerVeloxCounters() {
// P50, P90, P99, and P100.
REPORT_ADD_HISTOGRAM_EXPORT_PERCENTILE(
kCounterHiveFileHandleGenerateLatencyMs, 10, 0, 100000, 50, 90, 99, 100);

REPORT_ADD_STAT_EXPORT_TYPE(
kCounterCacheShrinkCount, facebook::velox::StatType::COUNT);

// Track cache shrink latency in range of [0, 100s] and reports P50, P90, P99,
// and P100.
REPORT_ADD_HISTOGRAM_EXPORT_PERCENTILE(
kCounterCacheShrinkTimeMs, 10, 0, 100'000, 50, 90, 99, 100);
}

} // namespace facebook::velox
5 changes: 5 additions & 0 deletions velox/common/base/Counters.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,4 +25,9 @@ void registerVeloxCounters();

constexpr folly::StringPiece kCounterHiveFileHandleGenerateLatencyMs{
"velox.hive_file_handle_generate_latency_ms"};

constexpr folly::StringPiece kCounterCacheShrinkCount{
"velox.cache_shrink_count"};

constexpr folly::StringPiece kCounterCacheShrinkTimeMs{"velox.cache_shrink_ms"};
} // namespace facebook::velox
49 changes: 32 additions & 17 deletions velox/common/caching/AsyncDataCache.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@
#include "velox/common/caching/FileIds.h"
#include "velox/common/caching/SsdCache.h"

#include <folly/executors/QueuedImmediateExecutor.h>
#include "velox/common/base/Counters.h"
#include "velox/common/base/StatsReporter.h"
#include "velox/common/base/SuccinctPrinter.h"
#include "velox/common/caching/FileIds.h"

Expand Down Expand Up @@ -694,26 +695,40 @@ bool AsyncDataCache::makeSpace(
uint64_t AsyncDataCache::shrink(uint64_t targetBytes) {
VELOX_CHECK_GT(targetBytes, 0);

REPORT_ADD_STAT_VALUE(kCounterCacheShrinkCount);
LOG(INFO) << "Try to shrink cache to free up "
<< velox::succinctBytes(targetBytes) << " memory";

const uint64_t minBytesToEvict = 8UL << 20;
uint64_t evictedBytes{0};
for (int shard = 0; shard < shards_.size(); ++shard) {
memory::Allocation unused;
evictedBytes += shards_[shardCounter_++ & (kShardMask)]->evict(
std::max<uint64_t>(minBytesToEvict, targetBytes - evictedBytes),
// Cache shrink is triggered when server is under low memory pressure
// so need to free up memory as soon as possible. So we always avoid
// triggering ssd save to accelerate the cache evictions.
true,
0,
unused);
VELOX_CHECK(unused.empty());
if (evictedBytes >= targetBytes) {
break;
uint64_t shrinkTimeUs{0};
{
MicrosecondTimer timer(&shrinkTimeUs);
for (int shard = 0; shard < shards_.size(); ++shard) {
memory::Allocation unused;
evictedBytes += shards_[shardCounter_++ & (kShardMask)]->evict(
std::max<uint64_t>(minBytesToEvict, targetBytes - evictedBytes),
// Cache shrink is triggered when server is under low memory pressure
// so need to free up memory as soon as possible. So we always avoid
// triggering ssd save to accelerate the cache evictions.
true,
0,
unused);
VELOX_CHECK(unused.empty());
if (evictedBytes >= targetBytes) {
break;
}
}
// Call unmap to free up to 'targetBytes' unused memory space back to
// operating system after shrink.
allocator_->unmap(memory::AllocationTraits::numPages(targetBytes));
}
// Call unmap to free up to 'targetBytes' unused memory space back to
// operating system after shrink.
allocator_->unmap(memory::AllocationTraits::numPages(targetBytes));

REPORT_ADD_HISTOGRAM_VALUE(kCounterCacheShrinkTimeMs, shrinkTimeUs / 1'000);
LOG(INFO) << "Freed " << velox::succinctBytes(evictedBytes)
<< " cache memory, spent " << velox::succinctMicros(shrinkTimeUs)
<< "\n"
<< toString();
return evictedBytes;
}

Expand Down

0 comments on commit 43524cc

Please sign in to comment.