From 83b7711c4e965b3749073ba1b855b2a8980295b6 Mon Sep 17 00:00:00 2001 From: Zac Wen Date: Thu, 14 Nov 2024 14:47:14 -0800 Subject: [PATCH] Use Velox fs for ssd cache evictlog file When Copy-on-Write (COW) is disabled on Btrfs, automatic relocation creates snapshots of files, ignoring the noCOW setting. This results in increased disk usage and can lead to "no space left" errors in production. One possible enhancement we can make is to use fallocate to reserve space immediately after file creation. This helps ensure the allocated space is as continuous as possible. For environments where "no space left" errors are already happening, attempting to reserve space can result in failures. In that case, fallback to truncate and record it in runtime metrics. --- velox/common/base/Counters.cpp | 6 ++++++ velox/common/base/Counters.h | 3 +++ velox/common/caching/SsdFile.cpp | 21 ++++++++++++--------- velox/common/caching/SsdFile.h | 4 ++-- velox/common/caching/tests/SsdFileTest.cpp | 2 +- velox/common/file/File.cpp | 21 +++++++++++++++++++++ 6 files changed, 45 insertions(+), 12 deletions(-) diff --git a/velox/common/base/Counters.cpp b/velox/common/base/Counters.cpp index 1ff64da4232c..324e49ca3e4b 100644 --- a/velox/common/base/Counters.cpp +++ b/velox/common/base/Counters.cpp @@ -265,6 +265,12 @@ void registerVeloxMetrics() { DEFINE_METRIC( kMetricSsdCacheRecoveredEntries, facebook::velox::StatType::SUM); + // Total number of local file space allocation failures. + // NOTE: space allocation is attempted by fallocate wherever it is supported. + DEFINE_METRIC( + kMetricLocalFileSpaceAllocationFailuresCount, + facebook::velox::StatType::SUM); + /// ================== Memory Arbitration Counters ================= // The number of arbitration requests. diff --git a/velox/common/base/Counters.h b/velox/common/base/Counters.h index 4ee6da39d9c1..b72060c6c99b 100644 --- a/velox/common/base/Counters.h +++ b/velox/common/base/Counters.h @@ -328,6 +328,9 @@ constexpr folly::StringPiece kMetricSsdCacheRegionsEvicted{ constexpr folly::StringPiece kMetricSsdCacheRecoveredEntries{ "velox.ssd_cache_recovered_entries"}; +constexpr folly::StringPiece kMetricLocalFileSpaceAllocationFailuresCount{ + "velox.local_file_space_allocation_failures_count"}; + constexpr folly::StringPiece kMetricExchangeDataTimeMs{ "velox.exchange_data_time_ms"}; diff --git a/velox/common/caching/SsdFile.cpp b/velox/common/caching/SsdFile.cpp index 68859306e338..c1297d485c66 100644 --- a/velox/common/caching/SsdFile.cpp +++ b/velox/common/caching/SsdFile.cpp @@ -169,11 +169,15 @@ SsdFile::SsdFile(const Config& config) writeFile_ = fs_->openFileForWrite(fileName_, fileOptions); readFile_ = fs_->openFileForRead(fileName_); - const uint64_t size = writeFile_->size(); - numRegions_ = std::min(size / kRegionSize, maxRegions_); - fileSize_ = numRegions_ * kRegionSize; - if ((size % kRegionSize > 0) || (size > numRegions_ * kRegionSize)) { - writeFile_->truncate(fileSize_); + // NOTE: checkpoint recovery will set 'numRegions_' and 'dataSize_' + // accordingly. + numRegions_ = 0; + dataSize_ = 0; + + const auto maxFileSize = kRegionSize * maxRegions_; + if (writeFile_->size() != maxFileSize) { + // Initialize and pre-allocate (if possible) the data file with fixed space. + writeFile_->truncate(static_cast(maxFileSize)); } // The existing regions in the file are writable. writableRegions_.resize(numRegions_); @@ -334,10 +338,8 @@ std::optional> SsdFile::getSpace( bool SsdFile::growOrEvictLocked() { process::TraceContext trace("SsdFile::growOrEvictLocked"); if (numRegions_ < maxRegions_) { - const auto newSize = (numRegions_ + 1) * kRegionSize; try { - writeFile_->truncate(newSize); - fileSize_ = newSize; + dataSize_ = (numRegions_ + 1) * kRegionSize; writableRegions_.push_back(numRegions_); regionSizes_[numRegions_] = 0; erasedRegionSizes_[numRegions_] = 0; @@ -448,7 +450,7 @@ void SsdFile::write(std::vector& pins) { writeOffset += writeLength; writeLength = 0; } - VELOX_CHECK_GE(fileSize_, writeOffset); + VELOX_CHECK_GE(dataSize_, writeOffset); { std::lock_guard l(mutex_); @@ -1007,6 +1009,7 @@ void SsdFile::readCheckpoint(std::ifstream& state) { maxRegions_, "Trying to start from checkpoint with a different capacity"); numRegions_ = readNumber(state); + dataSize_ = numRegions_ * kRegionSize; std::vector scores(maxRegions); state.read(asChar(scores.data()), maxRegions_ * sizeof(double)); std::unordered_map idMap; diff --git a/velox/common/caching/SsdFile.h b/velox/common/caching/SsdFile.h index 31300a274a78..660be3f3d884 100644 --- a/velox/common/caching/SsdFile.h +++ b/velox/common/caching/SsdFile.h @@ -563,8 +563,8 @@ class SsdFile { // File system. std::shared_ptr fs_; - // Size of the backing file in bytes. Must be multiple of kRegionSize. - uint64_t fileSize_{0}; + // The size of actual cached data in bytes. Must be multiple of kRegionSize. + uint64_t dataSize_{0}; // ReadFile for cache data file. std::unique_ptr readFile_; diff --git a/velox/common/caching/tests/SsdFileTest.cpp b/velox/common/caching/tests/SsdFileTest.cpp index 0b0bef1ca842..532f7855eb82 100644 --- a/velox/common/caching/tests/SsdFileTest.cpp +++ b/velox/common/caching/tests/SsdFileTest.cpp @@ -647,7 +647,7 @@ TEST_F(SsdFileTest, recoverFromCheckpointWithChecksum) { ASSERT_EQ(statsAfterRecover.entriesCached, stats.entriesCached); } else { ASSERT_EQ(statsAfterRecover.bytesCached, 0); - ASSERT_EQ(statsAfterRecover.regionsCached, stats.regionsCached); + ASSERT_EQ(statsAfterRecover.regionsCached, 0); ASSERT_EQ(statsAfterRecover.entriesCached, 0); } diff --git a/velox/common/file/File.cpp b/velox/common/file/File.cpp index 6a30f0a26159..b187d004cb2e 100644 --- a/velox/common/file/File.cpp +++ b/velox/common/file/File.cpp @@ -15,7 +15,9 @@ */ #include "velox/common/file/File.h" +#include "velox/common/base/Counters.h" #include "velox/common/base/Fs.h" +#include "velox/common/base/StatsReporter.h" #include #include @@ -377,6 +379,25 @@ void LocalWriteFile::write( void LocalWriteFile::truncate(int64_t newSize) { checkNotClosed(closed_); VELOX_CHECK_GE(newSize, 0, "New size cannot be negative."); +#ifdef linux + if (newSize > size_) { + // Use fallocate to extend the file. + const auto ret = ::fallocate(fd_, 0, 0, newSize); + try { + VELOX_CHECK_EQ( + ret, + 0, + "fallocate failed in LocalWriteFile::truncate: {}.", + folly::errnoStr(errno)); + size_ = newSize; + return; + } catch (const std::exception& e) { + RECORD_METRIC_VALUE(kMetricLocalFileSpaceAllocationFailuresCount); + } + } +#endif // linux + + // Fallback to ftruncate. const auto ret = ::ftruncate(fd_, newSize); VELOX_CHECK_EQ( ret,