diff --git a/velox/common/base/Counters.cpp b/velox/common/base/Counters.cpp index 1ff64da4232c..324e49ca3e4b 100644 --- a/velox/common/base/Counters.cpp +++ b/velox/common/base/Counters.cpp @@ -265,6 +265,12 @@ void registerVeloxMetrics() { DEFINE_METRIC( kMetricSsdCacheRecoveredEntries, facebook::velox::StatType::SUM); + // Total number of local file space allocation failures. + // NOTE: space allocation is attempted by fallocate wherever it is supported. + DEFINE_METRIC( + kMetricLocalFileSpaceAllocationFailuresCount, + facebook::velox::StatType::SUM); + /// ================== Memory Arbitration Counters ================= // The number of arbitration requests. diff --git a/velox/common/base/Counters.h b/velox/common/base/Counters.h index 4ee6da39d9c1..b72060c6c99b 100644 --- a/velox/common/base/Counters.h +++ b/velox/common/base/Counters.h @@ -328,6 +328,9 @@ constexpr folly::StringPiece kMetricSsdCacheRegionsEvicted{ constexpr folly::StringPiece kMetricSsdCacheRecoveredEntries{ "velox.ssd_cache_recovered_entries"}; +constexpr folly::StringPiece kMetricLocalFileSpaceAllocationFailuresCount{ + "velox.local_file_space_allocation_failures_count"}; + constexpr folly::StringPiece kMetricExchangeDataTimeMs{ "velox.exchange_data_time_ms"}; diff --git a/velox/common/caching/SsdFile.cpp b/velox/common/caching/SsdFile.cpp index 68859306e338..c1297d485c66 100644 --- a/velox/common/caching/SsdFile.cpp +++ b/velox/common/caching/SsdFile.cpp @@ -169,11 +169,15 @@ SsdFile::SsdFile(const Config& config) writeFile_ = fs_->openFileForWrite(fileName_, fileOptions); readFile_ = fs_->openFileForRead(fileName_); - const uint64_t size = writeFile_->size(); - numRegions_ = std::min(size / kRegionSize, maxRegions_); - fileSize_ = numRegions_ * kRegionSize; - if ((size % kRegionSize > 0) || (size > numRegions_ * kRegionSize)) { - writeFile_->truncate(fileSize_); + // NOTE: checkpoint recovery will set 'numRegions_' and 'dataSize_' + // accordingly. + numRegions_ = 0; + dataSize_ = 0; + + const auto maxFileSize = kRegionSize * maxRegions_; + if (writeFile_->size() != maxFileSize) { + // Initialize and pre-allocate (if possible) the data file with fixed space. + writeFile_->truncate(static_cast(maxFileSize)); } // The existing regions in the file are writable. writableRegions_.resize(numRegions_); @@ -334,10 +338,8 @@ std::optional> SsdFile::getSpace( bool SsdFile::growOrEvictLocked() { process::TraceContext trace("SsdFile::growOrEvictLocked"); if (numRegions_ < maxRegions_) { - const auto newSize = (numRegions_ + 1) * kRegionSize; try { - writeFile_->truncate(newSize); - fileSize_ = newSize; + dataSize_ = (numRegions_ + 1) * kRegionSize; writableRegions_.push_back(numRegions_); regionSizes_[numRegions_] = 0; erasedRegionSizes_[numRegions_] = 0; @@ -448,7 +450,7 @@ void SsdFile::write(std::vector& pins) { writeOffset += writeLength; writeLength = 0; } - VELOX_CHECK_GE(fileSize_, writeOffset); + VELOX_CHECK_GE(dataSize_, writeOffset); { std::lock_guard l(mutex_); @@ -1007,6 +1009,7 @@ void SsdFile::readCheckpoint(std::ifstream& state) { maxRegions_, "Trying to start from checkpoint with a different capacity"); numRegions_ = readNumber(state); + dataSize_ = numRegions_ * kRegionSize; std::vector scores(maxRegions); state.read(asChar(scores.data()), maxRegions_ * sizeof(double)); std::unordered_map idMap; diff --git a/velox/common/caching/SsdFile.h b/velox/common/caching/SsdFile.h index 31300a274a78..660be3f3d884 100644 --- a/velox/common/caching/SsdFile.h +++ b/velox/common/caching/SsdFile.h @@ -563,8 +563,8 @@ class SsdFile { // File system. std::shared_ptr fs_; - // Size of the backing file in bytes. Must be multiple of kRegionSize. - uint64_t fileSize_{0}; + // The size of actual cached data in bytes. Must be multiple of kRegionSize. + uint64_t dataSize_{0}; // ReadFile for cache data file. std::unique_ptr readFile_; diff --git a/velox/common/caching/tests/SsdFileTest.cpp b/velox/common/caching/tests/SsdFileTest.cpp index 0b0bef1ca842..532f7855eb82 100644 --- a/velox/common/caching/tests/SsdFileTest.cpp +++ b/velox/common/caching/tests/SsdFileTest.cpp @@ -647,7 +647,7 @@ TEST_F(SsdFileTest, recoverFromCheckpointWithChecksum) { ASSERT_EQ(statsAfterRecover.entriesCached, stats.entriesCached); } else { ASSERT_EQ(statsAfterRecover.bytesCached, 0); - ASSERT_EQ(statsAfterRecover.regionsCached, stats.regionsCached); + ASSERT_EQ(statsAfterRecover.regionsCached, 0); ASSERT_EQ(statsAfterRecover.entriesCached, 0); } diff --git a/velox/common/file/File.cpp b/velox/common/file/File.cpp index 6a30f0a26159..b187d004cb2e 100644 --- a/velox/common/file/File.cpp +++ b/velox/common/file/File.cpp @@ -15,7 +15,9 @@ */ #include "velox/common/file/File.h" +#include "velox/common/base/Counters.h" #include "velox/common/base/Fs.h" +#include "velox/common/base/StatsReporter.h" #include #include @@ -377,6 +379,25 @@ void LocalWriteFile::write( void LocalWriteFile::truncate(int64_t newSize) { checkNotClosed(closed_); VELOX_CHECK_GE(newSize, 0, "New size cannot be negative."); +#ifdef linux + if (newSize > size_) { + // Use fallocate to extend the file. + const auto ret = ::fallocate(fd_, 0, 0, newSize); + try { + VELOX_CHECK_EQ( + ret, + 0, + "fallocate failed in LocalWriteFile::truncate: {}.", + folly::errnoStr(errno)); + size_ = newSize; + return; + } catch (const std::exception& e) { + RECORD_METRIC_VALUE(kMetricLocalFileSpaceAllocationFailuresCount); + } + } +#endif // linux + + // Fallback to ftruncate. const auto ret = ::ftruncate(fd_, newSize); VELOX_CHECK_EQ( ret,