From 5a37aa2c51c6e4b46432c9861abcfb6cb3ca091e Mon Sep 17 00:00:00 2001 From: Zac Wen Date: Wed, 20 Nov 2024 09:56:59 -0800 Subject: [PATCH] feat: Use fallocate for file size extension when supported (#11541) Summary: When Copy-on-Write (COW) is disabled on Btrfs, automatic relocation creates snapshots of files, ignoring the noCOW setting. This results in increased disk usage and can lead to "no space left" errors in production. One possible enhancement we can make is to use fallocate to reserve space immediately after file creation. This helps ensure the allocated space is as continuous as possible. For environments where "no space left" errors are already happening, attempting to reserve space can result in failures. In that case, fallback to truncate and record it in runtime metrics. Reviewed By: xiaoxmeng Differential Revision: D65977522 Pulled By: zacw7 --- velox/common/base/Counters.cpp | 7 +++ velox/common/base/Counters.h | 3 ++ velox/common/caching/SsdCache.h | 6 +++ velox/common/caching/SsdFile.cpp | 21 ++++---- velox/common/caching/SsdFile.h | 12 ++++- .../caching/tests/AsyncDataCacheTest.cpp | 13 +++++ velox/common/caching/tests/CacheTestUtil.h | 50 +++++++++++++++++++ velox/common/caching/tests/SsdFileTest.cpp | 13 +++-- velox/common/file/File.cpp | 21 ++++++++ 9 files changed, 132 insertions(+), 14 deletions(-) create mode 100644 velox/common/caching/tests/CacheTestUtil.h diff --git a/velox/common/base/Counters.cpp b/velox/common/base/Counters.cpp index 1ff64da4232cb..5d14528f9c34e 100644 --- a/velox/common/base/Counters.cpp +++ b/velox/common/base/Counters.cpp @@ -265,6 +265,13 @@ void registerVeloxMetrics() { DEFINE_METRIC( kMetricSsdCacheRecoveredEntries, facebook::velox::StatType::SUM); + // Total number of local file space allocation failures. + // + // NOTE: space allocation is attempted by fallocate wherever it is supported. + DEFINE_METRIC( + kMetricLocalFileSpaceAllocationFailuresCount, + facebook::velox::StatType::COUNT); + /// ================== Memory Arbitration Counters ================= // The number of arbitration requests. diff --git a/velox/common/base/Counters.h b/velox/common/base/Counters.h index 4ee6da39d9c13..b72060c6c99b1 100644 --- a/velox/common/base/Counters.h +++ b/velox/common/base/Counters.h @@ -328,6 +328,9 @@ constexpr folly::StringPiece kMetricSsdCacheRegionsEvicted{ constexpr folly::StringPiece kMetricSsdCacheRecoveredEntries{ "velox.ssd_cache_recovered_entries"}; +constexpr folly::StringPiece kMetricLocalFileSpaceAllocationFailuresCount{ + "velox.local_file_space_allocation_failures_count"}; + constexpr folly::StringPiece kMetricExchangeDataTimeMs{ "velox.exchange_data_time_ms"}; diff --git a/velox/common/caching/SsdCache.h b/velox/common/caching/SsdCache.h index 386e79ed8eecb..73f405306deed 100644 --- a/velox/common/caching/SsdCache.h +++ b/velox/common/caching/SsdCache.h @@ -24,6 +24,10 @@ namespace facebook::velox::cache { #define VELOX_SSD_CACHE_LOG(severity) \ LOG(severity) << VELOX_SSD_CACHE_LOG_PREFIX +namespace test { +class SsdCacheTestHelper; +} + class SsdCache { public: struct Config { @@ -191,6 +195,8 @@ class SsdCache { // Count of shards with unfinished writes. std::atomic_int32_t writesInProgress_{0}; bool shutdown_{false}; + + friend class test::SsdCacheTestHelper; }; } // namespace facebook::velox::cache diff --git a/velox/common/caching/SsdFile.cpp b/velox/common/caching/SsdFile.cpp index bbf8e72c805fe..8460480774079 100644 --- a/velox/common/caching/SsdFile.cpp +++ b/velox/common/caching/SsdFile.cpp @@ -150,11 +150,15 @@ SsdFile::SsdFile(const Config& config) writeFile_ = fs_->openFileForWrite(fileName_, fileOptions); readFile_ = fs_->openFileForRead(fileName_); - const uint64_t size = writeFile_->size(); - numRegions_ = std::min(size / kRegionSize, maxRegions_); - fileSize_ = numRegions_ * kRegionSize; - if ((size % kRegionSize > 0) || (size > numRegions_ * kRegionSize)) { - writeFile_->truncate(fileSize_); + // NOTE: checkpoint recovery will set 'numRegions_' and 'dataSize_' + // accordingly. + numRegions_ = 0; + dataSize_ = 0; + + const auto maxFileSize = kRegionSize * maxRegions_; + if (writeFile_->size() != maxFileSize) { + // Initialize and pre-allocate (if possible) the data file with fixed space. + writeFile_->truncate(static_cast(maxFileSize)); } // The existing regions in the file are writable. writableRegions_.resize(numRegions_); @@ -315,10 +319,8 @@ std::optional> SsdFile::getSpace( bool SsdFile::growOrEvictLocked() { process::TraceContext trace("SsdFile::growOrEvictLocked"); if (numRegions_ < maxRegions_) { - const auto newSize = (numRegions_ + 1) * kRegionSize; try { - writeFile_->truncate(newSize); - fileSize_ = newSize; + dataSize_ = (numRegions_ + 1) * kRegionSize; writableRegions_.push_back(numRegions_); regionSizes_[numRegions_] = 0; erasedRegionSizes_[numRegions_] = 0; @@ -429,7 +431,7 @@ void SsdFile::write(std::vector& pins) { writeOffset += writeLength; writeLength = 0; } - VELOX_CHECK_GE(fileSize_, writeOffset); + VELOX_CHECK_GE(dataSize_, writeOffset); { std::lock_guard l(mutex_); @@ -993,6 +995,7 @@ void SsdFile::readCheckpoint(std::ifstream& state) { maxRegions_, "Trying to start from checkpoint with a different capacity"); numRegions_ = readNumber(state); + dataSize_ = numRegions_ * kRegionSize; std::vector scores(maxRegions); state.read(asChar(scores.data()), maxRegions_ * sizeof(double)); std::unordered_map idMap; diff --git a/velox/common/caching/SsdFile.h b/velox/common/caching/SsdFile.h index 907ddaf9d15bf..708191368d498 100644 --- a/velox/common/caching/SsdFile.h +++ b/velox/common/caching/SsdFile.h @@ -29,6 +29,11 @@ DECLARE_bool(ssd_verify_write); namespace facebook::velox::cache { +namespace test { +class SsdFileTestHelper; +class SsdCacheTestHelper; +} // namespace test + /// A 64 bit word describing a SSD cache entry in an SsdFile. The low 23 bits /// are the size, for a maximum entry size of 8MB. The high bits are the offset. class SsdRun { @@ -570,8 +575,8 @@ class SsdFile { // File system. std::shared_ptr fs_; - // Size of the backing file in bytes. Must be multiple of kRegionSize. - uint64_t fileSize_{0}; + // The size of actual cached data in bytes. Must be multiple of kRegionSize. + uint64_t dataSize_{0}; // ReadFile for cache data file. std::unique_ptr readFile_; @@ -597,6 +602,9 @@ class SsdFile { // True if there was an error with checkpoint and the checkpoint was deleted. bool checkpointDeleted_{false}; + + friend class test::SsdFileTestHelper; + friend class test::SsdCacheTestHelper; }; } // namespace facebook::velox::cache diff --git a/velox/common/caching/tests/AsyncDataCacheTest.cpp b/velox/common/caching/tests/AsyncDataCacheTest.cpp index d0c163d64a853..8de80f64fe334 100644 --- a/velox/common/caching/tests/AsyncDataCacheTest.cpp +++ b/velox/common/caching/tests/AsyncDataCacheTest.cpp @@ -20,6 +20,7 @@ #include "velox/common/caching/CacheTTLController.h" #include "velox/common/caching/FileIds.h" #include "velox/common/caching/SsdCache.h" +#include "velox/common/caching/tests/CacheTestUtil.h" #include "velox/common/file/FileSystems.h" #include "velox/common/memory/Memory.h" #include "velox/common/memory/MmapAllocator.h" @@ -141,6 +142,18 @@ class AsyncDataCacheTest : public ::testing::TestWithParam { GetParam().checksumEnabled, GetParam().checksumVerificationEnabled); ssdCache = std::make_unique(config); + if (ssdCache != nullptr) { + test::SsdCacheTestHelper ssdCacheHelper(ssdCache.get()); + ASSERT_EQ(ssdCacheHelper.numShards(), kNumSsdShards); + const auto sizeQuantum = kNumSsdShards * SsdFile::kRegionSize; + const auto maxNumRegions = static_cast( + bits::roundUp(config.maxBytes, sizeQuantum) / sizeQuantum); + for (int32_t i = 0; i < kNumSsdShards; ++i) { + ASSERT_EQ( + ssdCacheHelper.writeFileSize(i), + maxNumRegions * SsdFile::kRegionSize); + } + } } memory::MemoryManagerOptions options; diff --git a/velox/common/caching/tests/CacheTestUtil.h b/velox/common/caching/tests/CacheTestUtil.h new file mode 100644 index 0000000000000..38928b90051ac --- /dev/null +++ b/velox/common/caching/tests/CacheTestUtil.h @@ -0,0 +1,50 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "velox/common/caching/SsdCache.h" +#include "velox/common/caching/SsdFile.h" + +namespace facebook::velox::cache::test { + +class SsdFileTestHelper { + public: + explicit SsdFileTestHelper(SsdFile* ssdFile) : ssdFile_(ssdFile) {} + + uint64_t writeFileSize() { + return ssdFile_->writeFile_->size(); + } + + private: + SsdFile* const ssdFile_; +}; + +class SsdCacheTestHelper { + public: + explicit SsdCacheTestHelper(SsdCache* ssdCache) : ssdCache_(ssdCache){}; + + int32_t numShards() { + return ssdCache_->numShards_; + } + + uint64_t writeFileSize(uint64_t fileId) { + return ssdCache_->file(fileId).writeFile_->size(); + } + + private: + SsdCache* const ssdCache_; +}; +} // namespace facebook::velox::cache::test diff --git a/velox/common/caching/tests/SsdFileTest.cpp b/velox/common/caching/tests/SsdFileTest.cpp index 771420243f6c9..531f2b37919ab 100644 --- a/velox/common/caching/tests/SsdFileTest.cpp +++ b/velox/common/caching/tests/SsdFileTest.cpp @@ -16,7 +16,7 @@ #include "velox/common/base/tests/GTestUtils.h" #include "velox/common/caching/FileIds.h" -#include "velox/common/caching/SsdCache.h" +#include "velox/common/caching/tests/CacheTestUtil.h" #include "velox/common/file/FileSystems.h" #include "velox/common/file/tests/FaultyFileSystem.h" #include "velox/common/memory/Memory.h" @@ -91,15 +91,22 @@ class SsdFileTest : public testing::Test { bool checksumEnabled = false, bool checksumReadVerificationEnabled = false, bool disableFileCow = false) { + const auto maxNumRegions = static_cast( + bits::roundUp(ssdBytes, SsdFile::kRegionSize) / SsdFile::kRegionSize); SsdFile::Config config( fmt::format("{}/ssdtest", tempDirectory_->getPath()), 0, // shardId - bits::roundUp(ssdBytes, SsdFile::kRegionSize) / SsdFile::kRegionSize, + maxNumRegions, checkpointIntervalBytes, disableFileCow, checksumEnabled, checksumReadVerificationEnabled); ssdFile_ = std::make_unique(config); + if (ssdFile_ != nullptr) { + test::SsdFileTestHelper ssdFileHelper(ssdFile_.get()); + ASSERT_EQ( + ssdFileHelper.writeFileSize(), maxNumRegions * ssdFile_->kRegionSize); + } } // Corrupts the file by invalidate the last 1/10th of its content. @@ -653,7 +660,7 @@ TEST_F(SsdFileTest, recoverFromCheckpointWithChecksum) { ASSERT_EQ(statsAfterRecover.entriesCached, stats.entriesCached); } else { ASSERT_EQ(statsAfterRecover.bytesCached, 0); - ASSERT_EQ(statsAfterRecover.regionsCached, stats.regionsCached); + ASSERT_EQ(statsAfterRecover.regionsCached, 0); ASSERT_EQ(statsAfterRecover.entriesCached, 0); } diff --git a/velox/common/file/File.cpp b/velox/common/file/File.cpp index 6a30f0a26159e..b187d004cb2ef 100644 --- a/velox/common/file/File.cpp +++ b/velox/common/file/File.cpp @@ -15,7 +15,9 @@ */ #include "velox/common/file/File.h" +#include "velox/common/base/Counters.h" #include "velox/common/base/Fs.h" +#include "velox/common/base/StatsReporter.h" #include #include @@ -377,6 +379,25 @@ void LocalWriteFile::write( void LocalWriteFile::truncate(int64_t newSize) { checkNotClosed(closed_); VELOX_CHECK_GE(newSize, 0, "New size cannot be negative."); +#ifdef linux + if (newSize > size_) { + // Use fallocate to extend the file. + const auto ret = ::fallocate(fd_, 0, 0, newSize); + try { + VELOX_CHECK_EQ( + ret, + 0, + "fallocate failed in LocalWriteFile::truncate: {}.", + folly::errnoStr(errno)); + size_ = newSize; + return; + } catch (const std::exception& e) { + RECORD_METRIC_VALUE(kMetricLocalFileSpaceAllocationFailuresCount); + } + } +#endif // linux + + // Fallback to ftruncate. const auto ret = ::ftruncate(fd_, newSize); VELOX_CHECK_EQ( ret,