Skip to content

Commit

Permalink
Use Velox fs for ssd cache evictlog file
Browse files Browse the repository at this point in the history
When Copy-on-Write (COW) is disabled on Btrfs, automatic relocation
creates snapshots of files, ignoring the noCOW setting. This results
in increased disk usage and can lead to "no space left" errors in
production.

One possible enhancement we can make is to use fallocate to reserve
space immediately after file creation. This helps ensure the allocated
space is as continuous as possible.

For environments where "no space left" errors are already happening,
attempting to reserve space can result in failures. In that case,
fallback to truncate and record it in runtime metrics.
  • Loading branch information
zacw7 committed Nov 14, 2024
1 parent c069192 commit 83b7711
Show file tree
Hide file tree
Showing 6 changed files with 45 additions and 12 deletions.
6 changes: 6 additions & 0 deletions velox/common/base/Counters.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -265,6 +265,12 @@ void registerVeloxMetrics() {
DEFINE_METRIC(
kMetricSsdCacheRecoveredEntries, facebook::velox::StatType::SUM);

// Total number of local file space allocation failures.
// NOTE: space allocation is attempted by fallocate wherever it is supported.
DEFINE_METRIC(
kMetricLocalFileSpaceAllocationFailuresCount,
facebook::velox::StatType::SUM);

/// ================== Memory Arbitration Counters =================

// The number of arbitration requests.
Expand Down
3 changes: 3 additions & 0 deletions velox/common/base/Counters.h
Original file line number Diff line number Diff line change
Expand Up @@ -328,6 +328,9 @@ constexpr folly::StringPiece kMetricSsdCacheRegionsEvicted{
constexpr folly::StringPiece kMetricSsdCacheRecoveredEntries{
"velox.ssd_cache_recovered_entries"};

constexpr folly::StringPiece kMetricLocalFileSpaceAllocationFailuresCount{
"velox.local_file_space_allocation_failures_count"};

constexpr folly::StringPiece kMetricExchangeDataTimeMs{
"velox.exchange_data_time_ms"};

Expand Down
21 changes: 12 additions & 9 deletions velox/common/caching/SsdFile.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -169,11 +169,15 @@ SsdFile::SsdFile(const Config& config)
writeFile_ = fs_->openFileForWrite(fileName_, fileOptions);
readFile_ = fs_->openFileForRead(fileName_);

const uint64_t size = writeFile_->size();
numRegions_ = std::min<int32_t>(size / kRegionSize, maxRegions_);
fileSize_ = numRegions_ * kRegionSize;
if ((size % kRegionSize > 0) || (size > numRegions_ * kRegionSize)) {
writeFile_->truncate(fileSize_);
// NOTE: checkpoint recovery will set 'numRegions_' and 'dataSize_'
// accordingly.
numRegions_ = 0;
dataSize_ = 0;

const auto maxFileSize = kRegionSize * maxRegions_;
if (writeFile_->size() != maxFileSize) {
// Initialize and pre-allocate (if possible) the data file with fixed space.
writeFile_->truncate(static_cast<int64_t>(maxFileSize));
}
// The existing regions in the file are writable.
writableRegions_.resize(numRegions_);
Expand Down Expand Up @@ -334,10 +338,8 @@ std::optional<std::pair<uint64_t, int32_t>> SsdFile::getSpace(
bool SsdFile::growOrEvictLocked() {
process::TraceContext trace("SsdFile::growOrEvictLocked");
if (numRegions_ < maxRegions_) {
const auto newSize = (numRegions_ + 1) * kRegionSize;
try {
writeFile_->truncate(newSize);
fileSize_ = newSize;
dataSize_ = (numRegions_ + 1) * kRegionSize;
writableRegions_.push_back(numRegions_);
regionSizes_[numRegions_] = 0;
erasedRegionSizes_[numRegions_] = 0;
Expand Down Expand Up @@ -448,7 +450,7 @@ void SsdFile::write(std::vector<CachePin>& pins) {
writeOffset += writeLength;
writeLength = 0;
}
VELOX_CHECK_GE(fileSize_, writeOffset);
VELOX_CHECK_GE(dataSize_, writeOffset);

{
std::lock_guard<std::shared_mutex> l(mutex_);
Expand Down Expand Up @@ -1007,6 +1009,7 @@ void SsdFile::readCheckpoint(std::ifstream& state) {
maxRegions_,
"Trying to start from checkpoint with a different capacity");
numRegions_ = readNumber<int32_t>(state);
dataSize_ = numRegions_ * kRegionSize;
std::vector<double> scores(maxRegions);
state.read(asChar(scores.data()), maxRegions_ * sizeof(double));
std::unordered_map<uint64_t, StringIdLease> idMap;
Expand Down
4 changes: 2 additions & 2 deletions velox/common/caching/SsdFile.h
Original file line number Diff line number Diff line change
Expand Up @@ -563,8 +563,8 @@ class SsdFile {
// File system.
std::shared_ptr<filesystems::FileSystem> fs_;

// Size of the backing file in bytes. Must be multiple of kRegionSize.
uint64_t fileSize_{0};
// The size of actual cached data in bytes. Must be multiple of kRegionSize.
uint64_t dataSize_{0};

// ReadFile for cache data file.
std::unique_ptr<ReadFile> readFile_;
Expand Down
2 changes: 1 addition & 1 deletion velox/common/caching/tests/SsdFileTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -647,7 +647,7 @@ TEST_F(SsdFileTest, recoverFromCheckpointWithChecksum) {
ASSERT_EQ(statsAfterRecover.entriesCached, stats.entriesCached);
} else {
ASSERT_EQ(statsAfterRecover.bytesCached, 0);
ASSERT_EQ(statsAfterRecover.regionsCached, stats.regionsCached);
ASSERT_EQ(statsAfterRecover.regionsCached, 0);
ASSERT_EQ(statsAfterRecover.entriesCached, 0);
}

Expand Down
21 changes: 21 additions & 0 deletions velox/common/file/File.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,9 @@
*/

#include "velox/common/file/File.h"
#include "velox/common/base/Counters.h"
#include "velox/common/base/Fs.h"
#include "velox/common/base/StatsReporter.h"

#include <fmt/format.h>
#include <glog/logging.h>
Expand Down Expand Up @@ -377,6 +379,25 @@ void LocalWriteFile::write(
void LocalWriteFile::truncate(int64_t newSize) {
checkNotClosed(closed_);
VELOX_CHECK_GE(newSize, 0, "New size cannot be negative.");
#ifdef linux
if (newSize > size_) {
// Use fallocate to extend the file.
const auto ret = ::fallocate(fd_, 0, 0, newSize);
try {
VELOX_CHECK_EQ(
ret,
0,
"fallocate failed in LocalWriteFile::truncate: {}.",
folly::errnoStr(errno));
size_ = newSize;
return;
} catch (const std::exception& e) {
RECORD_METRIC_VALUE(kMetricLocalFileSpaceAllocationFailuresCount);
}
}
#endif // linux

// Fallback to ftruncate.
const auto ret = ::ftruncate(fd_, newSize);
VELOX_CHECK_EQ(
ret,
Expand Down

0 comments on commit 83b7711

Please sign in to comment.