Skip to content

Commit

Permalink
feat: Use fallocate for file size extension when supported (#11541)
Browse files Browse the repository at this point in the history
Summary:
When Copy-on-Write (COW) is disabled on Btrfs, automatic relocation creates snapshots of files, ignoring the noCOW setting. This results in increased disk usage and can lead to "no space left" errors in production.

One possible enhancement we can make is to use fallocate to reserve space immediately after file creation. This helps ensure the allocated space is as continuous as possible.

For environments where "no space left" errors are already happening, attempting to reserve space can result in failures. In that case, fallback to truncate and record it in runtime metrics.


Reviewed By: xiaoxmeng

Differential Revision: D65977522

Pulled By: zacw7
  • Loading branch information
zacw7 authored and facebook-github-bot committed Nov 20, 2024
1 parent c13b8ed commit 5a37aa2
Show file tree
Hide file tree
Showing 9 changed files with 132 additions and 14 deletions.
7 changes: 7 additions & 0 deletions velox/common/base/Counters.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -265,6 +265,13 @@ void registerVeloxMetrics() {
DEFINE_METRIC(
kMetricSsdCacheRecoveredEntries, facebook::velox::StatType::SUM);

// Total number of local file space allocation failures.
//
// NOTE: space allocation is attempted by fallocate wherever it is supported.
DEFINE_METRIC(
kMetricLocalFileSpaceAllocationFailuresCount,
facebook::velox::StatType::COUNT);

/// ================== Memory Arbitration Counters =================

// The number of arbitration requests.
Expand Down
3 changes: 3 additions & 0 deletions velox/common/base/Counters.h
Original file line number Diff line number Diff line change
Expand Up @@ -328,6 +328,9 @@ constexpr folly::StringPiece kMetricSsdCacheRegionsEvicted{
constexpr folly::StringPiece kMetricSsdCacheRecoveredEntries{
"velox.ssd_cache_recovered_entries"};

constexpr folly::StringPiece kMetricLocalFileSpaceAllocationFailuresCount{
"velox.local_file_space_allocation_failures_count"};

constexpr folly::StringPiece kMetricExchangeDataTimeMs{
"velox.exchange_data_time_ms"};

Expand Down
6 changes: 6 additions & 0 deletions velox/common/caching/SsdCache.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,10 @@ namespace facebook::velox::cache {
#define VELOX_SSD_CACHE_LOG(severity) \
LOG(severity) << VELOX_SSD_CACHE_LOG_PREFIX

namespace test {
class SsdCacheTestHelper;
}

class SsdCache {
public:
struct Config {
Expand Down Expand Up @@ -191,6 +195,8 @@ class SsdCache {
// Count of shards with unfinished writes.
std::atomic_int32_t writesInProgress_{0};
bool shutdown_{false};

friend class test::SsdCacheTestHelper;
};

} // namespace facebook::velox::cache
21 changes: 12 additions & 9 deletions velox/common/caching/SsdFile.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -150,11 +150,15 @@ SsdFile::SsdFile(const Config& config)
writeFile_ = fs_->openFileForWrite(fileName_, fileOptions);
readFile_ = fs_->openFileForRead(fileName_);

const uint64_t size = writeFile_->size();
numRegions_ = std::min<int32_t>(size / kRegionSize, maxRegions_);
fileSize_ = numRegions_ * kRegionSize;
if ((size % kRegionSize > 0) || (size > numRegions_ * kRegionSize)) {
writeFile_->truncate(fileSize_);
// NOTE: checkpoint recovery will set 'numRegions_' and 'dataSize_'
// accordingly.
numRegions_ = 0;
dataSize_ = 0;

const auto maxFileSize = kRegionSize * maxRegions_;
if (writeFile_->size() != maxFileSize) {
// Initialize and pre-allocate (if possible) the data file with fixed space.
writeFile_->truncate(static_cast<int64_t>(maxFileSize));
}
// The existing regions in the file are writable.
writableRegions_.resize(numRegions_);
Expand Down Expand Up @@ -315,10 +319,8 @@ std::optional<std::pair<uint64_t, int32_t>> SsdFile::getSpace(
bool SsdFile::growOrEvictLocked() {
process::TraceContext trace("SsdFile::growOrEvictLocked");
if (numRegions_ < maxRegions_) {
const auto newSize = (numRegions_ + 1) * kRegionSize;
try {
writeFile_->truncate(newSize);
fileSize_ = newSize;
dataSize_ = (numRegions_ + 1) * kRegionSize;
writableRegions_.push_back(numRegions_);
regionSizes_[numRegions_] = 0;
erasedRegionSizes_[numRegions_] = 0;
Expand Down Expand Up @@ -429,7 +431,7 @@ void SsdFile::write(std::vector<CachePin>& pins) {
writeOffset += writeLength;
writeLength = 0;
}
VELOX_CHECK_GE(fileSize_, writeOffset);
VELOX_CHECK_GE(dataSize_, writeOffset);

{
std::lock_guard<std::shared_mutex> l(mutex_);
Expand Down Expand Up @@ -993,6 +995,7 @@ void SsdFile::readCheckpoint(std::ifstream& state) {
maxRegions_,
"Trying to start from checkpoint with a different capacity");
numRegions_ = readNumber<int32_t>(state);
dataSize_ = numRegions_ * kRegionSize;
std::vector<double> scores(maxRegions);
state.read(asChar(scores.data()), maxRegions_ * sizeof(double));
std::unordered_map<uint64_t, StringIdLease> idMap;
Expand Down
12 changes: 10 additions & 2 deletions velox/common/caching/SsdFile.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,11 @@ DECLARE_bool(ssd_verify_write);

namespace facebook::velox::cache {

namespace test {
class SsdFileTestHelper;
class SsdCacheTestHelper;
} // namespace test

/// A 64 bit word describing a SSD cache entry in an SsdFile. The low 23 bits
/// are the size, for a maximum entry size of 8MB. The high bits are the offset.
class SsdRun {
Expand Down Expand Up @@ -570,8 +575,8 @@ class SsdFile {
// File system.
std::shared_ptr<filesystems::FileSystem> fs_;

// Size of the backing file in bytes. Must be multiple of kRegionSize.
uint64_t fileSize_{0};
// The size of actual cached data in bytes. Must be multiple of kRegionSize.
uint64_t dataSize_{0};

// ReadFile for cache data file.
std::unique_ptr<ReadFile> readFile_;
Expand All @@ -597,6 +602,9 @@ class SsdFile {

// True if there was an error with checkpoint and the checkpoint was deleted.
bool checkpointDeleted_{false};

friend class test::SsdFileTestHelper;
friend class test::SsdCacheTestHelper;
};

} // namespace facebook::velox::cache
13 changes: 13 additions & 0 deletions velox/common/caching/tests/AsyncDataCacheTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#include "velox/common/caching/CacheTTLController.h"
#include "velox/common/caching/FileIds.h"
#include "velox/common/caching/SsdCache.h"
#include "velox/common/caching/tests/CacheTestUtil.h"
#include "velox/common/file/FileSystems.h"
#include "velox/common/memory/Memory.h"
#include "velox/common/memory/MmapAllocator.h"
Expand Down Expand Up @@ -141,6 +142,18 @@ class AsyncDataCacheTest : public ::testing::TestWithParam<TestParam> {
GetParam().checksumEnabled,
GetParam().checksumVerificationEnabled);
ssdCache = std::make_unique<SsdCache>(config);
if (ssdCache != nullptr) {
test::SsdCacheTestHelper ssdCacheHelper(ssdCache.get());
ASSERT_EQ(ssdCacheHelper.numShards(), kNumSsdShards);
const auto sizeQuantum = kNumSsdShards * SsdFile::kRegionSize;
const auto maxNumRegions = static_cast<int32_t>(
bits::roundUp(config.maxBytes, sizeQuantum) / sizeQuantum);
for (int32_t i = 0; i < kNumSsdShards; ++i) {
ASSERT_EQ(
ssdCacheHelper.writeFileSize(i),
maxNumRegions * SsdFile::kRegionSize);
}
}
}

memory::MemoryManagerOptions options;
Expand Down
50 changes: 50 additions & 0 deletions velox/common/caching/tests/CacheTestUtil.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
/*
* Copyright (c) Facebook, Inc. and its affiliates.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once

#include "velox/common/caching/SsdCache.h"
#include "velox/common/caching/SsdFile.h"

namespace facebook::velox::cache::test {

class SsdFileTestHelper {
public:
explicit SsdFileTestHelper(SsdFile* ssdFile) : ssdFile_(ssdFile) {}

uint64_t writeFileSize() {
return ssdFile_->writeFile_->size();
}

private:
SsdFile* const ssdFile_;
};

class SsdCacheTestHelper {
public:
explicit SsdCacheTestHelper(SsdCache* ssdCache) : ssdCache_(ssdCache){};

int32_t numShards() {
return ssdCache_->numShards_;
}

uint64_t writeFileSize(uint64_t fileId) {
return ssdCache_->file(fileId).writeFile_->size();
}

private:
SsdCache* const ssdCache_;
};
} // namespace facebook::velox::cache::test
13 changes: 10 additions & 3 deletions velox/common/caching/tests/SsdFileTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

#include "velox/common/base/tests/GTestUtils.h"
#include "velox/common/caching/FileIds.h"
#include "velox/common/caching/SsdCache.h"
#include "velox/common/caching/tests/CacheTestUtil.h"
#include "velox/common/file/FileSystems.h"
#include "velox/common/file/tests/FaultyFileSystem.h"
#include "velox/common/memory/Memory.h"
Expand Down Expand Up @@ -91,15 +91,22 @@ class SsdFileTest : public testing::Test {
bool checksumEnabled = false,
bool checksumReadVerificationEnabled = false,
bool disableFileCow = false) {
const auto maxNumRegions = static_cast<int32_t>(
bits::roundUp(ssdBytes, SsdFile::kRegionSize) / SsdFile::kRegionSize);
SsdFile::Config config(
fmt::format("{}/ssdtest", tempDirectory_->getPath()),
0, // shardId
bits::roundUp(ssdBytes, SsdFile::kRegionSize) / SsdFile::kRegionSize,
maxNumRegions,
checkpointIntervalBytes,
disableFileCow,
checksumEnabled,
checksumReadVerificationEnabled);
ssdFile_ = std::make_unique<SsdFile>(config);
if (ssdFile_ != nullptr) {
test::SsdFileTestHelper ssdFileHelper(ssdFile_.get());
ASSERT_EQ(
ssdFileHelper.writeFileSize(), maxNumRegions * ssdFile_->kRegionSize);
}
}

// Corrupts the file by invalidate the last 1/10th of its content.
Expand Down Expand Up @@ -653,7 +660,7 @@ TEST_F(SsdFileTest, recoverFromCheckpointWithChecksum) {
ASSERT_EQ(statsAfterRecover.entriesCached, stats.entriesCached);
} else {
ASSERT_EQ(statsAfterRecover.bytesCached, 0);
ASSERT_EQ(statsAfterRecover.regionsCached, stats.regionsCached);
ASSERT_EQ(statsAfterRecover.regionsCached, 0);
ASSERT_EQ(statsAfterRecover.entriesCached, 0);
}

Expand Down
21 changes: 21 additions & 0 deletions velox/common/file/File.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,9 @@
*/

#include "velox/common/file/File.h"
#include "velox/common/base/Counters.h"
#include "velox/common/base/Fs.h"
#include "velox/common/base/StatsReporter.h"

#include <fmt/format.h>
#include <glog/logging.h>
Expand Down Expand Up @@ -377,6 +379,25 @@ void LocalWriteFile::write(
void LocalWriteFile::truncate(int64_t newSize) {
checkNotClosed(closed_);
VELOX_CHECK_GE(newSize, 0, "New size cannot be negative.");
#ifdef linux
if (newSize > size_) {
// Use fallocate to extend the file.
const auto ret = ::fallocate(fd_, 0, 0, newSize);
try {
VELOX_CHECK_EQ(
ret,
0,
"fallocate failed in LocalWriteFile::truncate: {}.",
folly::errnoStr(errno));
size_ = newSize;
return;
} catch (const std::exception& e) {
RECORD_METRIC_VALUE(kMetricLocalFileSpaceAllocationFailuresCount);
}
}
#endif // linux

// Fallback to ftruncate.
const auto ret = ::ftruncate(fd_, newSize);
VELOX_CHECK_EQ(
ret,
Expand Down

0 comments on commit 5a37aa2

Please sign in to comment.