Skip to content

Commit

Permalink
safekeeper: use set_len() to zero out segments (#9665)
Browse files Browse the repository at this point in the history
## Problem

When we create a new segment, we zero it out in order to avoid changing
the length and fsyncing metadata on every write. However, we zeroed it
out by writing 8 KB zero-pages, and Tokio file writes have non-trivial
overhead.

## Summary of changes

Zero out the segment using
[`File::set_len()`](https://docs.rs/tokio/latest/i686-unknown-linux-gnu/tokio/fs/struct.File.html#method.set_len)
instead. This will typically (depending on the filesystem) just write a
sparse file and omit the 16 MB of data entirely. This improves WAL
append throughput for large messages by over 400% with fsync disabled,
and 100% with fsync enabled.
  • Loading branch information
erikgrinaker authored Nov 7, 2024
1 parent 01265b7 commit f18aa04
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 27 deletions.
45 changes: 19 additions & 26 deletions safekeeper/src/wal_storage.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@ use crate::state::TimelinePersistentState;
use crate::wal_backup::{read_object, remote_timeline_path};
use postgres_ffi::waldecoder::WalStreamDecoder;
use postgres_ffi::XLogFileName;
use postgres_ffi::XLOG_BLCKSZ;
use pq_proto::SystemId;
use utils::{id::TenantTimelineId, lsn::Lsn};

Expand Down Expand Up @@ -223,6 +222,15 @@ impl PhysicalStorage {
)
}

/// Call fsync if config requires so.
async fn fsync_file(&mut self, file: &File) -> Result<()> {
if !self.no_sync {
self.metrics
.observe_flush_seconds(time_io_closure(file.sync_all()).await?);
}
Ok(())
}

/// Call fdatasync if config requires so.
async fn fdatasync_file(&mut self, file: &File) -> Result<()> {
if !self.no_sync {
Expand Down Expand Up @@ -256,11 +264,15 @@ impl PhysicalStorage {
// half initialized segment, first bake it under tmp filename and
// then rename.
let tmp_path = self.timeline_dir.join("waltmp");
let mut file = File::create(&tmp_path)
let file = File::create(&tmp_path)
.await
.with_context(|| format!("Failed to open tmp wal file {:?}", &tmp_path))?;

write_zeroes(&mut file, self.wal_seg_size).await?;
fail::fail_point!("sk-zero-segment", |_| {
info!("sk-zero-segment failpoint hit");
Err(anyhow::anyhow!("failpoint: sk-zero-segment"))
});
file.set_len(self.wal_seg_size as u64).await?;

// Note: this doesn't get into observe_flush_seconds metric. But
// segment init should be separate metric, if any.
Expand Down Expand Up @@ -486,12 +498,12 @@ impl Storage for PhysicalStorage {
// Remove all segments after the given LSN.
remove_segments_from_disk(&self.timeline_dir, self.wal_seg_size, |x| x > segno).await?;

let (mut file, is_partial) = self.open_or_create(segno).await?;
let (file, is_partial) = self.open_or_create(segno).await?;

// Fill end with zeroes
file.seek(SeekFrom::Start(xlogoff as u64)).await?;
write_zeroes(&mut file, self.wal_seg_size - xlogoff).await?;
self.fdatasync_file(&file).await?;
file.set_len(xlogoff as u64).await?;
file.set_len(self.wal_seg_size as u64).await?;
self.fsync_file(&file).await?;

if !is_partial {
// Make segment partial once again
Expand Down Expand Up @@ -751,25 +763,6 @@ impl WalReader {
}
}

/// Zero block for filling created WAL segments.
const ZERO_BLOCK: &[u8] = &[0u8; XLOG_BLCKSZ];

/// Helper for filling file with zeroes.
async fn write_zeroes(file: &mut File, mut count: usize) -> Result<()> {
fail::fail_point!("sk-write-zeroes", |_| {
info!("write_zeroes hit failpoint");
Err(anyhow::anyhow!("failpoint: sk-write-zeroes"))
});

while count >= XLOG_BLCKSZ {
file.write_all(ZERO_BLOCK).await?;
count -= XLOG_BLCKSZ;
}
file.write_all(&ZERO_BLOCK[0..count]).await?;
file.flush().await?;
Ok(())
}

/// Helper function for opening WAL segment `segno` in `dir`. Returns file and
/// whether it is .partial.
pub(crate) async fn open_wal_file(
Expand Down
2 changes: 1 addition & 1 deletion test_runner/regress/test_wal_acceptor_async.py
Original file line number Diff line number Diff line change
Expand Up @@ -602,7 +602,7 @@ async def run_segment_init_failure(env: NeonEnv):

sk = env.safekeepers[0]
sk_http = sk.http_client()
sk_http.configure_failpoints([("sk-write-zeroes", "return")])
sk_http.configure_failpoints([("sk-zero-segment", "return")])
conn = await ep.connect_async()
ep.safe_psql("select pg_switch_wal()") # jump to the segment boundary
# next insertion should hang until failpoint is disabled.
Expand Down

1 comment on commit f18aa04

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

5337 tests run: 5114 passed, 1 failed, 222 skipped (full report)


Failures on Postgres 17

# Run all failed tests locally:
scripts/pytest -vv -n $(nproc) -k "test_sharded_timeline_detach_ancestor[debug-pg17]"
Flaky tests (1)

Postgres 17

Test coverage report is not available

The comment gets automatically updated with the latest test results
f18aa04 at 2024-11-07T16:14:59.424Z :recycle:

Please sign in to comment.