From b67c760218b60fc97539e0aa039074232ea070dc Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Tue, 25 Jun 2024 14:08:50 +0200 Subject: [PATCH 01/24] WIP --- pageserver/src/bin/pageserver.rs | 3 + pageserver/src/config.rs | 3 + pageserver/src/tenant.rs | 14 ++++ pageserver/src/tenant/block_io.rs | 1 + pageserver/src/tenant/ephemeral_file.rs | 4 ++ .../src/tenant/ephemeral_file/page_caching.rs | 42 +++++++++++ .../ephemeral_file/zero_padded_read_write.rs | 8 +++ .../zero_padded_read_write/zero_padded.rs | 4 ++ .../tenant/storage_layer/inmemory_layer.rs | 70 ++++++++++++++----- pageserver/src/tenant/timeline.rs | 4 ++ pageserver/src/tenant/timeline/delete.rs | 1 + pageserver/src/tenant/timeline/l0_flush.rs | 42 +++++++++++ 12 files changed, 177 insertions(+), 19 deletions(-) create mode 100644 pageserver/src/tenant/timeline/l0_flush.rs diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index ba5b2608bdf1..3c94671ad2c8 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -421,6 +421,8 @@ fn start_pageserver( background_jobs_can_start: background_jobs_barrier.clone(), }; + let l0_flush_semaphore = Arc::new(tokio::sync::Semaphore::new(conf.l0_flush_max_concurrency)); + // Scan the local 'tenants/' directory and start loading the tenants let deletion_queue_client = deletion_queue.new_client(); let tenant_manager = BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr( @@ -429,6 +431,7 @@ fn start_pageserver( broker_client: broker_client.clone(), remote_storage: remote_storage.clone(), deletion_queue_client, + l0_flush_semaphore, }, order, shutdown_pageserver.clone(), diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 104234841c82..52c76147f415 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -294,8 +294,11 @@ pub struct PageServerConf { /// /// Setting this to zero disables limits on total ephemeral layer size. pub ephemeral_bytes_per_memory_kb: usize, + + pub l0_flush: L0FlushConfig, } + /// We do not want to store this in a PageServerConf because the latter may be logged /// and/or serialized at a whim, while the token is secret. Currently this token is the /// same for accessing all tenants/timelines, but may become per-tenant/per-timeline in diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 62f066862a16..dc128798b4fd 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -165,6 +165,7 @@ pub struct TenantSharedResources { pub broker_client: storage_broker::BrokerClientChannel, pub remote_storage: GenericRemoteStorage, pub deletion_queue_client: DeletionQueueClient, + pub l0_flush_semaphore: Arc, } /// A [`Tenant`] is really an _attached_ tenant. The configuration @@ -295,6 +296,9 @@ pub struct Tenant { /// An ongoing timeline detach must be checked during attempts to GC or compact a timeline. ongoing_timeline_detach: std::sync::Mutex>, + + /// Global semaphore for L0 flushes, passed in from tenant_mgr and we pass references to it to each timeline. + pub(crate) l0_flush_semaphore: Arc, } impl std::fmt::Debug for Tenant { @@ -658,6 +662,7 @@ impl Tenant { broker_client, remote_storage, deletion_queue_client, + l0_flush_semaphore, } = resources; let attach_mode = attached_conf.location.attach_mode; @@ -672,6 +677,7 @@ impl Tenant { tenant_shard_id, remote_storage.clone(), deletion_queue_client, + l0_flush_semaphore, )); // The attach task will carry a GateGuard, so that shutdown() reliably waits for it to drop out if @@ -984,6 +990,7 @@ impl Tenant { TimelineResources { remote_client, timeline_get_throttle: self.timeline_get_throttle.clone(), + l0_flush_semaphore: self.l0_flush_semaphore.clone(), }, ctx, ) @@ -1168,6 +1175,8 @@ impl Tenant { tenant_shard_id, remote_storage, DeletionQueueClient::broken(), + // broken tenant should never be flushing + Arc::new(tokio::sync::Semaphore::new(0)), )) } @@ -2493,6 +2502,7 @@ impl Tenant { tenant_shard_id: TenantShardId, remote_storage: GenericRemoteStorage, deletion_queue_client: DeletionQueueClient, + l0_flush_semaphore: Arc, ) -> Tenant { let (state, mut rx) = watch::channel(state); @@ -2576,6 +2586,7 @@ impl Tenant { )), tenant_conf: Arc::new(ArcSwap::from_pointee(attached_conf)), ongoing_timeline_detach: std::sync::Mutex::default(), + l0_flush_semaphore, } } @@ -3395,6 +3406,7 @@ impl Tenant { TimelineResources { remote_client, timeline_get_throttle: self.timeline_get_throttle.clone(), + l0_flush_semaphore: self.l0_flush_semaphore.clone(), } } @@ -3920,6 +3932,8 @@ pub(crate) mod harness { self.tenant_shard_id, self.remote_storage.clone(), self.deletion_queue.new_client(), + // in the past we didn't have a semaphore => use large value to avoid changing test behavior + Arc::new(tokio::sync::Semaphore::new(tokio::sync::Semaphore::MAX_PERMITS)), )); let preload = tenant diff --git a/pageserver/src/tenant/block_io.rs b/pageserver/src/tenant/block_io.rs index 92928116c1f6..d534d3e2067a 100644 --- a/pageserver/src/tenant/block_io.rs +++ b/pageserver/src/tenant/block_io.rs @@ -81,6 +81,7 @@ pub(crate) enum BlockReaderRef<'a> { FileBlockReader(&'a FileBlockReader<'a>), EphemeralFile(&'a EphemeralFile), Adapter(Adapter<&'a DeltaLayerInner>), + Slice(&[u8]), #[cfg(test)] TestDisk(&'a super::disk_btree::tests::TestDisk), #[cfg(test)] diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs index 79cc7bf15373..e09a868c0942 100644 --- a/pageserver/src/tenant/ephemeral_file.rs +++ b/pageserver/src/tenant/ephemeral_file.rs @@ -65,6 +65,10 @@ impl EphemeralFile { self.rw.page_cache_file_id() } + pub(crate) async fn load_into_contiguous_memory(&self, ctx: &RequestContext) -> Result, io::Error> { + self.rw.load_into_contiguous_memory(ctx).await + } + pub(crate) async fn read_blk( &self, blknum: u32, diff --git a/pageserver/src/tenant/ephemeral_file/page_caching.rs b/pageserver/src/tenant/ephemeral_file/page_caching.rs index 276ac8706493..23c3913bb4b7 100644 --- a/pageserver/src/tenant/ephemeral_file/page_caching.rs +++ b/pageserver/src/tenant/ephemeral_file/page_caching.rs @@ -49,6 +49,35 @@ impl RW { self.rw.bytes_written() } + pub(crate) async fn load_into_contiguous_memory( + &self, + ctx: &RequestContext, + ) -> Result, io::Error> { + let size = { + let s = self.bytes_written(); + s += s % PAGE_SZ; // round up to the next PAGE_SZ multiple, required by blob_io + }; + let buf = Vec::with_capacity(size); + + // read from disk what we've already flushed + let writer = self.rw.as_writer(); + let buf = writer + .read_all_written_blocks_into_memory_directly_from_disk( + buf.slice(0..writer.nwritten_blocks * PAGE_SZ), + ctx, + ) + .await?; + let vec = buf.into_inner(); + assert_eq!(vec.len() % PAGE_SZ, 0); + assert_eq!(vec.len() / PAGE_SZ, writer.nwritten_blocks as usize); + + // copy from in-memory buffer what we haven't flushed yet but would return when accessed via read_blk + let buffered = self.rw.inspect_buffer_zero_padded(); + vec.extend_from_slice(buffered); + assert_eq!(vec.len(), size); + Ok(vec) + } + pub(crate) async fn read_blk( &self, blknum: u32, @@ -129,6 +158,19 @@ impl PreWarmingWriter { file, } } + + async fn read_all_written_blocks_into_memory_directly_from_disk< + B: tokio_epoll_uring::Slice, + Buf: tokio_epoll_uring::IoBufMut + Send, + >( + &self, + buf: B, + ctx: &RequestContext, + ) -> std::io::Result { + assert_eq!(buf.bytes_total() % PAGE_SZ, 0); + assert_eq!(buf.bytes_total() / PAGE_SZ, self.nwritten_blocks); + self.file.read_exact_at(buf, 0, ctx).await + } } impl crate::virtual_file::owned_buffers_io::write::OwnedAsyncWriter for PreWarmingWriter { diff --git a/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs b/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs index b37eafb52c5b..f473d946ad7f 100644 --- a/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs +++ b/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs @@ -75,6 +75,14 @@ where flushed_offset + u64::try_from(buffer.pending()).unwrap() } + pub fn inspect_buffer_zero_padded(&self) -> &[u8] { + let buffer: &zero_padded::Buffer = self.buffered_writer.inspect_buffer(); + let buffer_written_up_to = u64::try_from(buffer.pending()).unwrap(); + // pad to next page boundary + let read_up_to = buffer_written_up_to + (buffer_written_up_to % PAGE_SZ); + buffer.as_zero_padded_slice()[0..buffer_written_up_to] + } + pub(crate) async fn read_blk(&self, blknum: u32) -> Result, std::io::Error> { let flushed_offset = self.buffered_writer.as_inner().bytes_written(); let buffer: &zero_padded::Buffer = self.buffered_writer.inspect_buffer(); diff --git a/pageserver/src/tenant/ephemeral_file/zero_padded_read_write/zero_padded.rs b/pageserver/src/tenant/ephemeral_file/zero_padded_read_write/zero_padded.rs index f90291bbf814..0f1e8da23992 100644 --- a/pageserver/src/tenant/ephemeral_file/zero_padded_read_write/zero_padded.rs +++ b/pageserver/src/tenant/ephemeral_file/zero_padded_read_write/zero_padded.rs @@ -36,6 +36,10 @@ impl Buffer { pub fn as_zero_padded_slice(&self) -> &[u8; N] { &self.allocation } + + pub fn as_unpadded_slice(&self) -> &[u8] { + &self.allocation[0..self.written] + } } impl crate::virtual_file::owned_buffers_io::write::Buffer for Buffer { diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs index 1ecc56ce993f..26d87deb7829 100644 --- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs +++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs @@ -7,10 +7,10 @@ use crate::config::PageServerConf; use crate::context::{PageContentKind, RequestContext, RequestContextBuilder}; use crate::repository::{Key, Value}; -use crate::tenant::block_io::BlockReader; +use crate::tenant::block_io::{BlockCursor, BlockReader, BlockReaderRef}; use crate::tenant::ephemeral_file::EphemeralFile; use crate::tenant::storage_layer::ValueReconstructResult; -use crate::tenant::timeline::GetVectoredError; +use crate::tenant::timeline::{l0_flush, GetVectoredError}; use crate::tenant::{PageReconstructError, Timeline}; use crate::{page_cache, walrecord}; use anyhow::{anyhow, ensure, Result}; @@ -620,6 +620,12 @@ impl InMemoryLayer { // rare though, so we just accept the potential latency hit for now. let inner = self.inner.read().await; + let l0_flush_global_state = timeline.l0_flush_global_state.inner(); + let _memory_permit = match l0_flush_global_state { + Inner::PageCached => None, + Inner::Direct { semaphore, .. } => Some(semaphore.acquire(inner.file.len()).await), + }; + let end_lsn = *self.end_lsn.get().unwrap(); let keys: Vec<_> = if let Some(key_range) = key_range { @@ -647,23 +653,49 @@ impl InMemoryLayer { ) .await?; - let mut buf = Vec::new(); - - let cursor = inner.file.block_cursor(); - - let ctx = RequestContextBuilder::extend(ctx) - .page_content_kind(PageContentKind::InMemoryLayer) - .build(); - for (key, vec_map) in inner.index.iter() { - // Write all page versions - for (lsn, pos) in vec_map.as_slice() { - cursor.read_blob_into_buf(*pos, &mut buf, &ctx).await?; - let will_init = Value::des(&buf)?.will_init(); - let res; - (buf, res) = delta_layer_writer - .put_value_bytes(*key, *lsn, buf, will_init, &ctx) - .await; - res?; + match l0_flush_global_state { + l0_flush::Inner::PageCached => { + let ctx = RequestContextBuilder::extend(ctx) + .page_content_kind(PageContentKind::InMemoryLayer) + .build(); + + let mut buf = Vec::new(); + + let cursor = inner.file.block_cursor(); + + for (key, vec_map) in inner.index.iter() { + // Write all page versions + for (lsn, pos) in vec_map.as_slice() { + cursor.read_blob_into_buf(*pos, &mut buf, &ctx).await?; + let will_init = Value::des(&buf)?.will_init(); + let res; + (buf, res) = delta_layer_writer + .put_value_bytes(*key, *lsn, buf, will_init, &ctx) + .await; + res?; + } + } + } + l0_flush::Inner::Direct { .. } => { + let file_contents = inner.file.load_into_contiguous_memory(ctx).await?; + assert_eq!(file_contents.len(), inner.file.len() as usize); + + let cursor = BlockCursor::new(BlockReaderRef::Slice(&file_contents)); + + let mut buf = Vec::new(); + + for (key, vec_map) in inner.index.iter() { + // Write all page versions + for (lsn, pos) in vec_map.as_slice() { + cursor.read_blob_into_buf(*pos, &mut buf, &ctx).await?; + let will_init = Value::des(&buf)?.will_init(); + let res; + (buf, res) = delta_layer_writer + .put_value_bytes(*key, *lsn, buf, will_init, &ctx) + .await; + res?; + } + } } } diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 5398ad399c84..f8c0ffbaa726 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -9,6 +9,7 @@ pub(crate) mod logical_size; pub mod span; pub mod uninit; mod walreceiver; +pub(crate) mod l0_flush; use anyhow::{anyhow, bail, ensure, Context, Result}; use arc_swap::ArcSwap; @@ -208,6 +209,7 @@ pub struct TimelineResources { pub timeline_get_throttle: Arc< crate::tenant::throttle::Throttle<&'static crate::metrics::tenant_throttling::TimelineGet>, >, + pub l0_flush_semaphore: Arc, } pub(crate) struct AuxFilesState { @@ -433,6 +435,8 @@ pub struct Timeline { /// in the future, add `extra_test_sparse_keyspace` if necessary. #[cfg(test)] pub(crate) extra_test_dense_keyspace: ArcSwap, + + pub(crate) l0_flush_global_state: L0FlushGlobalState, } pub struct WalReceiverInfo { diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs index 6d747d424dde..dd1acfd70eb2 100644 --- a/pageserver/src/tenant/timeline/delete.rs +++ b/pageserver/src/tenant/timeline/delete.rs @@ -272,6 +272,7 @@ impl DeleteTimelineFlow { TimelineResources { remote_client, timeline_get_throttle: tenant.timeline_get_throttle.clone(), + l0_flush_semaphore: tenant.l0_flush_semaphore.clone(), }, // Important. We dont pass ancestor above because it can be missing. // Thus we need to skip the validation here. diff --git a/pageserver/src/tenant/timeline/l0_flush.rs b/pageserver/src/tenant/timeline/l0_flush.rs new file mode 100644 index 000000000000..a2d88d8efa79 --- /dev/null +++ b/pageserver/src/tenant/timeline/l0_flush.rs @@ -0,0 +1,42 @@ +use std::num::NonZeroUsize; + +#[derive(Default)] +pub enum L0FlushConfig { + #[default] + PageCached, + Direct { + /// Concurrent L0 flushes are limited to consume at most `max_memory` bytes of memory. + /// If there are a lot of small L0s that need to be flushed, a lot of flushes can happen in parallel. + /// If there is a large L0 to be flushed, it might have to wait until preceding flushes are done. + pub max_memory: MaxMemory, + }, +} + +/// Deserializer guarantees that that initializing the `tokio::sync::Semaphore` will succeed. +pub struct MaxMemory(NonZeroUsize); + +pub struct L0FlushGlobalState(Arc); + +pub(in crate::tenant::storage_layer::inmemory_layer) enum Inner { + PageCached, + Direct { + config: L0FlushConfig, + semaphore: tokio::sync::Semaphore, + }, +} + +impl L0FlushGlobalState { + pub fn new(config: L0FlushConfig) -> Self { + match config { + L0FlushConfig::PageCached => Self(Inner::PageCached), + L0FlushConfig::Direct { max_memory } => { + let semaphore = tokio::sync::Semaphore::new(max_memory.0.get()); + Self(Inner::Direct { config, semaphore }) + } + } + } + + pub(in crate::tenant::storage_layer::inmemory_layer) fn inner(&self) -> &Inner { + &self.0 + } +} From 14eb9a1b641eca1c73c0e8418c5ab3affc8d6c3b Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Tue, 25 Jun 2024 17:33:57 +0000 Subject: [PATCH 02/24] wip --- pageserver/src/config.rs | 2 +- pageserver/src/tenant/block_io.rs | 22 +++++++++++++++- .../src/tenant/ephemeral_file/page_caching.rs | 25 +++++++++++++------ .../tenant/storage_layer/inmemory_layer.rs | 1 + pageserver/src/tenant/timeline.rs | 1 + pageserver/src/tenant/timeline/l0_flush.rs | 8 +++--- pageserver/src/virtual_file.rs | 6 ++--- 7 files changed, 48 insertions(+), 17 deletions(-) diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 52c76147f415..a3b45843bddd 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -30,7 +30,7 @@ use utils::{ logging::LogFormat, }; -use crate::tenant::timeline::GetVectoredImpl; +use crate::tenant::timeline::{l0_flush::L0FlushConfig, GetVectoredImpl}; use crate::tenant::vectored_blob_io::MaxVectoredReadBytes; use crate::tenant::{config::TenantConfOpt, timeline::GetImpl}; use crate::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME}; diff --git a/pageserver/src/tenant/block_io.rs b/pageserver/src/tenant/block_io.rs index d534d3e2067a..715b4bad54c6 100644 --- a/pageserver/src/tenant/block_io.rs +++ b/pageserver/src/tenant/block_io.rs @@ -37,6 +37,7 @@ where pub enum BlockLease<'a> { PageReadGuard(PageReadGuard<'static>), EphemeralFileMutableTail(&'a [u8; PAGE_SZ]), + Slice(&'a [u8; PAGE_SZ]), #[cfg(test)] Arc(std::sync::Arc<[u8; PAGE_SZ]>), #[cfg(test)] @@ -81,7 +82,7 @@ pub(crate) enum BlockReaderRef<'a> { FileBlockReader(&'a FileBlockReader<'a>), EphemeralFile(&'a EphemeralFile), Adapter(Adapter<&'a DeltaLayerInner>), - Slice(&[u8]), + Slice(&'a [u8]), #[cfg(test)] TestDisk(&'a super::disk_btree::tests::TestDisk), #[cfg(test)] @@ -100,6 +101,7 @@ impl<'a> BlockReaderRef<'a> { FileBlockReader(r) => r.read_blk(blknum, ctx).await, EphemeralFile(r) => r.read_blk(blknum, ctx).await, Adapter(r) => r.read_blk(blknum, ctx).await, + Slice(s) => Self::read_blk_slice(s, blknum), #[cfg(test)] TestDisk(r) => r.read_blk(blknum), #[cfg(test)] @@ -108,6 +110,24 @@ impl<'a> BlockReaderRef<'a> { } } +impl<'a> BlockReaderRef<'a> { + fn read_blk_slice(slice: &[u8], blknum: u32) -> std::io::Result { + let start = (blknum as usize).checked_mul(PAGE_SZ).unwrap(); + let end = start.checked_add(PAGE_SZ).unwrap(); + if slice.len() > end { + return Err(std::io::Error::new( + std::io::ErrorKind::UnexpectedEof, + format!("slice too short, len={} end={}", slice.len(), end), + )); + } + let slice = &slice[start..end]; + let page_sized: &[u8; PAGE_SZ] = slice + .try_into() + .expect("we add PAGE_SZ to start, so the slice must have PAGE_SZ"); + Ok(BlockLease::Slice(slice)) + } +} + /// /// A "cursor" for efficiently reading multiple pages from a BlockReader /// diff --git a/pageserver/src/tenant/ephemeral_file/page_caching.rs b/pageserver/src/tenant/ephemeral_file/page_caching.rs index 23c3913bb4b7..0cc0a23faf8c 100644 --- a/pageserver/src/tenant/ephemeral_file/page_caching.rs +++ b/pageserver/src/tenant/ephemeral_file/page_caching.rs @@ -49,32 +49,39 @@ impl RW { self.rw.bytes_written() } + /// Load all blocks that can be read via read_blk into a contiguous memory buffer. + /// + /// This includes blocks that are in the internal buffered writer's buffer. + /// The last block is zero-padded to `PAGE_SZ`, so, the returned buffer is always a multiple of `PAGE_SZ`. pub(crate) async fn load_into_contiguous_memory( &self, ctx: &RequestContext, ) -> Result, io::Error> { + // round up to the next PAGE_SZ multiple, required by blob_io let size = { - let s = self.bytes_written(); - s += s % PAGE_SZ; // round up to the next PAGE_SZ multiple, required by blob_io + let s = usize::try_from(self.bytes_written()).unwrap(); + s.checked_add(s % PAGE_SZ).unwrap() }; let buf = Vec::with_capacity(size); // read from disk what we've already flushed let writer = self.rw.as_writer(); + let nwritten_blocks = usize::try_from(writer.nwritten_blocks).unwrap(); let buf = writer .read_all_written_blocks_into_memory_directly_from_disk( - buf.slice(0..writer.nwritten_blocks * PAGE_SZ), + buf.slice(0..nwritten_blocks * PAGE_SZ), ctx, ) .await?; - let vec = buf.into_inner(); + let mut vec = buf.into_inner(); assert_eq!(vec.len() % PAGE_SZ, 0); - assert_eq!(vec.len() / PAGE_SZ, writer.nwritten_blocks as usize); + assert_eq!(vec.len() / PAGE_SZ, nwritten_blocks); // copy from in-memory buffer what we haven't flushed yet but would return when accessed via read_blk let buffered = self.rw.inspect_buffer_zero_padded(); vec.extend_from_slice(buffered); assert_eq!(vec.len(), size); + assert_eq!(vec.len() % PAGE_SZ, 0); Ok(vec) } @@ -160,15 +167,17 @@ impl PreWarmingWriter { } async fn read_all_written_blocks_into_memory_directly_from_disk< - B: tokio_epoll_uring::Slice, - Buf: tokio_epoll_uring::IoBufMut + Send, + B: tokio_epoll_uring::BoundedBufMut + Send, >( &self, buf: B, ctx: &RequestContext, ) -> std::io::Result { assert_eq!(buf.bytes_total() % PAGE_SZ, 0); - assert_eq!(buf.bytes_total() / PAGE_SZ, self.nwritten_blocks); + assert_eq!( + buf.bytes_total() / PAGE_SZ, + usize::try_from(self.nwritten_blocks).unwrap() + ); self.file.read_exact_at(buf, 0, ctx).await } } diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs index 26d87deb7829..3b00dd96eccc 100644 --- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs +++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs @@ -621,6 +621,7 @@ impl InMemoryLayer { let inner = self.inner.read().await; let l0_flush_global_state = timeline.l0_flush_global_state.inner(); + use l0_flush::Inner; let _memory_permit = match l0_flush_global_state { Inner::PageCached => None, Inner::Direct { semaphore, .. } => Some(semaphore.acquire(inner.file.len()).await), diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index f8c0ffbaa726..0851ae51b5ef 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -17,6 +17,7 @@ use bytes::Bytes; use camino::Utf8Path; use enumset::EnumSet; use fail::fail_point; +use l0_flush::L0FlushGlobalState; use once_cell::sync::Lazy; use pageserver_api::{ key::{ diff --git a/pageserver/src/tenant/timeline/l0_flush.rs b/pageserver/src/tenant/timeline/l0_flush.rs index a2d88d8efa79..32c6728701f6 100644 --- a/pageserver/src/tenant/timeline/l0_flush.rs +++ b/pageserver/src/tenant/timeline/l0_flush.rs @@ -1,4 +1,4 @@ -use std::num::NonZeroUsize; +use std::{num::NonZeroUsize, sync::Arc}; #[derive(Default)] pub enum L0FlushConfig { @@ -8,7 +8,7 @@ pub enum L0FlushConfig { /// Concurrent L0 flushes are limited to consume at most `max_memory` bytes of memory. /// If there are a lot of small L0s that need to be flushed, a lot of flushes can happen in parallel. /// If there is a large L0 to be flushed, it might have to wait until preceding flushes are done. - pub max_memory: MaxMemory, + max_memory: MaxMemory, }, } @@ -17,7 +17,7 @@ pub struct MaxMemory(NonZeroUsize); pub struct L0FlushGlobalState(Arc); -pub(in crate::tenant::storage_layer::inmemory_layer) enum Inner { +pub(crate) enum Inner { PageCached, Direct { config: L0FlushConfig, @@ -36,7 +36,7 @@ impl L0FlushGlobalState { } } - pub(in crate::tenant::storage_layer::inmemory_layer) fn inner(&self) -> &Inner { + pub(crate) fn inner(&self) -> &Inner { &self.0 } } diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index 04d9386fab92..08bbfd44672b 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -20,7 +20,7 @@ use once_cell::sync::OnceCell; use pageserver_api::shard::TenantShardId; use std::fs::File; use std::io::{Error, ErrorKind, Seek, SeekFrom}; -use tokio_epoll_uring::{BoundedBuf, IoBuf, IoBufMut, Slice}; +use tokio_epoll_uring::{BoundedBuf, BoundedBufMut, IoBuf, IoBufMut, Slice}; use std::os::fd::{AsRawFd, FromRawFd, IntoRawFd, OwnedFd, RawFd}; use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering}; @@ -592,7 +592,7 @@ impl VirtualFile { ctx: &RequestContext, ) -> Result where - B: IoBufMut + Send, + B: BoundedBufMut + Send, { let (buf, res) = read_exact_at_impl(buf, offset, None, |buf, offset| { self.read_at(buf, offset, ctx) @@ -788,7 +788,7 @@ pub async fn read_exact_at_impl( mut read_at: F, ) -> (B, std::io::Result<()>) where - B: IoBufMut + Send, + B: BoundedBufMut + Send, F: FnMut(tokio_epoll_uring::Slice, u64) -> Fut, Fut: std::future::Future, std::io::Result)>, { From 282a633364a8f4b9f2e39943de478a88af865bcc Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Tue, 25 Jun 2024 17:39:34 +0000 Subject: [PATCH 03/24] read_exact_at_impl: accept a BoundedBuf --- pageserver/src/virtual_file.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index 04d9386fab92..bf5bb802dfe0 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -786,13 +786,13 @@ pub async fn read_exact_at_impl( mut offset: u64, count: Option, mut read_at: F, -) -> (B, std::io::Result<()>) +) -> (B::Buf, std::io::Result<()>) where - B: IoBufMut + Send, - F: FnMut(tokio_epoll_uring::Slice, u64) -> Fut, - Fut: std::future::Future, std::io::Result)>, + B: BoundedBuf + Send, + F: FnMut(tokio_epoll_uring::Slice, u64) -> Fut, + Fut: std::future::Future, std::io::Result)>, { - let mut buf: tokio_epoll_uring::Slice = match count { + let mut buf: tokio_epoll_uring::Slice<_> = match count { Some(count) => { assert!(count <= buf.bytes_total()); assert!(count > 0); From 79a33dc5c0189e3e2b9f63c75fcf786783fd5e9d Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Tue, 25 Jun 2024 18:41:32 +0000 Subject: [PATCH 04/24] WIP --- pageserver/src/page_cache.rs | 31 +++++++++++ pageserver/src/virtual_file.rs | 96 +++++++++++++--------------------- 2 files changed, 68 insertions(+), 59 deletions(-) diff --git a/pageserver/src/page_cache.rs b/pageserver/src/page_cache.rs index f386c825b848..dc1f5bebb14d 100644 --- a/pageserver/src/page_cache.rs +++ b/pageserver/src/page_cache.rs @@ -320,6 +320,37 @@ impl Drop for PageWriteGuard<'_> { } } +/// SAFETY: page cache slots are stable in memory and are zero-initialized. +unsafe impl tokio_epoll_uring::IoBuf for PageWriteGuard<'static> { + fn stable_ptr(&self) -> *const u8 { + let data: &[u8; PAGE_SZ] = self.deref(); + data.as_ptr() + } + + fn bytes_init(&self) -> usize { + PAGE_SZ + } + + fn bytes_total(&self) -> usize { + PAGE_SZ + } +} + +/// SAFETY: underlying RwLockGuard guarantees unique access to the buffer +/// and the `&mut self` ensures something has unique ownership of `Self`. +unsafe impl tokio_epoll_uring::IoBufMut for PageWriteGuard<'static> { + fn stable_mut_ptr(&mut self) -> *mut u8 { + let data: &mut [u8; PAGE_SZ] = self.deref_mut(); + data.as_mut_ptr() + } + + unsafe fn set_init(&mut self, pos: usize) { + // There should never be a reason to call this because bytes_init() == bytes_total(). + // But there's nothing preventing a caller from doing so. + assert_eq!(pos, PAGE_SZ); + } +} + /// lock_for_read() return value pub enum ReadBufResult<'a> { Found(PageReadGuard<'a>), diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index bf5bb802dfe0..2097ae083ff0 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -13,14 +13,14 @@ use crate::context::RequestContext; use crate::metrics::{StorageIoOperation, STORAGE_IO_SIZE, STORAGE_IO_TIME_METRIC}; -use crate::page_cache::PageWriteGuard; +use crate::page_cache::{PageWriteGuard, PAGE_SZ}; use crate::tenant::TENANTS_SEGMENT_NAME; use camino::{Utf8Path, Utf8PathBuf}; use once_cell::sync::OnceCell; use pageserver_api::shard::TenantShardId; use std::fs::File; use std::io::{Error, ErrorKind, Seek, SeekFrom}; -use tokio_epoll_uring::{BoundedBuf, IoBuf, IoBufMut, Slice}; +use tokio_epoll_uring::{BoundedBuf, BoundedBufMut, IoBuf, IoBufMut, Slice}; use std::os::fd::{AsRawFd, FromRawFd, IntoRawFd, OwnedFd, RawFd}; use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering}; @@ -140,37 +140,6 @@ struct SlotInner { file: Option, } -/// Impl of [`tokio_epoll_uring::IoBuf`] and [`tokio_epoll_uring::IoBufMut`] for [`PageWriteGuard`]. -struct PageWriteGuardBuf { - page: PageWriteGuard<'static>, - init_up_to: usize, -} -// Safety: the [`PageWriteGuard`] gives us exclusive ownership of the page cache slot, -// and the location remains stable even if [`Self`] or the [`PageWriteGuard`] is moved. -unsafe impl tokio_epoll_uring::IoBuf for PageWriteGuardBuf { - fn stable_ptr(&self) -> *const u8 { - self.page.as_ptr() - } - fn bytes_init(&self) -> usize { - self.init_up_to - } - fn bytes_total(&self) -> usize { - self.page.len() - } -} -// Safety: see above, plus: the ownership of [`PageWriteGuard`] means exclusive access, -// hence it's safe to hand out the `stable_mut_ptr()`. -unsafe impl tokio_epoll_uring::IoBufMut for PageWriteGuardBuf { - fn stable_mut_ptr(&mut self) -> *mut u8 { - self.page.as_mut_ptr() - } - - unsafe fn set_init(&mut self, pos: usize) { - assert!(pos <= self.page.len()); - self.init_up_to = pos; - } -} - impl OpenFiles { /// Find a slot to use, evicting an existing file descriptor if needed. /// @@ -585,14 +554,18 @@ impl VirtualFile { Ok(self.pos) } - pub async fn read_exact_at( + /// Read `buf.bytes_total()` bytes into buf, starting at offset `offset`. + /// + /// The returned slice is guaranteed to be the same view into the underlying `Buf` as the input argument `buf`. + pub async fn read_exact_at( &self, buf: B, offset: u64, ctx: &RequestContext, - ) -> Result + ) -> Result, Error> where - B: IoBufMut + Send, + Buf: IoBufMut + Send, + B: BoundedBufMut, { let (buf, res) = read_exact_at_impl(buf, offset, None, |buf, offset| { self.read_at(buf, offset, ctx) @@ -601,21 +574,25 @@ impl VirtualFile { res.map(|()| buf) } - pub async fn read_exact_at_n( + /// Read `count` bytes into `buf`, starting at offset `offset`. + /// + /// The returned `B` is the same `buf` that was passed in. + /// On success, it is guaranteed that the returned `B` has `bytes_init() >= count`. + pub async fn read_exact_at_n( &self, - buf: B, + buf: Buf, offset: u64, count: usize, ctx: &RequestContext, - ) -> Result + ) -> Result where - B: IoBufMut + Send, + Buf: IoBufMut + Send, { let (buf, res) = read_exact_at_impl(buf, offset, Some(count), |buf, offset| { self.read_at(buf, offset, ctx) }) .await; - res.map(|()| buf) + res.map(|()| buf.into_inner()) } /// Like [`Self::read_exact_at`] but for [`PageWriteGuard`]. @@ -625,13 +602,9 @@ impl VirtualFile { offset: u64, ctx: &RequestContext, ) -> Result, Error> { - let buf = PageWriteGuardBuf { - page, - init_up_to: 0, - }; - let res = self.read_exact_at(buf, offset, ctx).await; - res.map(|PageWriteGuardBuf { page, .. }| page) - .map_err(|e| Error::new(ErrorKind::Other, e)) + assert_eq!(page.len(), PAGE_SZ); + let res = self.read_exact_at_n(page, offset, PAGE_SZ, ctx).await; + res.map_err(|e| Error::new(ErrorKind::Other, e)) } // Copied from https://doc.rust-lang.org/1.72.0/src/std/os/unix/fs.rs.html#219-235 @@ -781,17 +754,20 @@ impl VirtualFile { } // Adapted from https://doc.rust-lang.org/1.72.0/src/std/os/unix/fs.rs.html#117-135 -pub async fn read_exact_at_impl( +pub async fn read_exact_at_impl( buf: B, mut offset: u64, count: Option, mut read_at: F, -) -> (B::Buf, std::io::Result<()>) +) -> (tokio_epoll_uring::Slice, std::io::Result<()>) where - B: BoundedBuf + Send, - F: FnMut(tokio_epoll_uring::Slice, u64) -> Fut, - Fut: std::future::Future, std::io::Result)>, + Buf: IoBufMut + Send, + B: BoundedBufMut, + F: FnMut(tokio_epoll_uring::Slice, u64) -> Fut, + Fut: std::future::Future, std::io::Result)>, { + let original_bounds = buf.bounds(); + let mut buf: tokio_epoll_uring::Slice<_> = match count { Some(count) => { assert!(count <= buf.bytes_total()); @@ -811,7 +787,7 @@ where offset += n as u64; } Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {} - Err(e) => return (buf.into_inner(), Err(e)), + Err(e) => return (buf.into_inner().slice(original_bounds), Err(e)), } } // NB: don't use `buf.is_empty()` here; it is from the @@ -821,7 +797,7 @@ where // buffer that the user passed in. if buf.bytes_total() != 0 { ( - buf.into_inner(), + buf.into_inner().slice(original_bounds), Err(std::io::Error::new( std::io::ErrorKind::UnexpectedEof, "failed to fill whole buffer", @@ -829,7 +805,7 @@ where ) } else { assert_eq!(buf.len(), buf.bytes_total()); - (buf.into_inner(), Ok(())) + (buf.into_inner().slice(original_bounds), Ok(())) } } @@ -1053,7 +1029,7 @@ impl VirtualFile { use crate::page_cache::PAGE_SZ; let buf = vec![0; PAGE_SZ]; let buf = self - .read_exact_at(buf, blknum as u64 * (PAGE_SZ as u64), ctx) + .read_exact_at_n(buf, blknum as u64 * (PAGE_SZ as u64), PAGE_SZ, ctx) .await?; Ok(crate::tenant::block_io::BlockLease::Vec(buf)) } @@ -1211,7 +1187,9 @@ mod tests { ctx: &RequestContext, ) -> Result, Error> { match self { - MaybeVirtualFile::VirtualFile(file) => file.read_exact_at(buf, offset, ctx).await, + MaybeVirtualFile::VirtualFile(file) => { + file.read_exact_at_n(buf, offset, buf.len(), ctx).await + } MaybeVirtualFile::File(file) => file.read_exact_at(&mut buf, offset).map(|()| buf), } } @@ -1507,7 +1485,7 @@ mod tests { let mut rng = rand::rngs::OsRng; for _ in 1..1000 { let f = &files[rng.gen_range(0..files.len())]; - buf = f.read_exact_at(buf, 0, &ctx).await.unwrap(); + buf = f.read_exact_at_n(buf, 0, SIZE, &ctx).await.unwrap(); assert!(buf == SAMPLE); } }); From c7fc16929d204101364b6a26ecc1ab7cbffdd122 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Tue, 25 Jun 2024 19:11:38 +0000 Subject: [PATCH 05/24] WIP --- pageserver/src/page_cache.rs | 31 ----------------------- pageserver/src/virtual_file.rs | 45 ++++++++++++++++++++++++++++++---- 2 files changed, 40 insertions(+), 36 deletions(-) diff --git a/pageserver/src/page_cache.rs b/pageserver/src/page_cache.rs index dc1f5bebb14d..f386c825b848 100644 --- a/pageserver/src/page_cache.rs +++ b/pageserver/src/page_cache.rs @@ -320,37 +320,6 @@ impl Drop for PageWriteGuard<'_> { } } -/// SAFETY: page cache slots are stable in memory and are zero-initialized. -unsafe impl tokio_epoll_uring::IoBuf for PageWriteGuard<'static> { - fn stable_ptr(&self) -> *const u8 { - let data: &[u8; PAGE_SZ] = self.deref(); - data.as_ptr() - } - - fn bytes_init(&self) -> usize { - PAGE_SZ - } - - fn bytes_total(&self) -> usize { - PAGE_SZ - } -} - -/// SAFETY: underlying RwLockGuard guarantees unique access to the buffer -/// and the `&mut self` ensures something has unique ownership of `Self`. -unsafe impl tokio_epoll_uring::IoBufMut for PageWriteGuard<'static> { - fn stable_mut_ptr(&mut self) -> *mut u8 { - let data: &mut [u8; PAGE_SZ] = self.deref_mut(); - data.as_mut_ptr() - } - - unsafe fn set_init(&mut self, pos: usize) { - // There should never be a reason to call this because bytes_init() == bytes_total(). - // But there's nothing preventing a caller from doing so. - assert_eq!(pos, PAGE_SZ); - } -} - /// lock_for_read() return value pub enum ReadBufResult<'a> { Found(PageReadGuard<'a>), diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index 2097ae083ff0..9c8e8de0e0ad 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -13,7 +13,7 @@ use crate::context::RequestContext; use crate::metrics::{StorageIoOperation, STORAGE_IO_SIZE, STORAGE_IO_TIME_METRIC}; -use crate::page_cache::{PageWriteGuard, PAGE_SZ}; +use crate::page_cache::PageWriteGuard; use crate::tenant::TENANTS_SEGMENT_NAME; use camino::{Utf8Path, Utf8PathBuf}; use once_cell::sync::OnceCell; @@ -140,6 +140,38 @@ struct SlotInner { file: Option, } +/// Impl of [`tokio_epoll_uring::IoBuf`] and [`tokio_epoll_uring::IoBufMut`] for [`PageWriteGuard`]. +struct PageWriteGuardBuf { + page: PageWriteGuard<'static>, +} +// Safety: the [`PageWriteGuard`] gives us exclusive ownership of the page cache slot, +// and the location remains stable even if [`Self`] or the [`PageWriteGuard`] is moved. +// Page cache pages are zero-initialized, so, wrt uninitialized memory we're good. +// (Page cache tracks separately whether the contents are valid, see `PageWriteGuard::mark_valid`.) +unsafe impl tokio_epoll_uring::IoBuf for PageWriteGuardBuf { + fn stable_ptr(&self) -> *const u8 { + self.page.as_ptr() + } + fn bytes_init(&self) -> usize { + self.page.len() + } + fn bytes_total(&self) -> usize { + self.page.len() + } +} +// Safety: see above, plus: the ownership of [`PageWriteGuard`] means exclusive access, +// hence it's safe to hand out the `stable_mut_ptr()`. +unsafe impl tokio_epoll_uring::IoBufMut for PageWriteGuardBuf { + fn stable_mut_ptr(&mut self) -> *mut u8 { + self.page.as_mut_ptr() + } + + unsafe fn set_init(&mut self, pos: usize) { + // There shouldn't really be any reason to call this API since bytes_init() == bytes_total(). + assert!(pos <= self.page.len()); + } +} + impl OpenFiles { /// Find a slot to use, evicting an existing file descriptor if needed. /// @@ -602,9 +634,11 @@ impl VirtualFile { offset: u64, ctx: &RequestContext, ) -> Result, Error> { - assert_eq!(page.len(), PAGE_SZ); - let res = self.read_exact_at_n(page, offset, PAGE_SZ, ctx).await; - res.map_err(|e| Error::new(ErrorKind::Other, e)) + let page_sz = page.len(); + let buf = PageWriteGuardBuf { page }; + let res = self.read_exact_at_n(buf, offset, page_sz, ctx).await; + res.map(|PageWriteGuardBuf { page, .. }| page) + .map_err(|e| Error::new(ErrorKind::Other, e)) } // Copied from https://doc.rust-lang.org/1.72.0/src/std/os/unix/fs.rs.html#219-235 @@ -1188,7 +1222,8 @@ mod tests { ) -> Result, Error> { match self { MaybeVirtualFile::VirtualFile(file) => { - file.read_exact_at_n(buf, offset, buf.len(), ctx).await + let n = buf.len(); + file.read_exact_at_n(buf, offset, n, ctx).await } MaybeVirtualFile::File(file) => file.read_exact_at(&mut buf, offset).map(|()| buf), } From 27cae3ca858b0ea5032bb2222e24150469e50eae Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Tue, 25 Jun 2024 19:14:44 +0000 Subject: [PATCH 06/24] it compiles --- pageserver/src/virtual_file.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index 9c8e8de0e0ad..a62fb7dad558 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -906,7 +906,7 @@ mod test_read_exact_at_impl { }) .await; assert!(res.is_ok()); - assert_eq!(buf, vec![b'a', b'b', b'c', b'd', b'e']); + assert_eq!(buf.into_inner(), vec![b'a', b'b', b'c', b'd', b'e']); } #[tokio::test] @@ -926,7 +926,7 @@ mod test_read_exact_at_impl { }) .await; assert!(res.is_ok()); - assert_eq!(buf, vec![b'a', b'b', b'c']); + assert_eq!(buf.into_inner(), vec![b'a', b'b', b'c']); } #[tokio::test] @@ -966,7 +966,7 @@ mod test_read_exact_at_impl { }) .await; assert!(res.is_ok()); - assert_eq!(buf, vec![b'a', b'b', b'c', b'd']); + assert_eq!(buf.into_inner(), vec![b'a', b'b', b'c', b'd']); } #[tokio::test] From 5031f41b39d702a222ff9c25f42e7cf2defab3c5 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Wed, 26 Jun 2024 11:40:03 +0000 Subject: [PATCH 07/24] hack hack --- pageserver/src/bin/pageserver.rs | 5 ++-- pageserver/src/config.rs | 17 ++++++++++-- .../src/{tenant/timeline => }/l0_flush.rs | 23 +++++++--------- pageserver/src/lib.rs | 1 + pageserver/src/tenant.rs | 26 +++++++++---------- pageserver/src/tenant/block_io.rs | 3 ++- .../ephemeral_file/zero_padded_read_write.rs | 4 +-- .../zero_padded_read_write/zero_padded.rs | 4 --- .../tenant/storage_layer/inmemory_layer.rs | 12 +++++---- pageserver/src/tenant/timeline.rs | 8 +++--- pageserver/src/tenant/timeline/delete.rs | 2 +- 11 files changed, 58 insertions(+), 47 deletions(-) rename pageserver/src/{tenant/timeline => }/l0_flush.rs (50%) diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 3c94671ad2c8..d662e5d611fb 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -421,7 +421,8 @@ fn start_pageserver( background_jobs_can_start: background_jobs_barrier.clone(), }; - let l0_flush_semaphore = Arc::new(tokio::sync::Semaphore::new(conf.l0_flush_max_concurrency)); + let l0_flush_global_state = + pageserver::l0_flush::L0FlushGlobalState::new(conf.l0_flush.clone()); // Scan the local 'tenants/' directory and start loading the tenants let deletion_queue_client = deletion_queue.new_client(); @@ -431,7 +432,7 @@ fn start_pageserver( broker_client: broker_client.clone(), remote_storage: remote_storage.clone(), deletion_queue_client, - l0_flush_semaphore, + l0_flush_global_state, }, order, shutdown_pageserver.clone(), diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index a3b45843bddd..46e93d8dcb8e 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -30,7 +30,7 @@ use utils::{ logging::LogFormat, }; -use crate::tenant::timeline::{l0_flush::L0FlushConfig, GetVectoredImpl}; +use crate::{l0_flush::L0FlushConfig, tenant::timeline::GetVectoredImpl}; use crate::tenant::vectored_blob_io::MaxVectoredReadBytes; use crate::tenant::{config::TenantConfOpt, timeline::GetImpl}; use crate::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME}; @@ -298,7 +298,6 @@ pub struct PageServerConf { pub l0_flush: L0FlushConfig, } - /// We do not want to store this in a PageServerConf because the latter may be logged /// and/or serialized at a whim, while the token is secret. Currently this token is the /// same for accessing all tenants/timelines, but may become per-tenant/per-timeline in @@ -402,6 +401,8 @@ struct PageServerConfigBuilder { validate_vectored_get: BuilderValue, ephemeral_bytes_per_memory_kb: BuilderValue, + + l0_flush: BuilderValue, } impl PageServerConfigBuilder { @@ -490,6 +491,7 @@ impl PageServerConfigBuilder { )), validate_vectored_get: Set(DEFAULT_VALIDATE_VECTORED_GET), ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB), + l0_flush: Set(L0FlushConfig::PageCached), } } } @@ -677,6 +679,10 @@ impl PageServerConfigBuilder { self.ephemeral_bytes_per_memory_kb = BuilderValue::Set(value); } + pub fn l0_flush(&mut self, value: L0FlushConfig) { + self.l0_flush = BuilderValue::Set(value); + } + pub fn build(self) -> anyhow::Result { let default = Self::default_values(); @@ -734,6 +740,7 @@ impl PageServerConfigBuilder { max_vectored_read_bytes, validate_vectored_get, ephemeral_bytes_per_memory_kb, + l0_flush, } CUSTOM LOGIC { @@ -1017,6 +1024,9 @@ impl PageServerConf { "ephemeral_bytes_per_memory_kb" => { builder.get_ephemeral_bytes_per_memory_kb(parse_toml_u64("ephemeral_bytes_per_memory_kb", item)? as usize) } + "l0_flush" => { + builder.l0_flush(toml_edit::de::from_document(item)) + } _ => bail!("unrecognized pageserver option '{key}'"), } } @@ -1100,6 +1110,7 @@ impl PageServerConf { ), validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET, ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB, + l0_flush: L0FlushConfig::PageCached, } } } @@ -1339,6 +1350,7 @@ background_task_maximum_delay = '334 s' ), validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET, ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB, + l0_flush: L0FlushConfig::PageCached, }, "Correct defaults should be used when no config values are provided" ); @@ -1412,6 +1424,7 @@ background_task_maximum_delay = '334 s' ), validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET, ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB, + l0_flush: L0FlushConfig::PageCached, }, "Should be able to parse all basic config values correctly" ); diff --git a/pageserver/src/tenant/timeline/l0_flush.rs b/pageserver/src/l0_flush.rs similarity index 50% rename from pageserver/src/tenant/timeline/l0_flush.rs rename to pageserver/src/l0_flush.rs index e43893b66299..4be75ad7387c 100644 --- a/pageserver/src/tenant/timeline/l0_flush.rs +++ b/pageserver/src/l0_flush.rs @@ -1,40 +1,37 @@ use std::{num::NonZeroUsize, sync::Arc}; -#[derive(Default, Debug, PartialEq, Eq)] +#[derive(Default, Debug, PartialEq, Eq, Clone)] pub enum L0FlushConfig { #[default] PageCached, Direct { - max_memory: NonZeroUsize, max_concurrency: NonZeroUsize, }, + Fail(&'static str), } +#[derive(Clone)] pub struct L0FlushGlobalState(Arc); pub(crate) enum Inner { PageCached, - Direct { - config: L0FlushConfig, - semaphore: tokio::sync::Semaphore, - }, + Direct { semaphore: tokio::sync::Semaphore }, + Fail(&'static str), } impl L0FlushGlobalState { pub fn new(config: L0FlushConfig) -> Self { match config { - L0FlushConfig::PageCached => Self(Inner::PageCached), - L0FlushConfig::Direct { - max_memory, - max_concurrency, - } => { + L0FlushConfig::PageCached => Self(Arc::new(Inner::PageCached)), + L0FlushConfig::Direct { max_concurrency } => { let semaphore = tokio::sync::Semaphore::new(max_concurrency.get()); - Self(Inner::Direct { config, semaphore }) + Self(Arc::new(Inner::Direct { semaphore })) } + L0FlushConfig::Fail(msg) => Self(Arc::new(Inner::Fail(msg))), } } - pub(crate) fn inner(&self) -> &Inner { + pub(crate) fn inner(&self) -> &Arc { &self.0 } } diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index 9e64eafffcab..93fc5269f417 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -11,6 +11,7 @@ pub mod deletion_queue; pub mod disk_usage_eviction_task; pub mod http; pub mod import_datadir; +pub mod l0_flush; pub use pageserver_api::keyspace; pub mod aux_file; pub mod metrics; diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index cb46a8e97ac5..f7a23402faa2 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -73,6 +73,8 @@ use crate::deletion_queue::DeletionQueueClient; use crate::deletion_queue::DeletionQueueError; use crate::import_datadir; use crate::is_uninit_mark; +use crate::l0_flush::L0FlushConfig; +use crate::l0_flush::L0FlushGlobalState; use crate::metrics::TENANT; use crate::metrics::{ remove_tenant_metrics, BROKEN_TENANTS_SET, TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC, @@ -165,7 +167,7 @@ pub struct TenantSharedResources { pub broker_client: storage_broker::BrokerClientChannel, pub remote_storage: GenericRemoteStorage, pub deletion_queue_client: DeletionQueueClient, - pub l0_flush_semaphore: Arc, + pub l0_flush_global_state: L0FlushGlobalState, } /// A [`Tenant`] is really an _attached_ tenant. The configuration @@ -297,8 +299,7 @@ pub struct Tenant { /// An ongoing timeline detach must be checked during attempts to GC or compact a timeline. ongoing_timeline_detach: std::sync::Mutex>, - /// Global semaphore for L0 flushes, passed in from tenant_mgr and we pass references to it to each timeline. - pub(crate) l0_flush_semaphore: Arc, + l0_flush_global_state: L0FlushGlobalState, } impl std::fmt::Debug for Tenant { @@ -662,7 +663,7 @@ impl Tenant { broker_client, remote_storage, deletion_queue_client, - l0_flush_semaphore, + l0_flush_global_state, } = resources; let attach_mode = attached_conf.location.attach_mode; @@ -677,7 +678,7 @@ impl Tenant { tenant_shard_id, remote_storage.clone(), deletion_queue_client, - l0_flush_semaphore, + l0_flush_global_state, )); // The attach task will carry a GateGuard, so that shutdown() reliably waits for it to drop out if @@ -990,7 +991,7 @@ impl Tenant { TimelineResources { remote_client, timeline_get_throttle: self.timeline_get_throttle.clone(), - l0_flush_semaphore: self.l0_flush_semaphore.clone(), + l0_flush_global_state: self.l0_flush_global_state.clone(), }, ctx, ) @@ -1175,8 +1176,7 @@ impl Tenant { tenant_shard_id, remote_storage, DeletionQueueClient::broken(), - // broken tenant should never be flushing - Arc::new(tokio::sync::Semaphore::new(0)), + L0FlushGlobalState::new(L0FlushConfig::Fail("broken tenant should not be flushing L0s")), )) } @@ -2502,7 +2502,7 @@ impl Tenant { tenant_shard_id: TenantShardId, remote_storage: GenericRemoteStorage, deletion_queue_client: DeletionQueueClient, - l0_flush_semaphore: Arc, + l0_flush_global_state: L0FlushGlobalState, ) -> Tenant { let (state, mut rx) = watch::channel(state); @@ -2586,7 +2586,7 @@ impl Tenant { )), tenant_conf: Arc::new(ArcSwap::from_pointee(attached_conf)), ongoing_timeline_detach: std::sync::Mutex::default(), - l0_flush_semaphore, + l0_flush_global_state, } } @@ -3411,7 +3411,7 @@ impl Tenant { TimelineResources { remote_client, timeline_get_throttle: self.timeline_get_throttle.clone(), - l0_flush_semaphore: self.l0_flush_semaphore.clone(), + l0_flush_global_state: self.l0_flush_global_state.clone(), } } @@ -3937,8 +3937,8 @@ pub(crate) mod harness { self.tenant_shard_id, self.remote_storage.clone(), self.deletion_queue.new_client(), - // in the past we didn't have a semaphore => use large value to avoid changing test behavior - Arc::new(tokio::sync::Semaphore::new(tokio::sync::Semaphore::MAX_PERMITS)), + // TODO: ideally we should run all unit tests with both configs + L0FlushGlobalState::new(L0FlushConfig::default()), )); let preload = tenant diff --git a/pageserver/src/tenant/block_io.rs b/pageserver/src/tenant/block_io.rs index 715b4bad54c6..84c13caa03f3 100644 --- a/pageserver/src/tenant/block_io.rs +++ b/pageserver/src/tenant/block_io.rs @@ -64,6 +64,7 @@ impl<'a> Deref for BlockLease<'a> { match self { BlockLease::PageReadGuard(v) => v.deref(), BlockLease::EphemeralFileMutableTail(v) => v, + BlockLease::Slice(v) => v, #[cfg(test)] BlockLease::Arc(v) => v.deref(), #[cfg(test)] @@ -124,7 +125,7 @@ impl<'a> BlockReaderRef<'a> { let page_sized: &[u8; PAGE_SZ] = slice .try_into() .expect("we add PAGE_SZ to start, so the slice must have PAGE_SZ"); - Ok(BlockLease::Slice(slice)) + Ok(BlockLease::Slice(page_sized)) } } diff --git a/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs b/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs index f473d946ad7f..af9996813484 100644 --- a/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs +++ b/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs @@ -77,10 +77,10 @@ where pub fn inspect_buffer_zero_padded(&self) -> &[u8] { let buffer: &zero_padded::Buffer = self.buffered_writer.inspect_buffer(); - let buffer_written_up_to = u64::try_from(buffer.pending()).unwrap(); + let buffer_written_up_to = usize::try_from(buffer.pending()).unwrap(); // pad to next page boundary let read_up_to = buffer_written_up_to + (buffer_written_up_to % PAGE_SZ); - buffer.as_zero_padded_slice()[0..buffer_written_up_to] + &buffer.as_zero_padded_slice()[0..read_up_to] } pub(crate) async fn read_blk(&self, blknum: u32) -> Result, std::io::Error> { diff --git a/pageserver/src/tenant/ephemeral_file/zero_padded_read_write/zero_padded.rs b/pageserver/src/tenant/ephemeral_file/zero_padded_read_write/zero_padded.rs index 0f1e8da23992..f90291bbf814 100644 --- a/pageserver/src/tenant/ephemeral_file/zero_padded_read_write/zero_padded.rs +++ b/pageserver/src/tenant/ephemeral_file/zero_padded_read_write/zero_padded.rs @@ -36,10 +36,6 @@ impl Buffer { pub fn as_zero_padded_slice(&self) -> &[u8; N] { &self.allocation } - - pub fn as_unpadded_slice(&self) -> &[u8] { - &self.allocation[0..self.written] - } } impl crate::virtual_file::owned_buffers_io::write::Buffer for Buffer { diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs index 32fb1cf4187c..7c6266a6bd19 100644 --- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs +++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs @@ -10,9 +10,9 @@ use crate::repository::{Key, Value}; use crate::tenant::block_io::{BlockCursor, BlockReader, BlockReaderRef}; use crate::tenant::ephemeral_file::EphemeralFile; use crate::tenant::storage_layer::ValueReconstructResult; -use crate::tenant::timeline::{l0_flush, GetVectoredError}; +use crate::tenant::timeline::GetVectoredError; use crate::tenant::{PageReconstructError, Timeline}; -use crate::{page_cache, walrecord}; +use crate::{l0_flush, page_cache, walrecord}; use anyhow::{anyhow, ensure, Result}; use pageserver_api::keyspace::KeySpace; use pageserver_api::models::InMemoryLayerInfo; @@ -620,11 +620,12 @@ impl InMemoryLayer { // rare though, so we just accept the potential latency hit for now. let inner = self.inner.read().await; - let l0_flush_global_state = timeline.l0_flush_global_state.inner(); + let l0_flush_global_state = timeline.l0_flush_global_state.inner().clone(); use l0_flush::Inner; - let _concurrency_permit = match l0_flush_global_state { + let _concurrency_permit = match &*l0_flush_global_state { Inner::PageCached => None, Inner::Direct { semaphore, .. } => Some(semaphore.acquire().await), + Inner::Fail(msg) => anyhow::bail!(*msg), }; let end_lsn = *self.end_lsn.get().unwrap(); @@ -652,7 +653,7 @@ impl InMemoryLayer { ) .await?; - match l0_flush_global_state { + match &*l0_flush_global_state { l0_flush::Inner::PageCached => { let ctx = RequestContextBuilder::extend(ctx) .page_content_kind(PageContentKind::InMemoryLayer) @@ -701,6 +702,7 @@ impl InMemoryLayer { // => we'd have more concurrenct Vec than allowed as per the semaphore. drop(_concurrency_permit); } + l0_flush::Inner::Fail(_) => unreachable!("we bail out earlier in this function"), } // MAX is used here because we identify L0 layers by full key range diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 0851ae51b5ef..773c02ce5ab4 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -9,7 +9,6 @@ pub(crate) mod logical_size; pub mod span; pub mod uninit; mod walreceiver; -pub(crate) mod l0_flush; use anyhow::{anyhow, bail, ensure, Context, Result}; use arc_swap::ArcSwap; @@ -17,7 +16,6 @@ use bytes::Bytes; use camino::Utf8Path; use enumset::EnumSet; use fail::fail_point; -use l0_flush::L0FlushGlobalState; use once_cell::sync::Lazy; use pageserver_api::{ key::{ @@ -67,7 +65,7 @@ use std::{ ops::{Deref, Range}, }; -use crate::metrics::GetKind; +use crate::{l0_flush::{self, L0FlushGlobalState}, metrics::GetKind}; use crate::pgdatadir_mapping::MAX_AUX_FILE_V2_DELTAS; use crate::{ aux_file::AuxFileSizeEstimator, @@ -210,7 +208,7 @@ pub struct TimelineResources { pub timeline_get_throttle: Arc< crate::tenant::throttle::Throttle<&'static crate::metrics::tenant_throttling::TimelineGet>, >, - pub l0_flush_semaphore: Arc, + pub l0_flush_global_state: l0_flush::L0FlushGlobalState, } pub(crate) struct AuxFilesState { @@ -2381,6 +2379,8 @@ impl Timeline { #[cfg(test)] extra_test_dense_keyspace: ArcSwap::new(Arc::new(KeySpace::default())), + + l0_flush_global_state: resources.l0_flush_global_state, }; result.repartition_threshold = result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE; diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs index dd1acfd70eb2..b0088f4ea228 100644 --- a/pageserver/src/tenant/timeline/delete.rs +++ b/pageserver/src/tenant/timeline/delete.rs @@ -272,7 +272,7 @@ impl DeleteTimelineFlow { TimelineResources { remote_client, timeline_get_throttle: tenant.timeline_get_throttle.clone(), - l0_flush_semaphore: tenant.l0_flush_semaphore.clone(), + l0_flush_global_state: tenant.l0_flush_global_state.clone(), }, // Important. We dont pass ancestor above because it can be missing. // Thus we need to skip the validation here. From fdeff8716c22eb6d480a94598f371635c96818f3 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Wed, 26 Jun 2024 11:53:26 +0000 Subject: [PATCH 08/24] refactor: generalize toml_edit::Item deserialization --- Cargo.lock | 1 + libs/remote_storage/src/config.rs | 15 +-------------- libs/utils/Cargo.toml | 1 + libs/utils/src/lib.rs | 2 ++ libs/utils/src/toml_edit_ext.rs | 22 ++++++++++++++++++++++ pageserver/src/config.rs | 15 ++++++++++++++- 6 files changed, 41 insertions(+), 15 deletions(-) create mode 100644 libs/utils/src/toml_edit_ext.rs diff --git a/Cargo.lock b/Cargo.lock index 5393538c5902..6dae8e340348 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6811,6 +6811,7 @@ dependencies = [ "tokio-stream", "tokio-tar", "tokio-util", + "toml_edit 0.19.10", "tracing", "tracing-error", "tracing-subscriber", diff --git a/libs/remote_storage/src/config.rs b/libs/remote_storage/src/config.rs index 8a8f6212e99b..100fd1c28a31 100644 --- a/libs/remote_storage/src/config.rs +++ b/libs/remote_storage/src/config.rs @@ -1,6 +1,5 @@ use std::{fmt::Debug, num::NonZeroUsize, str::FromStr, time::Duration}; -use anyhow::bail; use aws_sdk_s3::types::StorageClass; use camino::Utf8PathBuf; @@ -177,19 +176,7 @@ impl RemoteStorageConfig { pub const DEFAULT_TIMEOUT: Duration = std::time::Duration::from_secs(120); pub fn from_toml(toml: &toml_edit::Item) -> anyhow::Result> { - let document: toml_edit::Document = match toml { - toml_edit::Item::Table(toml) => toml.clone().into(), - toml_edit::Item::Value(toml_edit::Value::InlineTable(toml)) => { - toml.clone().into_table().into() - } - _ => bail!("toml not a table or inline table"), - }; - - if document.is_empty() { - return Ok(None); - } - - Ok(Some(toml_edit::de::from_document(document)?)) + Ok(utils::toml_edit_ext::deserialize_item(toml)?) } } diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml index a6a081c5c144..261ca2cc1ac0 100644 --- a/libs/utils/Cargo.toml +++ b/libs/utils/Cargo.toml @@ -40,6 +40,7 @@ thiserror.workspace = true tokio.workspace = true tokio-tar.workspace = true tokio-util.workspace = true +toml_edit.workspace = true tracing.workspace = true tracing-error.workspace = true tracing-subscriber = { workspace = true, features = ["json", "registry"] } diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs index 2953f0aad4fd..2a397d97d2b9 100644 --- a/libs/utils/src/lib.rs +++ b/libs/utils/src/lib.rs @@ -94,6 +94,8 @@ pub mod env; pub mod poison; +pub mod toml_edit_ext; + /// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages /// /// we have several cases: diff --git a/libs/utils/src/toml_edit_ext.rs b/libs/utils/src/toml_edit_ext.rs new file mode 100644 index 000000000000..ab5f7bdd95ab --- /dev/null +++ b/libs/utils/src/toml_edit_ext.rs @@ -0,0 +1,22 @@ +#[derive(Debug, thiserror::Error)] +pub enum Error { + #[error("item is not a document")] + ItemIsNotADocument, + #[error(transparent)] + Serde(toml_edit::de::Error), +} + +pub fn deserialize_item(item: &toml_edit::Item) -> Result +where + T: serde::de::DeserializeOwned, +{ + let document: toml_edit::Document = match item { + toml_edit::Item::Table(toml) => toml.clone().into(), + toml_edit::Item::Value(toml_edit::Value::InlineTable(toml)) => { + toml.clone().into_table().into() + } + _ => return Err(Error::ItemIsNotADocument), + }; + + toml_edit::de::from_document(document).map_err(Error::Serde) +} diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 46e93d8dcb8e..73ce010c3ef5 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -935,7 +935,7 @@ impl PageServerConf { "http_auth_type" => builder.http_auth_type(parse_toml_from_str(key, item)?), "pg_auth_type" => builder.pg_auth_type(parse_toml_from_str(key, item)?), "remote_storage" => { - builder.remote_storage_config(RemoteStorageConfig::from_toml(item)?) + builder.remote_storage_config(RemoteStorageConfig::from_toml(item).context("remote_storage")?) } "tenant_config" => { t_conf = TenantConfOpt::try_from(item.to_owned()).context(format!("failed to parse: '{key}'"))?; @@ -1704,6 +1704,19 @@ threshold = "20m" } } + #[test] + fn empty_remote_storage_is_error() { + let tempdir = tempdir().unwrap(); + let (workdir, _) = prepare_fs(&tempdir).unwrap(); + let input = r#" +remote_storage = {} + "#; + let doc = toml_edit::Document::from_str(&input).unwrap(); + let err = PageServerConf::parse_and_validate(&doc, &workdir) + .expect_err("empty remote_storage field should fail, don't specify it if you want no remote_storage"); + assert!(format!("{err}").contains("remote_storage"), "{err}"); + } + fn prepare_fs(tempdir: &Utf8TempDir) -> anyhow::Result<(Utf8PathBuf, Utf8PathBuf)> { let tempdir_path = tempdir.path(); From eb593a721767902e23bd75c5a23ef1edee074a84 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Wed, 26 Jun 2024 12:02:35 +0000 Subject: [PATCH 09/24] hack hack hack --- pageserver/src/config.rs | 4 ++-- pageserver/src/l0_flush.rs | 5 ++++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 73ce010c3ef5..f54257c9f02b 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -30,11 +30,11 @@ use utils::{ logging::LogFormat, }; -use crate::{l0_flush::L0FlushConfig, tenant::timeline::GetVectoredImpl}; use crate::tenant::vectored_blob_io::MaxVectoredReadBytes; use crate::tenant::{config::TenantConfOpt, timeline::GetImpl}; use crate::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME}; use crate::{disk_usage_eviction_task::DiskUsageEvictionTaskConfig, virtual_file::io_engine}; +use crate::{l0_flush::L0FlushConfig, tenant::timeline::GetVectoredImpl}; use crate::{tenant::config::TenantConf, virtual_file}; use crate::{ TENANT_CONFIG_NAME, TENANT_HEATMAP_BASENAME, TENANT_LOCATION_CONFIG_NAME, @@ -1025,7 +1025,7 @@ impl PageServerConf { builder.get_ephemeral_bytes_per_memory_kb(parse_toml_u64("ephemeral_bytes_per_memory_kb", item)? as usize) } "l0_flush" => { - builder.l0_flush(toml_edit::de::from_document(item)) + builder.l0_flush(utils::toml_edit_ext::deserialize_item(item)?) } _ => bail!("unrecognized pageserver option '{key}'"), } diff --git a/pageserver/src/l0_flush.rs b/pageserver/src/l0_flush.rs index 4be75ad7387c..f605d7c9b5e8 100644 --- a/pageserver/src/l0_flush.rs +++ b/pageserver/src/l0_flush.rs @@ -1,12 +1,15 @@ use std::{num::NonZeroUsize, sync::Arc}; -#[derive(Default, Debug, PartialEq, Eq, Clone)] +#[derive(Default, Debug, PartialEq, Eq, Clone, serde::Deserialize)] +#[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)] pub enum L0FlushConfig { #[default] PageCached, + #[serde(rename_all = "snake_case")] Direct { max_concurrency: NonZeroUsize, }, + #[serde(skip)] Fail(&'static str), } From 38608b3b5c4fb1b30e86db51a20d4987c11c6563 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Wed, 26 Jun 2024 12:09:55 +0000 Subject: [PATCH 10/24] hack hack hack --- pageserver/src/config.rs | 2 +- pageserver/src/l0_flush.rs | 8 +++----- pageserver/src/tenant.rs | 4 +++- pageserver/src/tenant/ephemeral_file.rs | 5 ++++- pageserver/src/tenant/storage_layer/inmemory_layer.rs | 2 +- pageserver/src/tenant/timeline.rs | 5 ++++- 6 files changed, 16 insertions(+), 10 deletions(-) diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index f54257c9f02b..a8409c0446b6 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -1025,7 +1025,7 @@ impl PageServerConf { builder.get_ephemeral_bytes_per_memory_kb(parse_toml_u64("ephemeral_bytes_per_memory_kb", item)? as usize) } "l0_flush" => { - builder.l0_flush(utils::toml_edit_ext::deserialize_item(item)?) + builder.l0_flush(utils::toml_edit_ext::deserialize_item(item).context("l0_flush")?) } _ => bail!("unrecognized pageserver option '{key}'"), } diff --git a/pageserver/src/l0_flush.rs b/pageserver/src/l0_flush.rs index f605d7c9b5e8..416731e3b0a0 100644 --- a/pageserver/src/l0_flush.rs +++ b/pageserver/src/l0_flush.rs @@ -6,11 +6,9 @@ pub enum L0FlushConfig { #[default] PageCached, #[serde(rename_all = "snake_case")] - Direct { - max_concurrency: NonZeroUsize, - }, + Direct { max_concurrency: NonZeroUsize }, #[serde(skip)] - Fail(&'static str), + Fail(String), } #[derive(Clone)] @@ -19,7 +17,7 @@ pub struct L0FlushGlobalState(Arc); pub(crate) enum Inner { PageCached, Direct { semaphore: tokio::sync::Semaphore }, - Fail(&'static str), + Fail(String), } impl L0FlushGlobalState { diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index f7a23402faa2..b7f9e9c40146 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -1176,7 +1176,9 @@ impl Tenant { tenant_shard_id, remote_storage, DeletionQueueClient::broken(), - L0FlushGlobalState::new(L0FlushConfig::Fail("broken tenant should not be flushing L0s")), + L0FlushGlobalState::new(L0FlushConfig::Fail( + "broken tenant should not be flushing L0s".to_owned(), + )), )) } diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs index e09a868c0942..7240e3bc0951 100644 --- a/pageserver/src/tenant/ephemeral_file.rs +++ b/pageserver/src/tenant/ephemeral_file.rs @@ -65,7 +65,10 @@ impl EphemeralFile { self.rw.page_cache_file_id() } - pub(crate) async fn load_into_contiguous_memory(&self, ctx: &RequestContext) -> Result, io::Error> { + pub(crate) async fn load_into_contiguous_memory( + &self, + ctx: &RequestContext, + ) -> Result, io::Error> { self.rw.load_into_contiguous_memory(ctx).await } diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs index 7c6266a6bd19..3a58b51ef6c7 100644 --- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs +++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs @@ -625,7 +625,7 @@ impl InMemoryLayer { let _concurrency_permit = match &*l0_flush_global_state { Inner::PageCached => None, Inner::Direct { semaphore, .. } => Some(semaphore.acquire().await), - Inner::Fail(msg) => anyhow::bail!(*msg), + Inner::Fail(msg) => anyhow::bail!(msg.clone()), }; let end_lsn = *self.end_lsn.get().unwrap(); diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 773c02ce5ab4..4add3e4a5ada 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -65,7 +65,6 @@ use std::{ ops::{Deref, Range}, }; -use crate::{l0_flush::{self, L0FlushGlobalState}, metrics::GetKind}; use crate::pgdatadir_mapping::MAX_AUX_FILE_V2_DELTAS; use crate::{ aux_file::AuxFileSizeEstimator, @@ -90,6 +89,10 @@ use crate::{ use crate::{ disk_usage_eviction_task::EvictionCandidate, tenant::storage_layer::delta_layer::DeltaEntry, }; +use crate::{ + l0_flush::{self, L0FlushGlobalState}, + metrics::GetKind, +}; use crate::{ metrics::ScanLatencyOngoingRecording, tenant::timeline::logical_size::CurrentLogicalSize, }; From d4f419a281101bd8a13cd2d7c1bedf8bddc55212 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Wed, 26 Jun 2024 12:36:43 +0000 Subject: [PATCH 11/24] fix some bugs --- pageserver/src/bin/pageserver.rs | 1 + pageserver/src/tenant/block_io.rs | 2 +- .../src/tenant/ephemeral_file/page_caching.rs | 8 ++++++-- .../ephemeral_file/zero_padded_read_write.rs | 11 +++++++++-- .../src/tenant/storage_layer/inmemory_layer.rs | 15 ++++++++++++++- 5 files changed, 31 insertions(+), 6 deletions(-) diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index d662e5d611fb..39d4e46c9663 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -421,6 +421,7 @@ fn start_pageserver( background_jobs_can_start: background_jobs_barrier.clone(), }; + info!(config=?conf.l0_flush, "using l0_flush config"); let l0_flush_global_state = pageserver::l0_flush::L0FlushGlobalState::new(conf.l0_flush.clone()); diff --git a/pageserver/src/tenant/block_io.rs b/pageserver/src/tenant/block_io.rs index 84c13caa03f3..d46e51e3d3d5 100644 --- a/pageserver/src/tenant/block_io.rs +++ b/pageserver/src/tenant/block_io.rs @@ -115,7 +115,7 @@ impl<'a> BlockReaderRef<'a> { fn read_blk_slice(slice: &[u8], blknum: u32) -> std::io::Result { let start = (blknum as usize).checked_mul(PAGE_SZ).unwrap(); let end = start.checked_add(PAGE_SZ).unwrap(); - if slice.len() > end { + if end > slice.len() { return Err(std::io::Error::new( std::io::ErrorKind::UnexpectedEof, format!("slice too short, len={} end={}", slice.len(), end), diff --git a/pageserver/src/tenant/ephemeral_file/page_caching.rs b/pageserver/src/tenant/ephemeral_file/page_caching.rs index 40b41218cff6..4e13b5a72db9 100644 --- a/pageserver/src/tenant/ephemeral_file/page_caching.rs +++ b/pageserver/src/tenant/ephemeral_file/page_caching.rs @@ -60,7 +60,11 @@ impl RW { // round up to the next PAGE_SZ multiple, required by blob_io let size = { let s = usize::try_from(self.bytes_written()).unwrap(); - s.checked_add(s % PAGE_SZ).unwrap() + if s % PAGE_SZ == 0 { + s + } else { + s.checked_add(PAGE_SZ - (s % PAGE_SZ)).unwrap() + } }; let buf = Vec::with_capacity(size); @@ -78,7 +82,7 @@ impl RW { assert_eq!(vec.len() / PAGE_SZ, nwritten_blocks); // copy from in-memory buffer what we haven't flushed yet but would return when accessed via read_blk - let buffered = self.rw.inspect_buffer_zero_padded(); + let buffered = self.rw.inspect_served_from_zero_padded_mutable_tail(); vec.extend_from_slice(buffered); assert_eq!(vec.len(), size); assert_eq!(vec.len() % PAGE_SZ, 0); diff --git a/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs b/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs index af9996813484..60a07904c1ba 100644 --- a/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs +++ b/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs @@ -75,11 +75,18 @@ where flushed_offset + u64::try_from(buffer.pending()).unwrap() } - pub fn inspect_buffer_zero_padded(&self) -> &[u8] { + /// Get a slice of all blocks that read_blk would return as [`ReadResult::ServedFromZeroPaddedMutableTail`]. + pub fn inspect_served_from_zero_padded_mutable_tail(&self) -> &[u8] { let buffer: &zero_padded::Buffer = self.buffered_writer.inspect_buffer(); let buffer_written_up_to = usize::try_from(buffer.pending()).unwrap(); // pad to next page boundary - let read_up_to = buffer_written_up_to + (buffer_written_up_to % PAGE_SZ); + let read_up_to = if buffer_written_up_to % PAGE_SZ == 0 { + buffer_written_up_to + } else { + buffer_written_up_to + .checked_add(PAGE_SZ - (buffer_written_up_to % PAGE_SZ)) + .unwrap() + }; &buffer.as_zero_padded_slice()[0..read_up_to] } diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs index 3a58b51ef6c7..15effc53f225 100644 --- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs +++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs @@ -6,6 +6,7 @@ //! use crate::config::PageServerConf; use crate::context::{PageContentKind, RequestContext, RequestContextBuilder}; +use crate::page_cache::PAGE_SZ; use crate::repository::{Key, Value}; use crate::tenant::block_io::{BlockCursor, BlockReader, BlockReaderRef}; use crate::tenant::ephemeral_file::EphemeralFile; @@ -678,7 +679,19 @@ impl InMemoryLayer { } l0_flush::Inner::Direct { .. } => { let file_contents: Vec = inner.file.load_into_contiguous_memory(ctx).await?; - assert_eq!(file_contents.len(), inner.file.len() as usize); + assert_eq!( + file_contents.len() % PAGE_SZ, + 0, + "needed by BlockReaderRef::Slice" + ); + assert_eq!(file_contents.len(), { + let written = usize::try_from(inner.file.len()).unwrap(); + if written % PAGE_SZ == 0 { + written + } else { + written.checked_add(PAGE_SZ - (written % PAGE_SZ)).unwrap() + } + }); let cursor = BlockCursor::new(BlockReaderRef::Slice(&file_contents)); From 9cdbc7b0446a69ab06e4e67005bab199af23bc26 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Wed, 26 Jun 2024 14:33:32 +0000 Subject: [PATCH 12/24] don't prewarm page cache on EphemeralFile write and add TODO comment for read path page-caching --- pageserver/src/l0_flush.rs | 9 ++ pageserver/src/tenant/ephemeral_file.rs | 10 +- .../src/tenant/ephemeral_file/page_caching.rs | 91 +++++++++++-------- .../tenant/storage_layer/inmemory_layer.rs | 6 +- pageserver/src/tenant/timeline.rs | 1 + .../src/tenant/timeline/layer_manager.rs | 13 ++- 6 files changed, 88 insertions(+), 42 deletions(-) diff --git a/pageserver/src/l0_flush.rs b/pageserver/src/l0_flush.rs index 416731e3b0a0..7f3a5902079b 100644 --- a/pageserver/src/l0_flush.rs +++ b/pageserver/src/l0_flush.rs @@ -1,5 +1,7 @@ use std::{num::NonZeroUsize, sync::Arc}; +use crate::tenant::ephemeral_file; + #[derive(Default, Debug, PartialEq, Eq, Clone, serde::Deserialize)] #[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)] pub enum L0FlushConfig { @@ -35,4 +37,11 @@ impl L0FlushGlobalState { pub(crate) fn inner(&self) -> &Arc { &self.0 } + + pub(crate) fn prewarm_on_write(&self) -> ephemeral_file::PrewarmPageCacheOnWrite { + match &*self.0 { + Inner::PageCached => ephemeral_file::PrewarmPageCacheOnWrite::Yes, + Inner::Direct { .. } | Inner::Fail(_) => ephemeral_file::PrewarmPageCacheOnWrite::No, + } + } } diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs index 7240e3bc0951..cfef0f9499a8 100644 --- a/pageserver/src/tenant/ephemeral_file.rs +++ b/pageserver/src/tenant/ephemeral_file.rs @@ -21,6 +21,7 @@ pub struct EphemeralFile { } mod page_caching; +pub(crate) use page_caching::PrewarmOnWrite as PrewarmPageCacheOnWrite; mod zero_padded_read_write; impl EphemeralFile { @@ -28,6 +29,7 @@ impl EphemeralFile { conf: &PageServerConf, tenant_shard_id: TenantShardId, timeline_id: TimelineId, + prewarm_on_write: page_caching::PrewarmOnWrite, ctx: &RequestContext, ) -> Result { static NEXT_FILENAME: AtomicU64 = AtomicU64::new(1); @@ -53,7 +55,7 @@ impl EphemeralFile { Ok(EphemeralFile { _tenant_shard_id: tenant_shard_id, _timeline_id: timeline_id, - rw: page_caching::RW::new(file), + rw: page_caching::RW::new(file, prewarm_on_write), }) } @@ -162,7 +164,11 @@ mod tests { async fn test_ephemeral_blobs() -> Result<(), io::Error> { let (conf, tenant_id, timeline_id, ctx) = harness("ephemeral_blobs")?; - let mut file = EphemeralFile::create(conf, tenant_id, timeline_id, &ctx).await?; + let prewarm_on_write = + crate::l0_flush::L0FlushGlobalState::new(crate::l0_flush::L0FlushConfig::default()) + .prewarm_on_write(); + let mut file = + EphemeralFile::create(conf, tenant_id, timeline_id, prewarm_on_write, &ctx).await?; let pos_foo = file.write_blob(b"foo", &ctx).await?; assert_eq!( diff --git a/pageserver/src/tenant/ephemeral_file/page_caching.rs b/pageserver/src/tenant/ephemeral_file/page_caching.rs index 4e13b5a72db9..c87c92856d87 100644 --- a/pageserver/src/tenant/ephemeral_file/page_caching.rs +++ b/pageserver/src/tenant/ephemeral_file/page_caching.rs @@ -19,14 +19,21 @@ pub struct RW { rw: super::zero_padded_read_write::RW, } +#[derive(Clone, Copy)] +pub enum PrewarmOnWrite { + Yes, + No, +} + impl RW { - pub fn new(file: VirtualFile) -> Self { + pub fn new(file: VirtualFile, prewarm_on_write: PrewarmOnWrite) -> Self { let page_cache_file_id = page_cache::next_file_id(); Self { page_cache_file_id, rw: super::zero_padded_read_write::RW::new(PreWarmingWriter::new( page_cache_file_id, file, + prewarm_on_write, )), } } @@ -156,14 +163,20 @@ impl Drop for RW { } struct PreWarmingWriter { + prewarm_on_write: PrewarmOnWrite, nwritten_blocks: u32, page_cache_file_id: page_cache::FileId, file: VirtualFile, } impl PreWarmingWriter { - fn new(page_cache_file_id: page_cache::FileId, file: VirtualFile) -> Self { + fn new( + page_cache_file_id: page_cache::FileId, + file: VirtualFile, + prewarm_on_write: PrewarmOnWrite, + ) -> Self { Self { + prewarm_on_write, nwritten_blocks: 0, page_cache_file_id, file, @@ -235,45 +248,51 @@ impl crate::virtual_file::owned_buffers_io::write::OwnedAsyncWriter for PreWarmi assert_eq!(&check_bounds_stuff_works, &*buf); } - // Pre-warm page cache with the contents. - // At least in isolated bulk ingest benchmarks (test_bulk_insert.py), the pre-warming - // benefits the code that writes InMemoryLayer=>L0 layers. let nblocks = buflen / PAGE_SZ; let nblocks32 = u32::try_from(nblocks).unwrap(); - let cache = page_cache::get(); - static CTX: Lazy = Lazy::new(|| { - RequestContext::new( - crate::task_mgr::TaskKind::EphemeralFilePreWarmPageCache, - crate::context::DownloadBehavior::Error, - ) - }); - for blknum_in_buffer in 0..nblocks { - let blk_in_buffer = &buf[blknum_in_buffer * PAGE_SZ..(blknum_in_buffer + 1) * PAGE_SZ]; - let blknum = self - .nwritten_blocks - .checked_add(blknum_in_buffer as u32) - .unwrap(); - match cache - .read_immutable_buf(self.page_cache_file_id, blknum, &CTX) - .await - { - Err(e) => { - error!("ephemeral_file write_blob failed to get immutable buf to pre-warm page cache: {e:?}"); - // fail gracefully, it's not the end of the world if we can't pre-warm the cache here - } - Ok(v) => match v { - page_cache::ReadBufResult::Found(_guard) => { - // This function takes &mut self, so, it shouldn't be possible to reach this point. - unreachable!("we just wrote block {blknum} to the VirtualFile, which is owned by Self, \ - and this function takes &mut self, so, no concurrent read_blk is possible"); - } - page_cache::ReadBufResult::NotFound(mut write_guard) => { - write_guard.copy_from_slice(blk_in_buffer); - let _ = write_guard.mark_valid(); + + if matches!(self.prewarm_on_write, PrewarmOnWrite::Yes) { + // Pre-warm page cache with the contents. + // At least in isolated bulk ingest benchmarks (test_bulk_insert.py), the pre-warming + // benefits the code that writes InMemoryLayer=>L0 layers. + + let cache = page_cache::get(); + static CTX: Lazy = Lazy::new(|| { + RequestContext::new( + crate::task_mgr::TaskKind::EphemeralFilePreWarmPageCache, + crate::context::DownloadBehavior::Error, + ) + }); + for blknum_in_buffer in 0..nblocks { + let blk_in_buffer = + &buf[blknum_in_buffer * PAGE_SZ..(blknum_in_buffer + 1) * PAGE_SZ]; + let blknum = self + .nwritten_blocks + .checked_add(blknum_in_buffer as u32) + .unwrap(); + match cache + .read_immutable_buf(self.page_cache_file_id, blknum, &CTX) + .await + { + Err(e) => { + error!("ephemeral_file write_blob failed to get immutable buf to pre-warm page cache: {e:?}"); + // fail gracefully, it's not the end of the world if we can't pre-warm the cache here } - }, + Ok(v) => match v { + page_cache::ReadBufResult::Found(_guard) => { + // This function takes &mut self, so, it shouldn't be possible to reach this point. + unreachable!("we just wrote block {blknum} to the VirtualFile, which is owned by Self, \ + and this function takes &mut self, so, no concurrent read_blk is possible"); + } + page_cache::ReadBufResult::NotFound(mut write_guard) => { + write_guard.copy_from_slice(blk_in_buffer); + let _ = write_guard.mark_valid(); + } + }, + } } } + self.nwritten_blocks = self.nwritten_blocks.checked_add(nblocks32).unwrap(); Ok((buflen, buf.into_inner())) } diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs index 15effc53f225..2d4a84393cd4 100644 --- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs +++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs @@ -9,7 +9,7 @@ use crate::context::{PageContentKind, RequestContext, RequestContextBuilder}; use crate::page_cache::PAGE_SZ; use crate::repository::{Key, Value}; use crate::tenant::block_io::{BlockCursor, BlockReader, BlockReaderRef}; -use crate::tenant::ephemeral_file::EphemeralFile; +use crate::tenant::ephemeral_file::{self, EphemeralFile}; use crate::tenant::storage_layer::ValueReconstructResult; use crate::tenant::timeline::GetVectoredError; use crate::tenant::{PageReconstructError, Timeline}; @@ -411,6 +411,7 @@ impl InMemoryLayer { continue; } + // TODO: this uses the page cache let buf = reader.read_blob(block_read.block_offset, &ctx).await; if let Err(e) = buf { reconstruct_state @@ -474,11 +475,12 @@ impl InMemoryLayer { timeline_id: TimelineId, tenant_shard_id: TenantShardId, start_lsn: Lsn, + prewarm_on_write: ephemeral_file::PrewarmPageCacheOnWrite, ctx: &RequestContext, ) -> Result { trace!("initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}"); - let file = EphemeralFile::create(conf, tenant_shard_id, timeline_id, ctx).await?; + let file = EphemeralFile::create(conf, tenant_shard_id, timeline_id, prewarm_on_write, ctx).await?; let key = InMemoryLayerFileId(file.page_cache_file_id()); Ok(InMemoryLayer { diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 4add3e4a5ada..7db1f0bdabba 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -3671,6 +3671,7 @@ impl Timeline { self.conf, self.timeline_id, self.tenant_shard_id, + self.l0_flush_global_state.prewarm_on_write(), ctx, ) .await?; diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs index 550a9a567a43..85d5effc118c 100644 --- a/pageserver/src/tenant/timeline/layer_manager.rs +++ b/pageserver/src/tenant/timeline/layer_manager.rs @@ -13,6 +13,7 @@ use crate::{ context::RequestContext, metrics::TimelineMetrics, tenant::{ + ephemeral_file, layer_map::{BatchedUpdates, LayerMap}, storage_layer::{ AsLayerDesc, InMemoryLayer, Layer, PersistentLayerDesc, PersistentLayerKey, @@ -73,6 +74,7 @@ impl LayerManager { conf: &'static PageServerConf, timeline_id: TimelineId, tenant_shard_id: TenantShardId, + prewarm_on_write: ephemeral_file::PrewarmPageCacheOnWrite, ctx: &RequestContext, ) -> Result> { ensure!(lsn.is_aligned()); @@ -109,8 +111,15 @@ impl LayerManager { lsn ); - let new_layer = - InMemoryLayer::create(conf, timeline_id, tenant_shard_id, start_lsn, ctx).await?; + let new_layer = InMemoryLayer::create( + conf, + timeline_id, + tenant_shard_id, + start_lsn, + prewarm_on_write, + ctx, + ) + .await?; let layer = Arc::new(new_layer); self.layer_map.open_layer = Some(layer.clone()); From 2f8d12b86d64e0ca752af17f81bffa6810f8d962 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Thu, 27 Jun 2024 13:50:49 +0000 Subject: [PATCH 13/24] self-review --- pageserver/src/tenant/ephemeral_file.rs | 1 + .../src/tenant/ephemeral_file/page_caching.rs | 15 +++++++++------ .../ephemeral_file/zero_padded_read_write.rs | 2 +- .../src/tenant/storage_layer/inmemory_layer.rs | 11 +++++++++-- 4 files changed, 20 insertions(+), 9 deletions(-) diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs index cfef0f9499a8..03c7ac36d3ab 100644 --- a/pageserver/src/tenant/ephemeral_file.rs +++ b/pageserver/src/tenant/ephemeral_file.rs @@ -67,6 +67,7 @@ impl EphemeralFile { self.rw.page_cache_file_id() } + /// See [`self::page_caching::RW::load_into_contiguous_memory`]. pub(crate) async fn load_into_contiguous_memory( &self, ctx: &RequestContext, diff --git a/pageserver/src/tenant/ephemeral_file/page_caching.rs b/pageserver/src/tenant/ephemeral_file/page_caching.rs index c87c92856d87..1fc787f835b9 100644 --- a/pageserver/src/tenant/ephemeral_file/page_caching.rs +++ b/pageserver/src/tenant/ephemeral_file/page_caching.rs @@ -19,6 +19,8 @@ pub struct RW { rw: super::zero_padded_read_write::RW, } +/// When we flush a block to the underlying [`crate::virtual_file::VirtualFile`], +/// should we pre-warm the [`crate::page_cache`] with the contents? #[derive(Clone, Copy)] pub enum PrewarmOnWrite { Yes, @@ -56,11 +58,11 @@ impl RW { self.rw.bytes_written() } - /// Load all blocks that can be read via read_blk into a contiguous memory buffer. + /// Load all blocks that can be read via [`Self::read_blk`] into a contiguous memory buffer. /// - /// This includes blocks that are in the internal buffered writer's buffer. - /// The last block is zero-padded to `PAGE_SZ`, so, the returned buffer is always a multiple of `PAGE_SZ`. - pub(crate) async fn load_into_contiguous_memory( + /// This includes the blocks that aren't yet flushed to disk by the internal buffered writer. + /// The last block is zero-padded to [`PAGE_SZ`], so, the returned buffer is always a multiple of [`PAGE_SZ`]. + pub(super) async fn load_into_contiguous_memory( &self, ctx: &RequestContext, ) -> Result, io::Error> { @@ -79,7 +81,7 @@ impl RW { let writer = self.rw.as_writer(); let nwritten_blocks = usize::try_from(writer.nwritten_blocks).unwrap(); let buf = writer - .read_all_written_blocks_into_memory_directly_from_disk( + .load_flushed_blocks_into_contiguous_memory( buf.slice(0..nwritten_blocks * PAGE_SZ), ctx, ) @@ -183,7 +185,8 @@ impl PreWarmingWriter { } } - async fn read_all_written_blocks_into_memory_directly_from_disk( + /// Load all the blocks that we already flushed to disk into `buf`. + async fn load_flushed_blocks_into_contiguous_memory( &self, buf: B, ctx: &RequestContext, diff --git a/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs b/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs index 60a07904c1ba..b47f95b31645 100644 --- a/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs +++ b/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs @@ -75,7 +75,7 @@ where flushed_offset + u64::try_from(buffer.pending()).unwrap() } - /// Get a slice of all blocks that read_blk would return as [`ReadResult::ServedFromZeroPaddedMutableTail`]. + /// Get a slice of all blocks that [`Self::read_blk`] would return as [`ReadResult::ServedFromZeroPaddedMutableTail`]. pub fn inspect_served_from_zero_padded_mutable_tail(&self) -> &[u8] { let buffer: &zero_padded::Buffer = self.buffered_writer.inspect_buffer(); let buffer_written_up_to = usize::try_from(buffer.pending()).unwrap(); diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs index 2d4a84393cd4..5a88eeacd65e 100644 --- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs +++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs @@ -411,7 +411,7 @@ impl InMemoryLayer { continue; } - // TODO: this uses the page cache + // TODO: this uses the page cache => https://github.com/neondatabase/neon/issues/8183 let buf = reader.read_blob(block_read.block_offset, &ctx).await; if let Err(e) = buf { reconstruct_state @@ -480,7 +480,8 @@ impl InMemoryLayer { ) -> Result { trace!("initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}"); - let file = EphemeralFile::create(conf, tenant_shard_id, timeline_id, prewarm_on_write, ctx).await?; + let file = EphemeralFile::create(conf, tenant_shard_id, timeline_id, prewarm_on_write, ctx) + .await?; let key = InMemoryLayerFileId(file.page_cache_file_id()); Ok(InMemoryLayer { @@ -702,6 +703,12 @@ impl InMemoryLayer { for (key, vec_map) in inner.index.iter() { // Write all page versions for (lsn, pos) in vec_map.as_slice() { + // TODO: once we have blob lengths in the in-memory index, we can + // 1. get rid of the blob_io / BlockReaderRef::Slice business and + // 2. load the file contents into a Bytes and + // 3. the use `Bytes::slice` to get the `buf` that is our blob + // 4. pass that `buf` into `put_value_bytes` + // => https://github.com/neondatabase/neon/issues/8183 cursor.read_blob_into_buf(*pos, &mut buf, &ctx).await?; let will_init = Value::des(&buf)?.will_init(); let res; From 647f0848d4f50198b3ec1df6d8daf9bfaeca7f3f Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Thu, 27 Jun 2024 15:06:15 +0000 Subject: [PATCH 14/24] get rid of read_exact_at alltogether --- pageserver/src/virtual_file.rs | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index a62fb7dad558..37383a6c8af3 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -586,26 +586,6 @@ impl VirtualFile { Ok(self.pos) } - /// Read `buf.bytes_total()` bytes into buf, starting at offset `offset`. - /// - /// The returned slice is guaranteed to be the same view into the underlying `Buf` as the input argument `buf`. - pub async fn read_exact_at( - &self, - buf: B, - offset: u64, - ctx: &RequestContext, - ) -> Result, Error> - where - Buf: IoBufMut + Send, - B: BoundedBufMut, - { - let (buf, res) = read_exact_at_impl(buf, offset, None, |buf, offset| { - self.read_at(buf, offset, ctx) - }) - .await; - res.map(|()| buf) - } - /// Read `count` bytes into `buf`, starting at offset `offset`. /// /// The returned `B` is the same `buf` that was passed in. From 928c1dc1cde2e39cee21b2bb6eae645f330a9ad3 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Thu, 27 Jun 2024 15:31:40 +0000 Subject: [PATCH 15/24] re-add read_exact_at --- pageserver/src/virtual_file.rs | 75 ++++++++++++++++++++++++---------- 1 file changed, 53 insertions(+), 22 deletions(-) diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index 37383a6c8af3..558b12ff4ec3 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -13,7 +13,7 @@ use crate::context::RequestContext; use crate::metrics::{StorageIoOperation, STORAGE_IO_SIZE, STORAGE_IO_TIME_METRIC}; -use crate::page_cache::PageWriteGuard; +use crate::page_cache::{PageWriteGuard, PAGE_SZ}; use crate::tenant::TENANTS_SEGMENT_NAME; use camino::{Utf8Path, Utf8PathBuf}; use once_cell::sync::OnceCell; @@ -586,25 +586,58 @@ impl VirtualFile { Ok(self.pos) } - /// Read `count` bytes into `buf`, starting at offset `offset`. + /// Read the file contents in range `offset..(offset+count)` into `slice[0..count]`. /// - /// The returned `B` is the same `buf` that was passed in. - /// On success, it is guaranteed that the returned `B` has `bytes_init() >= count`. - pub async fn read_exact_at_n( + /// The returned `Buf` is `slice`'s underlying buffer. + pub async fn read_exact_at_n( &self, - buf: Buf, + slice: B, offset: u64, count: usize, ctx: &RequestContext, ) -> Result where Buf: IoBufMut + Send, + B: BoundedBufMut, { - let (buf, res) = read_exact_at_impl(buf, offset, Some(count), |buf, offset| { + let (buf, res) = read_exact_at_impl(slice, offset, Some(count), |buf, offset| { self.read_at(buf, offset, ctx) }) .await; - res.map(|()| buf.into_inner()) + res.map(|_| buf) + } + + /// Read the file contents in range `offset..(offset + slice.bytes_total())` into `slice[0..slice.bytes_total()]`. + /// + /// The returned `Slice` is equivalent to the input `slice`, i.e., it's the same view into the same buffer. + pub async fn read_exact_at( + &self, + slice: B, + offset: u64, + ctx: &RequestContext, + ) -> Result, Error> + where + Buf: IoBufMut + Send, + B: BoundedBufMut, + { + let check_equal_view = if cfg!(debug_assertions) { + Some((slice.stable_ptr(), slice.bytes_total())) + } else { + None + }; + let original_bounds = slice.bounds(); + let (buf, res) = read_exact_at_impl(slice, offset, None, |buf, offset| { + self.read_at(buf, offset, ctx) + }) + .await; + let res = res.map(|_| buf.slice(original_bounds)); + if let Some(orig) = check_equal_view { + if let Ok(slice) = &res { + let returning = (slice.stable_ptr(), slice.bytes_total()); + assert_eq!(orig, returning); + } + } + res } /// Like [`Self::read_exact_at`] but for [`PageWriteGuard`]. @@ -614,11 +647,11 @@ impl VirtualFile { offset: u64, ctx: &RequestContext, ) -> Result, Error> { - let page_sz = page.len(); let buf = PageWriteGuardBuf { page }; - let res = self.read_exact_at_n(buf, offset, page_sz, ctx).await; - res.map(|PageWriteGuardBuf { page, .. }| page) - .map_err(|e| Error::new(ErrorKind::Other, e)) + debug_assert_eq!(BoundedBufMut::bytes_total(&buf), PAGE_SZ); + self.read_exact_at(buf, offset, ctx) + .await + .map(|PageWriteGuardBuf { page }| page) } // Copied from https://doc.rust-lang.org/1.72.0/src/std/os/unix/fs.rs.html#219-235 @@ -778,10 +811,8 @@ where Buf: IoBufMut + Send, B: BoundedBufMut, F: FnMut(tokio_epoll_uring::Slice, u64) -> Fut, - Fut: std::future::Future, std::io::Result)>, + Fut: std::future::Future)>, { - let original_bounds = buf.bounds(); - let mut buf: tokio_epoll_uring::Slice<_> = match count { Some(count) => { assert!(count <= buf.bytes_total()); @@ -801,7 +832,7 @@ where offset += n as u64; } Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {} - Err(e) => return (buf.into_inner().slice(original_bounds), Err(e)), + Err(e) => return (buf.into_inner(), Err(e)), } } // NB: don't use `buf.is_empty()` here; it is from the @@ -811,7 +842,7 @@ where // buffer that the user passed in. if buf.bytes_total() != 0 { ( - buf.into_inner().slice(original_bounds), + buf.into_inner(), Err(std::io::Error::new( std::io::ErrorKind::UnexpectedEof, "failed to fill whole buffer", @@ -819,7 +850,7 @@ where ) } else { assert_eq!(buf.len(), buf.bytes_total()); - (buf.into_inner().slice(original_bounds), Ok(())) + (buf.into_inner(), Ok(())) } } @@ -1041,9 +1072,9 @@ impl VirtualFile { ctx: &RequestContext, ) -> Result, std::io::Error> { use crate::page_cache::PAGE_SZ; - let buf = vec![0; PAGE_SZ]; + let buf = Vec::with_capacity(PAGE_SZ); let buf = self - .read_exact_at_n(buf, blknum as u64 * (PAGE_SZ as u64), PAGE_SZ, ctx) + .read_exact_at(buf, blknum as u64 * (PAGE_SZ as u64), ctx) .await?; Ok(crate::tenant::block_io::BlockLease::Vec(buf)) } @@ -1196,14 +1227,14 @@ mod tests { impl MaybeVirtualFile { async fn read_exact_at( &self, - mut buf: Vec, + mut buf: tokio_epoll_uring::Slice, offset: u64, ctx: &RequestContext, ) -> Result, Error> { match self { MaybeVirtualFile::VirtualFile(file) => { let n = buf.len(); - file.read_exact_at_n(buf, offset, n, ctx).await + file.read_exact_at(buf, offset, n, ctx).await } MaybeVirtualFile::File(file) => file.read_exact_at(&mut buf, offset).map(|()| buf), } From df5659541e88871bd1ea4873cad58ac66fb24c1d Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Thu, 27 Jun 2024 16:54:31 +0000 Subject: [PATCH 16/24] finish & fix some ub with std-fs (will pull this into a preliminary) --- pageserver/src/tenant/vectored_blob_io.rs | 6 +- pageserver/src/virtual_file.rs | 167 +++++++----------- pageserver/src/virtual_file/io_engine.rs | 33 ++-- .../virtual_file/owned_buffers_io/slice.rs | 77 ++++++++ 4 files changed, 156 insertions(+), 127 deletions(-) create mode 100644 pageserver/src/virtual_file/owned_buffers_io/slice.rs diff --git a/pageserver/src/tenant/vectored_blob_io.rs b/pageserver/src/tenant/vectored_blob_io.rs index 1241a1390209..7ad8446e0411 100644 --- a/pageserver/src/tenant/vectored_blob_io.rs +++ b/pageserver/src/tenant/vectored_blob_io.rs @@ -20,6 +20,7 @@ use std::num::NonZeroUsize; use bytes::BytesMut; use pageserver_api::key::Key; +use tokio_epoll_uring::BoundedBuf; use utils::lsn::Lsn; use utils::vec_map::VecMap; @@ -316,8 +317,9 @@ impl<'a> VectoredBlobReader<'a> { ); let buf = self .file - .read_exact_at_n(buf, read.start, read.size(), ctx) - .await?; + .read_exact_at(buf.slice(0..read.size()), read.start, ctx) + .await? + .into_inner(); let blobs_at = read.blobs_at.as_slice(); let start_offset = blobs_at.first().expect("VectoredRead is never empty").0; diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index 558b12ff4ec3..51b0c420c346 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -20,7 +20,7 @@ use once_cell::sync::OnceCell; use pageserver_api::shard::TenantShardId; use std::fs::File; use std::io::{Error, ErrorKind, Seek, SeekFrom}; -use tokio_epoll_uring::{BoundedBuf, BoundedBufMut, IoBuf, IoBufMut, Slice}; +use tokio_epoll_uring::{BoundedBuf, IoBuf, IoBufMut, Slice}; use std::os::fd::{AsRawFd, FromRawFd, IntoRawFd, OwnedFd, RawFd}; use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering}; @@ -48,6 +48,7 @@ pub(crate) mod owned_buffers_io { //! but for the time being we're proving out the primitives in the neon.git repo //! for faster iteration. + pub(crate) mod slice; pub(crate) mod write; pub(crate) mod util { pub(crate) mod size_tracking_writer; @@ -586,57 +587,36 @@ impl VirtualFile { Ok(self.pos) } - /// Read the file contents in range `offset..(offset+count)` into `slice[0..count]`. - /// - /// The returned `Buf` is `slice`'s underlying buffer. - pub async fn read_exact_at_n( - &self, - slice: B, - offset: u64, - count: usize, - ctx: &RequestContext, - ) -> Result - where - Buf: IoBufMut + Send, - B: BoundedBufMut, - { - let (buf, res) = read_exact_at_impl(slice, offset, Some(count), |buf, offset| { - self.read_at(buf, offset, ctx) - }) - .await; - res.map(|_| buf) - } - /// Read the file contents in range `offset..(offset + slice.bytes_total())` into `slice[0..slice.bytes_total()]`. /// /// The returned `Slice` is equivalent to the input `slice`, i.e., it's the same view into the same buffer. - pub async fn read_exact_at( + pub async fn read_exact_at( &self, - slice: B, + slice: Slice, offset: u64, ctx: &RequestContext, - ) -> Result, Error> + ) -> Result, Error> where Buf: IoBufMut + Send, - B: BoundedBufMut, { - let check_equal_view = if cfg!(debug_assertions) { - Some((slice.stable_ptr(), slice.bytes_total())) + let assert_we_return_original_bounds = if cfg!(debug_assertions) { + Some((slice.stable_ptr() as usize, slice.bytes_total())) } else { None }; + let original_bounds = slice.bounds(); - let (buf, res) = read_exact_at_impl(slice, offset, None, |buf, offset| { - self.read_at(buf, offset, ctx) - }) - .await; + let (buf, res) = + read_exact_at_impl(slice, offset, |buf, offset| self.read_at(buf, offset, ctx)).await; let res = res.map(|_| buf.slice(original_bounds)); - if let Some(orig) = check_equal_view { + + if let Some(original_bounds) = assert_we_return_original_bounds { if let Ok(slice) = &res { - let returning = (slice.stable_ptr(), slice.bytes_total()); - assert_eq!(orig, returning); + let returned_bounds = (slice.stable_ptr() as usize, slice.bytes_total()); + assert_eq!(original_bounds, returned_bounds); } } + res } @@ -647,11 +627,11 @@ impl VirtualFile { offset: u64, ctx: &RequestContext, ) -> Result, Error> { - let buf = PageWriteGuardBuf { page }; - debug_assert_eq!(BoundedBufMut::bytes_total(&buf), PAGE_SZ); + let buf = PageWriteGuardBuf { page }.slice_full(); + debug_assert_eq!(buf.bytes_total(), PAGE_SZ); self.read_exact_at(buf, offset, ctx) .await - .map(|PageWriteGuardBuf { page }| page) + .map(|slice| slice.into_inner().page) } // Copied from https://doc.rust-lang.org/1.72.0/src/std/os/unix/fs.rs.html#219-235 @@ -742,14 +722,14 @@ impl VirtualFile { (buf, Ok(n)) } - pub(crate) async fn read_at( + pub(crate) async fn read_at( &self, - buf: B, + buf: tokio_epoll_uring::Slice, offset: u64, _ctx: &RequestContext, /* TODO: use for metrics: https://github.com/neondatabase/neon/issues/6107 */ - ) -> (B, Result) + ) -> (tokio_epoll_uring::Slice, Result) where - B: tokio_epoll_uring::BoundedBufMut + Send, + Buf: tokio_epoll_uring::IoBufMut + Send, { let file_guard = match self.lock_file().await { Ok(file_guard) => file_guard, @@ -801,27 +781,16 @@ impl VirtualFile { } // Adapted from https://doc.rust-lang.org/1.72.0/src/std/os/unix/fs.rs.html#117-135 -pub async fn read_exact_at_impl( - buf: B, +pub async fn read_exact_at_impl( + mut buf: tokio_epoll_uring::Slice, mut offset: u64, - count: Option, mut read_at: F, -) -> (tokio_epoll_uring::Slice, std::io::Result<()>) +) -> (Buf, std::io::Result<()>) where Buf: IoBufMut + Send, - B: BoundedBufMut, F: FnMut(tokio_epoll_uring::Slice, u64) -> Fut, - Fut: std::future::Future)>, + Fut: std::future::Future, std::io::Result)>, { - let mut buf: tokio_epoll_uring::Slice<_> = match count { - Some(count) => { - assert!(count <= buf.bytes_total()); - assert!(count > 0); - buf.slice(..count) // may include uninitialized memory - } - None => buf.slice_full(), // includes all the uninitialized memory - }; - while buf.bytes_total() != 0 { let res; (buf, res) = read_at(buf, offset).await; @@ -903,7 +872,7 @@ mod test_read_exact_at_impl { #[tokio::test] async fn test_basic() { - let buf = Vec::with_capacity(5); + let buf = Vec::with_capacity(5).slice_full(); let mock_read_at = Arc::new(tokio::sync::Mutex::new(MockReadAt { expectations: VecDeque::from(vec![Expectation { offset: 0, @@ -911,42 +880,22 @@ mod test_read_exact_at_impl { result: Ok(vec![b'a', b'b', b'c', b'd', b'e']), }]), })); - let (buf, res) = read_exact_at_impl(buf, 0, None, |buf, offset| { - let mock_read_at = Arc::clone(&mock_read_at); - async move { mock_read_at.lock().await.read_at(buf, offset).await } - }) - .await; - assert!(res.is_ok()); - assert_eq!(buf.into_inner(), vec![b'a', b'b', b'c', b'd', b'e']); - } - - #[tokio::test] - async fn test_with_count() { - let buf = Vec::with_capacity(5); - let mock_read_at = Arc::new(tokio::sync::Mutex::new(MockReadAt { - expectations: VecDeque::from(vec![Expectation { - offset: 0, - bytes_total: 3, - result: Ok(vec![b'a', b'b', b'c']), - }]), - })); - - let (buf, res) = read_exact_at_impl(buf, 0, Some(3), |buf, offset| { + let (buf, res) = read_exact_at_impl(buf, 0, |buf, offset| { let mock_read_at = Arc::clone(&mock_read_at); async move { mock_read_at.lock().await.read_at(buf, offset).await } }) .await; assert!(res.is_ok()); - assert_eq!(buf.into_inner(), vec![b'a', b'b', b'c']); + assert_eq!(buf, vec![b'a', b'b', b'c', b'd', b'e']); } #[tokio::test] async fn test_empty_buf_issues_no_syscall() { - let buf = Vec::new(); + let buf = Vec::new().slice_full(); let mock_read_at = Arc::new(tokio::sync::Mutex::new(MockReadAt { expectations: VecDeque::new(), })); - let (_buf, res) = read_exact_at_impl(buf, 0, None, |buf, offset| { + let (_buf, res) = read_exact_at_impl(buf, 0, |buf, offset| { let mock_read_at = Arc::clone(&mock_read_at); async move { mock_read_at.lock().await.read_at(buf, offset).await } }) @@ -956,7 +905,7 @@ mod test_read_exact_at_impl { #[tokio::test] async fn test_two_read_at_calls_needed_until_buf_filled() { - let buf = Vec::with_capacity(4); + let buf = Vec::with_capacity(4).slice_full(); let mock_read_at = Arc::new(tokio::sync::Mutex::new(MockReadAt { expectations: VecDeque::from(vec![ Expectation { @@ -971,18 +920,18 @@ mod test_read_exact_at_impl { }, ]), })); - let (buf, res) = read_exact_at_impl(buf, 0, None, |buf, offset| { + let (buf, res) = read_exact_at_impl(buf, 0, |buf, offset| { let mock_read_at = Arc::clone(&mock_read_at); async move { mock_read_at.lock().await.read_at(buf, offset).await } }) .await; assert!(res.is_ok()); - assert_eq!(buf.into_inner(), vec![b'a', b'b', b'c', b'd']); + assert_eq!(buf, vec![b'a', b'b', b'c', b'd']); } #[tokio::test] async fn test_eof_before_buffer_full() { - let buf = Vec::with_capacity(3); + let buf = Vec::with_capacity(3).slice_full(); let mock_read_at = Arc::new(tokio::sync::Mutex::new(MockReadAt { expectations: VecDeque::from(vec![ Expectation { @@ -1002,7 +951,7 @@ mod test_read_exact_at_impl { }, ]), })); - let (_buf, res) = read_exact_at_impl(buf, 0, None, |buf, offset| { + let (_buf, res) = read_exact_at_impl(buf, 0, |buf, offset| { let mock_read_at = Arc::clone(&mock_read_at); async move { mock_read_at.lock().await.read_at(buf, offset).await } }) @@ -1072,27 +1021,29 @@ impl VirtualFile { ctx: &RequestContext, ) -> Result, std::io::Error> { use crate::page_cache::PAGE_SZ; - let buf = Vec::with_capacity(PAGE_SZ); - let buf = self - .read_exact_at(buf, blknum as u64 * (PAGE_SZ as u64), ctx) + let slice = Vec::with_capacity(PAGE_SZ).slice_full(); + assert_eq!(slice.bytes_total(), PAGE_SZ); + let slice = self + .read_exact_at(slice, blknum as u64 * (PAGE_SZ as u64), ctx) .await?; - Ok(crate::tenant::block_io::BlockLease::Vec(buf)) + Ok(crate::tenant::block_io::BlockLease::Vec(slice.into_inner())) } async fn read_to_end(&mut self, buf: &mut Vec, ctx: &RequestContext) -> Result<(), Error> { let mut tmp = vec![0; 128]; loop { - let res; - (tmp, res) = self.read_at(tmp, self.pos, ctx).await; + let slice = tmp.slice(..128); + let (slice, res) = self.read_at(slice, self.pos, ctx).await; match res { Ok(0) => return Ok(()), Ok(n) => { self.pos += n as u64; - buf.extend_from_slice(&tmp[..n]); + buf.extend_from_slice(&slice[..n]); } Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {} Err(e) => return Err(e), } + tmp = slice.into_inner(); } } } @@ -1206,6 +1157,7 @@ mod tests { use crate::task_mgr::TaskKind; use super::*; + use owned_buffers_io::slice::SliceExt; use rand::seq::SliceRandom; use rand::thread_rng; use rand::Rng; @@ -1227,16 +1179,16 @@ mod tests { impl MaybeVirtualFile { async fn read_exact_at( &self, - mut buf: tokio_epoll_uring::Slice, + mut slice: tokio_epoll_uring::Slice>, offset: u64, ctx: &RequestContext, - ) -> Result, Error> { + ) -> Result>, Error> { match self { - MaybeVirtualFile::VirtualFile(file) => { - let n = buf.len(); - file.read_exact_at(buf, offset, n, ctx).await + MaybeVirtualFile::VirtualFile(file) => file.read_exact_at(slice, offset, ctx).await, + MaybeVirtualFile::File(file) => { + let rust_slice: &mut [u8] = slice.as_mut_rust_slice_full_zeroed(); + file.read_exact_at(rust_slice, offset).map(|()| slice) } - MaybeVirtualFile::File(file) => file.read_exact_at(&mut buf, offset).map(|()| buf), } } async fn write_all_at, Buf: IoBuf + Send>( @@ -1310,9 +1262,12 @@ mod tests { len: usize, ctx: &RequestContext, ) -> Result { - let buf = vec![0; len]; - let buf = self.read_exact_at(buf, pos, ctx).await?; - Ok(String::from_utf8(buf).unwrap()) + let slice = Vec::with_capacity(len).slice_full(); + assert_eq!(slice.bytes_total(), len); + let slice = self.read_exact_at(slice, pos, ctx).await?; + let vec = slice.into_inner(); + assert_eq!(vec.len(), len); + Ok(String::from_utf8(vec).unwrap()) } } @@ -1531,7 +1486,11 @@ mod tests { let mut rng = rand::rngs::OsRng; for _ in 1..1000 { let f = &files[rng.gen_range(0..files.len())]; - buf = f.read_exact_at_n(buf, 0, SIZE, &ctx).await.unwrap(); + buf = f + .read_exact_at(buf.slice_full(), 0, &ctx) + .await + .unwrap() + .into_inner(); assert!(buf == SAMPLE); } }); diff --git a/pageserver/src/virtual_file/io_engine.rs b/pageserver/src/virtual_file/io_engine.rs index 7a27be2ca108..2820cea097d1 100644 --- a/pageserver/src/virtual_file/io_engine.rs +++ b/pageserver/src/virtual_file/io_engine.rs @@ -107,7 +107,7 @@ use std::{ sync::atomic::{AtomicU8, Ordering}, }; -use super::{FileGuard, Metadata}; +use super::{owned_buffers_io::slice::SliceExt, FileGuard, Metadata}; #[cfg(target_os = "linux")] fn epoll_uring_error_to_std(e: tokio_epoll_uring::Error) -> std::io::Error { @@ -120,38 +120,29 @@ fn epoll_uring_error_to_std(e: tokio_epoll_uring::Error) -> std: } impl IoEngine { - pub(super) async fn read_at( + pub(super) async fn read_at( &self, file_guard: FileGuard, offset: u64, - mut buf: B, - ) -> ((FileGuard, B), std::io::Result) + mut slice: tokio_epoll_uring::Slice, + ) -> ( + (FileGuard, tokio_epoll_uring::Slice), + std::io::Result, + ) where - B: tokio_epoll_uring::BoundedBufMut + Send, + Buf: tokio_epoll_uring::IoBufMut + Send, { match self { IoEngine::NotSet => panic!("not initialized"), IoEngine::StdFs => { - // SAFETY: `dst` only lives at most as long as this match arm, during which buf remains valid memory. - let dst = unsafe { - std::slice::from_raw_parts_mut(buf.stable_mut_ptr(), buf.bytes_total()) - }; - let res = file_guard.with_std_file(|std_file| std_file.read_at(dst, offset)); - if let Ok(nbytes) = &res { - assert!(*nbytes <= buf.bytes_total()); - // SAFETY: see above assertion - unsafe { - buf.set_init(*nbytes); - } - } - #[allow(dropping_references)] - drop(dst); - ((file_guard, buf), res) + let rust_slice = slice.as_mut_rust_slice_full_zeroed(); + let res = file_guard.with_std_file(|std_file| std_file.read_at(rust_slice, offset)); + ((file_guard, slice), res) } #[cfg(target_os = "linux")] IoEngine::TokioEpollUring => { let system = tokio_epoll_uring_ext::thread_local_system().await; - let (resources, res) = system.read(file_guard, offset, buf).await; + let (resources, res) = system.read(file_guard, offset, slice).await; (resources, res.map_err(epoll_uring_error_to_std)) } } diff --git a/pageserver/src/virtual_file/owned_buffers_io/slice.rs b/pageserver/src/virtual_file/owned_buffers_io/slice.rs new file mode 100644 index 000000000000..23bf2746f721 --- /dev/null +++ b/pageserver/src/virtual_file/owned_buffers_io/slice.rs @@ -0,0 +1,77 @@ +use tokio_epoll_uring::BoundedBuf; +use tokio_epoll_uring::BoundedBufMut; +use tokio_epoll_uring::IoBufMut; +use tokio_epoll_uring::Slice; + +pub(crate) trait SliceExt { + /// Get a `&mut[0..self.bytes_total()`] slice, for when you need to do borrow-based IO. + /// + /// See the test case `test_slice_full_zeroed` for the difference to just doing `&slice[..]` + fn as_mut_rust_slice_full_zeroed(&mut self) -> &mut [u8]; +} + +impl SliceExt for Slice +where + B: IoBufMut, +{ + #[inline(always)] + fn as_mut_rust_slice_full_zeroed(&mut self) -> &mut [u8] { + // zero-initialize the uninitialized parts of the buffer so we can create a Rust slice + // + // SAFETY: we own `slice`, don't write outside the bounds + unsafe { + let to_init = self.bytes_total() - self.bytes_init(); + self.stable_mut_ptr() + .add(self.bytes_init()) + .write_bytes(0, to_init); + self.set_init(self.bytes_total()); + }; + let bytes_total = self.bytes_total(); + &mut self[0..bytes_total] + } +} + +#[cfg(test)] +mod tests { + use std::io::Read; + + use super::*; + use bytes::Buf; + use tokio_epoll_uring::Slice; + + #[test] + fn test_slice_full_zeroed() { + let buf = Vec::with_capacity(3); + + let mut slice: Slice<_> = buf.slice_full(); + assert_eq!(Slice::get_mut(&mut slice).len(), 0); + assert_eq!(Slice::get_mut(&mut slice).capacity(), 0); + + let make_fake_file = || bytes::BytesMut::from(&b"123"[..]).reader(); + + // the Slice derefs to &[u8] with bounds [0..bytes_INIT()] + { + assert_eq!(slice[..].len(), 0); + let mut file = make_fake_file(); + file.read_exact(&mut slice[..]).unwrap(); + assert_eq!(&slice[..] as &[u8], &[][..] as &[u8]); + } + + // that is unlike the behavior we get when passing the `Slice` + // into owned buffers IO like with VirtualFile. There, you can + // pass in a `Slice` with bytes_init()=0 but bytes_total()=5 + // and it will read 5 bytes. + + // To get the same behavior with a Rust slice, we need to zero-initialize the + // uninitialized parts of the `Slice` first, then create a Rust slice from it. + { + let rust_slice = slice.as_mut_rust_slice_full_zeroed(); + assert_eq!(rust_slice.len(), 3); + assert_eq!(rust_slice, &[0, 0, 0, 0, 0]); + let mut file = make_fake_file(); + file.read_exact(rust_slice).unwrap(); + assert_eq!(rust_slice, b"123"); + assert_eq!(&slice[..], b"123"); + } + } +} From 98d872112ca29d41292a3c91e94e7b65b8eb0b98 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Thu, 27 Jun 2024 17:55:14 +0000 Subject: [PATCH 17/24] fix test and pretty up --- .../virtual_file/owned_buffers_io/slice.rs | 70 +++++++++++++++---- 1 file changed, 57 insertions(+), 13 deletions(-) diff --git a/pageserver/src/virtual_file/owned_buffers_io/slice.rs b/pageserver/src/virtual_file/owned_buffers_io/slice.rs index 23bf2746f721..d19e5ddffefb 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/slice.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/slice.rs @@ -41,37 +41,81 @@ mod tests { #[test] fn test_slice_full_zeroed() { - let buf = Vec::with_capacity(3); + let make_fake_file = || bytes::BytesMut::from(&b"12345"[..]).reader(); - let mut slice: Slice<_> = buf.slice_full(); - assert_eq!(Slice::get_mut(&mut slice).len(), 0); - assert_eq!(Slice::get_mut(&mut slice).capacity(), 0); + // before we start the test, let's make sure we have a shared understanding of what slice_full does + { + let buf = Vec::with_capacity(3); + let slice: Slice<_> = buf.slice_full(); + assert_eq!(slice.bytes_init(), 0); + assert_eq!(slice.bytes_total(), 3); + let rust_slice = &slice[..]; + assert_eq!( + rust_slice.len(), + 0, + "Slice only derefs to a &[u8] of the initialized part" + ); + } - let make_fake_file = || bytes::BytesMut::from(&b"123"[..]).reader(); + // and also let's establish a shared understanding of .slice() + { + let buf = Vec::with_capacity(3); + let slice: Slice<_> = buf.slice(0..2); + assert_eq!(slice.bytes_init(), 0); + assert_eq!(slice.bytes_total(), 2); + let rust_slice = &slice[..]; + assert_eq!( + rust_slice.len(), + 0, + "Slice only derefs to a &[u8] of the initialized part" + ); + } - // the Slice derefs to &[u8] with bounds [0..bytes_INIT()] + // the above leads to the easy mistake of using slice[..] for borrow-based IO like so: { + let buf = Vec::with_capacity(3); + let mut slice: Slice<_> = buf.slice_full(); assert_eq!(slice[..].len(), 0); let mut file = make_fake_file(); - file.read_exact(&mut slice[..]).unwrap(); + file.read_exact(&mut slice[..]).unwrap(); // one might think this reads 3 bytes but it reads 0 assert_eq!(&slice[..] as &[u8], &[][..] as &[u8]); } - // that is unlike the behavior we get when passing the `Slice` - // into owned buffers IO like with VirtualFile. There, you can + // With owned buffers IO like with VirtualFilem, you could totally // pass in a `Slice` with bytes_init()=0 but bytes_total()=5 - // and it will read 5 bytes. + // and it will read 5 bytes into the slice, and return a slice that has bytes_init()=5. + { + // TODO: demo + } - // To get the same behavior with a Rust slice, we need to zero-initialize the - // uninitialized parts of the `Slice` first, then create a Rust slice from it. + // + // Ok, now that we have a shared understanding let's demo how to use the extension trait. + // + + // slice_full() { + let buf = Vec::with_capacity(3); + let mut slice: Slice<_> = buf.slice_full(); let rust_slice = slice.as_mut_rust_slice_full_zeroed(); assert_eq!(rust_slice.len(), 3); - assert_eq!(rust_slice, &[0, 0, 0, 0, 0]); + assert_eq!(rust_slice, &[0, 0, 0]); let mut file = make_fake_file(); file.read_exact(rust_slice).unwrap(); assert_eq!(rust_slice, b"123"); assert_eq!(&slice[..], b"123"); } + + // .slice(..) + { + let buf = Vec::with_capacity(3); + let mut slice: Slice<_> = buf.slice(0..2); + let rust_slice = slice.as_mut_rust_slice_full_zeroed(); + assert_eq!(rust_slice.len(), 2); + assert_eq!(rust_slice, &[0, 0]); + let mut file = make_fake_file(); + file.read_exact(rust_slice).unwrap(); + assert_eq!(rust_slice, b"12"); + assert_eq!(&slice[..], b"12"); + } } } From 1391b0777ec5c8967d6fe5d473e25a89e6c09ed8 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Fri, 28 Jun 2024 08:38:01 +0000 Subject: [PATCH 18/24] fixups for config stuff --- libs/remote_storage/src/config.rs | 10 +++++----- pageserver/ctl/src/main.rs | 2 +- pageserver/src/config.rs | 4 ++-- proxy/src/bin/proxy.rs | 9 ++++----- proxy/src/config.rs | 8 ++------ proxy/src/context/parquet.rs | 15 ++++++--------- safekeeper/src/bin/safekeeper.rs | 11 +---------- 7 files changed, 21 insertions(+), 38 deletions(-) diff --git a/libs/remote_storage/src/config.rs b/libs/remote_storage/src/config.rs index 100fd1c28a31..fa3f2cba58d7 100644 --- a/libs/remote_storage/src/config.rs +++ b/libs/remote_storage/src/config.rs @@ -175,7 +175,7 @@ fn serialize_storage_class( impl RemoteStorageConfig { pub const DEFAULT_TIMEOUT: Duration = std::time::Duration::from_secs(120); - pub fn from_toml(toml: &toml_edit::Item) -> anyhow::Result> { + pub fn from_toml(toml: &toml_edit::Item) -> anyhow::Result { Ok(utils::toml_edit_ext::deserialize_item(toml)?) } } @@ -184,7 +184,7 @@ impl RemoteStorageConfig { mod tests { use super::*; - fn parse(input: &str) -> anyhow::Result> { + fn parse(input: &str) -> anyhow::Result { let toml = input.parse::().unwrap(); RemoteStorageConfig::from_toml(toml.as_item()) } @@ -194,7 +194,7 @@ mod tests { let input = "local_path = '.' timeout = '5s'"; - let config = parse(input).unwrap().expect("it exists"); + let config = parse(input).unwrap(); assert_eq!( config, @@ -216,7 +216,7 @@ timeout = '5s'"; timeout = '7s' "; - let config = parse(toml).unwrap().expect("it exists"); + let config = parse(toml).unwrap(); assert_eq!( config, @@ -244,7 +244,7 @@ timeout = '5s'"; timeout = '7s' "; - let config = parse(toml).unwrap().expect("it exists"); + let config = parse(toml).unwrap(); assert_eq!( config, diff --git a/pageserver/ctl/src/main.rs b/pageserver/ctl/src/main.rs index 50c3ac4c6143..ea09a011e5cf 100644 --- a/pageserver/ctl/src/main.rs +++ b/pageserver/ctl/src/main.rs @@ -178,7 +178,7 @@ async fn main() -> anyhow::Result<()> { let toml_item = toml_document .get("remote_storage") .expect("need remote_storage"); - let config = RemoteStorageConfig::from_toml(toml_item)?.expect("incomplete config"); + let config = RemoteStorageConfig::from_toml(toml_item)?; let storage = remote_storage::GenericRemoteStorage::from_config(&config); let cancel = CancellationToken::new(); storage diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index ece52cbd960d..c07dc5f0aa2b 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -928,7 +928,7 @@ impl PageServerConf { "http_auth_type" => builder.http_auth_type(parse_toml_from_str(key, item)?), "pg_auth_type" => builder.pg_auth_type(parse_toml_from_str(key, item)?), "remote_storage" => { - builder.remote_storage_config(RemoteStorageConfig::from_toml(item).context("remote_storage")?) + builder.remote_storage_config(Some(RemoteStorageConfig::from_toml(item).context("remote_storage")?)) } "tenant_config" => { t_conf = TenantConfOpt::try_from(item.to_owned()).context(format!("failed to parse: '{key}'"))?; @@ -956,7 +956,7 @@ impl PageServerConf { builder.metric_collection_endpoint(Some(endpoint)); }, "metric_collection_bucket" => { - builder.metric_collection_bucket(RemoteStorageConfig::from_toml(item)?) + builder.metric_collection_bucket(Some(RemoteStorageConfig::from_toml(item)?)) } "synthetic_size_calculation_interval" => builder.synthetic_size_calculation_interval(parse_toml_duration(key, item)?), diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs index dffebf55800c..7f4cb2c0100c 100644 --- a/proxy/src/bin/proxy.rs +++ b/proxy/src/bin/proxy.rs @@ -35,6 +35,7 @@ use proxy::usage_metrics; use anyhow::bail; use proxy::config::{self, ProxyConfig}; use proxy::serverless; +use remote_storage::RemoteStorageConfig; use std::net::SocketAddr; use std::pin::pin; use std::sync::Arc; @@ -205,8 +206,8 @@ struct ProxyCliArgs { /// remote storage configuration for backup metric collection /// Encoded as toml (same format as pageservers), eg /// `{bucket_name='the-bucket',bucket_region='us-east-1',prefix_in_bucket='proxy',endpoint='http://minio:9000'}` - #[clap(long, default_value = "{}")] - metric_backup_collection_remote_storage: String, + #[clap(long, value_parser = remote_storage_from_toml)] + metric_backup_collection_remote_storage: Option, /// chunk size for backup metric collection /// Size of each event is no more than 400 bytes, so 2**22 is about 200MB before the compression. #[clap(long, default_value = "4194304")] @@ -511,9 +512,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { } let backup_metric_collection_config = config::MetricBackupCollectionConfig { interval: args.metric_backup_collection_interval, - remote_storage_config: remote_storage_from_toml( - &args.metric_backup_collection_remote_storage, - )?, + remote_storage_config: args.metric_backup_collection_remote_storage.clone(), chunk_size: args.metric_backup_collection_chunk_size, }; diff --git a/proxy/src/config.rs b/proxy/src/config.rs index f4707a33aa79..af5511d7ec24 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -399,15 +399,11 @@ impl FromStr for EndpointCacheConfig { #[derive(Debug)] pub struct MetricBackupCollectionConfig { pub interval: Duration, - pub remote_storage_config: OptRemoteStorageConfig, + pub remote_storage_config: Option, pub chunk_size: usize, } -/// Hack to avoid clap being smarter. If you don't use this type alias, clap assumes more about the optional state and you get -/// runtime type errors from the value parser we use. -pub type OptRemoteStorageConfig = Option; - -pub fn remote_storage_from_toml(s: &str) -> anyhow::Result { +pub fn remote_storage_from_toml(s: &str) -> anyhow::Result { RemoteStorageConfig::from_toml(&s.parse()?) } diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs index e72bf199e362..cfc1f8e89e3f 100644 --- a/proxy/src/context/parquet.rs +++ b/proxy/src/context/parquet.rs @@ -14,17 +14,14 @@ use parquet::{ record::RecordWriter, }; use pq_proto::StartupMessageParams; -use remote_storage::{GenericRemoteStorage, RemotePath, TimeoutOrCancel}; +use remote_storage::{GenericRemoteStorage, RemotePath, RemoteStorageConfig, TimeoutOrCancel}; use serde::ser::SerializeMap; use tokio::{sync::mpsc, time}; use tokio_util::sync::CancellationToken; use tracing::{debug, info, Span}; use utils::backoff; -use crate::{ - config::{remote_storage_from_toml, OptRemoteStorageConfig}, - context::LOG_CHAN_DISCONNECT, -}; +use crate::{config::remote_storage_from_toml, context::LOG_CHAN_DISCONNECT}; use super::{RequestMonitoring, LOG_CHAN}; @@ -33,11 +30,11 @@ pub struct ParquetUploadArgs { /// Storage location to upload the parquet files to. /// Encoded as toml (same format as pageservers), eg /// `{bucket_name='the-bucket',bucket_region='us-east-1',prefix_in_bucket='proxy',endpoint='http://minio:9000'}` - #[clap(long, default_value = "{}", value_parser = remote_storage_from_toml)] - parquet_upload_remote_storage: OptRemoteStorageConfig, + #[clap(long, value_parser = remote_storage_from_toml)] + parquet_upload_remote_storage: Option, - #[clap(long, default_value = "{}", value_parser = remote_storage_from_toml)] - parquet_upload_disconnect_events_remote_storage: OptRemoteStorageConfig, + #[clap(long, value_parser = remote_storage_from_toml)] + parquet_upload_disconnect_events_remote_storage: Option, /// How many rows to include in a row group #[clap(long, default_value_t = 8192)] diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index 20650490b1ae..b888bc987118 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -12,7 +12,6 @@ use sd_notify::NotifyState; use tokio::runtime::Handle; use tokio::signal::unix::{signal, SignalKind}; use tokio::task::JoinError; -use toml_edit::Document; use utils::logging::SecretString; use std::env::{var, VarError}; @@ -548,16 +547,8 @@ fn set_id(workdir: &Utf8Path, given_id: Option) -> Result { Ok(my_id) } -// Parse RemoteStorage from TOML table. fn parse_remote_storage(storage_conf: &str) -> anyhow::Result { - // funny toml doesn't consider plain inline table as valid document, so wrap in a key to parse - let storage_conf_toml = format!("remote_storage = {storage_conf}"); - let parsed_toml = storage_conf_toml.parse::()?; // parse - let (_, storage_conf_parsed_toml) = parsed_toml.iter().next().unwrap(); // and strip key off again - RemoteStorageConfig::from_toml(storage_conf_parsed_toml).and_then(|parsed_config| { - // XXX: Don't print the original toml here, there might be some sensitive data - parsed_config.context("Incorrectly parsed remote storage toml as no remote storage config") - }) + RemoteStorageConfig::from_toml(&storage_conf.parse()?) } #[test] From 73782760d130e8996907182f3d2c1dea49324e85 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Mon, 1 Jul 2024 10:23:41 +0000 Subject: [PATCH 19/24] naming: https://github.com/neondatabase/neon/pull/8190#discussion_r1658415703 --- pageserver/src/tenant/ephemeral_file/page_caching.rs | 2 +- pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pageserver/src/tenant/ephemeral_file/page_caching.rs b/pageserver/src/tenant/ephemeral_file/page_caching.rs index 313f0736ab2e..0d357f34df79 100644 --- a/pageserver/src/tenant/ephemeral_file/page_caching.rs +++ b/pageserver/src/tenant/ephemeral_file/page_caching.rs @@ -91,7 +91,7 @@ impl RW { assert_eq!(vec.len() / PAGE_SZ, nwritten_blocks); // copy from in-memory buffer what we haven't flushed yet but would return when accessed via read_blk - let buffered = self.rw.inspect_served_from_zero_padded_mutable_tail(); + let buffered = self.rw.get_tail_zero_padded(); vec.extend_from_slice(buffered); assert_eq!(vec.len(), size); assert_eq!(vec.len() % PAGE_SZ, 0); diff --git a/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs b/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs index eaf25fb3148a..fe310acab888 100644 --- a/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs +++ b/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs @@ -76,7 +76,7 @@ where } /// Get a slice of all blocks that [`Self::read_blk`] would return as [`ReadResult::ServedFromZeroPaddedMutableTail`]. - pub fn inspect_served_from_zero_padded_mutable_tail(&self) -> &[u8] { + pub fn get_tail_zero_padded(&self) -> &[u8] { let buffer: &zero_padded::Buffer = self.buffered_writer.inspect_buffer(); let buffer_written_up_to = buffer.pending(); // pad to next page boundary From dc34ac34e0c07fe528c72ef455a410d9926aefe8 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Mon, 1 Jul 2024 10:25:37 +0000 Subject: [PATCH 20/24] naming: https://github.com/neondatabase/neon/pull/8190#discussion_r1658412030 --- pageserver/src/tenant/ephemeral_file.rs | 6 +++--- pageserver/src/tenant/ephemeral_file/page_caching.rs | 2 +- pageserver/src/tenant/storage_layer/inmemory_layer.rs | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs index 03c7ac36d3ab..1a12bd9b2601 100644 --- a/pageserver/src/tenant/ephemeral_file.rs +++ b/pageserver/src/tenant/ephemeral_file.rs @@ -67,12 +67,12 @@ impl EphemeralFile { self.rw.page_cache_file_id() } - /// See [`self::page_caching::RW::load_into_contiguous_memory`]. - pub(crate) async fn load_into_contiguous_memory( + /// See [`self::page_caching::RW::load_to_vec`]. + pub(crate) async fn load_to_vec( &self, ctx: &RequestContext, ) -> Result, io::Error> { - self.rw.load_into_contiguous_memory(ctx).await + self.rw.load_to_vec(ctx).await } pub(crate) async fn read_blk( diff --git a/pageserver/src/tenant/ephemeral_file/page_caching.rs b/pageserver/src/tenant/ephemeral_file/page_caching.rs index 0d357f34df79..89eccc5593f7 100644 --- a/pageserver/src/tenant/ephemeral_file/page_caching.rs +++ b/pageserver/src/tenant/ephemeral_file/page_caching.rs @@ -62,7 +62,7 @@ impl RW { /// /// This includes the blocks that aren't yet flushed to disk by the internal buffered writer. /// The last block is zero-padded to [`PAGE_SZ`], so, the returned buffer is always a multiple of [`PAGE_SZ`]. - pub(super) async fn load_into_contiguous_memory( + pub(super) async fn load_to_vec( &self, ctx: &RequestContext, ) -> Result, io::Error> { diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs index b1076eb208e2..3c397ef27399 100644 --- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs +++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs @@ -680,7 +680,7 @@ impl InMemoryLayer { } } l0_flush::Inner::Direct { .. } => { - let file_contents: Vec = inner.file.load_into_contiguous_memory(ctx).await?; + let file_contents: Vec = inner.file.load_to_vec(ctx).await?; assert_eq!( file_contents.len() % PAGE_SZ, 0, From 5a34d2b6cf1e6b7bd0b850fada6483ffbb9e92ba Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Mon, 1 Jul 2024 11:03:37 +0000 Subject: [PATCH 21/24] fmt and linters --- pageserver/src/tenant/ephemeral_file.rs | 5 +---- pageserver/src/tenant/ephemeral_file/page_caching.rs | 5 +---- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs index 1a12bd9b2601..a2d9d2b022d3 100644 --- a/pageserver/src/tenant/ephemeral_file.rs +++ b/pageserver/src/tenant/ephemeral_file.rs @@ -68,10 +68,7 @@ impl EphemeralFile { } /// See [`self::page_caching::RW::load_to_vec`]. - pub(crate) async fn load_to_vec( - &self, - ctx: &RequestContext, - ) -> Result, io::Error> { + pub(crate) async fn load_to_vec(&self, ctx: &RequestContext) -> Result, io::Error> { self.rw.load_to_vec(ctx).await } diff --git a/pageserver/src/tenant/ephemeral_file/page_caching.rs b/pageserver/src/tenant/ephemeral_file/page_caching.rs index 89eccc5593f7..224e29e790bd 100644 --- a/pageserver/src/tenant/ephemeral_file/page_caching.rs +++ b/pageserver/src/tenant/ephemeral_file/page_caching.rs @@ -62,10 +62,7 @@ impl RW { /// /// This includes the blocks that aren't yet flushed to disk by the internal buffered writer. /// The last block is zero-padded to [`PAGE_SZ`], so, the returned buffer is always a multiple of [`PAGE_SZ`]. - pub(super) async fn load_to_vec( - &self, - ctx: &RequestContext, - ) -> Result, io::Error> { + pub(super) async fn load_to_vec(&self, ctx: &RequestContext) -> Result, io::Error> { // round up to the next PAGE_SZ multiple, required by blob_io let size = { let s = usize::try_from(self.bytes_written()).unwrap(); From 740344eb58e55c7798cbb13ec01b3224997d2b30 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Mon, 1 Jul 2024 11:13:10 +0000 Subject: [PATCH 22/24] less awkwardness: https://github.com/neondatabase/neon/pull/8190#discussion_r1658420347 --- .../src/tenant/ephemeral_file/page_caching.rs | 47 +++++++++---------- 1 file changed, 23 insertions(+), 24 deletions(-) diff --git a/pageserver/src/tenant/ephemeral_file/page_caching.rs b/pageserver/src/tenant/ephemeral_file/page_caching.rs index 224e29e790bd..43b9fff28d98 100644 --- a/pageserver/src/tenant/ephemeral_file/page_caching.rs +++ b/pageserver/src/tenant/ephemeral_file/page_caching.rs @@ -8,6 +8,7 @@ use crate::virtual_file::VirtualFile; use once_cell::sync::Lazy; use std::io::{self, ErrorKind}; +use std::ops::{Deref, Range}; use tokio_epoll_uring::BoundedBuf; use tracing::*; @@ -72,20 +73,20 @@ impl RW { s.checked_add(PAGE_SZ - (s % PAGE_SZ)).unwrap() } }; - let buf = Vec::with_capacity(size); + let vec = Vec::with_capacity(size); // read from disk what we've already flushed let writer = self.rw.as_writer(); - let nwritten_blocks = usize::try_from(writer.nwritten_blocks).unwrap(); - let buf = writer - .load_flushed_blocks_into_contiguous_memory( - buf.slice(0..nwritten_blocks * PAGE_SZ), + let flushed_range = writer.written_range(); + let mut vec = writer + .file + .read_exact_at( + vec.slice(0..(flushed_range.end - flushed_range.start)), + u64::try_from(flushed_range.start).unwrap(), ctx, ) - .await?; - let mut vec = buf.into_inner(); - assert_eq!(vec.len() % PAGE_SZ, 0); - assert_eq!(vec.len() / PAGE_SZ, nwritten_blocks); + .await? + .into_inner(); // copy from in-memory buffer what we haven't flushed yet but would return when accessed via read_blk let buffered = self.rw.get_tail_zero_padded(); @@ -182,21 +183,19 @@ impl PreWarmingWriter { } } - /// Load all the blocks that we already flushed to disk into `buf`. - async fn load_flushed_blocks_into_contiguous_memory( - &self, - buf: tokio_epoll_uring::Slice, - ctx: &RequestContext, - ) -> std::io::Result> - where - Buf: tokio_epoll_uring::IoBufMut + Send, - { - assert_eq!(buf.bytes_total() % PAGE_SZ, 0); - assert_eq!( - buf.bytes_total() / PAGE_SZ, - usize::try_from(self.nwritten_blocks).unwrap() - ); - self.file.read_exact_at(buf, 0, ctx).await + /// Return the byte range within `file` that has been written though `write_all`. + /// + /// The returned range would be invalidated by another `write_all`. To prevent that, we capture `&_`. + fn written_range(&self) -> (impl Deref> + '_) { + let nwritten_blocks = usize::try_from(self.nwritten_blocks).unwrap(); + struct Wrapper(Range); + impl Deref for Wrapper { + type Target = Range; + fn deref(&self) -> &Range { + &self.0 + } + } + Wrapper(0..nwritten_blocks * PAGE_SZ) } } From ad004b7987b6a7a8c0243471843a5af7c4664f2a Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Mon, 1 Jul 2024 13:04:38 +0000 Subject: [PATCH 23/24] use ::default() in all the places --- pageserver/src/config.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index c07dc5f0aa2b..435295b61b01 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -488,7 +488,7 @@ impl PageServerConfigBuilder { )), validate_vectored_get: Set(DEFAULT_VALIDATE_VECTORED_GET), ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB), - l0_flush: Set(L0FlushConfig::PageCached), + l0_flush: Set(L0FlushConfig::default()), } } } @@ -1103,7 +1103,7 @@ impl PageServerConf { ), validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET, ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB, - l0_flush: L0FlushConfig::PageCached, + l0_flush: L0FlushConfig::default(), } } } @@ -1343,7 +1343,7 @@ background_task_maximum_delay = '334 s' ), validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET, ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB, - l0_flush: L0FlushConfig::PageCached, + l0_flush: L0FlushConfig::default(), }, "Correct defaults should be used when no config values are provided" ); @@ -1417,7 +1417,7 @@ background_task_maximum_delay = '334 s' ), validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET, ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB, - l0_flush: L0FlushConfig::PageCached, + l0_flush: L0FlushConfig::default(), }, "Should be able to parse all basic config values correctly" ); From 511d6644c35b3b61e19da6766db7c194d6c481bd Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Mon, 1 Jul 2024 13:09:13 +0000 Subject: [PATCH 24/24] infer prewarm_on_write from PageServerConf; https://github.com/neondatabase/neon/pull/8190#discussion_r1658420347 --- pageserver/src/l0_flush.rs | 9 ++++++--- pageserver/src/tenant/ephemeral_file.rs | 9 ++------- .../src/tenant/storage_layer/inmemory_layer.rs | 6 ++---- pageserver/src/tenant/timeline.rs | 1 - pageserver/src/tenant/timeline/layer_manager.rs | 14 ++------------ 5 files changed, 12 insertions(+), 27 deletions(-) diff --git a/pageserver/src/l0_flush.rs b/pageserver/src/l0_flush.rs index c41f5a292275..7fe8fedc6394 100644 --- a/pageserver/src/l0_flush.rs +++ b/pageserver/src/l0_flush.rs @@ -33,11 +33,14 @@ impl L0FlushGlobalState { pub(crate) fn inner(&self) -> &Arc { &self.0 } +} +impl L0FlushConfig { pub(crate) fn prewarm_on_write(&self) -> ephemeral_file::PrewarmPageCacheOnWrite { - match &*self.0 { - Inner::PageCached => ephemeral_file::PrewarmPageCacheOnWrite::Yes, - Inner::Direct { .. } => ephemeral_file::PrewarmPageCacheOnWrite::No, + use L0FlushConfig::*; + match self { + PageCached => ephemeral_file::PrewarmPageCacheOnWrite::Yes, + Direct { .. } => ephemeral_file::PrewarmPageCacheOnWrite::No, } } } diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs index a2d9d2b022d3..bb65ae24fc5e 100644 --- a/pageserver/src/tenant/ephemeral_file.rs +++ b/pageserver/src/tenant/ephemeral_file.rs @@ -29,7 +29,6 @@ impl EphemeralFile { conf: &PageServerConf, tenant_shard_id: TenantShardId, timeline_id: TimelineId, - prewarm_on_write: page_caching::PrewarmOnWrite, ctx: &RequestContext, ) -> Result { static NEXT_FILENAME: AtomicU64 = AtomicU64::new(1); @@ -55,7 +54,7 @@ impl EphemeralFile { Ok(EphemeralFile { _tenant_shard_id: tenant_shard_id, _timeline_id: timeline_id, - rw: page_caching::RW::new(file, prewarm_on_write), + rw: page_caching::RW::new(file, conf.l0_flush.prewarm_on_write()), }) } @@ -162,11 +161,7 @@ mod tests { async fn test_ephemeral_blobs() -> Result<(), io::Error> { let (conf, tenant_id, timeline_id, ctx) = harness("ephemeral_blobs")?; - let prewarm_on_write = - crate::l0_flush::L0FlushGlobalState::new(crate::l0_flush::L0FlushConfig::default()) - .prewarm_on_write(); - let mut file = - EphemeralFile::create(conf, tenant_id, timeline_id, prewarm_on_write, &ctx).await?; + let mut file = EphemeralFile::create(conf, tenant_id, timeline_id, &ctx).await?; let pos_foo = file.write_blob(b"foo", &ctx).await?; assert_eq!( diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs index 3c397ef27399..e1eaea90af57 100644 --- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs +++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs @@ -9,7 +9,7 @@ use crate::context::{PageContentKind, RequestContext, RequestContextBuilder}; use crate::page_cache::PAGE_SZ; use crate::repository::{Key, Value}; use crate::tenant::block_io::{BlockCursor, BlockReader, BlockReaderRef}; -use crate::tenant::ephemeral_file::{self, EphemeralFile}; +use crate::tenant::ephemeral_file::EphemeralFile; use crate::tenant::storage_layer::ValueReconstructResult; use crate::tenant::timeline::GetVectoredError; use crate::tenant::{PageReconstructError, Timeline}; @@ -475,13 +475,11 @@ impl InMemoryLayer { timeline_id: TimelineId, tenant_shard_id: TenantShardId, start_lsn: Lsn, - prewarm_on_write: ephemeral_file::PrewarmPageCacheOnWrite, ctx: &RequestContext, ) -> Result { trace!("initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}"); - let file = EphemeralFile::create(conf, tenant_shard_id, timeline_id, prewarm_on_write, ctx) - .await?; + let file = EphemeralFile::create(conf, tenant_shard_id, timeline_id, ctx).await?; let key = InMemoryLayerFileId(file.page_cache_file_id()); Ok(InMemoryLayer { diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index cd3d3bb5aff1..e1b72556113a 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -3671,7 +3671,6 @@ impl Timeline { self.conf, self.timeline_id, self.tenant_shard_id, - self.l0_flush_global_state.prewarm_on_write(), ctx, ) .await?; diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs index 4ccd5128b55a..948237e06a5e 100644 --- a/pageserver/src/tenant/timeline/layer_manager.rs +++ b/pageserver/src/tenant/timeline/layer_manager.rs @@ -13,7 +13,6 @@ use crate::{ context::RequestContext, metrics::TimelineMetrics, tenant::{ - ephemeral_file, layer_map::{BatchedUpdates, LayerMap}, storage_layer::{ AsLayerDesc, InMemoryLayer, Layer, PersistentLayerDesc, PersistentLayerKey, @@ -67,7 +66,6 @@ impl LayerManager { /// Open a new writable layer to append data if there is no open layer, otherwise return the current open layer, /// called within `get_layer_for_write`. - #[allow(clippy::too_many_arguments)] pub(crate) async fn get_layer_for_write( &mut self, lsn: Lsn, @@ -75,7 +73,6 @@ impl LayerManager { conf: &'static PageServerConf, timeline_id: TimelineId, tenant_shard_id: TenantShardId, - prewarm_on_write: ephemeral_file::PrewarmPageCacheOnWrite, ctx: &RequestContext, ) -> Result> { ensure!(lsn.is_aligned()); @@ -112,15 +109,8 @@ impl LayerManager { lsn ); - let new_layer = InMemoryLayer::create( - conf, - timeline_id, - tenant_shard_id, - start_lsn, - prewarm_on_write, - ctx, - ) - .await?; + let new_layer = + InMemoryLayer::create(conf, timeline_id, tenant_shard_id, start_lsn, ctx).await?; let layer = Arc::new(new_layer); self.layer_map.open_layer = Some(layer.clone());