Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

L0 flush: opt-in mechanism to bypass PageCache reads and writes #8190

Merged
merged 35 commits into from
Jul 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
b67c760
WIP
problame Jun 25, 2024
14eb9a1
wip
problame Jun 25, 2024
fc8ece0
Merge branch 'main' into problame/fast-delta-layer-writes
problame Jun 25, 2024
282a633
read_exact_at_impl: accept a BoundedBuf
problame Jun 25, 2024
79a33dc
WIP
problame Jun 25, 2024
c7fc169
WIP
problame Jun 25, 2024
27cae3c
it compiles
problame Jun 25, 2024
1e5c126
Merge branch 'main' into problame/virtualfile-use-boundedbuf
problame Jun 25, 2024
8070fc8
Merge branch 'problame/virtualfile-use-boundedbuf' into problame/fast…
problame Jun 25, 2024
5031f41
hack hack
problame Jun 26, 2024
fdeff87
refactor: generalize toml_edit::Item deserialization
problame Jun 26, 2024
eb593a7
hack hack hack
problame Jun 26, 2024
38608b3
hack hack hack
problame Jun 26, 2024
d4f419a
fix some bugs
problame Jun 26, 2024
9cdbc7b
don't prewarm page cache on EphemeralFile write and add TODO comment …
problame Jun 26, 2024
2f8d12b
self-review
problame Jun 27, 2024
f49b32f
Merge branch 'main' into problame/virtualfile-use-boundedbuf
problame Jun 27, 2024
647f084
get rid of read_exact_at alltogether
problame Jun 27, 2024
928c1dc
re-add read_exact_at
problame Jun 27, 2024
df56595
finish & fix some ub with std-fs (will pull this into a preliminary)
problame Jun 27, 2024
b481740
Merge branch 'main' into problame/virtualfile-use-boundedbuf
problame Jun 27, 2024
562c091
Merge branch 'problame/virtualfile-use-boundedbuf' into problame/fast…
problame Jun 27, 2024
98d8721
fix test and pretty up
problame Jun 27, 2024
2dbcabe
Merge branch 'problame/virtualfile-use-boundedbuf' into problame/fast…
problame Jun 27, 2024
9fa4f47
Merge branch 'main' into problame/virtualfile-use-boundedbuf
problame Jun 27, 2024
d0cbb3d
Merge branch 'problame/virtualfile-use-boundedbuf' into problame/fast…
problame Jun 27, 2024
1391b07
fixups for config stuff
problame Jun 28, 2024
c8a636a
Merge branch 'main' into problame/fast-delta-layer-writes
problame Jun 28, 2024
7378276
naming: https://github.com/neondatabase/neon/pull/8190#discussion_r16…
problame Jul 1, 2024
dc34ac3
naming: https://github.com/neondatabase/neon/pull/8190#discussion_r16…
problame Jul 1, 2024
5a34d2b
fmt and linters
problame Jul 1, 2024
740344e
less awkwardness: https://github.com/neondatabase/neon/pull/8190#disc…
problame Jul 1, 2024
ad004b7
use ::default() in all the places
problame Jul 1, 2024
511d664
infer prewarm_on_write from PageServerConf; https://github.com/neonda…
problame Jul 1, 2024
c5bc214
Merge branch 'main' into problame/fast-delta-layer-writes
problame Jul 2, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions pageserver/src/bin/pageserver.rs
Original file line number Diff line number Diff line change
Expand Up @@ -421,6 +421,10 @@ fn start_pageserver(
background_jobs_can_start: background_jobs_barrier.clone(),
};

info!(config=?conf.l0_flush, "using l0_flush config");
let l0_flush_global_state =
pageserver::l0_flush::L0FlushGlobalState::new(conf.l0_flush.clone());

// Scan the local 'tenants/' directory and start loading the tenants
let deletion_queue_client = deletion_queue.new_client();
let tenant_manager = BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
Expand All @@ -429,6 +433,7 @@ fn start_pageserver(
broker_client: broker_client.clone(),
remote_storage: remote_storage.clone(),
deletion_queue_client,
l0_flush_global_state,
},
order,
shutdown_pageserver.clone(),
Expand Down
18 changes: 17 additions & 1 deletion pageserver/src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,11 @@ use utils::{
logging::LogFormat,
};

use crate::tenant::timeline::GetVectoredImpl;
use crate::tenant::vectored_blob_io::MaxVectoredReadBytes;
use crate::tenant::{config::TenantConfOpt, timeline::GetImpl};
use crate::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
use crate::{disk_usage_eviction_task::DiskUsageEvictionTaskConfig, virtual_file::io_engine};
use crate::{l0_flush::L0FlushConfig, tenant::timeline::GetVectoredImpl};
use crate::{tenant::config::TenantConf, virtual_file};
use crate::{TENANT_HEATMAP_BASENAME, TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX};

Expand Down Expand Up @@ -291,6 +291,8 @@ pub struct PageServerConf {
///
/// Setting this to zero disables limits on total ephemeral layer size.
pub ephemeral_bytes_per_memory_kb: usize,

pub l0_flush: L0FlushConfig,
}

/// We do not want to store this in a PageServerConf because the latter may be logged
Expand Down Expand Up @@ -396,6 +398,8 @@ struct PageServerConfigBuilder {
validate_vectored_get: BuilderValue<bool>,

ephemeral_bytes_per_memory_kb: BuilderValue<usize>,

l0_flush: BuilderValue<L0FlushConfig>,
}

impl PageServerConfigBuilder {
Expand Down Expand Up @@ -484,6 +488,7 @@ impl PageServerConfigBuilder {
)),
validate_vectored_get: Set(DEFAULT_VALIDATE_VECTORED_GET),
ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
l0_flush: Set(L0FlushConfig::default()),
}
}
}
Expand Down Expand Up @@ -671,6 +676,10 @@ impl PageServerConfigBuilder {
self.ephemeral_bytes_per_memory_kb = BuilderValue::Set(value);
}

pub fn l0_flush(&mut self, value: L0FlushConfig) {
self.l0_flush = BuilderValue::Set(value);
}

pub fn build(self) -> anyhow::Result<PageServerConf> {
let default = Self::default_values();

Expand Down Expand Up @@ -728,6 +737,7 @@ impl PageServerConfigBuilder {
max_vectored_read_bytes,
validate_vectored_get,
ephemeral_bytes_per_memory_kb,
l0_flush,
}
CUSTOM LOGIC
{
Expand Down Expand Up @@ -1007,6 +1017,9 @@ impl PageServerConf {
"ephemeral_bytes_per_memory_kb" => {
builder.get_ephemeral_bytes_per_memory_kb(parse_toml_u64("ephemeral_bytes_per_memory_kb", item)? as usize)
}
"l0_flush" => {
builder.l0_flush(utils::toml_edit_ext::deserialize_item(item).context("l0_flush")?)
}
_ => bail!("unrecognized pageserver option '{key}'"),
}
}
Expand Down Expand Up @@ -1090,6 +1103,7 @@ impl PageServerConf {
),
validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
l0_flush: L0FlushConfig::default(),
}
}
}
Expand Down Expand Up @@ -1329,6 +1343,7 @@ background_task_maximum_delay = '334 s'
),
validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
l0_flush: L0FlushConfig::default(),
},
"Correct defaults should be used when no config values are provided"
);
Expand Down Expand Up @@ -1402,6 +1417,7 @@ background_task_maximum_delay = '334 s'
),
validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
l0_flush: L0FlushConfig::default(),
},
"Should be able to parse all basic config values correctly"
);
Expand Down
46 changes: 46 additions & 0 deletions pageserver/src/l0_flush.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
use std::{num::NonZeroUsize, sync::Arc};

use crate::tenant::ephemeral_file;

#[derive(Default, Debug, PartialEq, Eq, Clone, serde::Deserialize)]
#[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
pub enum L0FlushConfig {
#[default]
PageCached,
#[serde(rename_all = "snake_case")]
Direct { max_concurrency: NonZeroUsize },
}

#[derive(Clone)]
pub struct L0FlushGlobalState(Arc<Inner>);

pub(crate) enum Inner {
PageCached,
Direct { semaphore: tokio::sync::Semaphore },
}

impl L0FlushGlobalState {
pub fn new(config: L0FlushConfig) -> Self {
match config {
L0FlushConfig::PageCached => Self(Arc::new(Inner::PageCached)),
L0FlushConfig::Direct { max_concurrency } => {
let semaphore = tokio::sync::Semaphore::new(max_concurrency.get());
Self(Arc::new(Inner::Direct { semaphore }))
}
}
}

pub(crate) fn inner(&self) -> &Arc<Inner> {
&self.0
}
}

impl L0FlushConfig {
pub(crate) fn prewarm_on_write(&self) -> ephemeral_file::PrewarmPageCacheOnWrite {
use L0FlushConfig::*;
match self {
PageCached => ephemeral_file::PrewarmPageCacheOnWrite::Yes,
Direct { .. } => ephemeral_file::PrewarmPageCacheOnWrite::No,
}
}
}
1 change: 1 addition & 0 deletions pageserver/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ pub mod deletion_queue;
pub mod disk_usage_eviction_task;
pub mod http;
pub mod import_datadir;
pub mod l0_flush;
pub use pageserver_api::keyspace;
pub mod aux_file;
pub mod metrics;
Expand Down
13 changes: 13 additions & 0 deletions pageserver/src/tenant.rs
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ use crate::deletion_queue::DeletionQueueClient;
use crate::deletion_queue::DeletionQueueError;
use crate::import_datadir;
use crate::is_uninit_mark;
use crate::l0_flush::L0FlushGlobalState;
use crate::metrics::TENANT;
use crate::metrics::{
remove_tenant_metrics, BROKEN_TENANTS_SET, TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC,
Expand Down Expand Up @@ -166,6 +167,7 @@ pub struct TenantSharedResources {
pub broker_client: storage_broker::BrokerClientChannel,
pub remote_storage: GenericRemoteStorage,
pub deletion_queue_client: DeletionQueueClient,
pub l0_flush_global_state: L0FlushGlobalState,
}

/// A [`Tenant`] is really an _attached_ tenant. The configuration
Expand Down Expand Up @@ -294,6 +296,8 @@ pub struct Tenant {

/// An ongoing timeline detach must be checked during attempts to GC or compact a timeline.
ongoing_timeline_detach: std::sync::Mutex<Option<(TimelineId, utils::completion::Barrier)>>,

l0_flush_global_state: L0FlushGlobalState,
}

impl std::fmt::Debug for Tenant {
Expand Down Expand Up @@ -667,6 +671,7 @@ impl Tenant {
broker_client,
remote_storage,
deletion_queue_client,
l0_flush_global_state,
} = resources;

let attach_mode = attached_conf.location.attach_mode;
Expand All @@ -681,6 +686,7 @@ impl Tenant {
tenant_shard_id,
remote_storage.clone(),
deletion_queue_client,
l0_flush_global_state,
));

// The attach task will carry a GateGuard, so that shutdown() reliably waits for it to drop out if
Expand Down Expand Up @@ -980,6 +986,7 @@ impl Tenant {
TimelineResources {
remote_client,
timeline_get_throttle: self.timeline_get_throttle.clone(),
l0_flush_global_state: self.l0_flush_global_state.clone(),
},
ctx,
)
Expand Down Expand Up @@ -2469,6 +2476,7 @@ impl Tenant {
tenant_shard_id: TenantShardId,
remote_storage: GenericRemoteStorage,
deletion_queue_client: DeletionQueueClient,
l0_flush_global_state: L0FlushGlobalState,
) -> Tenant {
debug_assert!(
!attached_conf.location.generation.is_none() || conf.control_plane_api.is_none()
Expand Down Expand Up @@ -2556,6 +2564,7 @@ impl Tenant {
)),
tenant_conf: Arc::new(ArcSwap::from_pointee(attached_conf)),
ongoing_timeline_detach: std::sync::Mutex::default(),
l0_flush_global_state,
}
}

Expand Down Expand Up @@ -3296,6 +3305,7 @@ impl Tenant {
TimelineResources {
remote_client,
timeline_get_throttle: self.timeline_get_throttle.clone(),
l0_flush_global_state: self.l0_flush_global_state.clone(),
}
}

Expand Down Expand Up @@ -3632,6 +3642,7 @@ pub(crate) mod harness {
use utils::logging;

use crate::deletion_queue::mock::MockDeletionQueue;
use crate::l0_flush::L0FlushConfig;
use crate::walredo::apply_neon;
use crate::{repository::Key, walrecord::NeonWalRecord};

Expand Down Expand Up @@ -3821,6 +3832,8 @@ pub(crate) mod harness {
self.tenant_shard_id,
self.remote_storage.clone(),
self.deletion_queue.new_client(),
// TODO: ideally we should run all unit tests with both configs
L0FlushGlobalState::new(L0FlushConfig::default()),
));

let preload = tenant
Expand Down
22 changes: 22 additions & 0 deletions pageserver/src/tenant/block_io.rs
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ where
pub enum BlockLease<'a> {
PageReadGuard(PageReadGuard<'static>),
EphemeralFileMutableTail(&'a [u8; PAGE_SZ]),
Slice(&'a [u8; PAGE_SZ]),
#[cfg(test)]
Arc(std::sync::Arc<[u8; PAGE_SZ]>),
#[cfg(test)]
Expand All @@ -63,6 +64,7 @@ impl<'a> Deref for BlockLease<'a> {
match self {
BlockLease::PageReadGuard(v) => v.deref(),
BlockLease::EphemeralFileMutableTail(v) => v,
BlockLease::Slice(v) => v,
#[cfg(test)]
BlockLease::Arc(v) => v.deref(),
#[cfg(test)]
Expand All @@ -81,6 +83,7 @@ pub(crate) enum BlockReaderRef<'a> {
FileBlockReader(&'a FileBlockReader<'a>),
EphemeralFile(&'a EphemeralFile),
Adapter(Adapter<&'a DeltaLayerInner>),
Slice(&'a [u8]),
#[cfg(test)]
TestDisk(&'a super::disk_btree::tests::TestDisk),
#[cfg(test)]
Expand All @@ -99,6 +102,7 @@ impl<'a> BlockReaderRef<'a> {
FileBlockReader(r) => r.read_blk(blknum, ctx).await,
EphemeralFile(r) => r.read_blk(blknum, ctx).await,
Adapter(r) => r.read_blk(blknum, ctx).await,
Slice(s) => Self::read_blk_slice(s, blknum),
#[cfg(test)]
TestDisk(r) => r.read_blk(blknum),
#[cfg(test)]
Expand All @@ -107,6 +111,24 @@ impl<'a> BlockReaderRef<'a> {
}
}

impl<'a> BlockReaderRef<'a> {
fn read_blk_slice(slice: &[u8], blknum: u32) -> std::io::Result<BlockLease> {
let start = (blknum as usize).checked_mul(PAGE_SZ).unwrap();
let end = start.checked_add(PAGE_SZ).unwrap();
if end > slice.len() {
return Err(std::io::Error::new(
std::io::ErrorKind::UnexpectedEof,
format!("slice too short, len={} end={}", slice.len(), end),
));
}
let slice = &slice[start..end];
let page_sized: &[u8; PAGE_SZ] = slice
.try_into()
.expect("we add PAGE_SZ to start, so the slice must have PAGE_SZ");
Ok(BlockLease::Slice(page_sized))
}
}

///
/// A "cursor" for efficiently reading multiple pages from a BlockReader
///
Expand Down
8 changes: 7 additions & 1 deletion pageserver/src/tenant/ephemeral_file.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ pub struct EphemeralFile {
}

mod page_caching;
pub(crate) use page_caching::PrewarmOnWrite as PrewarmPageCacheOnWrite;
mod zero_padded_read_write;

impl EphemeralFile {
Expand Down Expand Up @@ -53,7 +54,7 @@ impl EphemeralFile {
Ok(EphemeralFile {
_tenant_shard_id: tenant_shard_id,
_timeline_id: timeline_id,
rw: page_caching::RW::new(file),
rw: page_caching::RW::new(file, conf.l0_flush.prewarm_on_write()),
})
}

Expand All @@ -65,6 +66,11 @@ impl EphemeralFile {
self.rw.page_cache_file_id()
}

/// See [`self::page_caching::RW::load_to_vec`].
pub(crate) async fn load_to_vec(&self, ctx: &RequestContext) -> Result<Vec<u8>, io::Error> {
self.rw.load_to_vec(ctx).await
}

pub(crate) async fn read_blk(
&self,
blknum: u32,
Expand Down
Loading
Loading