Skip to content

Commit

Permalink
pageserver: tighten up code around SLRU dir key handling (#10082)
Browse files Browse the repository at this point in the history
## Problem

Changes in #9786 were functionally complete but missed some edges that
made testing less robust than it should have been:
- `is_key_disposable` didn't consider SLRU dir keys disposable
- Timeline `init_empty` was always creating SLRU dir keys on all shards

The result was that when we had a bug
(#10080), it wasn't apparent in
tests, because one would only encounter the issue if running on a
long-lived timeline with enough compaction to drop the initially created
empty SLRU dir keys, _and_ some CLog truncation going on.

Closes: neondatabase/cloud#21516

## Summary of changes

- Update is_key_global and init_empty to handle SLRU dir keys properly
-- the only functional impact is that we avoid writing some spurious
keys in shards >0, but this makes testing much more robust.
- Make `test_clog_truncate` explicitly use a sharded tenant

The net result is that if one reverts #10080, then tests fail (i.e. this
PR is a reproducer for the issue)
  • Loading branch information
jcsp authored Dec 16, 2024
1 parent 117c1b5 commit ebcbc1a
Show file tree
Hide file tree
Showing 4 changed files with 42 additions and 21 deletions.
4 changes: 4 additions & 0 deletions libs/pageserver_api/src/key.rs
Original file line number Diff line number Diff line change
Expand Up @@ -565,6 +565,10 @@ impl Key {
&& self.field5 == 0
&& self.field6 == u32::MAX
}

pub fn is_slru_dir_key(&self) -> bool {
slru_dir_kind(self).is_some()
}
}

#[inline(always)]
Expand Down
6 changes: 5 additions & 1 deletion libs/pageserver_api/src/shard.rs
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,11 @@ impl ShardIdentity {

/// Return true if the key should be stored on all shards, not just one.
pub fn is_key_global(&self, key: &Key) -> bool {
if key.is_slru_block_key() || key.is_slru_segment_size_key() || key.is_aux_file_key() {
if key.is_slru_block_key()
|| key.is_slru_segment_size_key()
|| key.is_aux_file_key()
|| key.is_slru_dir_key()
{
// Special keys that are only stored on shard 0
false
} else if key.is_rel_block_key() {
Expand Down
29 changes: 17 additions & 12 deletions pageserver/src/pgdatadir_mapping.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1319,18 +1319,23 @@ impl<'a> DatadirModification<'a> {

let buf: Bytes = SlruSegmentDirectory::ser(&SlruSegmentDirectory::default())?.into();
let empty_dir = Value::Image(buf);
self.put(slru_dir_to_key(SlruKind::Clog), empty_dir.clone());
self.pending_directory_entries
.push((DirectoryKind::SlruSegment(SlruKind::Clog), 0));
self.put(
slru_dir_to_key(SlruKind::MultiXactMembers),
empty_dir.clone(),
);
self.pending_directory_entries
.push((DirectoryKind::SlruSegment(SlruKind::Clog), 0));
self.put(slru_dir_to_key(SlruKind::MultiXactOffsets), empty_dir);
self.pending_directory_entries
.push((DirectoryKind::SlruSegment(SlruKind::MultiXactOffsets), 0));

// Initialize SLRUs on shard 0 only: creating these on other shards would be
// harmless but they'd just be dropped on later compaction.
if self.tline.tenant_shard_id.is_shard_zero() {
self.put(slru_dir_to_key(SlruKind::Clog), empty_dir.clone());
self.pending_directory_entries
.push((DirectoryKind::SlruSegment(SlruKind::Clog), 0));
self.put(
slru_dir_to_key(SlruKind::MultiXactMembers),
empty_dir.clone(),
);
self.pending_directory_entries
.push((DirectoryKind::SlruSegment(SlruKind::Clog), 0));
self.put(slru_dir_to_key(SlruKind::MultiXactOffsets), empty_dir);
self.pending_directory_entries
.push((DirectoryKind::SlruSegment(SlruKind::MultiXactOffsets), 0));
}

Ok(())
}
Expand Down
24 changes: 16 additions & 8 deletions test_runner/regress/test_clog_truncate.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,19 @@
from __future__ import annotations

import os
import time

from fixtures.log_helper import log
from fixtures.neon_fixtures import NeonEnv
from fixtures.utils import query_scalar
from fixtures.neon_fixtures import NeonEnvBuilder
from fixtures.utils import query_scalar, wait_until


#
# Test compute node start after clog truncation
#
def test_clog_truncate(neon_simple_env: NeonEnv):
env = neon_simple_env
def test_clog_truncate(neon_env_builder: NeonEnvBuilder):
# Use a multi-sharded tenant because WAL ingest logic is shard-dependent, and
# this test is one of the very few that exercises a CLogTruncate WAL record.
env = neon_env_builder.init_start(initial_tenant_shard_count=2)

# set aggressive autovacuum to make sure that truncation will happen
config = [
Expand All @@ -31,6 +32,7 @@ def test_clog_truncate(neon_simple_env: NeonEnv):
endpoint.safe_psql("CREATE EXTENSION neon_test_utils")

# Consume many xids to advance clog
log.info("Consuming xids...")
with endpoint.cursor() as cur:
cur.execute("select test_consume_xids(1000*1000*10);")
log.info("xids consumed")
Expand All @@ -47,11 +49,17 @@ def test_clog_truncate(neon_simple_env: NeonEnv):
pg_xact_0000_path = os.path.join(endpoint.pg_xact_dir_path(), "0000")
log.info(f"pg_xact_0000_path = {pg_xact_0000_path}")

while os.path.isfile(pg_xact_0000_path):
log.info(f"file exists. wait for truncation: {pg_xact_0000_path=}")
time.sleep(5)
def assert_file_removed():
exists = os.path.isfile(pg_xact_0000_path)
if exists:
log.info(f"file exists. wait for truncation: {pg_xact_0000_path=}")
assert not exists

log.info("Waiting for truncation...")
wait_until(assert_file_removed)

# checkpoint to advance latest lsn
log.info("Checkpointing...")
with endpoint.cursor() as cur:
cur.execute("CHECKPOINT;")
lsn_after_truncation = query_scalar(cur, "select pg_current_wal_insert_lsn()")
Expand Down

1 comment on commit ebcbc1a

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

7084 tests run: 6785 passed, 2 failed, 297 skipped (full report)


Failures on Postgres 17

# Run all failed tests locally:
scripts/pytest -vv -n $(nproc) -k "test_timeline_archival_chaos[release-pg17] or test_timeline_archival_chaos[release-pg17]"
Flaky tests (2)

Postgres 17

Postgres 15

  • test_physical_replication_config_mismatch_too_many_known_xids: release-arm64

Test coverage report is not available

The comment gets automatically updated with the latest test results
ebcbc1a at 2024-12-16T11:02:52.542Z :recycle:

Please sign in to comment.