Skip to content

Commit

Permalink
fix(pageserver): make repartition error critical (#10872)
Browse files Browse the repository at this point in the history
## Problem

Read errors during repartition should be a critical error.

## Summary of changes

<del>We only have one call site</del> We have two call sites of
`repartition` where one of them is during the initial image upload
optimization and another is during image layer creation, so I added a
`critical!` here instead of inside `collect_keyspace`.

---------

Signed-off-by: Alex Chi Z <[email protected]>
  • Loading branch information
skyzh authored Feb 18, 2025
1 parent 9d074db commit a4e3989
Show file tree
Hide file tree
Showing 5 changed files with 31 additions and 4 deletions.
1 change: 1 addition & 0 deletions pageserver/src/http/routes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2395,6 +2395,7 @@ async fn timeline_checkpoint_handler(
match e {
CompactionError::ShuttingDown => ApiError::ShuttingDown,
CompactionError::Offload(e) => ApiError::InternalServerError(anyhow::anyhow!(e)),
CompactionError::CollectKeySpaceError(e) => ApiError::InternalServerError(anyhow::anyhow!(e)),
CompactionError::Other(e) => ApiError::InternalServerError(e)
}
)?;
Expand Down
6 changes: 6 additions & 0 deletions pageserver/src/tenant.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3150,6 +3150,12 @@ impl Tenant {
// Offload failures don't trip the circuit breaker, since they're cheap to retry and
// shouldn't block compaction.
CompactionError::Offload(_) => {}
CompactionError::CollectKeySpaceError(err) => {
self.compaction_circuit_breaker
.lock()
.unwrap()
.fail(&CIRCUIT_BREAKERS_BROKEN, err);
}
CompactionError::Other(err) => {
self.compaction_circuit_breaker
.lock()
Expand Down
3 changes: 3 additions & 0 deletions pageserver/src/tenant/tasks.rs
Original file line number Diff line number Diff line change
Expand Up @@ -287,13 +287,16 @@ fn log_compaction_error(
sleep_duration: Duration,
task_cancelled: bool,
) {
use crate::pgdatadir_mapping::CollectKeySpaceError;
use crate::tenant::upload_queue::NotInitialized;
use crate::tenant::PageReconstructError;
use CompactionError::*;

let level = match err {
ShuttingDown => return,
Offload(_) => Level::ERROR,
CollectKeySpaceError(CollectKeySpaceError::Cancelled) => Level::INFO,
CollectKeySpaceError(_) => Level::ERROR,
_ if task_cancelled => Level::INFO,
Other(err) => {
let root_cause = err.root_cause();
Expand Down
9 changes: 7 additions & 2 deletions pageserver/src/tenant/timeline.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1881,7 +1881,7 @@ impl Timeline {
// Signal compaction failure to avoid L0 flush stalls when it's broken.
match result {
Ok(_) => self.compaction_failed.store(false, AtomicOrdering::Relaxed),
Err(CompactionError::Other(_)) => {
Err(CompactionError::Other(_)) | Err(CompactionError::CollectKeySpaceError(_)) => {
self.compaction_failed.store(true, AtomicOrdering::Relaxed)
}
// Don't change the current value on offload failure or shutdown. We don't want to
Expand Down Expand Up @@ -4604,7 +4604,10 @@ impl Timeline {
));
}

let (dense_ks, sparse_ks) = self.collect_keyspace(lsn, ctx).await?;
let (dense_ks, sparse_ks) = self
.collect_keyspace(lsn, ctx)
.await
.map_err(CompactionError::CollectKeySpaceError)?;
let dense_partitioning = dense_ks.partition(&self.shard_identity, partition_size);
let sparse_partitioning = SparseKeyPartitioning {
parts: vec![sparse_ks],
Expand Down Expand Up @@ -5319,6 +5322,8 @@ pub(crate) enum CompactionError {
#[error("Failed to offload timeline: {0}")]
Offload(OffloadError),
/// Compaction cannot be done right now; page reconstruction and so on.
#[error("Failed to collect keyspace: {0}")]
CollectKeySpaceError(CollectKeySpaceError),
#[error(transparent)]
Other(anyhow::Error),
}
Expand Down
16 changes: 14 additions & 2 deletions pageserver/src/tenant/timeline/compaction.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@ use std::sync::Arc;
use super::layer_manager::LayerManager;
use super::{
CompactFlags, CompactOptions, CreateImageLayersError, DurationRecorder, GetVectoredError,
ImageLayerCreationMode, LastImageLayerCreationStatus, RecordedDuration, Timeline,
ImageLayerCreationMode, LastImageLayerCreationStatus, PageReconstructError, RecordedDuration,
Timeline,
};

use anyhow::{anyhow, bail, Context};
Expand All @@ -31,6 +32,7 @@ use utils::id::TimelineId;

use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder};
use crate::page_cache;
use crate::pgdatadir_mapping::CollectKeySpaceError;
use crate::statvfs::Statvfs;
use crate::tenant::checks::check_valid_layermap;
use crate::tenant::gc_block::GcBlock;
Expand Down Expand Up @@ -781,7 +783,17 @@ impl Timeline {
//
// Suppress error when it's due to cancellation
if !self.cancel.is_cancelled() && !err.is_cancelled() {
tracing::error!("could not compact, repartitioning keyspace failed: {err:?}");
if let CompactionError::CollectKeySpaceError(
CollectKeySpaceError::Decode(_)
| CollectKeySpaceError::PageRead(PageReconstructError::MissingKey(_)),
) = err
{
critical!("could not compact, repartitioning keyspace failed: {err:?}");
} else {
tracing::error!(
"could not compact, repartitioning keyspace failed: {err:?}"
);
}
}
}
};
Expand Down

1 comment on commit a4e3989

@github-actions
Copy link

@github-actions github-actions bot commented on a4e3989 Feb 18, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

1372 tests run: 1335 passed, 1 failed, 36 skipped (full report)


Failures on Postgres 17

# Run all failed tests locally:
scripts/pytest -vv -n $(nproc) -k "test_neon_cli_basics[release-pg17]"
Flaky tests (2)

Postgres 17

Test coverage report is not available

The comment gets automatically updated with the latest test results
a4e3989 at 2025-02-19T02:30:02.944Z :recycle:

Please sign in to comment.