Skip to content

Commit

Permalink
Activate timelines during unoffload (#9399)
Browse files Browse the repository at this point in the history
The current code has forgotten to activate timelines during unoffload,
leading to inability to receive the basebackup, due to the timeline
still being in loading state.

```
  stderr:
    command failed: compute startup failed: failed to get basebackup@0/0 from pageserver postgresql://no_user@localhost:15014

    Caused by:
        0: db error: ERROR: Not found: Timeline 508546c79b2b16a84ab609fdf966e0d3/bfc18c24c4b837ecae5dbb5216c80fce is not active, state: Loading
        1: ERROR: Not found: Timeline 508546c79b2b16a84ab609fdf966e0d3/bfc18c24c4b837ecae5dbb5216c80fce is not active, state: Loading
```

Therefore, also activate the timeline during unoffloading.

Part of #8088
  • Loading branch information
arpad-m authored Oct 16, 2024
1 parent 9668601 commit 55b2460
Show file tree
Hide file tree
Showing 3 changed files with 50 additions and 14 deletions.
7 changes: 6 additions & 1 deletion pageserver/src/http/routes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -720,7 +720,12 @@ async fn timeline_archival_config_handler(
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;

tenant
.apply_timeline_archival_config(timeline_id, request_data.state, ctx)
.apply_timeline_archival_config(
timeline_id,
request_data.state,
state.broker_client.clone(),
ctx,
)
.await?;
Ok::<_, ApiError>(())
}
Expand Down
40 changes: 27 additions & 13 deletions pageserver/src/tenant.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1554,6 +1554,7 @@ impl Tenant {
async fn unoffload_timeline(
self: &Arc<Self>,
timeline_id: TimelineId,
broker_client: storage_broker::BrokerClientChannel,
ctx: RequestContext,
) -> Result<Arc<Timeline>, TimelineArchivalError> {
info!("unoffloading timeline");
Expand Down Expand Up @@ -1605,25 +1606,37 @@ impl Tenant {
})
.map_err(TimelineArchivalError::Other)?;
let timelines = self.timelines.lock().unwrap();
if let Some(timeline) = timelines.get(&timeline_id) {
let mut offloaded_timelines = self.timelines_offloaded.lock().unwrap();
if offloaded_timelines.remove(&timeline_id).is_none() {
warn!("timeline already removed from offloaded timelines");
}
info!("timeline unoffloading complete");
Ok(Arc::clone(timeline))
} else {
let Some(timeline) = timelines.get(&timeline_id) else {
warn!("timeline not available directly after attach");
Err(TimelineArchivalError::Other(anyhow::anyhow!(
return Err(TimelineArchivalError::Other(anyhow::anyhow!(
"timeline not available directly after attach"
)))
)));
};
let mut offloaded_timelines = self.timelines_offloaded.lock().unwrap();
if offloaded_timelines.remove(&timeline_id).is_none() {
warn!("timeline already removed from offloaded timelines");
}

// Activate the timeline (if it makes sense)
if !(timeline.is_broken() || timeline.is_stopping()) {
let background_jobs_can_start = None;
timeline.activate(
self.clone(),
broker_client.clone(),
background_jobs_can_start,
&ctx,
);
}

info!("timeline unoffloading complete");
Ok(Arc::clone(timeline))
}

pub(crate) async fn apply_timeline_archival_config(
self: &Arc<Self>,
timeline_id: TimelineId,
new_state: TimelineArchivalState,
broker_client: storage_broker::BrokerClientChannel,
ctx: RequestContext,
) -> Result<(), TimelineArchivalError> {
info!("setting timeline archival config");
Expand Down Expand Up @@ -1664,12 +1677,13 @@ impl Tenant {
Some(Arc::clone(timeline))
};

// Second part: unarchive timeline (if needed)
// Second part: unoffload timeline (if needed)
let timeline = if let Some(timeline) = timeline_or_unarchive_offloaded {
timeline
} else {
// Turn offloaded timeline into a non-offloaded one
self.unoffload_timeline(timeline_id, ctx).await?
self.unoffload_timeline(timeline_id, broker_client, ctx)
.await?
};

// Third part: upload new timeline archival state and block until it is present in S3
Expand Down Expand Up @@ -3354,7 +3368,7 @@ impl Tenant {
/// Populate all Timelines' `GcInfo` with information about their children. We do not set the
/// PITR cutoffs here, because that requires I/O: this is done later, before GC, by [`Self::refresh_gc_info_internal`]
///
/// Subsequently, parent-child relationships are updated incrementally during timeline creation/deletion.
/// Subsequently, parent-child relationships are updated incrementally inside [`Timeline::new`] and [`Timeline::drop`].
fn initialize_gc_info(
&self,
timelines: &std::sync::MutexGuard<HashMap<TimelineId, Arc<Timeline>>>,
Expand Down
17 changes: 17 additions & 0 deletions test_runner/regress/test_timeline_archive.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,17 @@ def test_timeline_offloading(neon_env_builder: NeonEnvBuilder, manual_offload: b
"test_ancestor_branch_archive_branch1", tenant_id, "test_ancestor_branch_archive_parent"
)

with env.endpoints.create_start(
"test_ancestor_branch_archive_branch1", tenant_id=tenant_id
) as endpoint:
endpoint.safe_psql_many(
[
"CREATE TABLE foo(key serial primary key, t text default 'data_content')",
"INSERT INTO foo SELECT FROM generate_series(1,1000)",
]
)
sum = endpoint.safe_psql("SELECT sum(key) from foo where key > 50")

ps_http.timeline_archival_config(
tenant_id,
leaf_timeline_id,
Expand Down Expand Up @@ -197,4 +208,10 @@ def leaf_offloaded():
)
assert leaf_detail["is_archived"] is False

with env.endpoints.create_start(
"test_ancestor_branch_archive_branch1", tenant_id=tenant_id
) as endpoint:
sum_again = endpoint.safe_psql("SELECT sum(key) from foo where key > 50")
assert sum == sum_again

assert not timeline_offloaded(initial_timeline_id)

1 comment on commit 55b2460

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

5202 tests run: 4994 passed, 1 failed, 207 skipped (full report)


Failures on Postgres 16

# Run all failed tests locally:
scripts/pytest -vv -n $(nproc) -k "test_sharding_split_failures[release-pg16-failure15]"
Flaky tests (1)

Postgres 16

Test coverage report is not available

The comment gets automatically updated with the latest test results
55b2460 at 2024-10-16T16:20:02.397Z :recycle:

Please sign in to comment.