Skip to content

Commit

Permalink
Offloaded timeline deletion (#9519)
Browse files Browse the repository at this point in the history
As pointed out in
#9489 (comment) ,
we currently didn't support deletion for offloaded timelines after the
timeline has been loaded from the manifest instead of having been
offloaded.

This was because the upload queue hasn't been initialized yet. This PR
thus initializes the timeline and shuts it down immediately.

Part of #8088
  • Loading branch information
arpad-m authored Oct 29, 2024
1 parent 07b9744 commit a73402e
Show file tree
Hide file tree
Showing 3 changed files with 88 additions and 46 deletions.
15 changes: 3 additions & 12 deletions pageserver/src/tenant.rs
Original file line number Diff line number Diff line change
Expand Up @@ -626,19 +626,10 @@ impl TimelineOrOffloaded {
TimelineOrOffloaded::Offloaded(offloaded) => &offloaded.delete_progress,
}
}
fn remote_client_maybe_construct(&self, tenant: &Tenant) -> Arc<RemoteTimelineClient> {
fn maybe_remote_client(&self) -> Option<Arc<RemoteTimelineClient>> {
match self {
TimelineOrOffloaded::Timeline(timeline) => timeline.remote_client.clone(),
TimelineOrOffloaded::Offloaded(offloaded) => match offloaded.remote_client.clone() {
Some(remote_client) => remote_client,
None => {
let remote_client = tenant.build_timeline_client(
offloaded.timeline_id,
tenant.remote_storage.clone(),
);
Arc::new(remote_client)
}
},
TimelineOrOffloaded::Timeline(timeline) => Some(timeline.remote_client.clone()),
TimelineOrOffloaded::Offloaded(offloaded) => offloaded.remote_client.clone(),
}
}
}
Expand Down
31 changes: 28 additions & 3 deletions pageserver/src/tenant/timeline/delete.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ use std::{
use anyhow::Context;
use pageserver_api::{models::TimelineState, shard::TenantShardId};
use tokio::sync::OwnedMutexGuard;
use tracing::{error, info, instrument, Instrument};
use tracing::{error, info, info_span, instrument, Instrument};
use utils::{crashsafe, fs_ext, id::TimelineId, pausable_failpoint};

use crate::{
Expand All @@ -15,7 +15,7 @@ use crate::{
tenant::{
metadata::TimelineMetadata,
remote_timeline_client::{
self, PersistIndexPartWithDeletedFlagError, RemoteTimelineClient,
self, MaybeDeletedIndexPart, PersistIndexPartWithDeletedFlagError, RemoteTimelineClient,
},
CreateTimelineCause, DeleteTimelineError, Tenant, TimelineOrOffloaded,
},
Expand Down Expand Up @@ -258,7 +258,32 @@ impl DeleteTimelineFlow {
))?
});

let remote_client = timeline.remote_client_maybe_construct(tenant);
let remote_client = match timeline.maybe_remote_client() {
Some(remote_client) => remote_client,
None => {
let remote_client = tenant
.build_timeline_client(timeline.timeline_id(), tenant.remote_storage.clone());
let result = remote_client
.download_index_file(&tenant.cancel)
.instrument(info_span!("download_index_file"))
.await
.map_err(|e| DeleteTimelineError::Other(anyhow::anyhow!("error: {:?}", e)))?;
let index_part = match result {
MaybeDeletedIndexPart::Deleted(p) => {
tracing::info!("Timeline already set as deleted in remote index");
p
}
MaybeDeletedIndexPart::IndexPart(p) => p,
};
let remote_client = Arc::new(remote_client);

remote_client
.init_upload_queue(&index_part)
.map_err(DeleteTimelineError::Other)?;
remote_client.shutdown().await;
remote_client
}
};
set_deleted_in_remote_index(&remote_client).await?;

fail::fail_point!("timeline-delete-before-schedule", |_| {
Expand Down
88 changes: 57 additions & 31 deletions test_runner/regress/test_timeline_archive.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,14 +137,17 @@ def test_timeline_offloading(neon_env_builder: NeonEnvBuilder, manual_offload: b
}
)

# Create two branches and archive them
parent_timeline_id = env.create_branch("test_ancestor_branch_archive_parent", tenant_id)
leaf_timeline_id = env.create_branch(
"test_ancestor_branch_archive_branch1", tenant_id, "test_ancestor_branch_archive_parent"
# Create three branches that depend on each other, starting with two
grandparent_timeline_id = env.create_branch(
"test_ancestor_branch_archive_grandparent", tenant_id
)
parent_timeline_id = env.create_branch(
"test_ancestor_branch_archive_parent", tenant_id, "test_ancestor_branch_archive_grandparent"
)

# write some stuff to the parent
with env.endpoints.create_start(
"test_ancestor_branch_archive_branch1", tenant_id=tenant_id
"test_ancestor_branch_archive_parent", tenant_id=tenant_id
) as endpoint:
endpoint.safe_psql_many(
[
Expand All @@ -154,6 +157,11 @@ def test_timeline_offloading(neon_env_builder: NeonEnvBuilder, manual_offload: b
)
sum = endpoint.safe_psql("SELECT sum(key) from foo where key > 50")

# create the third branch
leaf_timeline_id = env.create_branch(
"test_ancestor_branch_archive_branch1", tenant_id, "test_ancestor_branch_archive_parent"
)

ps_http.timeline_archival_config(
tenant_id,
leaf_timeline_id,
Expand All @@ -171,6 +179,12 @@ def test_timeline_offloading(neon_env_builder: NeonEnvBuilder, manual_offload: b
state=TimelineArchivalState.ARCHIVED,
)

ps_http.timeline_archival_config(
tenant_id,
grandparent_timeline_id,
state=TimelineArchivalState.ARCHIVED,
)

def timeline_offloaded_logged(timeline_id: TimelineId) -> bool:
return (
env.pageserver.log_contains(f".*{timeline_id}.* offloading archived timeline.*")
Expand Down Expand Up @@ -201,30 +215,34 @@ def leaf_offloaded():

ps_http.timeline_archival_config(
tenant_id,
parent_timeline_id,
grandparent_timeline_id,
state=TimelineArchivalState.UNARCHIVED,
)
ps_http.timeline_archival_config(
tenant_id,
leaf_timeline_id,
parent_timeline_id,
state=TimelineArchivalState.UNARCHIVED,
)
leaf_detail = ps_http.timeline_detail(
parent_detail = ps_http.timeline_detail(
tenant_id,
leaf_timeline_id,
parent_timeline_id,
)
assert leaf_detail["is_archived"] is False
assert parent_detail["is_archived"] is False

with env.endpoints.create_start(
"test_ancestor_branch_archive_branch1", tenant_id=tenant_id
"test_ancestor_branch_archive_parent", tenant_id=tenant_id
) as endpoint:
sum_again = endpoint.safe_psql("SELECT sum(key) from foo where key > 50")
assert sum == sum_again

# Test that deletion of offloaded timelines works
ps_http.timeline_delete(tenant_id, leaf_timeline_id)

assert not timeline_offloaded_logged(initial_timeline_id)


def test_timeline_offload_persist(neon_env_builder: NeonEnvBuilder):
@pytest.mark.parametrize("delete_timeline", [False, True])
def test_timeline_offload_persist(neon_env_builder: NeonEnvBuilder, delete_timeline: bool):
"""
Test for persistence of timeline offload state
"""
Expand Down Expand Up @@ -306,27 +324,35 @@ def child_offloaded():
assert timeline_offloaded_api(child_timeline_id)
assert not timeline_offloaded_api(root_timeline_id)

ps_http.timeline_archival_config(
tenant_id,
child_timeline_id,
state=TimelineArchivalState.UNARCHIVED,
)
child_detail = ps_http.timeline_detail(
tenant_id,
child_timeline_id,
)
assert child_detail["is_archived"] is False
if delete_timeline:
ps_http.timeline_delete(tenant_id, child_timeline_id)
with pytest.raises(PageserverApiException, match="not found"):
ps_http.timeline_detail(
tenant_id,
child_timeline_id,
)
else:
ps_http.timeline_archival_config(
tenant_id,
child_timeline_id,
state=TimelineArchivalState.UNARCHIVED,
)
child_detail = ps_http.timeline_detail(
tenant_id,
child_timeline_id,
)
assert child_detail["is_archived"] is False

with env.endpoints.create_start(
"test_archived_branch_persisted", tenant_id=tenant_id
) as endpoint:
sum_again = endpoint.safe_psql("SELECT sum(key) from foo where key < 500")
assert sum == sum_again
with env.endpoints.create_start(
"test_archived_branch_persisted", tenant_id=tenant_id
) as endpoint:
sum_again = endpoint.safe_psql("SELECT sum(key) from foo where key < 500")
assert sum == sum_again

assert_prefix_empty(
neon_env_builder.pageserver_remote_storage,
prefix=f"tenants/{str(env.initial_tenant)}/tenant-manifest",
)
assert_prefix_empty(
neon_env_builder.pageserver_remote_storage,
prefix=f"tenants/{str(env.initial_tenant)}/tenant-manifest",
)

assert not timeline_offloaded_api(root_timeline_id)

Expand Down

1 comment on commit a73402e

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

5371 tests run: 5139 passed, 0 failed, 232 skipped (full report)


Flaky tests (1)

Postgres 17

Code coverage* (full report)

  • functions: 31.3% (7685 of 24573 functions)
  • lines: 48.7% (60396 of 123975 lines)

* collected from Rust tests only


The comment gets automatically updated with the latest test results
a73402e at 2024-10-29T12:17:53.779Z :recycle:

Please sign in to comment.