Skip to content

Commit

Permalink
storage_scrubber: fixes to garbage commands (#9409)
Browse files Browse the repository at this point in the history
## Problem

While running `find-garbage` and `purge-garbage`, I encountered two
things that needed updating:
- Console API may omit `user_id` since org accounts were added
- When we cut over to using GenericRemoteStorage, the object listings we
do during purge did not get proper retry handling, so could easily fail
on usual S3 errors, and make the whole process drop out.

...and one bug:
- We had a `.unwrap` which expects that after finding an object in a
tenant path, a listing in that path will always return objects. This is
not true, because a pageserver might be deleting the path at the same
time as we scan it.

## Summary of changes

- When listing objects during purge, use backoff::retry
- Make `user_id` an `Option`
- Handle the case where a tenant's objects go away during find-garbage.
  • Loading branch information
jcsp authored Oct 17, 2024
1 parent 934dbb6 commit db68e82
Show file tree
Hide file tree
Showing 2 changed files with 42 additions and 25 deletions.
2 changes: 1 addition & 1 deletion storage_scrubber/src/cloud_admin_api.rs
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ pub struct ProjectData {
pub name: String,
pub region_id: String,
pub platform_id: String,
pub user_id: String,
pub user_id: Option<String>,
pub pageserver_id: Option<u64>,
#[serde(deserialize_with = "from_nullable_id")]
pub tenant: TenantId,
Expand Down
65 changes: 41 additions & 24 deletions storage_scrubber/src/garbage.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,13 @@ use remote_storage::{GenericRemoteStorage, ListingMode, ListingObject, RemotePat
use serde::{Deserialize, Serialize};
use tokio_stream::StreamExt;
use tokio_util::sync::CancellationToken;
use utils::id::TenantId;
use utils::{backoff, id::TenantId};

use crate::{
cloud_admin_api::{CloudAdminApiClient, MaybeDeleted, ProjectData},
init_remote, list_objects_with_retries,
metadata_stream::{stream_tenant_timelines, stream_tenants},
BucketConfig, ConsoleConfig, NodeKind, TenantShardTimelineId, TraversingDepth,
BucketConfig, ConsoleConfig, NodeKind, TenantShardTimelineId, TraversingDepth, MAX_RETRIES,
};

#[derive(Serialize, Deserialize, Debug)]
Expand Down Expand Up @@ -250,13 +250,16 @@ async fn find_garbage_inner(
&target.tenant_root(&tenant_shard_id),
)
.await?;
let object = tenant_objects.keys.first().unwrap();
if object.key.get_path().as_str().ends_with("heatmap-v1.json") {
tracing::info!("Tenant {tenant_shard_id}: is missing in console and is only a heatmap (known historic deletion bug)");
garbage.append_buggy(GarbageEntity::Tenant(tenant_shard_id));
continue;
if let Some(object) = tenant_objects.keys.first() {
if object.key.get_path().as_str().ends_with("heatmap-v1.json") {
tracing::info!("Tenant {tenant_shard_id}: is missing in console and is only a heatmap (known historic deletion bug)");
garbage.append_buggy(GarbageEntity::Tenant(tenant_shard_id));
continue;
} else {
tracing::info!("Tenant {tenant_shard_id} is missing in console and contains one object: {}", object.key);
}
} else {
tracing::info!("Tenant {tenant_shard_id} is missing in console and contains one object: {}", object.key);
tracing::info!("Tenant {tenant_shard_id} is missing in console appears to have been deleted while we ran");
}
} else {
// A console-unknown tenant with timelines: check if these timelines only contain initdb.tar.zst, from the initial
Expand Down Expand Up @@ -406,14 +409,17 @@ pub async fn get_tenant_objects(
// TODO: apply extra validation based on object modification time. Don't purge
// tenants where any timeline's index_part.json has been touched recently.

let list = s3_client
.list(
Some(&tenant_root),
ListingMode::NoDelimiter,
None,
&CancellationToken::new(),
)
.await?;
let cancel = CancellationToken::new();
let list = backoff::retry(
|| s3_client.list(Some(&tenant_root), ListingMode::NoDelimiter, None, &cancel),
|_| false,
3,
MAX_RETRIES as u32,
"get_tenant_objects",
&cancel,
)
.await
.expect("dummy cancellation token")?;
Ok(list.keys)
}

Expand All @@ -424,14 +430,25 @@ pub async fn get_timeline_objects(
tracing::debug!("Listing objects in timeline {ttid}");
let timeline_root = super::remote_timeline_path_id(&ttid);

let list = s3_client
.list(
Some(&timeline_root),
ListingMode::NoDelimiter,
None,
&CancellationToken::new(),
)
.await?;
let cancel = CancellationToken::new();
let list = backoff::retry(
|| {
s3_client.list(
Some(&timeline_root),
ListingMode::NoDelimiter,
None,
&cancel,
)
},
|_| false,
3,
MAX_RETRIES as u32,
"get_timeline_objects",
&cancel,
)
.await
.expect("dummy cancellation token")?;

Ok(list.keys)
}

Expand Down

1 comment on commit db68e82

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

5290 tests run: 5073 passed, 0 failed, 217 skipped (full report)


Flaky tests (3)

Postgres 17

Postgres 14

Code coverage* (full report)

  • functions: 31.3% (7551 of 24113 functions)
  • lines: 49.1% (60430 of 122952 lines)

* collected from Rust tests only


The comment gets automatically updated with the latest test results
db68e82 at 2024-10-17T11:05:27.977Z :recycle:

Please sign in to comment.