Skip to content

Commit

Permalink
feat(pageserver): allow read path debug in getpagelsn API (#10748)
Browse files Browse the repository at this point in the history
## Problem

The usual workflow for me to debug read path errors in staging is:
download the tenant to my laptop, import, and then run some read tests.

With this patch, we can do this directly over staging pageservers.

## Summary of changes

* Add a new `touchpagelsn` API that does a page read but does not return
page info back.
* Allow read from latest record LSN from get/touchpagelsn
* Add read_debug config in the context.
* The read path will read the context config to decide whether to enable
read path tracing or not.

Signed-off-by: Alex Chi Z <[email protected]>
  • Loading branch information
skyzh authored Feb 18, 2025
1 parent cb80605 commit 538ea03
Show file tree
Hide file tree
Showing 3 changed files with 52 additions and 12 deletions.
12 changes: 12 additions & 0 deletions pageserver/src/context.rs
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@ pub struct RequestContext {
download_behavior: DownloadBehavior,
access_stats_behavior: AccessStatsBehavior,
page_content_kind: PageContentKind,
read_path_debug: bool,
}

/// The kind of access to the page cache.
Expand Down Expand Up @@ -155,6 +156,7 @@ impl RequestContextBuilder {
download_behavior: DownloadBehavior::Download,
access_stats_behavior: AccessStatsBehavior::Update,
page_content_kind: PageContentKind::Unknown,
read_path_debug: false,
},
}
}
Expand All @@ -168,6 +170,7 @@ impl RequestContextBuilder {
download_behavior: original.download_behavior,
access_stats_behavior: original.access_stats_behavior,
page_content_kind: original.page_content_kind,
read_path_debug: original.read_path_debug,
},
}
}
Expand All @@ -191,6 +194,11 @@ impl RequestContextBuilder {
self
}

pub(crate) fn read_path_debug(mut self, b: bool) -> Self {
self.inner.read_path_debug = b;
self
}

pub fn build(self) -> RequestContext {
self.inner
}
Expand Down Expand Up @@ -291,4 +299,8 @@ impl RequestContext {
pub(crate) fn page_content_kind(&self) -> PageContentKind {
self.page_content_kind
}

pub(crate) fn read_path_debug(&self) -> bool {
self.read_path_debug
}
}
50 changes: 39 additions & 11 deletions pageserver/src/http/routes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ use tokio_util::sync::CancellationToken;
use tracing::*;

use crate::config::PageServerConf;
use crate::context::RequestContextBuilder;
use crate::context::{DownloadBehavior, RequestContext};
use crate::deletion_queue::DeletionQueueClient;
use crate::pgdatadir_mapping::LsnForTimestamp;
Expand Down Expand Up @@ -2571,14 +2572,30 @@ async fn deletion_queue_flush(
}
}

/// Try if `GetPage@Lsn` is successful, useful for manual debugging.
async fn getpage_at_lsn_handler(
request: Request<Body>,
cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
getpage_at_lsn_handler_inner(false, request, cancel).await
}

async fn touchpage_at_lsn_handler(
request: Request<Body>,
cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
getpage_at_lsn_handler_inner(true, request, cancel).await
}

/// Try if `GetPage@Lsn` is successful, useful for manual debugging.
async fn getpage_at_lsn_handler_inner(
touch: bool,
request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
// Require pageserver admin permission for this API instead of only tenant-level token.
check_permission(&request, None)?;
let state = get_state(&request);

struct Key(pageserver_api::key::Key);
Expand All @@ -2593,22 +2610,29 @@ async fn getpage_at_lsn_handler(

let key: Key = parse_query_param(&request, "key")?
.ok_or_else(|| ApiError::BadRequest(anyhow!("missing 'key' query parameter")))?;
let lsn: Lsn = parse_query_param(&request, "lsn")?
.ok_or_else(|| ApiError::BadRequest(anyhow!("missing 'lsn' query parameter")))?;
let lsn: Option<Lsn> = parse_query_param(&request, "lsn")?;

async {
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
// Enable read path debugging
let ctx = RequestContextBuilder::extend(&ctx).read_path_debug(true).build();
let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?;

// Use last_record_lsn if no lsn is provided
let lsn = lsn.unwrap_or_else(|| timeline.get_last_record_lsn());
let page = timeline.get(key.0, lsn, &ctx).await?;

Result::<_, ApiError>::Ok(
Response::builder()
.status(StatusCode::OK)
.header(header::CONTENT_TYPE, "application/octet-stream")
.body(hyper::Body::from(page))
.unwrap(),
)
if touch {
json_response(StatusCode::OK, ())
} else {
Result::<_, ApiError>::Ok(
Response::builder()
.status(StatusCode::OK)
.header(header::CONTENT_TYPE, "application/octet-stream")
.body(hyper::Body::from(page))
.unwrap(),
)
}
}
.instrument(info_span!("timeline_get", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
.await
Expand Down Expand Up @@ -3743,6 +3767,10 @@ pub fn make_router(
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/getpage",
|r| testing_api_handler("getpage@lsn", r, getpage_at_lsn_handler),
)
.get(
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/touchpage",
|r| api_handler(r, touchpage_at_lsn_handler),
)
.get(
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/keyspace",
|r| api_handler(r, timeline_collect_keyspace),
Expand Down
2 changes: 1 addition & 1 deletion pageserver/src/tenant/timeline.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1298,7 +1298,7 @@ impl Timeline {
reconstruct_state: &mut ValuesReconstructState,
ctx: &RequestContext,
) -> Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError> {
let read_path = if self.conf.enable_read_path_debugging {
let read_path = if self.conf.enable_read_path_debugging || ctx.read_path_debug() {
Some(ReadPath::new(keyspace.clone(), lsn))
} else {
None
Expand Down

1 comment on commit 538ea03

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

7697 tests run: 7314 passed, 0 failed, 383 skipped (full report)


Flaky tests (4)

Postgres 17

Postgres 14

Code coverage* (full report)

  • functions: 32.9% (8624 of 26191 functions)
  • lines: 48.9% (72734 of 148843 lines)

* collected from Rust tests only


The comment gets automatically updated with the latest test results
538ea03 at 2025-02-18T22:03:49.796Z :recycle:

Please sign in to comment.