Skip to content

Commit

Permalink
pd: remove all timeouts
Browse files Browse the repository at this point in the history
pd: re-enable req-resp timeout

pd(oblivious): increase timeout to 2s

pd: fix type inferrence error
  • Loading branch information
erwanor authored and conorsch committed Aug 16, 2023
1 parent ce5b75c commit 022980d
Show file tree
Hide file tree
Showing 2 changed files with 40 additions and 21 deletions.
59 changes: 38 additions & 21 deletions crates/bin/pd/src/info/oblivious.rs
Original file line number Diff line number Diff line change
Expand Up @@ -267,32 +267,49 @@ impl ObliviousQueryService for Info {
.expect("block fetcher does not fail")
.expect("no error fetching block")
.expect("compact block for in-range height must be present");

let send_op = async { tx_blocks.send(Ok(block.into())).await };
match tokio::time::timeout(tokio::time::Duration::from_secs(1), send_op).await {
Ok(Ok(_)) => {
metrics::increment_counter!(
metrics::CLIENT_OBLIVIOUS_COMPACT_BLOCK_SERVED_TOTAL
);
}
Ok(Err(_)) => {
return Err(tonic::Status::internal("error while sending block"));
}
Err(_) => {
tracing::debug!(
"client did not poll compact block stream for 1s, timing out"
);
return Err(tonic::Status::deadline_exceeded(
"timeout while sending block",
));
}
}
// Tracked in #2908: we previously added a timeout on `send` targeting
// buffered streams staying full for too long. However, in at least a few
// "regular usage" instances we observed client streams stopping too eagerly.
// In #2932, it was established that the timeout had to be at least 10s to
// accomodate those usecases.
//
// Although we cannot exclude that clients actually did not poll the stream for
// more than `9s`, this seems unlikely. We are removing the timeout mechanism
// altogether for now. This might negatively impact memory usage under load.
// Future iterations of this work should start by moving block serialization
// outside of the `send_op` future, and investigate if long blocking sends can
// happen for benign reasons (i.e not caused by the client).
//
// let send_op = async { tx_blocks.send(Ok(block.into())).await };
// match tokio::time::timeout(tokio::time::Duration::from_secs(2), send_op).await {
// Ok(Ok(_)) => {
// metrics::increment_counter!(
// metrics::CLIENT_OBLIVIOUS_COMPACT_BLOCK_SERVED_TOTAL
// );
// }
// Ok(Err(_)) => {
// return Err(tonic::Status::internal("error while sending block"));
// }
// Err(_) => {
// tracing::debug!(
// "client did not poll compact block stream for 1s, timing out"
// );
// return Err(tonic::Status::deadline_exceeded(
// "timeout while sending block",
// ));
// }
// }

tx_blocks.send(Ok(block.into())).await?;
metrics::increment_counter!(
metrics::CLIENT_OBLIVIOUS_COMPACT_BLOCK_SERVED_TOTAL
);
}

// If the client didn't request a keep-alive, we're done.
if !keep_alive {
// Explicitly annotate the error type, so we can bubble up errors...
return Ok(());
return Ok::<(), anyhow::Error>(());
}

// Before we can stream new compact blocks as they're created,
Expand Down
2 changes: 2 additions & 0 deletions crates/bin/pd/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -298,6 +298,8 @@ async fn main() -> anyhow::Result<()> {
})
// Allow HTTP/1, which will be used by grpc-web connections.
.accept_http1(true)
// As part of #2932, we are disabling all timeouts until we circle back to our
// performance story.
// Sets a timeout for all gRPC requests, but note that in the case of streaming
// requests, the timeout is only applied to the initial request. This means that
// this does not prevent long lived streams, for example to allow clients to obtain
Expand Down

0 comments on commit 022980d

Please sign in to comment.