pd: remove all timeouts

pd: re-enable req-resp timeout pd(oblivious): increase timeout to 2s pd: fix type inferrence error
penumbra-zone · Aug 16, 2023 · 022980d · 022980d
1 parent ce5b75c
commit 022980d
Show file tree

Hide file tree

Showing 2 changed files with 40 additions and 21 deletions.
diff --git a/crates/bin/pd/src/info/oblivious.rs b/crates/bin/pd/src/info/oblivious.rs
@@ -267,32 +267,49 @@ impl ObliviousQueryService for Info {
                         .expect("block fetcher does not fail")
                         .expect("no error fetching block")
                         .expect("compact block for in-range height must be present");
-
-                    let send_op = async { tx_blocks.send(Ok(block.into())).await };
-                    match tokio::time::timeout(tokio::time::Duration::from_secs(1), send_op).await {
-                        Ok(Ok(_)) => {
-                            metrics::increment_counter!(
-                                metrics::CLIENT_OBLIVIOUS_COMPACT_BLOCK_SERVED_TOTAL
-                            );
-                        }
-                        Ok(Err(_)) => {
-                            return Err(tonic::Status::internal("error while sending block"));
-                        }
-                        Err(_) => {
-                            tracing::debug!(
-                                "client did not poll compact block stream for 1s, timing out"
-                            );
-                            return Err(tonic::Status::deadline_exceeded(
-                                "timeout while sending block",
-                            ));
-                        }
-                    }
+                    // Tracked in #2908: we previously added a timeout on `send` targeting
+                    // buffered streams staying full for too long. However, in at least a few
+                    // "regular usage" instances we observed client streams stopping too eagerly.
+                    // In #2932, it was established that the timeout had to be at least 10s to
+                    // accomodate those usecases.
+                    //
+                    // Although we cannot exclude that clients actually did not poll the stream for
+                    // more than `9s`, this seems unlikely. We are removing the timeout mechanism
+                    // altogether for now. This might negatively impact memory usage under load.
+                    // Future iterations of this work should start by moving block serialization
+                    // outside of the `send_op` future, and investigate if long blocking sends can
+                    // happen for benign reasons (i.e not caused by the client).
+                    //
+                    // let send_op = async { tx_blocks.send(Ok(block.into())).await };
+                    // match tokio::time::timeout(tokio::time::Duration::from_secs(2), send_op).await {
+                    //     Ok(Ok(_)) => {
+                    //         metrics::increment_counter!(
+                    //             metrics::CLIENT_OBLIVIOUS_COMPACT_BLOCK_SERVED_TOTAL
+                    //         );
+                    //     }
+                    //     Ok(Err(_)) => {
+                    //         return Err(tonic::Status::internal("error while sending block"));
+                    //     }
+                    //     Err(_) => {
+                    //         tracing::debug!(
+                    //             "client did not poll compact block stream for 1s, timing out"
+                    //         );
+                    //         return Err(tonic::Status::deadline_exceeded(
+                    //             "timeout while sending block",
+                    //         ));
+                    //     }
+                    // }
+
+                    tx_blocks.send(Ok(block.into())).await?;
+                    metrics::increment_counter!(
+                        metrics::CLIENT_OBLIVIOUS_COMPACT_BLOCK_SERVED_TOTAL
+                    );
                 }
 
                 // If the client didn't request a keep-alive, we're done.
                 if !keep_alive {
                     // Explicitly annotate the error type, so we can bubble up errors...
-                    return Ok(());
+                    return Ok::<(), anyhow::Error>(());
                 }
 
                 // Before we can stream new compact blocks as they're created,

diff --git a/crates/bin/pd/src/main.rs b/crates/bin/pd/src/main.rs
@@ -298,6 +298,8 @@ async fn main() -> anyhow::Result<()> {
                 })
                 // Allow HTTP/1, which will be used by grpc-web connections.
                 .accept_http1(true)
+                // As part of #2932, we are disabling all timeouts until we circle back to our
+                // performance story.
                 // Sets a timeout for all gRPC requests, but note that in the case of streaming
                 // requests, the timeout is only applied to the initial request. This means that
                 // this does not prevent long lived streams, for example to allow clients to obtain