Less granular message queue length metrics

gefjon · gefjon · commit c00797660a24 · 2025-05-21T11:44:25.000-04:00
This commit alters the message queue length metrics introduced by #2754 to be per-database, rather than per-client. This should limit the cardinality of these metrics, and better lines up with the labels of our other metrics. Because the metrics are now per-database rather than per-client, it's no longer correct to just drop the label when the client disconnects. Instead, care must be taken to decrement the metric by the number of messages which were waiting in the queue at the time of the disconnection. I've added comments to call attention to this complexity.
diff --git a/crates/client-api/src/routes/subscribe.rs b/crates/client-api/src/routes/subscribe.rs
@@ -228,30 +228,23 @@ async fn ws_client_actor_inner(
 
     let addr = client.module.info().database_identity;
 
-    let client_identity = client.sender().id.identity;
-    let connection_id = client.sender().id.connection_id;
-
-    scopeguard::defer!(
-        if let Err(e) = WORKER_METRICS
-            .client_connection_incoming_queue_length
-            .remove_label_values(&addr, &client_identity, &connection_id) {
-                log::error!("Failed to `remove_label_values` for `client_connection_incoming_queue_length`: {e:?}");
-            };
-
-        if let Err(e) = WORKER_METRICS
-            .client_connection_outgoing_queue_length
-            .remove_label_values(&addr, &client_identity, &connection_id) {
-                log::error!("Failed to `remove_label_values` for `client_connection_outgoing_queue_length`: {e:?}");
-            }
-    );
-
-    let incoming_queue_length_metric = WORKER_METRICS
-        .client_connection_incoming_queue_length
-        .with_label_values(&addr, &client_identity, &connection_id);
-
-    let outgoing_queue_length_metric = WORKER_METRICS
-        .client_connection_outgoing_queue_length
-        .with_label_values(&addr, &client_identity, &connection_id);
+    // Grab handles on the total incoming and outgoing queue length metrics,
+    // which we'll increment and decrement as we push into and pull out of those queues.
+    // Note that `total_outgoing_queue_length` is incremented separately,
+    // by `ClientConnectionSender::send` in core/src/client/client_connection.rs;
+    // we're only responsible for decrementing that one.
+    // Also note that much care must be taken to clean up these metrics when the connection closes!
+    // Any path which exits this function must decrement each of these metrics
+    // by the number of messages still waiting in this client's queue,
+    // or else they will grow without bound as clients disconnect, and be useless.
+    let incoming_queue_length_metric = WORKER_METRICS.total_incoming_queue_length.with_label_values(&addr);
+    let outgoing_queue_length_metric = WORKER_METRICS.total_outgoing_queue_length.with_label_values(&addr);
+
+    let clean_up_metrics = |message_queue: &VecDeque<(DataMessage, Instant)>,
+                            sendrx: &mpsc::Receiver<SerializableMessage>| {
+        incoming_queue_length_metric.sub(message_queue.len() as _);
+        outgoing_queue_length_metric.sub(sendrx.len() as _);
+    };
 
     loop {
         rx_buf.clear();
@@ -289,7 +282,10 @@ async fn ws_client_actor_inner(
                     continue;
                 }
                 // the client sent us a close frame
-                None => break,
+                None => {
+                    clean_up_metrics(&message_queue, &sendrx);
+                    break
+                },
             },
 
             // If we have an outgoing message to send, send it off.
@@ -302,31 +298,31 @@ async fn ws_client_actor_inner(
                     //       even though the websocket RFC allows it. should we fork tungstenite?
                     log::info!("dropping messages due to ws already being closed: {:?}", &rx_buf[..n]);
                 } else {
-                    let send_all = async {
-                        for msg in rx_buf.drain(..n) {
-                            let workload = msg.workload();
-                            let num_rows = msg.num_rows();
-
-                            let msg = datamsg_to_wsmsg(serialize(msg, client.config));
-
-                            // These metrics should be updated together,
-                            // or not at all.
-                            if let (Some(workload), Some(num_rows)) = (workload, num_rows) {
-                                WORKER_METRICS
-                                    .websocket_sent_num_rows
-                                    .with_label_values(&addr, &workload)
-                                    .observe(num_rows as f64);
-                                WORKER_METRICS
-                                    .websocket_sent_msg_size
-                                    .with_label_values(&addr, &workload)
-                                    .observe(msg.len() as f64);
-                            }
-                            // feed() buffers the message, but does not necessarily send it
-                            ws.feed(msg).await?;
+                let send_all = async {
+                    for msg in rx_buf.drain(..n) {
+                        let workload = msg.workload();
+                        let num_rows = msg.num_rows();
+
+                        let msg = datamsg_to_wsmsg(serialize(msg, client.config));
+
+                        // These metrics should be updated together,
+                        // or not at all.
+                        if let (Some(workload), Some(num_rows)) = (workload, num_rows) {
+                            WORKER_METRICS
+                                .websocket_sent_num_rows
+                                .with_label_values(&addr, &workload)
+                                .observe(num_rows as f64);
+                            WORKER_METRICS
+                                .websocket_sent_msg_size
+                                .with_label_values(&addr, &workload)
+                                .observe(msg.len() as f64);
                         }
-                        // now we flush all the messages to the socket
-                        ws.flush().await
-                    };
+                        // feed() buffers the message, but does not necessarily send it
+                        ws.feed(msg).await?;
+                    }
+                    // now we flush all the messages to the socket
+                     ws.flush().await
+                 };
                     // Flush the websocket while continuing to poll the `handle_queue`,
                     // to avoid deadlocks or delays due to enqueued futures holding resources.
                     let send_all = also_poll(send_all, make_progress(&mut current_message));
@@ -375,6 +371,7 @@ async fn ws_client_actor_inner(
                 } else {
                     // the client never responded to our ping; drop them without trying to send them a Close
                     log::warn!("client {} timed out", client.id);
+                    clean_up_metrics(&message_queue, &sendrx);
                     break;
                 }
             }
diff --git a/crates/core/src/client/client_connection.rs b/crates/core/src/client/client_connection.rs
@@ -70,6 +70,13 @@ pub struct ClientConnectionSender {
     sendtx: mpsc::Sender<SerializableMessage>,
     abort_handle: AbortHandle,
     cancelled: AtomicBool,
+    /// The `total_outgoing_queue_length` metric labeled with this database's `Identity`,
+    /// which we'll increment whenever sending a message.
+    ///
+    /// This metric will be decremented, and cleaned up,
+    /// by `ws_client_actor_inner` in client-api/src/routes/subscribe.rs.
+    /// Care must be taken not to increment it after the client has disconnected
+    /// and performed its clean-up.
     sendtx_queue_size_metric: Option<IntGauge>,
 }
 
@@ -116,21 +123,26 @@ impl ClientConnectionSender {
             return Err(ClientSendError::Cancelled);
         }
 
-        if let Some(metric) = &self.sendtx_queue_size_metric {
-            metric.inc();
-        }
-
-        self.sendtx.try_send(message).map_err(|e| match e {
-            mpsc::error::TrySendError::Full(_) => {
+        match self.sendtx.try_send(message) {
+            Err(mpsc::error::TrySendError::Full(_)) => {
                 // we've hit CLIENT_CHANNEL_CAPACITY messages backed up in
                 // the channel, so forcibly kick the client
                 tracing::warn!(identity = %self.id.identity, connection_id = %self.id.connection_id, "client channel capacity exceeded");
                 self.abort_handle.abort();
                 self.cancelled.store(true, Relaxed);
-                ClientSendError::Cancelled
+                return Err(ClientSendError::Cancelled);
             }
-            mpsc::error::TrySendError::Closed(_) => ClientSendError::Disconnected,
-        })?;
+            Err(mpsc::error::TrySendError::Closed(_)) => return Err(ClientSendError::Disconnected),
+            Ok(()) => {
+                // If we successfully pushed a message into the queue, increment the queue size metric.
+                // Don't do this before pushing because, if the client has disconnected,
+                // it will already have performed its clean-up,
+                // and so would never perform the corresponding `dec` to this `inc`.
+                if let Some(metric) = &self.sendtx_queue_size_metric {
+                    metric.inc();
+                }
+            }
+        }
 
         Ok(())
     }
@@ -225,9 +237,7 @@ impl ClientConnection {
         })
         .abort_handle();
 
-        let sendtx_queue_size_metric = WORKER_METRICS
-            .client_connection_outgoing_queue_length
-            .with_label_values(&db, &id.identity, &id.connection_id);
+        let sendtx_queue_size_metric = WORKER_METRICS.total_outgoing_queue_length.with_label_values(&db);
 
         let sender = Arc::new(ClientConnectionSender {
             id,
diff --git a/crates/core/src/worker_metrics/mod.rs b/crates/core/src/worker_metrics/mod.rs
@@ -258,15 +258,15 @@ metrics_group!(
         #[labels(txn_type: WorkloadType, db: Identity)]
         pub bytes_sent_to_clients: IntCounterVec,
 
-        #[name = spacetime_client_connection_incoming_queue_length]
-        #[help = "The number of client -> server WebSocket messages waiting in a client connection's incoming queue"]
-        #[labels(db: Identity, client_identity: Identity, connection_id: ConnectionId)]
-        pub client_connection_incoming_queue_length: IntGaugeVec,
-
-        #[name = spacetime_client_connection_outgoing_queue_length]
-        #[help = "The number of server -> client WebSocket messages waiting in a client connection's outgoing queue"]
-        #[labels(db: Identity, client_identity: Identity, connection_id: ConnectionId)]
-        pub client_connection_outgoing_queue_length: IntGaugeVec,
+        #[name = spacetime_total_incoming_queue_length]
+        #[help = "The number of client -> server WebSocket messages waiting any client's incoming queue"]
+        #[labels(db: Identity)]
+        pub total_incoming_queue_length: IntGaugeVec,
+
+        #[name = spacetime_total_outgoing_queue_length]
+        #[help = "The number of server -> client WebSocket messages waiting in any client's outgoing queue"]
+        #[labels(db: Identity)]
+        pub total_outgoing_queue_length: IntGaugeVec,
     }
 );