From 8e51a5d9a910975029a415942e69cfbbe7459959 Mon Sep 17 00:00:00 2001 From: Yufeng Zhou Date: Fri, 18 Oct 2024 23:23:57 +0000 Subject: [PATCH] gui: moving window average and max for tile primary metrics --- book/api/websocket.md | 20 +++---- src/disco/gui/fd_gui.c | 108 ++++++++++++++++++++++++++++++---- src/disco/gui/fd_gui.h | 31 ++++++++-- src/disco/gui/fd_gui_printf.c | 52 ++++++++-------- src/disco/gui/fd_gui_printf.h | 6 +- 5 files changed, 161 insertions(+), 56 deletions(-) diff --git a/book/api/websocket.md b/book/api/websocket.md index 64b0b62986..a5524f3399 100644 --- a/book/api/websocket.md +++ b/book/api/websocket.md @@ -586,21 +586,21 @@ potential underflow. | Field | Type | Description |---------------------|---------------------|------------ | next_leader_slot | `number\|null` | The next leader slot | -| tile_primary_metric | `TilePrimaryMetric` | Per-tile-type primary metrics. Some of these are point-in-time values (P), and some are aggregated since the end of the previous leader slot (A) | +| tile_primary_metric | `TilePrimaryMetric` | Per-tile-type primary metrics. Some of these are point-in-time values (P), and some are moving window averages (W) | **`TilePrimaryMetric`** | Field | Type | Description | |---------|----------|-------------| -| net_in | `number` | Ingress bytes per second (P) | +| net_in | `number` | Ingress bytes per second (W) | | quic | `number` | Active QUIC connections (P) | -| verify | `number` | Fraction of transactions that failed sigverify (A) | -| dedup | `number` | Fraction of transactions deduplicated (A) | +| verify | `number` | Fraction of transactions that failed sigverify (W) | +| dedup | `number` | Fraction of transactions deduplicated (W) | | pack | `number` | Fraction of pack buffer filled (P) | -| bank | `number` | Execution TPS (P) | -| poh | `number` | Fraction of time spent hashing (P) | -| shred | `number` | Shreds processed per second (P) | -| store | `number` | 50% percentile latency (A) | -| net_out | `number` | Egress bytes per second (P) | +| bank | `number` | Execution TPS (W) | +| poh | `number` | Fraction of time spent hashing (W) | +| shred | `number` | Shreds processed per second (W) | +| store | `number` | 50% percentile latency (W) | +| net_out | `number` | Egress bytes per second (W) | #### `summary.live_tile_timers` @@ -1053,7 +1053,7 @@ are skipped on the currently active fork. |---------------------|---------------------------|-------------| | publish | `SlotPublish` | General information about the slot | | waterfall | `TxnWaterfall\|null` | If the slot is not `mine`, will be `null`. Otherwise, a waterfall showing reasons transactions were acquired since the end of the prior leader slot | -| tile_primary_metric | `TilePrimaryMetric\|null` | If the slot is not `mine`, will be `null`. Otherwise, per-tile-type primary metrics since the end of the prior leader slot | +| tile_primary_metric | `TilePrimaryMetric\|null` | If the slot is not `mine`, will be `null`. Otherwise, max value of per-tile-type primary metrics since the end of the prior leader slot | | tile_timers | `TsTileTimers[]\|null` | If the slot is not `mine`, will be `null`. Otherwise, an array of `TsTileTimers` samples from the slot, sorted earliest to latest | **`TxnWaterfall`** diff --git a/src/disco/gui/fd_gui.c b/src/disco/gui/fd_gui.c index cc4a28bb96..4fb4c65c76 100644 --- a/src/disco/gui/fd_gui.c +++ b/src/disco/gui/fd_gui.c @@ -19,6 +19,8 @@ fd_gui_footprint( void ) { return sizeof(fd_gui_t); } +static void fd_gui_tile_prime_metric_window_reset( fd_gui_t * gui ); + void * fd_gui_new( void * shmem, fd_http_server_t * http, @@ -93,9 +95,14 @@ fd_gui_new( void * shmem, memset( gui->summary.txn_waterfall_reference, 0, sizeof(gui->summary.txn_waterfall_reference) ); memset( gui->summary.txn_waterfall_current, 0, sizeof(gui->summary.txn_waterfall_current) ); - memset( gui->summary.tile_prime_metric_ref, 0, sizeof(gui->summary.tile_prime_metric_ref) ); - memset( gui->summary.tile_prime_metric_cur, 0, sizeof(gui->summary.tile_prime_metric_cur) ); - gui->summary.tile_prime_metric_ref[ 0 ].ts_nanos = fd_log_wallclock(); + /* Sample 0 is initialized below, so start from 1. */ + gui->summary.tile_prime_metric_history_idx = 1UL; + for( ulong i=1UL; isummary.tile_prime_metric_history[ i ].ts_nanos = 0L; + } + memset( gui->summary.tile_prime_metric_history, 0, sizeof(gui->summary.tile_prime_metric_history[ 0 ]) ); + gui->summary.tile_prime_metric_history[ 0 ].ts_nanos = fd_log_wallclock(); + fd_gui_tile_prime_metric_window_reset( gui ); memset( gui->summary.tile_timers_snap[ 0 ], 0, sizeof(gui->summary.tile_timers_snap[ 0 ]) ); memset( gui->summary.tile_timers_snap[ 1 ], 0, sizeof(gui->summary.tile_timers_snap[ 1 ]) ); @@ -467,11 +474,60 @@ fd_gui_txn_waterfall_snap( fd_gui_t * gui, } static void -fd_gui_tile_prime_metric_snap( fd_gui_t * gui, - fd_gui_txn_waterfall_t * w_cur, - fd_gui_tile_prime_metric_t * m_cur ) { +fd_gui_tile_prime_metric_window_reset( fd_gui_t * gui ) { + gui->summary.tile_prime_metric_running_window->net_in_bytes_max = 0UL; + gui->summary.tile_prime_metric_running_window->quic_conns_max = 0UL; + gui->summary.tile_prime_metric_running_window->verify_drop_ratio_max = 0.0; + gui->summary.tile_prime_metric_running_window->dedup_drop_ratio_max = 0.0; + gui->summary.tile_prime_metric_running_window->pack_fill_ratio_max = 0.0; + gui->summary.tile_prime_metric_running_window->bank_txn_max = 0UL; + gui->summary.tile_prime_metric_running_window->net_out_bytes_max = 0UL; +} + +static void +fd_gui_tile_prime_metric_recompute_window( fd_gui_t * gui, + fd_gui_tile_prime_metric_t * ref, + fd_gui_tile_prime_metric_t * cur ) { + fd_gui_tile_prime_metric_running_window_t * window = gui->summary.tile_prime_metric_running_window; + + ulong net_in_cur = (cur->net_in_bytes-ref->net_in_bytes)*1000000000UL/(ulong)(cur->ts_nanos-ref->ts_nanos); + ulong net_out_cur = (cur->net_out_bytes-ref->net_out_bytes)*1000000000UL/(ulong)(cur->ts_nanos-ref->ts_nanos); + ulong bank_txn_cur = (cur->bank_txn-ref->bank_txn)*1000000000UL/(ulong)(cur->ts_nanos-ref->ts_nanos); + + double verify_drop_cur = -1.0; + if( FD_LIKELY( cur->verify_drop_denominator>ref->verify_drop_denominator ) ) { + verify_drop_cur = (double)(cur->verify_drop_numerator-ref->verify_drop_numerator)/(double)(cur->verify_drop_denominator-ref->verify_drop_denominator); + } + double dedup_drop_cur = -1.0; + if( FD_LIKELY( cur->dedup_drop_denominator>ref->dedup_drop_denominator ) ) { + dedup_drop_cur = (double)(cur->dedup_drop_numerator-ref->dedup_drop_numerator)/(double)(cur->dedup_drop_denominator-ref->dedup_drop_denominator); + } + + double pack_fill_cur = (double)(cur->pack_fill_numerator)/(double)(cur->pack_fill_denominator); + + window->quic_conns_max = fd_ulong_max( window->quic_conns_max, cur->quic_conns ); + window->net_in_bytes_max = fd_ulong_max( window->net_in_bytes_max, net_in_cur ); + window->net_out_bytes_max = fd_ulong_max( window->net_out_bytes_max, net_out_cur ); + window->bank_txn_max = fd_ulong_max( window->bank_txn_max, bank_txn_cur ); + window->verify_drop_ratio_max = verify_drop_cur > window->verify_drop_ratio_max ? verify_drop_cur : window->verify_drop_ratio_max; + window->dedup_drop_ratio_max = dedup_drop_cur > window->dedup_drop_ratio_max ? dedup_drop_cur : window->dedup_drop_ratio_max; + window->pack_fill_ratio_max = pack_fill_cur > window->pack_fill_ratio_max ? pack_fill_cur : window->pack_fill_ratio_max; + + window->quic_conns_cur = cur->quic_conns; + window->net_in_bytes_cur = net_in_cur; + window->verify_drop_ratio_cur = verify_drop_cur; + window->dedup_drop_ratio_cur = dedup_drop_cur; + window->pack_fill_ratio_cur = pack_fill_cur; + window->bank_txn_cur = bank_txn_cur; + window->net_out_bytes_cur = net_out_cur; +} + +static void +fd_gui_tile_prime_metric_snap( fd_gui_t * gui, + fd_gui_txn_waterfall_t * w_cur ) { fd_topo_t * topo = gui->topo; + fd_gui_tile_prime_metric_t * m_cur = &gui->summary.tile_prime_metric_history[ gui->summary.tile_prime_metric_history_idx ]; m_cur->ts_nanos = fd_log_wallclock(); m_cur->net_in_bytes = 0UL; @@ -509,6 +565,36 @@ fd_gui_tile_prime_metric_snap( fd_gui_t * gui, m_cur->pack_fill_denominator = pack->pack.max_pending_transactions; m_cur->bank_txn = w_cur->out.block_fail + w_cur->out.block_success; + + gui->summary.tile_prime_metric_history_idx = (gui->summary.tile_prime_metric_history_idx+1UL) % FD_GUI_TILE_METRIC_SAMPLE_CNT; + + fd_gui_tile_prime_metric_t * m_ref = NULL; + for( ulong i=0UL; isummary.tile_prime_metric_history[ (gui->summary.tile_prime_metric_history_idx+i)%FD_GUI_TILE_METRIC_SAMPLE_CNT ]; + if( FD_LIKELY( m_tmp->ts_nanos!=0L && ( m_tmp->ts_nanos+FD_GUI_TILE_METRIC_WINDOW_DURATION_SECONDS*1000L*1000L*1000L>=m_cur->ts_nanos ) ) ) { + /* This is the first sample within the most recent window. */ + m_ref = m_tmp; + break; + } + } + if( FD_UNLIKELY( !m_ref ) ) { + /* We didn't find a suitable sample as reference. + This could be due to we just booted and sample 0 has a timestamp + that is too far away. + So just use the sample that's one before the last one we + sampled. */ + ulong ref_idx = (gui->summary.tile_prime_metric_history_idx+FD_GUI_TILE_METRIC_SAMPLE_CNT-2UL)%FD_GUI_TILE_METRIC_SAMPLE_CNT; + m_ref = &gui->summary.tile_prime_metric_history[ ref_idx ]; + FD_TEST( m_ref->ts_nanos!=0L ); + if( FD_UNLIKELY( ref_idx!=0UL ) ) { + FD_LOG_WARNING(( "Couldn't find a recent enough reference sample for tile primary metrics " + "and it doesn't appear to be due to fresh boot m_ref->ts_nanos=%ld m_cur->ts_nanos=%ld ref_idx=%lu; " + "GUI should be non-blocking and sampling frequent enough", + m_ref->ts_nanos, m_cur->ts_nanos, ref_idx )); + } + } + fd_gui_tile_prime_metric_recompute_window( gui, m_ref, m_cur ); } int @@ -531,8 +617,8 @@ fd_gui_poll( fd_gui_t * gui ) { fd_gui_printf_live_txn_waterfall( gui, gui->summary.txn_waterfall_reference, gui->summary.txn_waterfall_current, 0UL /* TODO: REAL NEXT LEADER SLOT */ ); fd_http_server_ws_broadcast( gui->http ); - fd_gui_tile_prime_metric_snap( gui, gui->summary.txn_waterfall_current, gui->summary.tile_prime_metric_cur ); - fd_gui_printf_live_tile_prime_metric( gui, gui->summary.tile_prime_metric_ref, gui->summary.tile_prime_metric_cur, 0UL ); // TODO: REAL NEXT LEADER SLOT + fd_gui_tile_prime_metric_snap( gui, gui->summary.txn_waterfall_current ); + fd_gui_printf_live_tile_prime_metric( gui, 0UL ); // TODO: REAL NEXT LEADER SLOT fd_http_server_ws_broadcast( gui->http ); gui->next_sample_100millis += 100L*1000L*1000L; @@ -1085,10 +1171,10 @@ fd_gui_handle_slot_end( fd_gui_t * gui, slot. */ fd_gui_txn_waterfall_snap( gui, slot->waterfall_end ); - fd_gui_tile_prime_metric_snap( gui, slot->waterfall_end, slot->tile_prime_metric_end ); + fd_gui_tile_prime_metric_snap( gui, slot->waterfall_end ); memcpy( gui->summary.txn_waterfall_reference, slot->waterfall_end, sizeof(gui->summary.txn_waterfall_reference) ); - memcpy( slot->tile_prime_metric_begin, gui->summary.tile_prime_metric_ref, sizeof(slot->tile_prime_metric_begin) ); - memcpy( gui->summary.tile_prime_metric_ref, slot->tile_prime_metric_end, sizeof(gui->summary.tile_prime_metric_ref) ); + memcpy( slot->tile_prime_metric_window, gui->summary.tile_prime_metric_running_window, sizeof(slot->tile_prime_metric_window) ); + fd_gui_tile_prime_metric_window_reset( gui ); } static void diff --git a/src/disco/gui/fd_gui.h b/src/disco/gui/fd_gui.h index 5581a17e25..aa4d6e42cc 100644 --- a/src/disco/gui/fd_gui.h +++ b/src/disco/gui/fd_gui.h @@ -12,6 +12,10 @@ #define FD_GUI_SLOTS_CNT (864000UL) #define FD_GUI_TPS_HISTORY_WINDOW_DURATION_SECONDS (10L) /* 10 second moving average */ #define FD_GUI_TPS_HISTORY_SAMPLE_CNT (150UL) +/* We sample these every 100ms, so this should be enough for computing + values in the past 1-second window. */ +#define FD_GUI_TILE_METRIC_SAMPLE_CNT (16UL) +#define FD_GUI_TILE_METRIC_WINDOW_DURATION_SECONDS (1L) #define FD_GUI_SLOT_LEVEL_INCOMPLETE (0) #define FD_GUI_SLOT_LEVEL_COMPLETED (1) @@ -141,6 +145,25 @@ struct fd_gui_tile_prime_metric { typedef struct fd_gui_tile_prime_metric fd_gui_tile_prime_metric_t; +struct fd_gui_tile_prime_metric_running_window { + ulong net_in_bytes_cur; + ulong net_in_bytes_max; + ulong quic_conns_cur; + ulong quic_conns_max; + double verify_drop_ratio_cur; + double verify_drop_ratio_max; + double dedup_drop_ratio_cur; + double dedup_drop_ratio_max; + double pack_fill_ratio_cur; + double pack_fill_ratio_max; + ulong bank_txn_cur; + ulong bank_txn_max; + ulong net_out_bytes_cur; + ulong net_out_bytes_max; +}; + +typedef struct fd_gui_tile_prime_metric_running_window fd_gui_tile_prime_metric_running_window_t; + #define FD_GUI_SLOT_LEADER_UNSTARTED (0UL) #define FD_GUI_SLOT_LEADER_STARTED (1UL) #define FD_GUI_SLOT_LEADER_ENDED (2UL) @@ -165,8 +188,7 @@ struct fd_gui_slot { ulong prior_leader_slot; fd_gui_txn_waterfall_t waterfall_end[ 1 ]; - fd_gui_tile_prime_metric_t tile_prime_metric_begin[ 1 ]; - fd_gui_tile_prime_metric_t tile_prime_metric_end[ 1 ]; + fd_gui_tile_prime_metric_running_window_t tile_prime_metric_window[ 1 ]; /* Index into periodic sample array. Inclusive. Points to first sample after slot start sample. */ @@ -252,8 +274,9 @@ struct fd_gui { fd_gui_txn_waterfall_t txn_waterfall_reference[ 1 ]; fd_gui_txn_waterfall_t txn_waterfall_current[ 1 ]; - fd_gui_tile_prime_metric_t tile_prime_metric_ref[ 1 ]; - fd_gui_tile_prime_metric_t tile_prime_metric_cur[ 1 ]; + ulong tile_prime_metric_history_idx; + fd_gui_tile_prime_metric_t tile_prime_metric_history[ FD_GUI_TILE_METRIC_SAMPLE_CNT ]; + fd_gui_tile_prime_metric_running_window_t tile_prime_metric_running_window[ 1 ]; ulong tile_timers_snap_idx; fd_gui_tile_timers_t tile_timers_snap[ 432000UL ][ 64 ]; /* TODO: This can only store about 1 hour of samples */ diff --git a/src/disco/gui/fd_gui_printf.c b/src/disco/gui/fd_gui_printf.c index 0daa65e369..e1d9b97719 100644 --- a/src/disco/gui/fd_gui_printf.c +++ b/src/disco/gui/fd_gui_printf.c @@ -521,27 +521,27 @@ fd_gui_printf_live_txn_waterfall( fd_gui_t * gui, } static void -fd_gui_printf_tile_prime_metric( fd_gui_t * gui, - fd_gui_tile_prime_metric_t * prev, - fd_gui_tile_prime_metric_t * cur ) { +fd_gui_printf_tile_prime_metric( fd_gui_t * gui, + fd_gui_tile_prime_metric_running_window_t * window, + int print_cur ) { jsonp_open_object( gui, "tile_primary_metric" ); - /* Connection count is a point-in-time value not a cumulative value. */ - jsonp_ulong( gui, "quic", cur->quic_conns ); - jsonp_ulong( gui, "net_in", (cur->net_in_bytes-prev->net_in_bytes)*1000000000UL/(ulong)(cur->ts_nanos-prev->ts_nanos) ); - jsonp_ulong( gui, "net_out", (cur->net_out_bytes - prev->net_out_bytes)*1000000000UL/(ulong)(cur->ts_nanos-prev->ts_nanos) ); - if( FD_LIKELY( cur->verify_drop_denominator>prev->verify_drop_denominator ) ) { - jsonp_double( gui, "verify", (double)(cur->verify_drop_numerator-prev->verify_drop_numerator)/(double)(cur->verify_drop_denominator-prev->verify_drop_denominator) ); - } else { - jsonp_double( gui, "verify", -1 ); - } - if( FD_LIKELY( cur->dedup_drop_denominator>prev->dedup_drop_denominator ) ) { - jsonp_double( gui, "dedup", (double)(cur->dedup_drop_numerator-prev->dedup_drop_numerator)/(double)(cur->dedup_drop_denominator-prev->dedup_drop_denominator) ); - } else { - jsonp_double( gui, "dedup", -1 ); - } - jsonp_ulong( gui, "bank", (cur->bank_txn-prev->bank_txn)*1000000000UL/(ulong)(cur->ts_nanos-prev->ts_nanos) ); - /* pack fill rate is a point-in-time value not a cumulative value. */ - jsonp_double( gui, "pack", (double)(cur->pack_fill_numerator)/(double)(cur->pack_fill_denominator) ); + if( FD_LIKELY( print_cur ) ) { + jsonp_ulong( gui, "quic", window->quic_conns_cur ); + jsonp_ulong( gui, "net_in", window->net_in_bytes_cur ); + jsonp_ulong( gui, "net_out", window->net_out_bytes_cur ); + jsonp_double( gui, "verify", window->verify_drop_ratio_cur ); + jsonp_double( gui, "dedup", window->dedup_drop_ratio_cur ); + jsonp_ulong( gui, "bank", window->bank_txn_cur ); + jsonp_double( gui, "pack", window->pack_fill_ratio_cur ); + } else { + jsonp_ulong( gui, "quic", window->quic_conns_max ); + jsonp_ulong( gui, "net_in", window->net_in_bytes_max ); + jsonp_ulong( gui, "net_out", window->net_out_bytes_max ); + jsonp_double( gui, "verify", window->verify_drop_ratio_max ); + jsonp_double( gui, "dedup", window->dedup_drop_ratio_max ); + jsonp_ulong( gui, "bank", window->bank_txn_max ); + jsonp_double( gui, "pack", window->pack_fill_ratio_max ); + } jsonp_double( gui, "poh", 0.0 ); //TODO jsonp_double( gui, "shred", 0.0 );//TODO jsonp_double( gui, "store", 0.0 );//TODO @@ -549,14 +549,12 @@ fd_gui_printf_tile_prime_metric( fd_gui_t * gui, } void -fd_gui_printf_live_tile_prime_metric( fd_gui_t * gui, - fd_gui_tile_prime_metric_t * prev, - fd_gui_tile_prime_metric_t * cur, - ulong next_leader_slot ) { +fd_gui_printf_live_tile_prime_metric( fd_gui_t * gui, + ulong next_leader_slot ) { jsonp_open_envelope( gui, "summary", "live_tile_primary_metric" ); jsonp_open_object( gui, "value" ); jsonp_ulong( gui, "next_leader_slot", next_leader_slot ); - fd_gui_printf_tile_prime_metric( gui, prev, cur ); + fd_gui_printf_tile_prime_metric( gui, gui->summary.tile_prime_metric_running_window, 1 ); jsonp_close_object( gui ); jsonp_close_envelope( gui ); } @@ -1024,7 +1022,7 @@ fd_gui_printf_slot( fd_gui_t * gui, fd_gui_printf_ts_tile_timers( gui, prev_timer, slot->tile_timers_end ); jsonp_close_array( gui );*/ - fd_gui_printf_tile_prime_metric( gui, slot->tile_prime_metric_begin, slot->tile_prime_metric_end ); + fd_gui_printf_tile_prime_metric( gui, slot->tile_prime_metric_window, 0 ); } else { jsonp_null( gui, "waterfall" ); // jsonp_null( gui, "tile_timers" ); @@ -1112,7 +1110,7 @@ fd_gui_printf_slot_request( fd_gui_t * gui, fd_gui_printf_ts_tile_timers( gui, prev_timer, slot->tile_timers_end ); jsonp_close_array( gui ); - fd_gui_printf_tile_prime_metric( gui, slot->tile_prime_metric_begin, slot->tile_prime_metric_end ); + fd_gui_printf_tile_prime_metric( gui, slot->tile_prime_metric_window, 0 ); } else { jsonp_null( gui, "waterfall" ); jsonp_null( gui, "tile_timers" ); diff --git a/src/disco/gui/fd_gui_printf.h b/src/disco/gui/fd_gui_printf.h index 6da87e1910..72cc42996a 100644 --- a/src/disco/gui/fd_gui_printf.h +++ b/src/disco/gui/fd_gui_printf.h @@ -89,7 +89,5 @@ fd_gui_printf_live_txn_waterfall( fd_gui_t * gui, ulong next_leader_slot ); void -fd_gui_printf_live_tile_prime_metric( fd_gui_t * gui, - fd_gui_tile_prime_metric_t * prev, - fd_gui_tile_prime_metric_t * cur, - ulong next_leader_slot ); +fd_gui_printf_live_tile_prime_metric( fd_gui_t * gui, + ulong next_leader_slot );