Skip to content

Commit

Permalink
gui: moving window average and max for tile primary metrics
Browse files Browse the repository at this point in the history
  • Loading branch information
yufeng-jump committed Oct 19, 2024
1 parent 62a162e commit 8e51a5d
Show file tree
Hide file tree
Showing 5 changed files with 161 additions and 56 deletions.
20 changes: 10 additions & 10 deletions book/api/websocket.md
Original file line number Diff line number Diff line change
Expand Up @@ -586,21 +586,21 @@ potential underflow.
| Field | Type | Description
|---------------------|---------------------|------------
| next_leader_slot | `number\|null` | The next leader slot |
| tile_primary_metric | `TilePrimaryMetric` | Per-tile-type primary metrics. Some of these are point-in-time values (P), and some are aggregated since the end of the previous leader slot (A) |
| tile_primary_metric | `TilePrimaryMetric` | Per-tile-type primary metrics. Some of these are point-in-time values (P), and some are moving window averages (W) |

**`TilePrimaryMetric`**
| Field | Type | Description |
|---------|----------|-------------|
| net_in | `number` | Ingress bytes per second (P) |
| net_in | `number` | Ingress bytes per second (W) |
| quic | `number` | Active QUIC connections (P) |
| verify | `number` | Fraction of transactions that failed sigverify (A) |
| dedup | `number` | Fraction of transactions deduplicated (A) |
| verify | `number` | Fraction of transactions that failed sigverify (W) |
| dedup | `number` | Fraction of transactions deduplicated (W) |
| pack | `number` | Fraction of pack buffer filled (P) |
| bank | `number` | Execution TPS (P) |
| poh | `number` | Fraction of time spent hashing (P) |
| shred | `number` | Shreds processed per second (P) |
| store | `number` | 50% percentile latency (A) |
| net_out | `number` | Egress bytes per second (P) |
| bank | `number` | Execution TPS (W) |
| poh | `number` | Fraction of time spent hashing (W) |
| shred | `number` | Shreds processed per second (W) |
| store | `number` | 50% percentile latency (W) |
| net_out | `number` | Egress bytes per second (W) |


#### `summary.live_tile_timers`
Expand Down Expand Up @@ -1053,7 +1053,7 @@ are skipped on the currently active fork.
|---------------------|---------------------------|-------------|
| publish | `SlotPublish` | General information about the slot |
| waterfall | `TxnWaterfall\|null` | If the slot is not `mine`, will be `null`. Otherwise, a waterfall showing reasons transactions were acquired since the end of the prior leader slot |
| tile_primary_metric | `TilePrimaryMetric\|null` | If the slot is not `mine`, will be `null`. Otherwise, per-tile-type primary metrics since the end of the prior leader slot |
| tile_primary_metric | `TilePrimaryMetric\|null` | If the slot is not `mine`, will be `null`. Otherwise, max value of per-tile-type primary metrics since the end of the prior leader slot |
| tile_timers | `TsTileTimers[]\|null` | If the slot is not `mine`, will be `null`. Otherwise, an array of `TsTileTimers` samples from the slot, sorted earliest to latest |

**`TxnWaterfall`**
Expand Down
108 changes: 97 additions & 11 deletions src/disco/gui/fd_gui.c
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ fd_gui_footprint( void ) {
return sizeof(fd_gui_t);
}

static void fd_gui_tile_prime_metric_window_reset( fd_gui_t * gui );

void *
fd_gui_new( void * shmem,
fd_http_server_t * http,
Expand Down Expand Up @@ -93,9 +95,14 @@ fd_gui_new( void * shmem,
memset( gui->summary.txn_waterfall_reference, 0, sizeof(gui->summary.txn_waterfall_reference) );
memset( gui->summary.txn_waterfall_current, 0, sizeof(gui->summary.txn_waterfall_current) );

memset( gui->summary.tile_prime_metric_ref, 0, sizeof(gui->summary.tile_prime_metric_ref) );
memset( gui->summary.tile_prime_metric_cur, 0, sizeof(gui->summary.tile_prime_metric_cur) );
gui->summary.tile_prime_metric_ref[ 0 ].ts_nanos = fd_log_wallclock();
/* Sample 0 is initialized below, so start from 1. */
gui->summary.tile_prime_metric_history_idx = 1UL;
for( ulong i=1UL; i<FD_GUI_TILE_METRIC_SAMPLE_CNT; i++ ) {
gui->summary.tile_prime_metric_history[ i ].ts_nanos = 0L;
}
memset( gui->summary.tile_prime_metric_history, 0, sizeof(gui->summary.tile_prime_metric_history[ 0 ]) );
gui->summary.tile_prime_metric_history[ 0 ].ts_nanos = fd_log_wallclock();
fd_gui_tile_prime_metric_window_reset( gui );

memset( gui->summary.tile_timers_snap[ 0 ], 0, sizeof(gui->summary.tile_timers_snap[ 0 ]) );
memset( gui->summary.tile_timers_snap[ 1 ], 0, sizeof(gui->summary.tile_timers_snap[ 1 ]) );
Expand Down Expand Up @@ -467,11 +474,60 @@ fd_gui_txn_waterfall_snap( fd_gui_t * gui,
}

static void
fd_gui_tile_prime_metric_snap( fd_gui_t * gui,
fd_gui_txn_waterfall_t * w_cur,
fd_gui_tile_prime_metric_t * m_cur ) {
fd_gui_tile_prime_metric_window_reset( fd_gui_t * gui ) {
gui->summary.tile_prime_metric_running_window->net_in_bytes_max = 0UL;
gui->summary.tile_prime_metric_running_window->quic_conns_max = 0UL;
gui->summary.tile_prime_metric_running_window->verify_drop_ratio_max = 0.0;
gui->summary.tile_prime_metric_running_window->dedup_drop_ratio_max = 0.0;
gui->summary.tile_prime_metric_running_window->pack_fill_ratio_max = 0.0;
gui->summary.tile_prime_metric_running_window->bank_txn_max = 0UL;
gui->summary.tile_prime_metric_running_window->net_out_bytes_max = 0UL;
}

static void
fd_gui_tile_prime_metric_recompute_window( fd_gui_t * gui,
fd_gui_tile_prime_metric_t * ref,
fd_gui_tile_prime_metric_t * cur ) {
fd_gui_tile_prime_metric_running_window_t * window = gui->summary.tile_prime_metric_running_window;

ulong net_in_cur = (cur->net_in_bytes-ref->net_in_bytes)*1000000000UL/(ulong)(cur->ts_nanos-ref->ts_nanos);
ulong net_out_cur = (cur->net_out_bytes-ref->net_out_bytes)*1000000000UL/(ulong)(cur->ts_nanos-ref->ts_nanos);
ulong bank_txn_cur = (cur->bank_txn-ref->bank_txn)*1000000000UL/(ulong)(cur->ts_nanos-ref->ts_nanos);

double verify_drop_cur = -1.0;
if( FD_LIKELY( cur->verify_drop_denominator>ref->verify_drop_denominator ) ) {
verify_drop_cur = (double)(cur->verify_drop_numerator-ref->verify_drop_numerator)/(double)(cur->verify_drop_denominator-ref->verify_drop_denominator);
}
double dedup_drop_cur = -1.0;
if( FD_LIKELY( cur->dedup_drop_denominator>ref->dedup_drop_denominator ) ) {
dedup_drop_cur = (double)(cur->dedup_drop_numerator-ref->dedup_drop_numerator)/(double)(cur->dedup_drop_denominator-ref->dedup_drop_denominator);
}

double pack_fill_cur = (double)(cur->pack_fill_numerator)/(double)(cur->pack_fill_denominator);

window->quic_conns_max = fd_ulong_max( window->quic_conns_max, cur->quic_conns );
window->net_in_bytes_max = fd_ulong_max( window->net_in_bytes_max, net_in_cur );
window->net_out_bytes_max = fd_ulong_max( window->net_out_bytes_max, net_out_cur );
window->bank_txn_max = fd_ulong_max( window->bank_txn_max, bank_txn_cur );
window->verify_drop_ratio_max = verify_drop_cur > window->verify_drop_ratio_max ? verify_drop_cur : window->verify_drop_ratio_max;
window->dedup_drop_ratio_max = dedup_drop_cur > window->dedup_drop_ratio_max ? dedup_drop_cur : window->dedup_drop_ratio_max;
window->pack_fill_ratio_max = pack_fill_cur > window->pack_fill_ratio_max ? pack_fill_cur : window->pack_fill_ratio_max;

window->quic_conns_cur = cur->quic_conns;
window->net_in_bytes_cur = net_in_cur;
window->verify_drop_ratio_cur = verify_drop_cur;
window->dedup_drop_ratio_cur = dedup_drop_cur;
window->pack_fill_ratio_cur = pack_fill_cur;
window->bank_txn_cur = bank_txn_cur;
window->net_out_bytes_cur = net_out_cur;
}

static void
fd_gui_tile_prime_metric_snap( fd_gui_t * gui,
fd_gui_txn_waterfall_t * w_cur ) {
fd_topo_t * topo = gui->topo;

fd_gui_tile_prime_metric_t * m_cur = &gui->summary.tile_prime_metric_history[ gui->summary.tile_prime_metric_history_idx ];
m_cur->ts_nanos = fd_log_wallclock();

m_cur->net_in_bytes = 0UL;
Expand Down Expand Up @@ -509,6 +565,36 @@ fd_gui_tile_prime_metric_snap( fd_gui_t * gui,
m_cur->pack_fill_denominator = pack->pack.max_pending_transactions;

m_cur->bank_txn = w_cur->out.block_fail + w_cur->out.block_success;

gui->summary.tile_prime_metric_history_idx = (gui->summary.tile_prime_metric_history_idx+1UL) % FD_GUI_TILE_METRIC_SAMPLE_CNT;

fd_gui_tile_prime_metric_t * m_ref = NULL;
for( ulong i=0UL; i<FD_GUI_TILE_METRIC_SAMPLE_CNT-1; i++ ) {
/* Find reference sample for computing window values. */
fd_gui_tile_prime_metric_t * m_tmp = &gui->summary.tile_prime_metric_history[ (gui->summary.tile_prime_metric_history_idx+i)%FD_GUI_TILE_METRIC_SAMPLE_CNT ];
if( FD_LIKELY( m_tmp->ts_nanos!=0L && ( m_tmp->ts_nanos+FD_GUI_TILE_METRIC_WINDOW_DURATION_SECONDS*1000L*1000L*1000L>=m_cur->ts_nanos ) ) ) {
/* This is the first sample within the most recent window. */
m_ref = m_tmp;
break;
}
}
if( FD_UNLIKELY( !m_ref ) ) {
/* We didn't find a suitable sample as reference.
This could be due to we just booted and sample 0 has a timestamp
that is too far away.
So just use the sample that's one before the last one we
sampled. */
ulong ref_idx = (gui->summary.tile_prime_metric_history_idx+FD_GUI_TILE_METRIC_SAMPLE_CNT-2UL)%FD_GUI_TILE_METRIC_SAMPLE_CNT;
m_ref = &gui->summary.tile_prime_metric_history[ ref_idx ];
FD_TEST( m_ref->ts_nanos!=0L );
if( FD_UNLIKELY( ref_idx!=0UL ) ) {
FD_LOG_WARNING(( "Couldn't find a recent enough reference sample for tile primary metrics "
"and it doesn't appear to be due to fresh boot m_ref->ts_nanos=%ld m_cur->ts_nanos=%ld ref_idx=%lu; "
"GUI should be non-blocking and sampling frequent enough",
m_ref->ts_nanos, m_cur->ts_nanos, ref_idx ));
}
}
fd_gui_tile_prime_metric_recompute_window( gui, m_ref, m_cur );
}

int
Expand All @@ -531,8 +617,8 @@ fd_gui_poll( fd_gui_t * gui ) {
fd_gui_printf_live_txn_waterfall( gui, gui->summary.txn_waterfall_reference, gui->summary.txn_waterfall_current, 0UL /* TODO: REAL NEXT LEADER SLOT */ );
fd_http_server_ws_broadcast( gui->http );

fd_gui_tile_prime_metric_snap( gui, gui->summary.txn_waterfall_current, gui->summary.tile_prime_metric_cur );
fd_gui_printf_live_tile_prime_metric( gui, gui->summary.tile_prime_metric_ref, gui->summary.tile_prime_metric_cur, 0UL ); // TODO: REAL NEXT LEADER SLOT
fd_gui_tile_prime_metric_snap( gui, gui->summary.txn_waterfall_current );
fd_gui_printf_live_tile_prime_metric( gui, 0UL ); // TODO: REAL NEXT LEADER SLOT
fd_http_server_ws_broadcast( gui->http );

gui->next_sample_100millis += 100L*1000L*1000L;
Expand Down Expand Up @@ -1085,10 +1171,10 @@ fd_gui_handle_slot_end( fd_gui_t * gui,
slot. */

fd_gui_txn_waterfall_snap( gui, slot->waterfall_end );
fd_gui_tile_prime_metric_snap( gui, slot->waterfall_end, slot->tile_prime_metric_end );
fd_gui_tile_prime_metric_snap( gui, slot->waterfall_end );
memcpy( gui->summary.txn_waterfall_reference, slot->waterfall_end, sizeof(gui->summary.txn_waterfall_reference) );
memcpy( slot->tile_prime_metric_begin, gui->summary.tile_prime_metric_ref, sizeof(slot->tile_prime_metric_begin) );
memcpy( gui->summary.tile_prime_metric_ref, slot->tile_prime_metric_end, sizeof(gui->summary.tile_prime_metric_ref) );
memcpy( slot->tile_prime_metric_window, gui->summary.tile_prime_metric_running_window, sizeof(slot->tile_prime_metric_window) );
fd_gui_tile_prime_metric_window_reset( gui );
}

static void
Expand Down
31 changes: 27 additions & 4 deletions src/disco/gui/fd_gui.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,10 @@
#define FD_GUI_SLOTS_CNT (864000UL)
#define FD_GUI_TPS_HISTORY_WINDOW_DURATION_SECONDS (10L) /* 10 second moving average */
#define FD_GUI_TPS_HISTORY_SAMPLE_CNT (150UL)
/* We sample these every 100ms, so this should be enough for computing
values in the past 1-second window. */
#define FD_GUI_TILE_METRIC_SAMPLE_CNT (16UL)
#define FD_GUI_TILE_METRIC_WINDOW_DURATION_SECONDS (1L)

#define FD_GUI_SLOT_LEVEL_INCOMPLETE (0)
#define FD_GUI_SLOT_LEVEL_COMPLETED (1)
Expand Down Expand Up @@ -141,6 +145,25 @@ struct fd_gui_tile_prime_metric {

typedef struct fd_gui_tile_prime_metric fd_gui_tile_prime_metric_t;

struct fd_gui_tile_prime_metric_running_window {
ulong net_in_bytes_cur;
ulong net_in_bytes_max;
ulong quic_conns_cur;
ulong quic_conns_max;
double verify_drop_ratio_cur;
double verify_drop_ratio_max;
double dedup_drop_ratio_cur;
double dedup_drop_ratio_max;
double pack_fill_ratio_cur;
double pack_fill_ratio_max;
ulong bank_txn_cur;
ulong bank_txn_max;
ulong net_out_bytes_cur;
ulong net_out_bytes_max;
};

typedef struct fd_gui_tile_prime_metric_running_window fd_gui_tile_prime_metric_running_window_t;

#define FD_GUI_SLOT_LEADER_UNSTARTED (0UL)
#define FD_GUI_SLOT_LEADER_STARTED (1UL)
#define FD_GUI_SLOT_LEADER_ENDED (2UL)
Expand All @@ -165,8 +188,7 @@ struct fd_gui_slot {
ulong prior_leader_slot;
fd_gui_txn_waterfall_t waterfall_end[ 1 ];

fd_gui_tile_prime_metric_t tile_prime_metric_begin[ 1 ];
fd_gui_tile_prime_metric_t tile_prime_metric_end[ 1 ];
fd_gui_tile_prime_metric_running_window_t tile_prime_metric_window[ 1 ];

/* Index into periodic sample array. Inclusive.
Points to first sample after slot start sample. */
Expand Down Expand Up @@ -252,8 +274,9 @@ struct fd_gui {
fd_gui_txn_waterfall_t txn_waterfall_reference[ 1 ];
fd_gui_txn_waterfall_t txn_waterfall_current[ 1 ];

fd_gui_tile_prime_metric_t tile_prime_metric_ref[ 1 ];
fd_gui_tile_prime_metric_t tile_prime_metric_cur[ 1 ];
ulong tile_prime_metric_history_idx;
fd_gui_tile_prime_metric_t tile_prime_metric_history[ FD_GUI_TILE_METRIC_SAMPLE_CNT ];
fd_gui_tile_prime_metric_running_window_t tile_prime_metric_running_window[ 1 ];

ulong tile_timers_snap_idx;
fd_gui_tile_timers_t tile_timers_snap[ 432000UL ][ 64 ]; /* TODO: This can only store about 1 hour of samples */
Expand Down
52 changes: 25 additions & 27 deletions src/disco/gui/fd_gui_printf.c
Original file line number Diff line number Diff line change
Expand Up @@ -521,42 +521,40 @@ fd_gui_printf_live_txn_waterfall( fd_gui_t * gui,
}

static void
fd_gui_printf_tile_prime_metric( fd_gui_t * gui,
fd_gui_tile_prime_metric_t * prev,
fd_gui_tile_prime_metric_t * cur ) {
fd_gui_printf_tile_prime_metric( fd_gui_t * gui,
fd_gui_tile_prime_metric_running_window_t * window,
int print_cur ) {
jsonp_open_object( gui, "tile_primary_metric" );
/* Connection count is a point-in-time value not a cumulative value. */
jsonp_ulong( gui, "quic", cur->quic_conns );
jsonp_ulong( gui, "net_in", (cur->net_in_bytes-prev->net_in_bytes)*1000000000UL/(ulong)(cur->ts_nanos-prev->ts_nanos) );
jsonp_ulong( gui, "net_out", (cur->net_out_bytes - prev->net_out_bytes)*1000000000UL/(ulong)(cur->ts_nanos-prev->ts_nanos) );
if( FD_LIKELY( cur->verify_drop_denominator>prev->verify_drop_denominator ) ) {
jsonp_double( gui, "verify", (double)(cur->verify_drop_numerator-prev->verify_drop_numerator)/(double)(cur->verify_drop_denominator-prev->verify_drop_denominator) );
} else {
jsonp_double( gui, "verify", -1 );
}
if( FD_LIKELY( cur->dedup_drop_denominator>prev->dedup_drop_denominator ) ) {
jsonp_double( gui, "dedup", (double)(cur->dedup_drop_numerator-prev->dedup_drop_numerator)/(double)(cur->dedup_drop_denominator-prev->dedup_drop_denominator) );
} else {
jsonp_double( gui, "dedup", -1 );
}
jsonp_ulong( gui, "bank", (cur->bank_txn-prev->bank_txn)*1000000000UL/(ulong)(cur->ts_nanos-prev->ts_nanos) );
/* pack fill rate is a point-in-time value not a cumulative value. */
jsonp_double( gui, "pack", (double)(cur->pack_fill_numerator)/(double)(cur->pack_fill_denominator) );
if( FD_LIKELY( print_cur ) ) {
jsonp_ulong( gui, "quic", window->quic_conns_cur );
jsonp_ulong( gui, "net_in", window->net_in_bytes_cur );
jsonp_ulong( gui, "net_out", window->net_out_bytes_cur );
jsonp_double( gui, "verify", window->verify_drop_ratio_cur );
jsonp_double( gui, "dedup", window->dedup_drop_ratio_cur );
jsonp_ulong( gui, "bank", window->bank_txn_cur );
jsonp_double( gui, "pack", window->pack_fill_ratio_cur );
} else {
jsonp_ulong( gui, "quic", window->quic_conns_max );
jsonp_ulong( gui, "net_in", window->net_in_bytes_max );
jsonp_ulong( gui, "net_out", window->net_out_bytes_max );
jsonp_double( gui, "verify", window->verify_drop_ratio_max );
jsonp_double( gui, "dedup", window->dedup_drop_ratio_max );
jsonp_ulong( gui, "bank", window->bank_txn_max );
jsonp_double( gui, "pack", window->pack_fill_ratio_max );
}
jsonp_double( gui, "poh", 0.0 ); //TODO
jsonp_double( gui, "shred", 0.0 );//TODO
jsonp_double( gui, "store", 0.0 );//TODO
jsonp_close_object( gui );
}

void
fd_gui_printf_live_tile_prime_metric( fd_gui_t * gui,
fd_gui_tile_prime_metric_t * prev,
fd_gui_tile_prime_metric_t * cur,
ulong next_leader_slot ) {
fd_gui_printf_live_tile_prime_metric( fd_gui_t * gui,
ulong next_leader_slot ) {
jsonp_open_envelope( gui, "summary", "live_tile_primary_metric" );
jsonp_open_object( gui, "value" );
jsonp_ulong( gui, "next_leader_slot", next_leader_slot );
fd_gui_printf_tile_prime_metric( gui, prev, cur );
fd_gui_printf_tile_prime_metric( gui, gui->summary.tile_prime_metric_running_window, 1 );
jsonp_close_object( gui );
jsonp_close_envelope( gui );
}
Expand Down Expand Up @@ -1024,7 +1022,7 @@ fd_gui_printf_slot( fd_gui_t * gui,
fd_gui_printf_ts_tile_timers( gui, prev_timer, slot->tile_timers_end );
jsonp_close_array( gui );*/

fd_gui_printf_tile_prime_metric( gui, slot->tile_prime_metric_begin, slot->tile_prime_metric_end );
fd_gui_printf_tile_prime_metric( gui, slot->tile_prime_metric_window, 0 );
} else {
jsonp_null( gui, "waterfall" );
// jsonp_null( gui, "tile_timers" );
Expand Down Expand Up @@ -1112,7 +1110,7 @@ fd_gui_printf_slot_request( fd_gui_t * gui,
fd_gui_printf_ts_tile_timers( gui, prev_timer, slot->tile_timers_end );
jsonp_close_array( gui );

fd_gui_printf_tile_prime_metric( gui, slot->tile_prime_metric_begin, slot->tile_prime_metric_end );
fd_gui_printf_tile_prime_metric( gui, slot->tile_prime_metric_window, 0 );
} else {
jsonp_null( gui, "waterfall" );
jsonp_null( gui, "tile_timers" );
Expand Down
6 changes: 2 additions & 4 deletions src/disco/gui/fd_gui_printf.h
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,5 @@ fd_gui_printf_live_txn_waterfall( fd_gui_t * gui,
ulong next_leader_slot );

void
fd_gui_printf_live_tile_prime_metric( fd_gui_t * gui,
fd_gui_tile_prime_metric_t * prev,
fd_gui_tile_prime_metric_t * cur,
ulong next_leader_slot );
fd_gui_printf_live_tile_prime_metric( fd_gui_t * gui,
ulong next_leader_slot );

0 comments on commit 8e51a5d

Please sign in to comment.