diff --git a/ffi/rust/firedancer-sys/build.rs b/ffi/rust/firedancer-sys/build.rs index a9295b239c..bfdde23542 100644 --- a/ffi/rust/firedancer-sys/build.rs +++ b/ffi/rust/firedancer-sys/build.rs @@ -62,7 +62,7 @@ fn main() { .clang_arg(format!("-I{prefix}/")) .header(&format!("wrapper_{lib}.h")) .blocklist_type("schar|uchar|ushort|uint|ulong") - .blocklist_item("SORT_QUICK_ORDER_STYLE|SORT_MERGE_THRESH|SORT_QUICK_THRESH|SORT_QUICK_ORDER_STYLE|SORT_QUICK_SWAP_MINIMIZE"); + .blocklist_item("SORT_QUICK_ORDER_STYLE|SORT_MERGE_THRESH|SORT_QUICK_THRESH|SORT_QUICK_ORDER_STYLE|SORT_QUICK_SWAP_MINIMIZE|MAP_MEMOIZE|MAP_QUERY_OPT|MAP_KEY_EQUAL_IS_SLOW"); // Well this is a complete mess. We want to only include, say, functions // declared in the `ballet` directory in the ballet bindgen output. If diff --git a/ffi/rust/firedancer-sys/src/tango/mod.rs b/ffi/rust/firedancer-sys/src/tango/mod.rs index 77d451c0a7..3c515a4f0f 100644 --- a/ffi/rust/firedancer-sys/src/tango/mod.rs +++ b/ffi/rust/firedancer-sys/src/tango/mod.rs @@ -5,6 +5,7 @@ mod fseq; mod mcache; mod tcache; mod xdp; +mod stake; pub use cnc::*; pub use dcache::*; @@ -13,3 +14,4 @@ pub use fseq::*; pub use mcache::*; pub use tcache::*; pub use xdp::*; +pub use stake::*; diff --git a/ffi/rust/firedancer-sys/src/tango/stake.rs b/ffi/rust/firedancer-sys/src/tango/stake.rs new file mode 100644 index 0000000000..7876444e07 --- /dev/null +++ b/ffi/rust/firedancer-sys/src/tango/stake.rs @@ -0,0 +1,13 @@ +pub use crate::generated::{ + fd_stake_t, + fd_stake_align, + fd_stake_footprint, + fd_stake_join, + fd_stake_new, + fd_stake_version, + fd_stake_version_laddr, + fd_stake_write, + fd_stake_read, + fd_stake_dump, + FD_STAKE_ALIGN +}; diff --git a/src/app/fdctl/config.h b/src/app/fdctl/config.h index 5d0bae76fd..ede35c6a74 100644 --- a/src/app/fdctl/config.h +++ b/src/app/fdctl/config.h @@ -127,12 +127,13 @@ typedef struct { ushort listen_port; char xdp_mode[ 8 ]; - uint max_concurrent_connections; + ulong max_concurrent_connections; uint max_concurrent_connection_ids_per_connection; uint max_concurrent_streams_per_connection; uint max_concurrent_handshakes; uint max_inflight_quic_packets; uint tx_buf_size; + int stake_lg_slot_cnt; uint xdp_rx_queue_size; uint xdp_tx_queue_size; uint xdp_aio_depth; diff --git a/src/app/fdctl/config/default.toml b/src/app/fdctl/config/default.toml index a891b36c27..6f6091c8b0 100644 --- a/src/app/fdctl/config/default.toml +++ b/src/app/fdctl/config/default.toml @@ -377,6 +377,8 @@ dynamic_port_range = "8000-10000" # Maximum number of simultaneous QUIC connections which can be open. New # connections which would exceed this limit will not be accepted. + # + # Must be >=2 and a power of 2. max_concurrent_connections = 32 # While in TCP a connection is identified by (Source IP, Source Port, @@ -393,7 +395,9 @@ dynamic_port_range = "8000-10000" # # Currently this option does nothing, as we do not support creating # additional connection IDs. - max_concurrent_connection_ids_per_connection = 16 + # + # Should be in sync with `fd_quic_conn.h`. + max_concurrent_connection_ids_per_connection = 4 # QUIC allows for multiple streams to be multiplexed over a single # connection. This option sets the maximum number of simultaneous @@ -438,6 +442,9 @@ dynamic_port_range = "8000-10000" # should be unused. tx_buf_size = 4096 + # Lg number of stake nodes to track + stake_lg_slot_cnt = 10 + # Firedancer uses XDP for fast packet processing. XDP supports two # modes, XDP_SKB and XDP_DRV. XDP_DRV is preferred as it is faster, but # is not supported by all drivers. diff --git a/src/app/fdctl/configure/workspace.c b/src/app/fdctl/configure/workspace.c index 7d3b87d017..ea4937855d 100644 --- a/src/app/fdctl/configure/workspace.c +++ b/src/app/fdctl/configure/workspace.c @@ -2,12 +2,21 @@ #include "../../../tango/fd_tango.h" #include "../../../tango/quic/fd_quic.h" +#include "../../../tango/quic/fd_quic_qos.h" #include "../../../tango/xdp/fd_xsk_aio.h" +#include "../../../tango/udpsock/fd_udpsock.h" #include #include +#include +#include +#include +#include +#include +#include #define NAME "workspace" +#define FD_HAS_XDP 0 static void init_perm( security_t * security, @@ -81,6 +90,20 @@ static void quic( void * pod, char * fmt, fd_quic_limits_t * limits, ... ) { fd_quic_new ( shmem, limits ) ); } +static void quic_qos( void * pod, char * fmt, fd_quic_qos_limits_t * limits, ... ) { + INSERTER( limits, + fd_quic_qos_align ( ), + fd_quic_qos_footprint( limits ), + fd_quic_qos_new ( shmem, limits ) ); +} + +static void stake( void * pod, char * fmt, int lg_slot_cnt, ... ) { + INSERTER( lg_slot_cnt, + fd_stake_align ( ), + fd_stake_footprint( lg_slot_cnt ), + fd_stake_new ( shmem, lg_slot_cnt ) ); +} + static void xsk( void * pod, char * fmt, ulong frame_sz, ulong rx_depth, ulong tx_depth, ... ) { INSERTER( tx_depth, fd_xsk_align ( ), @@ -95,6 +118,13 @@ static void xsk_aio( void * pod, char * fmt, ulong tx_depth, ulong batch_count, fd_xsk_aio_new ( shmem, tx_depth, batch_count ) ); } +static void udpsock( void * pod, char * fmt, ulong frame_sz, ulong rx_depth, ulong tx_depth, ... ) { + INSERTER( tx_depth, + fd_udpsock_align ( ), + fd_udpsock_footprint( frame_sz, rx_depth, tx_depth ), + fd_udpsock_new ( shmem, frame_sz, rx_depth, tx_depth ) ); +} + static void alloc( void * pod, char * fmt, ulong align, ulong sz, ... ) { INSERTER( sz, align, sz, 1 ); } @@ -196,19 +226,6 @@ init( config_t * const config ) { if( FD_LIKELY( uid == 0 && seteuid( config->uid ) ) ) FD_LOG_ERR(( "seteuid() failed (%i-%s)", errno, strerror( errno ) )); - fd_quic_limits_t limits = { - .conn_cnt = config->tiles.quic.max_concurrent_connections, - .handshake_cnt = config->tiles.quic.max_concurrent_handshakes, - .conn_id_cnt = config->tiles.quic.max_concurrent_connection_ids_per_connection, - .conn_id_sparsity = 0.0, - .inflight_pkt_cnt = config->tiles.quic.max_inflight_quic_packets, - .tx_buf_sz = config->tiles.quic.tx_buf_size, - .stream_cnt[ FD_QUIC_STREAM_TYPE_BIDI_CLIENT ] = 0, - .stream_cnt[ FD_QUIC_STREAM_TYPE_BIDI_SERVER ] = 0, - .stream_cnt[ FD_QUIC_STREAM_TYPE_UNI_CLIENT ] = config->tiles.quic.max_concurrent_streams_per_connection, - .stream_cnt[ FD_QUIC_STREAM_TYPE_UNI_SERVER ] = 0, - }; - for( ulong j=0; jshmem.workspaces_cnt; j++ ) { workspace_config_t * wksp1 = &config->shmem.workspaces[ j ]; WKSP_BEGIN( config, wksp1, 0 ); @@ -242,17 +259,64 @@ init( config_t * const config ) { fseq ( pod, "fseq%lu", i ); } break; - case wksp_quic: - cnc ( pod, "cnc" ); - quic ( pod, "quic", &limits ); - xsk ( pod, "xsk", 2048, config->tiles.quic.xdp_rx_queue_size, config->tiles.quic.xdp_tx_queue_size ); - xsk_aio( pod, "xsk_aio", config->tiles.quic.xdp_tx_queue_size, config->tiles.quic.xdp_aio_depth ); - + case wksp_quic:; + fd_quic_limits_t quic_limits = { + .conn_cnt = config->tiles.quic.max_concurrent_connections, + .handshake_cnt = config->tiles.quic.max_concurrent_handshakes, + .conn_id_cnt = config->tiles.quic.max_concurrent_connection_ids_per_connection, + .conn_id_sparsity = 0.0, + .inflight_pkt_cnt = config->tiles.quic.max_inflight_quic_packets, + .tx_buf_sz = config->tiles.quic.tx_buf_size, + .stream_cnt[ FD_QUIC_STREAM_TYPE_BIDI_CLIENT ] = 0, + .stream_cnt[ FD_QUIC_STREAM_TYPE_BIDI_SERVER ] = 0, + .stream_cnt[ FD_QUIC_STREAM_TYPE_UNI_CLIENT ] = config->tiles.quic.max_concurrent_streams_per_connection, + .stream_cnt[ FD_QUIC_STREAM_TYPE_UNI_SERVER ] = 0, + }; + + int lg_max_conns = fd_ulong_find_msb( config->tiles.quic.max_concurrent_connections ); + if ( lg_max_conns < 1 ) FD_LOG_ERR( ( "max_concurrent_connections must be at least 2." ) ); + fd_quic_qos_limits_t quic_qos_limits = { + .min_streams = FD_QUIC_QOS_DEFAULT_MIN_STREAMS, + .max_streams = FD_QUIC_QOS_DEFAULT_MAX_STREAMS, + .total_streams = FD_QUIC_QOS_DEFAULT_TOTAL_STREAMS, + .pq_lg_slot_cnt = lg_max_conns - 1, + .lru_depth = config->tiles.quic.max_concurrent_connections >> 1, + }; + + cnc ( pod, "cnc" ); + quic ( pod, "quic", &quic_limits ); + quic_qos( pod, "quic_qos", &quic_qos_limits ); + stake ( pod, "stake", 10 ); // FIXME slot cnt is not getting parsed + + #if FD_HAS_XDP + (void)udpsock; + xsk ( pod, "xsk", 2048, config->tiles.quic.xdp_rx_queue_size, config->tiles.quic.xdp_tx_queue_size ); + xsk_aio ( pod, "xsk_aio", config->tiles.quic.xdp_tx_queue_size, config->tiles.quic.xdp_aio_depth ); char const * quic_xsk_gaddr = fd_pod_query_cstr( pod, "xsk", NULL ); void * shmem = fd_wksp_map ( quic_xsk_gaddr ); if( FD_UNLIKELY( !fd_xsk_bind( shmem, config->name, config->tiles.quic.interface, (uint)wksp1->kind_idx ) ) ) FD_LOG_ERR(( "failed to bind xsk for quic tile %lu", wksp1->kind_idx )); fd_wksp_unmap( shmem ); + #else + (void)xsk; + (void)xsk_aio; + int sock_fd = socket( AF_INET, SOCK_DGRAM, IPPROTO_UDP ); + if( FD_UNLIKELY( sock_fd<0 ) ) { + FD_LOG_ERR( ( + "socket(AF_INET,SOCK_DGRAM,IPPROTO_UDP) failed (%d-%s)", errno, strerror( errno ) ) ); + } + struct sockaddr_in listen_addr = { + .sin_family = AF_INET, + .sin_addr = { .s_addr = FD_IP4_ADDR(127, 0, 0, 1) }, + .sin_port = (ushort)fd_ushort_bswap( 8004 ), + }; + if( FD_UNLIKELY( 0!=bind( sock_fd, (struct sockaddr const *)fd_type_pun_const( &listen_addr ), sizeof(struct sockaddr_in) ) ) ) { + close( sock_fd ); + FD_LOG_ERR( ( "bind(sock_fd) failed (%d-%s)", errno, strerror( errno ) ) ); + } + udpsock ( pod, "udpsock", 2048, config->tiles.quic.xdp_rx_queue_size, config->tiles.quic.xdp_tx_queue_size ); + #endif + uint1 ( pod, "ip_addr", config->tiles.quic.ip_addr ); ushort1( pod, "listen_port", config->tiles.quic.listen_port, 0 ); diff --git a/src/app/frank/fd_frank.h b/src/app/frank/fd_frank.h index 7187625695..50b8f0fd6b 100644 --- a/src/app/frank/fd_frank.h +++ b/src/app/frank/fd_frank.h @@ -4,6 +4,7 @@ #include "../../disco/fd_disco.h" #include "../../ballet/fd_ballet.h" /* FIXME: CONSIDER HAVING THIS IN DISCO_BASE */ #include "../../tango/xdp/fd_xsk.h" +#include "../../tango/udpsock/fd_udpsock.h" /* FD_FRANK_CNC_DIAG_* are FD_CNC_DIAG_* style diagnostics and thus the same considerations apply. Further they are harmonized with the @@ -29,10 +30,11 @@ typedef struct { char * tile_name; ulong tile_idx; ulong idx; - uchar const * tile_pod; - uchar const * in_pod; - uchar const * out_pod; - fd_xsk_t * xsk; + uchar const * tile_pod; + uchar const * in_pod; + uchar const * out_pod; + fd_xsk_t * xsk; + fd_udpsock_t * udpsock; } fd_frank_args_t; typedef struct { diff --git a/src/app/frank/fd_frank_quic.c b/src/app/frank/fd_frank_quic.c index e4f86562fb..29f67dd8ca 100644 --- a/src/app/frank/fd_frank_quic.c +++ b/src/app/frank/fd_frank_quic.c @@ -56,6 +56,14 @@ run( fd_frank_args_t * args ) { fd_quic_t * quic = fd_quic_join( fd_wksp_pod_map( args->tile_pod, "quic" ) ); if( FD_UNLIKELY( !quic ) ) FD_LOG_ERR(( "fd_quic_join failed" )); + FD_LOG_INFO(( "loading quic" )); + fd_quic_qos_t * quic_qos = fd_quic_qos_join( fd_wksp_pod_map( args->tile_pod, "quic_qos" ) ); + if( FD_UNLIKELY( !quic_qos ) ) FD_LOG_ERR(( "fd_quic_qos_join failed" )); + + FD_LOG_INFO(( "loading stake" )); + fd_stake_t * stake = fd_stake_join( fd_wksp_pod_map( args->tile_pod, "stake" ) ); + if( FD_UNLIKELY( !stake ) ) FD_LOG_ERR(( "fd_stake_join failed" )); + FD_LOG_INFO(( "loading xsk_aio" )); fd_xsk_aio_t * xsk_aio = fd_xsk_aio_join( fd_wksp_pod_map( args->tile_pod, "xsk_aio" ), args->xsk ); if( FD_UNLIKELY( !xsk_aio ) ) FD_LOG_ERR(( "fd_xsk_aio_join failed" )); @@ -122,7 +130,12 @@ run( fd_frank_args_t * args ) { /* Start serving */ FD_LOG_INFO(( "%s(%lu) run", args->tile_name, args->tile_idx )); - int err = fd_quic_tile( cnc, quic, xsk_aio, mcache, dcache, lazy, rng, scratch ); + #if FD_HAS_XDP + #define SOCK args->xsk + #else + #define SOCK args->udpsock + #endif + int err = fd_quic_tile( cnc, quic, quic_qos, stake, SOCK, mcache, dcache, lazy, rng, scratch ); if( FD_UNLIKELY( err ) ) FD_LOG_ERR(( "fd_quic_tile failed (%i)", err )); } diff --git a/src/ballet/x509/fd_x509.c b/src/ballet/x509/fd_x509.c index 3bb8bb6251..2550b14888 100644 --- a/src/ballet/x509/fd_x509.c +++ b/src/ballet/x509/fd_x509.c @@ -50,14 +50,17 @@ fd_x509_gen_solana_cert( EVP_PKEY * pkey ) { /* Generate serial number */ long serial; - if( FD_UNLIKELY( 1!=RAND_bytes( (uchar *)&serial, sizeof(long) ) ) ) { + if ( FD_UNLIKELY( 1 != RAND_bytes( (uchar *)&serial, sizeof( long ) ) ) ) { FD_LOG_WARNING(( "RAND_bytes() failed" )); goto cleanup1; } ASN1_INTEGER_set( X509_get_serialNumber(x), serial ); /* Set public key (the only important part) */ - X509_set_pubkey( x, pkey ); + if ( FD_UNLIKELY( 1 != X509_set_pubkey( x, pkey ) ) ) { + FD_LOG_WARNING(( "X509_set_pubkey() failed" )); + goto cleanup1; + }; /* Set very long expiration date */ long not_before = 0L; /* Jan 1 00:00:00 1975 GMT */ diff --git a/src/disco/quic/fd_quic.h b/src/disco/quic/fd_quic.h index 7a9e1687ee..37e99a4d66 100644 --- a/src/disco/quic/fd_quic.h +++ b/src/disco/quic/fd_quic.h @@ -38,7 +38,10 @@ #include "../fd_disco_base.h" #include "../../tango/quic/fd_quic.h" +#include "../../tango/quic/fd_quic_qos.h" +#include "../../tango/quic/tls/fd_quic_tls.h" #include "../../tango/xdp/fd_xdp.h" +#include "../../tango/udpsock/fd_udpsock.h" #include "../../ballet/txn/fd_txn.h" #if FD_HAS_HOSTED @@ -118,14 +121,20 @@ FD_FN_CONST ulong fd_quic_tile_scratch_footprint( ulong depth ); int -fd_quic_tile( fd_cnc_t * cnc, /* Local join to the tile's command-and-control */ - fd_quic_t * quic, /* QUIC without active join */ - fd_xsk_aio_t * xsk_aio, /* Local join to QUIC XSK aio */ - fd_frag_meta_t * mcache, /* Local join to the tile's txn output mcache */ - uchar * dcache, /* Local join to the tile's txn output dcache */ - long lazy, /* Laziness, <=0 means use a reasonable default */ - fd_rng_t * rng, /* Local join to the rng this tile should use */ - void * scratch ); /* Tile scratch memory */ +fd_quic_tile( fd_cnc_t * cnc, /* Local join to the tile's command-and-control */ + fd_quic_t * quic, /* QUIC without active join */ + fd_quic_qos_t * quic_qos, /* Local join to QoS */ + fd_stake_t * stake, + #if FD_HAS_XDP + fd_xsk_aio_t * xsk_aio, /* Local join to QUIC XSK aio */ + #else + fd_udpsock_t * udpsock, /* Local join to QUIC udp sock */ + #endif + fd_frag_meta_t * mcache, /* Local join to the tile's txn output mcache */ + uchar * dcache, /* Local join to the tile's txn output dcache */ + long lazy, /* Laziness, <=0 means use a reasonable default */ + fd_rng_t * rng, /* Local join to the rng this tile should use */ + void * scratch ); /* Tile scratch memory */ FD_PROTOTYPES_END diff --git a/src/disco/quic/fd_quic_tile.c b/src/disco/quic/fd_quic_tile.c index f032285155..b6672ce4b9 100644 --- a/src/disco/quic/fd_quic_tile.c +++ b/src/disco/quic/fd_quic_tile.c @@ -1,4 +1,11 @@ #include "fd_quic.h" +#include "../../util/net/fd_ip4.h" +#include +#include +#include +#include +#include +#include #if !FD_HAS_HOSTED #error "fd_quic tile requires FD_HAS_HOSTED" @@ -45,7 +52,6 @@ fd_quic_dcache_msg_ctx( uchar * app_laddr, struct fd_quic_tpu_ctx { /* dcache */ - uchar * base; /* dcache chunk region */ uchar * dcache_app; /* dcache app region */ ulong chunk0; @@ -60,11 +66,18 @@ struct fd_quic_tpu_ctx { ulong depth; /* publish stack */ - fd_quic_tpu_msg_ctx_t ** pubq; - /* meta */ + /* stake */ + fd_stake_t * stake; + /* qos */ + fd_quic_qos_t * quic_qos; + + /* rng */ + fd_rng_t * rng; + + /* meta */ ulong cnc_diag_tpu_conn_live_cnt; ulong cnc_diag_tpu_conn_seq; }; @@ -82,31 +95,47 @@ fd_tpu_now( void * ctx ) { return (ulong)fd_log_wallclock(); } -/* fd_tpu_conn_create implements fd_quic_cb_conn_new_t */ +/* fd_tpu_conn_new implements fd_quic_cb_conn_new_t + + calls `fd_quic_qos_conn_new to implement connection prioritization and flow + control logic */ static void -fd_tpu_conn_create( fd_quic_conn_t * conn, - void * _ctx ) { +fd_tpu_conn_new( fd_quic_conn_t * conn, + void * _ctx ) { conn->local_conn_id = ++conn_seq; + fd_stake_pubkey_t pubkey = { 0 }; + fd_stake_pubkey_t * pubkey_ptr = &pubkey; + int verify_result = fd_quic_tls_get_pubkey( conn->tls_hs, pubkey.pubkey, FD_STAKE_PUBKEY_SZ ); + /* we only care about self-signed certs identifying the connection's associated Solana pubkey */ + if ( FD_UNLIKELY( verify_result != X509_V_ERR_DEPTH_ZERO_SELF_SIGNED_CERT ) ) { + FD_DEBUG( FD_LOG_WARNING( ( "Failed to get conn: %lu's pubkey", conn->local_conn_id ) ) ); + pubkey_ptr = NULL; + } fd_quic_tpu_ctx_t * ctx = (fd_quic_tpu_ctx_t *)_ctx; + fd_quic_qos_conn_new( ctx->quic_qos, ctx->stake, ctx->rng, conn, pubkey_ptr ); + ctx->cnc_diag_tpu_conn_seq = conn_seq; ctx->cnc_diag_tpu_conn_live_cnt++; } -/* fd_tpu_conn_destroy implements fd_quic_cb_conn_final_t */ +/* fd_tpu_conn_final implements fd_quic_cb_conn_final_t */ static void -fd_tpu_conn_destroy( fd_quic_conn_t * conn, - void * _ctx ) { - (void)conn; - +fd_tpu_conn_final( fd_quic_conn_t * conn, + void * _ctx ) { fd_quic_tpu_ctx_t * ctx = (fd_quic_tpu_ctx_t *)_ctx; + fd_quic_qos_pq_t * pq = ctx->quic_qos->pq; + fd_quic_qos_pq_t * query = fd_quic_qos_pq_query( pq, conn->local_conn_id, NULL ); + if ( FD_UNLIKELY( (query) ) ) { /* most connections likely unstaked */ + fd_quic_qos_pq_remove( pq, query ); + } ctx->cnc_diag_tpu_conn_live_cnt--; } /* fd_tpu_stream_create implements fd_quic_cb_stream_new_t */ static void -fd_tpu_stream_create( fd_quic_stream_t * stream, +fd_tpu_stream_new( fd_quic_stream_t * stream, void * _ctx, int type ) { @@ -189,7 +218,7 @@ fd_tpu_stream_receive( fd_quic_stream_t * stream, ulong total_sz = offset+data_sz; if( FD_UNLIKELY( total_sz>FD_TPU_MTU || total_szcb; if( FD_UNLIKELY( !quic_cb ) ) { FD_LOG_WARNING(( "NULL quic callbacks") ); return 1; } - quic_cb->conn_new = fd_tpu_conn_create; + quic_cb->conn_new = fd_tpu_conn_new; quic_cb->conn_hs_complete = NULL; - quic_cb->conn_final = fd_tpu_conn_destroy; - quic_cb->stream_new = fd_tpu_stream_create; + quic_cb->conn_final = fd_tpu_conn_final; + quic_cb->stream_new = fd_tpu_stream_new; quic_cb->stream_notify = fd_tpu_stream_notify; quic_cb->stream_receive = fd_tpu_stream_receive; @@ -394,6 +429,8 @@ fd_quic_tile( fd_cnc_t * cnc, quic_ctx.mcache = mcache; quic_ctx.depth = depth; quic_ctx.inflight_streams = 0UL; + quic_ctx.quic_qos = quic_qos; + quic_ctx.stake = stake; quic_cb->quic_ctx = &quic_ctx; @@ -444,9 +481,15 @@ fd_quic_tile( fd_cnc_t * cnc, /* Reload housekeeping timer */ then = now + (long)fd_tempo_async_reload( rng, async_min ); } + + /* Poll network backend */ + #if FD_HAS_XDP fd_xsk_aio_service( xsk_aio ); + #else + fd_udpsock_service( udpsock ); + #endif /* Service QUIC clients */ fd_quic_service( quic ); diff --git a/src/disco/quic/test_quic_tile.c b/src/disco/quic/test_quic_tile.c index e304799f0c..46fb505066 100644 --- a/src/disco/quic/test_quic_tile.c +++ b/src/disco/quic/test_quic_tile.c @@ -1,6 +1,7 @@ #include "../../util/fd_util.h" #if FD_HAS_HOSTED && FD_HAS_X86 && FD_HAS_OPENSSL +#define FD_HAS_XDP 1 #include "fd_quic.h" #include "../../tango/xdp/fd_xdp.h" @@ -28,6 +29,8 @@ struct test_cfg { long tx_lazy; uint tx_seed; fd_quic_t * tx_quic; + fd_quic_qos_t * qos; + fd_stake_t * stake; fd_quic_config_t * tx_quic_cfg; fd_cnc_t * rx_cnc; @@ -187,6 +190,8 @@ tx_tile_main( int argc, FD_TEST( !fd_quic_tile( cfg->tx_cnc, cfg->tx_quic, + cfg->qos, + cfg->stake, cfg->xsk_aio, cfg->tx_mcache, cfg->tx_dcache, @@ -338,6 +343,8 @@ int main( int argc, fd_quic_config_t * quic_cfg = &cfg->tx_quic->config; FD_TEST( quic_cfg ); + /* TODO set qos and stake */ + /* must set role first */ quic_cfg->role = FD_QUIC_ROLE_SERVER; diff --git a/src/tango/fd_tango.h b/src/tango/fd_tango.h index f2b8eb611e..d9f8cfa0e6 100644 --- a/src/tango/fd_tango.h +++ b/src/tango/fd_tango.h @@ -10,6 +10,8 @@ #include "dcache/fd_dcache.h" /* Includes fd_tango_base.h */ #include "tcache/fd_tcache.h" /* Includes fd_tango_base.h */ #include "aio/fd_aio.h" /* Includes fd_tango_base.h */ +#include "stake/fd_stake.h" /* Includes fd_tango_base.h */ +#include "lru/fd_lru.h" #endif /* HEADER_fd_src_tango_fd_tango_h */ diff --git a/src/tango/fd_tango_ctl.c b/src/tango/fd_tango_ctl.c index f635ea2430..6e4c2266b8 100644 --- a/src/tango/fd_tango_ctl.c +++ b/src/tango/fd_tango_ctl.c @@ -1,6 +1,7 @@ #include "fd_tango.h" #include "mcache/fd_mcache_private.h" #include "dcache/fd_dcache_private.h" +#include "stake/fd_stake.h" #if FD_HAS_HOSTED @@ -721,6 +722,52 @@ main( int argc, FD_LOG_NOTICE(( "%i: %s %s: success", cnt, cmd, gaddr )); SHIFT( 1 ); + } else if( !strcmp( cmd, "new-stake" ) ) { + + if( FD_UNLIKELY( argc!=2 ) ) FD_LOG_ERR(( "%i: %s: wrong number of arguments\n\tDo %s help for help", cnt, cmd, bin )); + + char const * _wksp = argv[0]; + int lg_slot_cnt = fd_cstr_to_int( argv[1] ); + + fd_wksp_t * wksp = fd_wksp_attach( _wksp ); + if( FD_UNLIKELY( !wksp ) ) { + FD_LOG_ERR(( "%i: %s: fd_wksp_attach( \"%s\" ) failed\n\tDo %s help for help", cnt, cmd, _wksp, bin )); + } + + ulong align = fd_stake_align(); + ulong footprint = fd_stake_footprint(lg_slot_cnt); + // FD_LOG_ERR(("footprint %lu", footprint)); + ulong gaddr = fd_wksp_alloc( wksp, align, footprint, tag ); + if( FD_UNLIKELY( !gaddr ) ) { + fd_wksp_detach( wksp ); + FD_LOG_ERR(( "%i: %s: fd_wksp_alloc( \"%s\", %lu, %lu, %lu ) failed\n\tDo %s help for help", + cnt, cmd, _wksp, align, footprint, tag, bin )); + } + + void * shmem = fd_wksp_laddr( wksp, gaddr ); + // FD_LOG_HEXDUMP_ERR(("shmem", shmem, footprint )); + if( FD_UNLIKELY( !shmem ) ) { + fd_wksp_free( wksp, gaddr ); + fd_wksp_detach( wksp ); + FD_LOG_ERR(( "%i: %s: fd_wksp_laddr( \"%s\", %lu ) failed\n\tDo %s help for help", cnt, cmd, _wksp, gaddr, bin )); + } + + void * shstake = fd_stake_new( shmem, lg_slot_cnt ); + if( FD_UNLIKELY( !shstake ) ) { + fd_wksp_free( wksp, gaddr ); + fd_wksp_detach( wksp ); + FD_LOG_ERR(( "%i: %s: fd_stake_new( %s:%lu, %d ) failed\n\tDo %s help for help", + cnt, cmd, _wksp, gaddr, lg_slot_cnt, bin )); + } + + char buf[ FD_WKSP_CSTR_MAX ]; + printf( "%s\n", fd_wksp_cstr( wksp, gaddr, buf ) ); + + fd_wksp_detach( wksp ); + + FD_LOG_NOTICE(( "%i: %s %s %d: success", cnt, cmd, _wksp, lg_slot_cnt )); + SHIFT( 2 ); + } else { FD_LOG_ERR(( "%i: %s: unknown command\n\t" diff --git a/src/tango/lru/Local.mk b/src/tango/lru/Local.mk new file mode 100644 index 0000000000..841ae87ede --- /dev/null +++ b/src/tango/lru/Local.mk @@ -0,0 +1,6 @@ +$(call add-hdrs,fd_lru.h fd_list.h) +$(call add-objs,fd_lru fd_list,fd_tango) +$(call make-unit-test,test_lru,test_lru,fd_tango fd_util) +$(call make-unit-test,test_list,test_list,fd_tango fd_util) +$(call run-unit-test,test_lru) +$(call run-unit-test,test_list) diff --git a/src/tango/lru/fd_list.c b/src/tango/lru/fd_list.c new file mode 100644 index 0000000000..3c000ec0fe --- /dev/null +++ b/src/tango/lru/fd_list.c @@ -0,0 +1,103 @@ +#include "fd_list.h" + +#define FD_LIST_ALIGN ( 32UL ) /* 2-nodes per L1 cache line */ + +ulong +fd_list_align( void ) { + return FD_LIST_ALIGN; +} + +ulong +fd_list_footprint( ulong max ) { + return ( max + 1 ) * sizeof( fd_list_t ); +} + +void * +fd_list_new( void * mem, ulong max ) { + fd_list_t * sentinel = (fd_list_t *)mem; + sentinel->tag = 0; + sentinel->curr = 0; + sentinel->prev = 0; + sentinel->next = 0; + fd_list_t * curr = sentinel; + for ( ulong i = 1; i <= max; i++ ) { + fd_list_t * new = sentinel + i; + new->curr = i; + fd_list_insert( curr, new ); + curr = new; + } + curr = sentinel; + curr = fd_list_head( sentinel ); + while (curr != sentinel) { + curr = fd_list_next(curr); + } + return mem; +} + +fd_list_t * +fd_list_join( void * mem ) { + return (fd_list_t *)mem; +} + +fd_list_t * +fd_list_prev( fd_list_t * curr ) { + return curr - curr->curr + curr->prev; +} + +fd_list_t * +fd_list_next( fd_list_t * curr ) { + return curr - curr->curr + curr->next; +} + +fd_list_t * +fd_list_sentinel( fd_list_t * list ) { + return list - list->curr; +} + +fd_list_t * +fd_list_head( fd_list_t * list ) { + fd_list_t * sentinel = fd_list_sentinel( list ); + return fd_list_next( sentinel ); +} + +fd_list_t * +fd_list_tail( fd_list_t * list ) { + fd_list_t * sentinel = fd_list_sentinel( list ); + return fd_list_prev( sentinel ); +} + +int +fd_list_is_empty( fd_list_t * list ) { + return fd_list_head(list) == fd_list_sentinel( list ); +} + +fd_list_t * +fd_list_insert( fd_list_t * curr, fd_list_t * new ) { + new->prev = curr->curr; + new->next = curr->next; + fd_list_next( curr )->prev = new->curr; + curr->next = new->curr; + return new; +} + +fd_list_t * +fd_list_remove( fd_list_t * curr ) { + if ( FD_UNLIKELY( fd_list_is_empty( curr ) ) ) return NULL; + fd_list_prev( curr )->next = curr->next; + fd_list_next( curr )->prev = curr->prev; + curr->prev = 0; + curr->next = 0; + return curr; +} + +fd_list_t * +fd_list_push_back( fd_list_t * list, fd_list_t * new ) { + return fd_list_insert( fd_list_tail( list ), new ); +} + +fd_list_t * +fd_list_pop_front( fd_list_t * list ) { + fd_list_t * head = fd_list_head( list ); + if ( FD_UNLIKELY( fd_list_is_empty( list ) ) ) { return NULL; } + return fd_list_remove( head ); +} diff --git a/src/tango/lru/fd_list.h b/src/tango/lru/fd_list.h new file mode 100644 index 0000000000..542f364d7c --- /dev/null +++ b/src/tango/lru/fd_list.h @@ -0,0 +1,79 @@ +#ifndef HEADER_fd_src_util_list_fd_list_h +#define HEADER_fd_src_util_list_fd_list_h + +#include "../../util/fd_util.h" + +/* An implementation of an intrusive doubly-linked list. + + ----- + 1 / 0x0 = 0 + ----- + 5 / 0x4 = 4 + ----- + 3 / 0x8 = 8 + ----- + 2 / 0x12 = 12 + ----- + 4 / 0x16 = 16 + ----- + + tag : 1 -> 2 -> 3 -> 4 -> 5 + curr: 0 -> 12 -> 8 -> 16 -> 4 + prev: / <- 0 <- 12 <- 8 <- 16 + next: 12 -> 8 -> 16 -> 4 -> / + + TODO generalize to a tmpl data structure? */ +typedef struct fd_list fd_list_t; /* forward decl */ +struct fd_list { + ulong tag; /* TODO generic */ + /* below all are offsets from the sentinel */ + ulong curr; + ulong prev; + ulong next; +}; + +ulong +fd_list_align( void ); + +ulong +fd_list_footprint( ulong max ); + +void * +fd_list_new( void * mem, ulong max ); + +fd_list_t * +fd_list_join( void * mem ); + +fd_list_t * +fd_list_prev( fd_list_t * curr ); + +fd_list_t * +fd_list_next( fd_list_t * curr ); + +fd_list_t * +fd_list_sentinel( fd_list_t * list ); + +fd_list_t * +fd_list_head( fd_list_t * list ); + +fd_list_t * +fd_list_tail( fd_list_t * list ); + +int +fd_list_is_empty( fd_list_t * list ); + +/* a list can insert an element directly after itself */ +fd_list_t * +fd_list_insert( fd_list_t * curr, fd_list_t * new ); + +/* a list can remove itself */ +fd_list_t * +fd_list_remove( fd_list_t * new ); + +fd_list_t * +fd_list_push_back( fd_list_t * list, fd_list_t * new ); + +fd_list_t * +fd_list_pop_front( fd_list_t * list ); + +#endif /* HEADER_fd_src_util_list_fd_list_h */ diff --git a/src/tango/lru/fd_lru.c b/src/tango/lru/fd_lru.c new file mode 100644 index 0000000000..5660e190d3 --- /dev/null +++ b/src/tango/lru/fd_lru.c @@ -0,0 +1,123 @@ +#include "fd_lru.h" +#include "fd_list.h" + +#define DEPTH_OFFSET (2UL) + +ulong +fd_lru_align( void ) { + return FD_LRU_ALIGN; +} + +ulong +fd_lru_footprint( ulong depth, + ulong map_cnt ) { + if( !map_cnt ) map_cnt = fd_lru_map_cnt_default( depth ); /* use default */ + + if( FD_UNLIKELY( (!depth) | (map_cnt<(depth+2UL)) | (!fd_ulong_is_pow2( map_cnt )) ) ) return 0UL; /* Invalid depth / max_cnt */ + + /* TODO overflow checks*/ + ulong footprint = sizeof(fd_lru_t); + footprint += (depth + 1) * sizeof(fd_list_t); + footprint += map_cnt * sizeof( ulong ); /* pointer-size */ + footprint = fd_ulong_align_up( footprint, fd_lru_align() ); + return footprint; +} + +void * +fd_lru_new( void * shmem, + ulong depth, + ulong map_cnt ) { + if( !map_cnt ) map_cnt = fd_lru_map_cnt_default( depth ); /* use default */ + + if( FD_UNLIKELY( !shmem ) ) { + FD_LOG_WARNING(( "NULL shmem" )); + return NULL; + } + + if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)shmem, fd_lru_align() ) ) ) { + FD_LOG_WARNING(( "misaligned shmem" )); + return NULL; + } + + ulong footprint = fd_lru_footprint( depth, map_cnt ); + if( FD_UNLIKELY( !footprint ) ) { + FD_LOG_WARNING(( "bad depth (%lu) and/or map_cnt (%lu)", depth, map_cnt )); + return NULL; + } + + fd_memset( shmem, 0, footprint ); + + fd_lru_t * lru = (fd_lru_t *)shmem; + + lru->depth = depth; + lru->free_top = 1UL; + lru->map_cnt = map_cnt; + fd_list_new( fd_lru_list_laddr( lru ), depth ); + + // FD_LOG_HEXDUMP_NOTICE(("lru", lru, footprint)); + + FD_COMPILER_MFENCE(); + FD_VOLATILE( lru->magic ) = FD_LRU_MAGIC; + FD_COMPILER_MFENCE(); + + return shmem; +} + +fd_lru_t * +fd_lru_join( void * _lru ) { + + if( FD_UNLIKELY( !_lru ) ) { + FD_LOG_WARNING(( "NULL _lru" )); + return NULL; + } + + if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)_lru, fd_lru_align() ) ) ) { + FD_LOG_WARNING(( "misaligned _lru" )); + return NULL; + } + + fd_lru_t * lru = (fd_lru_t *)_lru; + if( FD_UNLIKELY( lru->magic!=FD_LRU_MAGIC ) ) { + FD_LOG_WARNING(( "bad magic" )); + return NULL; + } + + return lru; +} + +void * +fd_lru_leave( fd_lru_t * lru ) { + + if( FD_UNLIKELY( !lru ) ) { + FD_LOG_WARNING(( "NULL lru" )); + return NULL; + } + + return (void *)lru; +} + +void * +fd_lru_delete( void * _lru ) { + + if( FD_UNLIKELY( !_lru ) ) { + FD_LOG_WARNING(( "NULL _lru" )); + return NULL; + } + + if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)_lru, fd_lru_align() ) ) ) { + FD_LOG_WARNING(( "misaligned _lru" )); + return NULL; + } + + fd_lru_t * lru = (fd_lru_t *)_lru; + if( FD_UNLIKELY( lru->magic != FD_LRU_MAGIC ) ) { + FD_LOG_WARNING(( "bad magic" )); + return NULL; + } + + FD_COMPILER_MFENCE(); + FD_VOLATILE( lru->magic ) = 0UL; + FD_COMPILER_MFENCE(); + + return _lru; +} diff --git a/src/tango/lru/fd_lru.h b/src/tango/lru/fd_lru.h new file mode 100644 index 0000000000..a1580c1b4d --- /dev/null +++ b/src/tango/lru/fd_lru.h @@ -0,0 +1,416 @@ +#ifndef HEADER_fd_src_tango_lru_fd_lru_h +#define HEADER_fd_src_tango_lru_fd_lru_h + +/* fd_lru_t is very similar to fd_tcache_t. The main differences are: + 1. instead of a ring, it uses a doubly linked list. + 2. instead of a map of tag -> index, it uses a map of tag -> node. + + Keeping in mind these differences, the API and documentation is otherwise + based on `fd_tcache.h`. + + A fd_lru_t is a cache of the most recently observed unique 64-bit + tags. It is useful for, among other things, deduplication of traffic + based on a thumbprint / hash / signature. Makes no demands on the + application on tagging scheme except that there be a "null" tag (a + tag value that will never occur). + + The amount of history ("depth") of a lru is theoretically + unlimited but the implementation below was optimized for large-ish + depths (e.g. millions) on platforms where memory footprint is + reasonable cheap. The implementation was also optimized for + situations where heavily duplication is common and temporally + localized. Lastly, the implementation is optimized for the case that + tags behave like IID random values (e.g. a tag is a hash of a packet + payload). + + It is strongly recommend that the lru be backed by a single NUMA + page (e.g. in a gigantic page backed workspace) to avoid TLB + thrashing if used in performance critical contexts. */ + +#include "../fd_tango_base.h" +#include "fd_list.h" + +/* FD_TCACHE_{ALIGN,FOOTPRINT} specify the alignment and footprint + needed for a tcache with depth history and a tag key-only map with + map_cnt slots. ALIGN is at least double cache line to mitigate + various kinds of false shalist. depth and map_cnt are assumed to be + valid (i.e. depth is positive, map_cnt is an integer power of 2 of at + least depth+2 and the combination will not require a footprint larger + than ULONG_MAX). */ +#define FD_LRU_ALIGN ( 128UL ) + +#define FD_LRU_TAG_NULL ( 0UL ) + +#define FD_LRU_SPARSE_DEFAULT ( 2 ) + +#define FD_LRU_MAGIC ( 0xf17eda2c3712C0UL ) /* firedancer lru ver 0 */ + +struct __attribute( ( aligned( FD_LRU_ALIGN ) ) ) fd_lru_private { + ulong magic; /* ==FD_LRU_MAGIC */ + ulong depth; /* The lru will maintain a history of the most recent depth tags */ + ulong free_top; + ulong map_cnt; + + /* depth ulong (doubly linked list): + + After the tcache has started up (i.e. at least depth unique tags + have been inserted), list[oldest] will be contain the oldest tag in + the tcache. This is a circular doubly linked list with a sentinel: + the entry before sentinel (cyclic) is the newest tag in the tcache and + the list entry after oldest (cyclic) is the 2nd oldest tag in the + tcache. Dulist startup (the first depth-1 unique tags inserted), + list[oldest] will be FD_TCACHE_NULL. In high performance operation, + only the slots around oldest will be in active use / occupy local + cache and the access pattern will be highly sequential. */ + + /* map_cnt ulong (map): + + This is a sparse linear probed key-only map of tags currently in + the tcache. Since it is sparse, probe collisons are rare (and thus + the branches involved in various cache operations are highly + predictable. While the sparsity makes the map reasonably + inefficient from a memory footprint point of view, memory footprint + is quite cheap in practice and the actual cache utilization is + quite mild. Specifically, if tag duplication is rare, only the + slots around the newest and oldest tags will be in use typically. + Further, if any tag duplication is temporally clustered (as is + commonly the case), duplicate tags will be a cache hit against the + (still cached because of recent use) original insertion. + + In the typical case of randomized tags, this randomly accesses the + map aggressively. The NUMA and TLB thrashing impacts of that can + be reduced / eliminated by backing the tcache with a huge / + gigantic page shared workspace on a NUMA node nearby the tcache + using threads. */ + + /* Padding to FD_TCACHE align */ +}; + +typedef struct fd_lru_private fd_lru_t; + +FD_PROTOTYPES_BEGIN + +/* fd_lru_map_cnt_default returns the default map_cnt to use for the + given depth. Returns 0 if the depth is invalid / results in a + map_cnt larger than ULONG_MAX. */ + +FD_FN_CONST static inline ulong +fd_lru_map_cnt_default( ulong depth ) { + + if ( FD_UNLIKELY( !depth ) ) return 0UL; /* depth must be positive */ + + if ( FD_UNLIKELY( depth == ULONG_MAX ) ) return 0UL; /* overflow */ + int lg_map_cnt = fd_ulong_find_msb( depth + 1UL ) + FD_LRU_SPARSE_DEFAULT; /* no overflow */ + if ( FD_UNLIKELY( lg_map_cnt > 63 ) ) return 0UL; /* depth too large */ + + /* At this point: + + 2^(lg_map_cnt-s) <= depth+n < 2^(lg_map_cnt-s+1) + + where s is SPARSE_DEFAULT > 0 and n is 1. + + map_cnt/2^s - n <= depth < map_cnt/2^(s-1) - n + 1/2^s - n/map_cnt <= depth/map_cnt < 1/2^(s-1) - n/map_cnt + + For asymptotically large depth / map_cnt, the worst case map fill + ratio will asymptote to something in ~( 1/2^s, 1/2^(s-1) ). + Flipping this around, we also have: + + -> 2^(s-1) (depth+n) < map_cnt <= 2^s (depth+n) + + In the worst case, s==1, depth+1 < map_cnt -> map_cnt>=depth+2. */ + + return 1UL << lg_map_cnt; +} + +/* fd_lru_{align,footprint} return the required alignment and + footprint of a memory region suitable for use as an lru. + fd_lru_align returns FD_LRU_ALIGN. For fd_lru_footprint, a + map_cnt of 0 indicates to use fd_lru_map_cnt_default above. If + depth is not positive, map_cnt is not a power of 2 of at least + depth+2 and/or the required footprint would be larger than ULONG_MAX, + footprint will silently return 0 (and thus can be used by the caller + to validate the lru configuration parameters). Otherwise, it + returns FD_LRU_FOOTPRINT for actual value of map_cnt used. */ + +FD_FN_CONST ulong +fd_lru_align( void ); + +FD_FN_CONST ulong +fd_lru_footprint( ulong depth, ulong map_cnt ); + +/* fd_lru_new formats an unused memory region for use as an lru. + shmem is a non-NULL pointer to this region in the local address space + with the required footprint and alignment. depth is the number of + unique tags that can be stored in the lru and should be positive + (positive integer powers of 2 minus 2 have good memory footprint Feng + Shui and postive integer powers of 2 minus 1 have good computational + efficiency Feng Shui). map_cnt is the number of slots to use for the + map. A map_cnt of 0 indicates to fd_lru_map_cnt_default above. + + Returns shmem (and the memory region it points to will be formatted + as a lru, caller is not joined, lru will be empty) on success + and NULL on failure (logs details). Reasons for failure include + obviously bad shmem, bad depth or bad map_cnt. */ + +void * +fd_lru_new( void * shmem, ulong depth, ulong map_cnt ); + +/* fd_lru_join joins the caller to the lru. _lru points to the + first byte of the memory region backing the lru in the caller's + address space. + + Returns a pointer in the local address space to the lru's entries + on success (this is not necessarily just a cast of _lru) and NULL + on failure (logs details). Reasons for failure are that _lru is + obviously not a pointer to memory region holding a lru. Every + successful join should have a matching leave. The lifetime of the + join is until the matching leave or thread group is terminated. */ + +fd_lru_t * +fd_lru_join( void * _lru ); + +/* fd_lru_leave leaves a current local join. Returns a pointer to + the underlying shared memory region on success (this is not + necessarily just a cast of _lru) and NULL on failure (logs + details). Reasons for failure include lru is NULL. */ + +void * +fd_lru_leave( fd_lru_t * lru ); + +/* fd_lru_delete unformats a memory region used as an lru. Assumes + nobody is joined to the region. Returns a pointer to the underlying + shared memory region or NULL if used obviously in error (e.g. + _lru is obviously not a lru ... logs details). The ownership + of the memory region is transferred to the caller. */ + +void * +fd_lru_delete( void * _lru ); + +/* fd_lru_{depth,map_cnt,oldest_laddr,list_laddr,map_laddr} return + various properties of the lru. These assume lru is a valid + local join. Since lru is used in performance critical code paths, + typical usage will unpack lru list and map pointers into registers + and the current value for oldest will be tracked in a register as + well. It is the responsibility of users to update the value at + oldest_laddr at termination to do clean restarts on an in progress + lru. */ + +FD_FN_PURE static inline ulong +fd_lru_depth( fd_lru_t const * lru ) { + return lru->depth; +} +FD_FN_PURE static inline ulong +fd_lru_free_top( fd_lru_t const * lru ) { + return lru->free_top; +} +FD_FN_PURE static inline ulong +fd_lru_map_cnt( fd_lru_t const * lru ) { + return lru->map_cnt; +} + +FD_FN_CONST static inline fd_list_t * +fd_lru_list_laddr( fd_lru_t * lru ) { + return (fd_list_t *)fd_type_pun( lru + 1UL ); /* both metadata and fd_list_t are 32-byte */ +} + +FD_FN_PURE static inline fd_list_t ** +fd_lru_map_laddr( fd_lru_t * lru ) { + return (fd_list_t **)fd_type_pun( fd_lru_list_laddr( lru ) + lru->depth + 1UL ); +} + +/* fd_lru_tag_is_null returns non-zero if tag is FD_LRU_TAG_NULL + and zero otherwise. */ + +FD_FN_CONST static inline int +fd_lru_tag_is_null( ulong tag ) { + return tag == FD_LRU_TAG_NULL; +} + +/* fd_lru_map_start returns the location in a lru map to start + probing for tag. Assumes tag is not null and map_cnt is a positive + integer power of 2. Implementation here is optimized for the case + where tags are randomized. + + fd_lru_map_next returns the next location to probe given the + current location. idx is assumed in [0,map_cnt) and map_cnt is + assumed to be a positive integer power of 2. */ + +FD_FN_CONST static inline ulong +fd_lru_map_start( ulong tag, ulong map_cnt ) { + return tag & ( map_cnt - 1UL ); +} +FD_FN_CONST static inline ulong +fd_lru_map_next( ulong idx, ulong map_cnt ) { + return ( idx + 1UL ) & ( map_cnt - 1UL ); +} + +/* FD_LRU_QUERY searches for tag in a map with map_cnt slots. On + return, map_idx will be in [0,map_cnt) and found will be in [0,1]. + If found is 0, map_idx is a suitable location where tag can be + inserted into the map, assuming the map has at most map_cnt-2 entries + currently in it. If found is is 1, map_idx is the index into the map + where tag is currently located (this index will be valid until the + next map remove or map destruction). + + For sparse fill ratios and properly randomized map_starts, this is a + fast O(1). + + This is implemented as a macro to support multiple return values + (found and map_idx), especially as this is used in performance + critical contexts. Similarly, does no input argument checking and + uses the unpacked fields of an lru. Assumes map is non-NULL, map + is indexed [0,map_cnt), map_cnt is a positive integer power-of-two + and tag is not null. + + This macro is robust (e.g. evaluates its arguments a minimal number + of times) and pure (i.e. found / map_idx will not change between + calls given the same map / map[*] / tag). */ + +#define FD_LRU_QUERY( found, map_idx, map, map_cnt, tag ) \ + do { \ + fd_list_t * const * _flq_map = ( map ); \ + ulong _flq_map_cnt = ( map_cnt ); \ + ulong _flq_tag = ( tag ); \ + int _flq_found = 0; \ + ulong _flq_map_idx = fd_lru_map_start( _flq_tag, _flq_map_cnt ); \ + for ( ;; ) { \ + fd_list_t * _flq_map_slot = _flq_map[_flq_map_idx]; \ + if ( _flq_map_slot == NULL ) break; \ + _flq_found = ( _flq_tag == _flq_map_slot->tag ); \ + if ( FD_LIKELY( _flq_found | fd_lru_tag_is_null( _flq_map_slot->tag ) ) ) break; \ + _flq_map_idx = fd_lru_map_next( _flq_map_idx, _flq_map_cnt ); \ + } \ + ( found ) = _flq_found; \ + ( map_idx ) = _flq_map_idx; \ + } while ( 0 ) + +/* fd_lru_remove removes tag in a map with map_cnt slots. For + sparsely populated maps and properly randomized tags, this is a fast + O(1). This does not remove tag from the list, so the user is responsible + for the removing the value from the list. As this is used in performance + critical contexts, does no input argument checking and uses the + unpacked fields of an lru. Assumes map is non-NULL, map is indexed + [0,map_cnt) and map_cnt is a positive integer power-of-two. Does + nothing if tag is null or if tag is not currently in the map. */ + +FD_FN_UNUSED static void /* Work around -Winline */ +fd_lru_remove( fd_lru_t * lru, ulong tag ) { + + /* Look up tag in the lru. If not found, nothing to do. (This + should always succeed at this point in typical lru usage but we + keep the check for paranoia / minimize risk of silent corruption.) */ + + int found; + ulong slot; + fd_list_t ** map = fd_lru_map_laddr( lru ); + FD_LRU_QUERY( found, slot, map, lru->map_cnt, tag ); + if ( FD_LIKELY( found ) ) { + + /* slot contains the tag to remove. Remove it. See util/fd_map* + for details how this works. */ + + for ( ;; ) { + map[slot] = FD_LRU_TAG_NULL; + ulong hole = slot; + fd_list_t * next; + for ( ;; ) { + slot = fd_lru_map_next( slot, lru->map_cnt ); + next = map[slot]; + if ( FD_LIKELY( next == NULL || fd_lru_tag_is_null( next->tag ) ) ) return; + ulong start = fd_lru_map_start( tag, lru->map_cnt ); + if ( !( ( ( hole < start ) & ( start <= slot ) ) | + ( ( hole > slot ) & ( ( hole < start ) | ( start <= slot ) ) ) ) ) + break; + } + map[hole] = next; + } + } +} + +static inline fd_list_t * +fd_lru_list_acquire( fd_lru_t * lru ) { + fd_list_t * sentinel = fd_lru_list_laddr( lru ); + fd_list_t * free_top = sentinel + lru->free_top; + lru->free_top = free_top->next; + return fd_list_remove( free_top ); +} + +/* user is responsible for ensuring curr is removed */ +static inline void +fd_lru_list_release( fd_lru_t * lru, fd_list_t * curr ) { + fd_list_t * sentinel = fd_lru_list_laddr( lru ); + fd_list_t * free_top = sentinel + lru->free_top; + fd_list_insert( fd_list_prev( free_top ), curr ); + lru->free_top = curr->curr; +} + +static inline fd_list_t * +fd_lru_list_head( fd_lru_t * lru ) { + return fd_list_next( fd_lru_list_laddr( lru ) ); +} + +static inline fd_list_t * +fd_lru_list_tail( fd_lru_t * lru ) { + return fd_list_prev( fd_lru_list_laddr( lru ) + lru->free_top ); +} + +/* fd_lru_upsert upserts tag into the lru in fast O(1) operations. + On return, if tag is already in the lru, the tag will be moved to + most recent position (back). If tag is not in the lru, tag was inserted, + and if the lru was full (i.e. had already contained depth values), the + oldest tag in the lru will have been evicted. + + Returns the evicted element (if any) or NULL if no element was evicted. + + Assumes oldest is in [0,depth), list is non-NULL and indexed + [0,depth), depth is positive, map is non-NULL, map is indexed + [0,map_cnt), map_cnt is an integer power-of-two of at least depth+2, + and tag is not null. + + Map entries store the location in the list structure. On a duplicate + tag, insert will move the duplicate tag from its current location in + the list (given from the query itself) to one immediately before the + oldest tag in the list and update the map entry (and similar for + unique tag insert). */ + +static inline fd_list_t * +fd_lru_upsert( fd_lru_t * lru, ulong tag) { + ulong map_idx; + fd_list_t ** map = fd_lru_map_laddr( lru ); + int found; + // FD_LOG_NOTICE(("fd_lru(lru) %p", (void *)lru)); + // FD_LOG_NOTICE(("fd_lru_list_laddr(lru) %p", (void *)fd_lru_list_laddr(lru))); + // FD_LOG_NOTICE(("fd_lru_map_laddr(lru) %p", (void *)fd_lru_map_laddr(lru))); + FD_LRU_QUERY( found, map_idx, map, lru->map_cnt, tag ); + fd_list_t * evict = NULL; + + /* lru insert */ + if ( !found ) { /* application dependent branch probability */ + /* Evict oldest tag / insert tag into list */ + if ( FD_LIKELY( lru->free_top == 0 ) ) { + fd_list_t * remove = fd_list_remove( fd_lru_list_head( lru ) ); + fd_lru_list_release( lru, remove ); + fd_lru_remove( lru, remove->tag ); + evict = remove; + } + fd_list_t * insert = fd_lru_list_acquire( lru ); + if ( insert == NULL ) FD_LOG_ERR( ( "fd_lru_list_acquire failed. lru invariant violation: should have evicted when lru was full." ) ); + insert->tag = tag; + fd_list_insert( fd_lru_list_tail( lru ), insert ); + + /* Insert tag into the map (assumes depth <= map_cnt-2) */ + /* map has at most map_cnt-2 entries here */ + map[map_idx] = insert; + /* map has at most map_cnt-1 entries here */ + + /* lru update */ + } else { + fd_list_insert( fd_lru_list_tail( lru ), fd_list_remove( map[map_idx] ) ); + } + return evict; +} + +FD_PROTOTYPES_END + +#endif /* HEADER_fd_src_tango_lru_fd_lru_h */ diff --git a/src/tango/lru/test_list.c b/src/tango/lru/test_list.c new file mode 100644 index 0000000000..66fc523c03 --- /dev/null +++ b/src/tango/lru/test_list.c @@ -0,0 +1,70 @@ +#include "fd_list.h" + +#define MAX 4 + +int +main( int argc, char ** argv ) { + fd_boot( &argc, &argv ); + + if ( FD_UNLIKELY( argc > 1 ) ) FD_LOG_ERR( ( "unrecognized argument: %s", argv[1] ) ); + + fd_wksp_t * wksp = fd_wksp_new_anonymous( + FD_SHMEM_HUGE_PAGE_SZ, 1, fd_shmem_cpu_idx( fd_shmem_numa_idx( 0 ) ), "wksp", 0UL ); + FD_TEST( wksp ); + + void * mem = fd_wksp_alloc_laddr( wksp, fd_list_align(), fd_list_footprint( 2 ), 42UL ); + fd_list_t * list = fd_list_join( fd_list_new( mem, 4 ) ); + fd_list_t * sentinel = fd_list_sentinel( list ); + + fd_list_t * curr = fd_list_head( list ); + for ( ulong i = 1; i <= MAX; i++ ) { + curr->tag = i; + curr = fd_list_next(curr); + } + + ulong n = 0; + curr = fd_list_head( list ); + while ( curr->tag != 0 ) { + n++; + curr = fd_list_next(curr); + } + FD_TEST( n == MAX ); + + /* 1 -> 2 -> 3 -> 4 => 2 -> 3 -> 4 */ + fd_list_t * pop = fd_list_pop_front( list ); + FD_TEST( pop->tag == 1 ); + FD_TEST( fd_list_head( list )->tag == 2 ); + + ulong i = 1; + curr = fd_list_head( list ); + while ( curr != sentinel ) { + FD_TEST( curr->tag == i + 1 ); + curr = fd_list_next(curr); + i++; + } + + /* 2 -> 3 -> 4 => 2 -> 3 -> 4 -> 1 */ + fd_list_push_back( list, pop ); + FD_TEST( fd_list_head( list )->tag == 2 ); + FD_TEST( fd_list_tail( list )->tag == 1 ); + + /* 2 -> 3 -> 4 -> 1 => 2 -> 3 -> 1 -> 4 */ + fd_list_t * remove = fd_list_remove( fd_list_prev(fd_list_tail( list )) ); + fd_list_insert( fd_list_tail( list ), remove ); + + /* 2 -> 3 -> 1 -> 4 => 1 -> 2 -> 3 -> 4 */ + remove = fd_list_remove( fd_list_next(fd_list_next(fd_list_head( list ))) ); + fd_list_insert( sentinel, remove ); + + /* 1 -> 2 -> 3 -> 4 => NULL */ + i = 1; + while ( ( curr = fd_list_pop_front( list ) ) != NULL ) { + FD_TEST( curr->tag == i ); + i++; + } + FD_TEST( i == MAX ); + + FD_LOG_NOTICE( ( "pass" ) ); + fd_halt(); + return 0; +} diff --git a/src/tango/lru/test_lru.c b/src/tango/lru/test_lru.c new file mode 100644 index 0000000000..ba003a6734 --- /dev/null +++ b/src/tango/lru/test_lru.c @@ -0,0 +1,182 @@ +#include "../fd_tango.h" +#include "fd_lru.h" + +#if FD_HAS_HOSTED + +FD_STATIC_ASSERT( FD_LRU_ALIGN == 128UL, unit_test ); + +FD_STATIC_ASSERT( FD_LRU_TAG_NULL == 0UL, unit_test ); + +FD_STATIC_ASSERT( FD_LRU_SPARSE_DEFAULT == 2, unit_test ); + +int +main( int argc, char ** argv ) { + fd_boot( &argc, &argv ); + + fd_rng_t _rng[1]; + fd_rng_t * rng = fd_rng_join( fd_rng_new( _rng, 0U, 0UL ) ); + + FD_TEST( fd_lru_align() == FD_LRU_ALIGN ); + FD_TEST( !fd_lru_footprint( ULONG_MAX, 4UL ) ); + FD_TEST( !fd_lru_footprint( 1UL, ULONG_MAX ) ); + FD_TEST( fd_lru_map_cnt_default( 0UL ) == 0UL ); + FD_TEST( fd_lru_map_cnt_default( 1UL ) == 8UL ); + FD_TEST( fd_lru_map_cnt_default( 2UL ) == 8UL ); + FD_TEST( fd_lru_map_cnt_default( 3UL ) == 16UL ); + FD_TEST( fd_lru_map_cnt_default( 6UL ) == 16UL ); + FD_TEST( fd_lru_map_cnt_default( 7UL ) == 32UL ); + + ulong cpu_idx = fd_tile_cpu_id( fd_tile_idx() ); + if ( cpu_idx > fd_shmem_cpu_cnt() ) cpu_idx = 0UL; + + ulong page_cnt = 1; + char * _page_sz = "gigantic"; + ulong numa_idx = fd_shmem_numa_idx( 0 ); + FD_LOG_NOTICE( ( "Creating workspace (--page-cnt %lu, --page-sz %s, --numa-idx %lu)", + page_cnt, + _page_sz, + numa_idx ) ); + fd_wksp_t * wksp = fd_wksp_new_anonymous( + fd_cstr_to_shmem_page_sz( _page_sz ), page_cnt, fd_shmem_cpu_idx( numa_idx ), "wksp", 0UL ); + FD_TEST( wksp ); + + ulong depth = 1UL << 16; + ulong map_cnt = 0UL; + ulong align = fd_lru_align(); + ulong footprint = fd_lru_footprint( depth, map_cnt ); + if ( FD_UNLIKELY( !footprint ) ) FD_LOG_ERR( ( "bad depth / map_cnt" ) ); + FD_LOG_NOTICE( ( "Creating lru (--depth %lu, --map-cnt %lu, align %lu, footprint %lu)", + depth, + map_cnt, + align, + footprint ) ); + void * mem = fd_wksp_alloc_laddr( wksp, align, footprint, 1UL ); + FD_TEST( mem ); + void * _lru = fd_lru_new( mem, depth, map_cnt ); + FD_TEST( _lru ); + fd_lru_t * lru = fd_lru_join( _lru ); + FD_TEST( lru ); + + if ( !map_cnt ) { + map_cnt = fd_lru_map_cnt_default( depth ); + FD_LOG_NOTICE( ( "default map_cnt %lu used", map_cnt ) ); + } + FD_LOG_NOTICE( + ( "[LRU cache] depth: %lu, map_cnt: %lu, footprint: %lu", depth, map_cnt, footprint ) ); + + FD_TEST( fd_lru_depth( lru ) == depth ); + FD_TEST( fd_lru_free_top( lru ) == 1UL ); + FD_TEST( fd_lru_map_cnt( lru ) == map_cnt ); + FD_TEST( fd_lru_list_laddr( lru ) ); + FD_TEST( fd_lru_map_laddr( lru ) ); + + fd_list_t ** map = fd_lru_map_laddr( lru ); + for ( ulong tag = 1; tag <= depth; tag++ ) { + FD_TEST( fd_lru_free_top( lru ) == tag ); + fd_list_t * upsert = fd_lru_upsert( lru, tag ); + int found; + ulong map_idx; + FD_LRU_QUERY( found, map_idx, map, map_cnt, tag ); + FD_TEST( found ); + FD_TEST( found == ( upsert == NULL ) ); + FD_TEST( map[map_idx] ); + FD_TEST( map[map_idx]->tag == tag ); + FD_TEST( map[map_idx]->curr ); + FD_TEST( fd_lru_list_head( lru )->tag == 1 ); + } + for ( ulong tag = 1; tag <= depth; tag++ ) { + int found; + ulong map_idx; + FD_LRU_QUERY( found, map_idx, map, map_cnt, tag ); + FD_TEST( found ); + FD_TEST( map[map_idx] ); + FD_TEST( map[map_idx]->tag == tag ); + FD_TEST( map[map_idx]->curr ); + FD_TEST( fd_lru_list_head( lru )->tag == 1 ); + } + + for ( ulong tag = depth + 1; tag <= 2 * depth; tag++ ) { + int found; + ulong map_idx; + FD_LRU_QUERY( found, map_idx, map, map_cnt, tag ); + FD_TEST( !found ); + fd_lru_upsert( lru, tag ); + FD_LRU_QUERY( found, map_idx, map, map_cnt, tag ); + FD_TEST( found ); + FD_TEST( map[map_idx] ); + FD_TEST( map[map_idx]->tag == tag ); + FD_TEST( map[map_idx]->curr ); + FD_TEST( fd_lru_list_head( lru )->tag == tag - depth + 1 ); + } + + /* already present */ + do { + fd_lru_upsert( lru, depth + 1 ); + int found; + ulong map_idx; + ulong tag = depth + 1; + FD_LRU_QUERY( found, map_idx, map, map_cnt, tag ); + FD_TEST( found ); + FD_TEST( map[map_idx] ); + FD_TEST( map[map_idx]->tag == tag ); + FD_TEST( fd_lru_list_tail( lru )->tag == tag ); + FD_TEST( fd_lru_list_head( lru )->tag == depth + 2UL ); + } while ( 0 ); + + /* update every element */ + for ( ulong tag = depth + 1; tag <= 2 * depth; tag++ ) { + int found; + ulong map_idx; + FD_LRU_QUERY( found, map_idx, map, map_cnt, tag ); + FD_TEST( found ); + fd_lru_upsert( lru, tag ); + FD_LRU_QUERY( found, map_idx, map, map_cnt, tag ); + FD_TEST( found ); + FD_TEST( map[map_idx] ); + FD_TEST( map[map_idx]->tag == tag ); + FD_TEST( fd_lru_list_tail( lru )->tag == tag ); + } + + for ( ulong i = 1; i < depth; i++ ) { + ushort n = fd_rng_ushort( rng ); /* assumes depth = USHORT_MAX */ + int found; + ulong map_idx; + if ( n < ( fd_rng_uchar( rng ) >= ( 1 << 7 ) ) ) { + ulong tag = (ulong)n + depth + 1; + FD_LRU_QUERY( found, map_idx, map, lru->map_cnt, tag ); + fd_list_t * evicted = fd_lru_upsert( lru, tag ); + FD_TEST( found ); + FD_TEST( evicted == NULL ); + } else { + ulong tag = (ulong)n; + FD_LRU_QUERY( found, map_idx, map, lru->map_cnt, tag ); + (void)map_idx; + FD_TEST( !found ); + } + } + + FD_LOG_NOTICE( ( "Cleaning up" ) ); + + FD_TEST( fd_lru_leave( lru ) == _lru ); + FD_TEST( fd_lru_delete( _lru ) == mem ); + fd_wksp_free_laddr( mem ); + fd_wksp_delete_anonymous( wksp ); + + fd_rng_delete( fd_rng_leave( rng ) ); + + FD_LOG_NOTICE( ( "pass" ) ); + fd_halt(); + return 0; +} + +#else + +int +main( int argc, char ** argv ) { + fd_boot( &argc, &argv ); + // FD_LOG_WARNING(( "skip: unit test requires FD_HAS_HOSTED capabilities" )); + fd_halt(); + return 0; +} + +#endif diff --git a/src/tango/mvcc/Local.mk b/src/tango/mvcc/Local.mk new file mode 100644 index 0000000000..67e09fa73d --- /dev/null +++ b/src/tango/mvcc/Local.mk @@ -0,0 +1,4 @@ +$(call add-hdrs,fd_mvcc.h) +$(call add-objs,fd_mvcc,fd_tango) +$(call make-unit-test,test_mvcc,test_mvcc,fd_tango fd_util) +$(call run-unit-test,test_mvcc,) diff --git a/src/tango/mvcc/fd_mvcc.c b/src/tango/mvcc/fd_mvcc.c new file mode 100644 index 0000000000..d4190f10fc --- /dev/null +++ b/src/tango/mvcc/fd_mvcc.c @@ -0,0 +1,38 @@ +#include "../../util/fd_util.h" +#include "fd_mvcc.h" + +ulong * +fd_mvcc_version_laddr( fd_mvcc_t * mvcc ) { + return &mvcc->version; +} + +ulong +fd_mvcc_begin_write( fd_mvcc_t * mvcc ) { + ulong version = FD_ATOMIC_FETCH_AND_ADD( fd_mvcc_version_laddr( mvcc ), 1 ); + FD_COMPILER_MFENCE(); + return version; +} + +ulong +fd_mvcc_end_write( fd_mvcc_t * mvcc ) { + FD_COMPILER_MFENCE(); + return FD_ATOMIC_FETCH_AND_ADD( fd_mvcc_version_laddr( mvcc ), 1 ); +} + +ulong +fd_mvcc_read( fd_mvcc_t * mvcc ) { + FD_COMPILER_MFENCE(); + ulong version = FD_VOLATILE_CONST( mvcc->version ); + FD_COMPILER_MFENCE(); + return version; +} + +ulong +fd_mvcc_begin_read( fd_mvcc_t * mvcc ) { + return fd_mvcc_read( mvcc ); +} + +ulong +fd_mvcc_end_read( fd_mvcc_t * mvcc ) { + return fd_mvcc_read( mvcc ); +} diff --git a/src/tango/mvcc/fd_mvcc.h b/src/tango/mvcc/fd_mvcc.h new file mode 100644 index 0000000000..d885faba2e --- /dev/null +++ b/src/tango/mvcc/fd_mvcc.h @@ -0,0 +1,64 @@ +#ifndef HEADER_fd_src_tango_mvcc_fd_mvcc_h +#define HEADER_fd_src_tango_mvcc_fd_mvcc_h + +#include "../../util/fd_util.h" + +/* fd_mvcc ("Multiversion Concurrency Control") is a simple primitive for lock-free synchronization + of concurrent readers and writers. It is strictly less general than the MVCC used in various + DBMS [https://dl.acm.org/doi/pdf/10.1145/356842.356846], but it is conceptually similar in that + it uses a version number to detect conflicts. + + Usage: + - Writer increments version number + - Writer does update + - Writer increments version number + - Therefore, if the version number is odd, a write is in progress. + + - Reader reads version number + - Reader reads data + - Reader reads version number + - Therefore, if the version number has changed, the read is invalid. + + fd_mvcc_begin_write() // release-store + ... write ... + fd_mvcc_end_write() // acquire-load + + ulong begin = fd_mvcc_begin_read() // acquire-load + ulong end = fd_mvcc_end_read() // acquire-load + if (end != begin) { + ... read is invalid ... + } + + Note this is similar to how producers / consumers synchronize across mcache / dcache. + + TODO hardware fencing */ + +struct fd_mvcc { + ulong version; +}; +typedef struct fd_mvcc fd_mvcc_t; + +/* fd_mvcc_version_laddr returns a local pointer to the version number for the current joined + * process. Caller is responsible for fencing the dereference if necessary. */ +ulong * +fd_mvcc_version_laddr( fd_mvcc_t * mvcc ); + +/* fd_mvcc_begin_write increments then returns the version number, fencing preceding memory + * accesses. Corresponds to C++ memory_order_release. */ +ulong +fd_mvcc_begin_write( fd_mvcc_t * mvcc ); + +/* fd_mvcc_begin_write increments then returns the version number, fencing subsequent memory + * accesses. Corresponds to C++ memory_order_acquire. */ +ulong +fd_mvcc_end_write( fd_mvcc_t * mvcc ); + +/* fd_mvcc_{begin,end}_read are convenience exports for code readability assisting with + remembering to read back the version. */ +ulong +fd_mvcc_begin_read( fd_mvcc_t * mvcc ); + +ulong +fd_mvcc_end_read( fd_mvcc_t * mvcc ); + +#endif /* HEADER_fd_src_tango_mvcc_fd_mvcc_h */ diff --git a/src/tango/mvcc/test_mvcc.c b/src/tango/mvcc/test_mvcc.c new file mode 100644 index 0000000000..1c0c625c80 --- /dev/null +++ b/src/tango/mvcc/test_mvcc.c @@ -0,0 +1,27 @@ +#include "../../util/fd_util.h" +#include "fd_mvcc.h" + +int +main( int argc, char ** argv ) { + fd_boot( &argc, &argv ); + + if ( FD_UNLIKELY( argc > 1 ) ) FD_LOG_ERR( ( "unrecognized argument: %s", argv[1] ) ); + + fd_mvcc_t mvcc = { .version = 0 }; + FD_TEST( fd_mvcc_begin_read( &mvcc ) == 0 ); + FD_TEST( fd_mvcc_end_read( &mvcc ) == 0 ); + + FD_TEST( fd_mvcc_begin_write( &mvcc ) == 0 ); + FD_TEST( fd_mvcc_begin_read( &mvcc ) == 1 ); + FD_TEST( fd_mvcc_end_read( &mvcc ) == 1 ); + FD_TEST( fd_mvcc_end_write( &mvcc ) == 1 ); + + FD_TEST( fd_mvcc_begin_read( &mvcc ) == 2 ); + FD_TEST( fd_mvcc_begin_write( &mvcc ) == 2 ); + FD_TEST( fd_mvcc_end_read( &mvcc ) == 3 ); + FD_TEST( fd_mvcc_end_write( &mvcc ) == 3 ); + + FD_LOG_NOTICE( ( "pass" ) ); + fd_halt(); + return 0; +} diff --git a/src/tango/quic/Local.mk b/src/tango/quic/Local.mk index 14d3eecd8e..3896c54896 100644 --- a/src/tango/quic/Local.mk +++ b/src/tango/quic/Local.mk @@ -1,7 +1,7 @@ ifdef FD_HAS_OPENSSL $(call make-lib,fd_quic) $(call add-objs,fd_quic fd_quic_conn fd_quic_conn_id fd_quic_conn_map fd_quic_proto \ - fd_quic_stream tls/fd_quic_tls crypto/fd_quic_crypto_suites templ/fd_quic_transport_params \ - templ/fd_quic_parse_util fd_quic_pkt_meta,fd_quic) + fd_quic_stream_pool fd_quic_stream tls/fd_quic_tls crypto/fd_quic_crypto_suites \ + templ/fd_quic_transport_params templ/fd_quic_parse_util fd_quic_pkt_meta fd_quic_qos,fd_quic) $(call make-bin,fd_quic_ctl,fd_quic_ctl,fd_quic fd_ballet fd_util) endif diff --git a/src/tango/quic/fd_quic.c b/src/tango/quic/fd_quic.c index d605fb0eef..cf1c976d61 100644 --- a/src/tango/quic/fd_quic.c +++ b/src/tango/quic/fd_quic.c @@ -1,7 +1,9 @@ +#include "fd_quic_common.h" #include "fd_quic_private.h" #include "fd_quic_conn.h" #include "fd_quic_conn_map.h" #include "fd_quic_proto.h" +#include "fd_quic_stream_pool.h" #include "crypto/fd_quic_crypto_suites.h" #include "templ/fd_quic_transport_params.h" @@ -54,6 +56,7 @@ struct fd_quic_layout { ulong event_queue_off; /* offset of event queue mem region */ int lg_slot_cnt; /* see conn_map_new */ ulong tls_off; /* offset of fd_quic_tls_t */ + ulong stream_pool_off; /* offset of stream pool */ }; typedef struct fd_quic_layout fd_quic_layout_t; @@ -123,6 +126,22 @@ fd_quic_footprint_ext( fd_quic_limits_t const * limits, if( FD_UNLIKELY( !tls_footprint ) ) { FD_LOG_WARNING(( "invalid fd_quic_tls_footprint" )); return 0UL; } offs += tls_footprint; + /* adjust stream pool sz */ + ulong stream_pool_sz = limits->stream_pool_sz; + if( stream_pool_sz == 0UL ) { + /* zero reverts to original behavior */ + ulong tot_stream_cnt = 0UL; + for( ulong j = 0UL; j < 4UL; ++j ) tot_stream_cnt += limits->stream_cnt[j]; + stream_pool_sz = tot_stream_cnt * limits->conn_cnt; + } + + /* allocate space for fd_quic_stream_pool_t */ + offs = fd_ulong_align_up( offs, fd_quic_stream_pool_align() ); + layout->stream_pool_off = offs; + ulong stream_pool_footprint = fd_quic_stream_pool_footprint( stream_pool_sz, tx_buf_sz ); + if( FD_UNLIKELY( !stream_pool_footprint ) ) { FD_LOG_WARNING(( "invalid fd_quic_stream_pool_footprint" )); return 0UL; } + offs += stream_pool_footprint; + return offs; } @@ -140,6 +159,7 @@ fd_quic_new( void * mem, if( FD_UNLIKELY( !mem ) ) { FD_LOG_WARNING(( "NULL mem" )); + __asm__("int $3"); return NULL; } @@ -251,7 +271,6 @@ fd_quic_set_aio_net_tx( fd_quic_t * quic, } } - /* initialize everything that mutates during runtime */ static void fd_quic_stream_init( fd_quic_stream_t * stream ) { @@ -372,6 +391,25 @@ fd_quic_init( fd_quic_t * quic ) { fd_quic_state_t * state = fd_quic_get_state( quic ); memset( state, 0, sizeof(fd_quic_state_t) ); + /* Create stream pool */ + + ulong stream_pool_laddr = (ulong)quic + layout.stream_pool_off; + void * stream_pool_mem = (void*)( stream_pool_laddr ); + ulong stream_pool_sz = limits->stream_pool_sz; + if( stream_pool_sz == 0UL ) { + /* zero reverts to original behavior */ + ulong tot_stream_cnt = 0UL; + for( ulong j = 0UL; j < 4UL; ++j ) tot_stream_cnt += limits->stream_cnt[j]; + stream_pool_sz = tot_stream_cnt * limits->conn_cnt; + } + fd_quic_stream_pool_t * stream_pool = fd_quic_stream_pool_new( stream_pool_mem, stream_pool_sz, limits->tx_buf_sz ); + if( FD_UNLIKELY( !stream_pool ) ) { + FD_LOG_ERR(( "Unable to create stream pool" )); + return NULL; + } + + state->stream_pool = stream_pool; + /* State: initialize each connection, and add to free list */ ulong conn_laddr = (ulong)quic + layout.conns_off; @@ -448,11 +486,11 @@ fd_quic_init( fd_quic_t * quic ) { .handshake_complete_cb = fd_quic_tls_cb_handshake_complete, .keylog_cb = fd_quic_tls_cb_keylog, + .keylog_fd = keylog_fd, + /* set up alpn */ .alpns = (uchar const *)config->alpns, .alpns_sz = config->alpns_sz, - - .keylog_fd = keylog_fd }; tls_cfg.cert = (X509 *) quic->cert_object; quic->cert_object = NULL; tls_cfg.cert_key = (EVP_PKEY *)quic->cert_key_object; quic->cert_key_object = NULL; @@ -840,35 +878,36 @@ fd_quic_conn_new_stream( fd_quic_conn_t * conn, uint type = server + ( (uint)dirtype << 1u ); ulong next_stream_id = conn->next_stream_id[type]; - uint stream_cnt = (uint)( - conn->quic->limits.stream_cnt[ 0x00 ] + - conn->quic->limits.stream_cnt[ 0x01 ] + - conn->quic->limits.stream_cnt[ 0x02 ] + - conn->quic->limits.stream_cnt[ 0x03 ] ); - uint cur_num_streams = (uint)conn->num_streams[type]; /* have we maxed out our max stream id?? */ ulong max_stream_id = ( conn->max_streams[type] << 2u ) + type; if( FD_UNLIKELY( ( next_stream_id > max_stream_id ) | - ( conn->state != FD_QUIC_CONN_STATE_ACTIVE ) | - ( cur_num_streams >= stream_cnt ) ) ) { + ( conn->state != FD_QUIC_CONN_STATE_ACTIVE ) ) ) { /* this is a normal condition which occurs whenever we run up to the peer advertized limit and represents one form of flow control */ return NULL; } - /* find unused stream */ - fd_quic_stream_t * stream = conn->unused_streams->next; + /* find unused stream of correct type */ + fd_quic_stream_t * stream_sentinel = conn->unused_streams[type]; + fd_quic_stream_t * stream = stream_sentinel->next; - /* should not occur - implies logic error */ - if( FD_UNLIKELY( stream == conn->unused_streams ) ) { - FD_LOG_ERR(( "max_concur_streams not reached, yet no free streams found" )); + /* no available streams */ + if( FD_UNLIKELY( stream == stream_sentinel ) ) { + /* could be that the peer tried using a stream without having credits */ + /* TODO test for credits, abort connection if appropriate */ + return NULL; } /* remove from unused list */ FD_QUIC_STREAM_LIST_REMOVE( stream ); + /* add to used list */ + FD_QUIC_STREAM_LIST_INSERT_BEFORE( conn->used_streams, stream ); + + /* set list membership */ + stream->list_memb = FD_QUIC_STREAM_LIST_MEMB_USED; + fd_quic_stream_init( stream ); /* 0x00 Client-Initiated, Bidirectional @@ -972,8 +1011,11 @@ fd_quic_stream_send( fd_quic_stream_t * stream, } /* insert into send list */ - if( stream->flags == 0 ) { + if( stream->list_memb != FD_QUIC_STREAM_LIST_MEMB_SEND ) { + FD_QUIC_STREAM_LIST_REMOVE( stream ); FD_QUIC_STREAM_LIST_INSERT_BEFORE( conn->send_streams, stream ); + + stream->list_memb = FD_QUIC_STREAM_LIST_MEMB_SEND; } stream->flags |= FD_QUIC_STREAM_FLAGS_UNSENT; /* we have unsent data */ stream->upd_pkt_number = FD_QUIC_PKT_NUM_PENDING; /* schedule tx */ @@ -1005,8 +1047,11 @@ fd_quic_stream_fin( fd_quic_stream_t * stream ) { fd_quic_conn_t * conn = stream->conn; /* insert into send list */ - if( stream->flags == 0 ) { + if( stream->list_memb != FD_QUIC_STREAM_LIST_MEMB_SEND ) { + FD_QUIC_STREAM_LIST_REMOVE( stream ); FD_QUIC_STREAM_LIST_INSERT_BEFORE( conn->send_streams, stream ); + + stream->list_memb = FD_QUIC_STREAM_LIST_MEMB_SEND; } stream->flags |= FD_QUIC_STREAM_FLAGS_TX_FIN; /* state immediately updated */ stream->state |= FD_QUIC_STREAM_STATE_TX_FIN; /* state immediately updated */ @@ -1019,6 +1064,22 @@ fd_quic_stream_fin( fd_quic_stream_t * stream ) { /* TODO update metrics */ } +void +fd_quic_stream_close( fd_quic_stream_t * stream ) { + /* TODO implement STOP_SENDING + + If the stream is in the "Recv" or "Size Known" state, the transport SHOULD + signal this by sending a STOP_SENDING frame to prompt closure of the stream + in the opposite direction. */ + fd_quic_stream_free( stream->conn->quic, stream->conn, stream, FD_QUIC_NOTIFY_ABORT ); +} + +void +fd_quic_conn_set_max_streams( fd_quic_conn_t * conn, int dirtype, ulong max_streams ) { + int type = ((dirtype & 1) << 1) + !conn->server; /* `dirtype & 1` clamps to 0 or 1 */ + conn->max_streams[(ulong)type] = max_streams; +} + void fd_quic_conn_set_rx_max_data( fd_quic_conn_t * conn, ulong rx_max_data ) { conn->rx_max_data = rx_max_data; @@ -1209,8 +1270,13 @@ fd_quic_handle_v1_initial( fd_quic_t * quic, } /* Early check: Is conn free? */ - - if( !state->conns ) { + if ( FD_UNLIKELY( !state->conns ) ) { + /* attempt to service any pending closes to free a conn. for example, an application may + have evicted an old conn as part of cb.conn_new */ + FD_DEBUG( FD_LOG_DEBUG( ( "fd_quic_conn_create: no free conn slots so calling service" ) ) ); + fd_quic_service( quic ); + } + if ( FD_UNLIKELY( !state->conns ) ) { FD_DEBUG( FD_LOG_DEBUG(( "ignoring conn request: no free conn slots" )) ); quic->metrics.conn_err_no_slots_cnt++; return FD_QUIC_PARSE_FAIL; /* FIXME better error code? */ @@ -2912,8 +2978,9 @@ fd_quic_tls_cb_handshake_complete( fd_quic_tls_hs_t * hs, conn->tx_initial_max_stream_data_bidi_local = peer_tp->initial_max_stream_data_bidi_local; conn->tx_initial_max_stream_data_bidi_remote = peer_tp->initial_max_stream_data_bidi_remote; - fd_quic_state_t * state = fd_quic_get_state( conn->quic ); + fd_quic_state_t * state = fd_quic_get_state( conn->quic ); fd_quic_transport_params_t * our_tp = &state->transport_params; + conn->rx_max_data = our_tp->initial_max_data; conn->rx_initial_max_stream_data_uni = our_tp->initial_max_stream_data_uni; conn->rx_initial_max_stream_data_bidi_local = our_tp->initial_max_stream_data_bidi_local; @@ -2931,21 +2998,17 @@ fd_quic_tls_cb_handshake_complete( fd_quic_tls_hs_t * hs, /* max streams set the initial max allowed by the peer */ - uint stream_cnt = (uint)( - conn->quic->limits.stream_cnt[ 0x00 ] + - conn->quic->limits.stream_cnt[ 0x01 ] + - conn->quic->limits.stream_cnt[ 0x02 ] + - conn->quic->limits.stream_cnt[ 0x03 ] ); + ulong * stream_cnts = conn->quic->limits.stream_cnt; if( conn->server ) { /* 0x01 server-initiated, bidirectional */ - conn->max_streams[0x01] = fd_uint_min( stream_cnt, (uint)peer_tp->initial_max_streams_bidi ); + conn->max_streams[0x01] = fd_uint_min( (uint)stream_cnts[0x01], (uint)peer_tp->initial_max_streams_bidi ); /* 0x03 server-initiated, unidirectional */ - conn->max_streams[0x03] = fd_uint_min( stream_cnt, (uint)peer_tp->initial_max_streams_uni ); + conn->max_streams[0x03] = fd_uint_min( (uint)stream_cnts[0x03], (uint)peer_tp->initial_max_streams_uni ); } else { /* 0x00 client-initiated, bidirectional */ - conn->max_streams[0x00] = fd_uint_min( stream_cnt, (uint)peer_tp->initial_max_streams_bidi ); + conn->max_streams[0x00] = fd_uint_min( (uint)stream_cnts[0x00], (uint)peer_tp->initial_max_streams_bidi ); /* 0x02 client-initiated, unidirectional */ - conn->max_streams[0x02] = fd_uint_min( stream_cnt, (uint)peer_tp->initial_max_streams_uni ); + conn->max_streams[0x02] = fd_uint_min( (uint)stream_cnts[0x02], (uint)peer_tp->initial_max_streams_uni ); } return; @@ -3931,6 +3994,7 @@ fd_quic_conn_tx( fd_quic_t * quic, fd_quic_conn_t * conn ) { ulong stream_type_idx = 2u | !conn->server; frame.max_streams.stream_type = 1; frame.max_streams.max_streams = conn->max_streams[stream_type_idx]; + frame.max_streams.max_streams = conn->max_streams[stream_type_idx]; /* attempt to write into buffer */ frame_sz = fd_quic_encode_max_streams_frame( payload_ptr, @@ -4054,6 +4118,10 @@ fd_quic_conn_tx( fd_quic_t * quic, fd_quic_conn_t * conn ) { if( cur_stream->flags == 0 ) { /* remove cur_stream from action list */ FD_QUIC_STREAM_LIST_REMOVE( cur_stream ); + + /* add to used list */ + FD_QUIC_STREAM_LIST_INSERT_BEFORE( conn->used_streams, cur_stream ); + cur_stream->list_memb = FD_QUIC_STREAM_LIST_MEMB_USED; } } } @@ -4322,6 +4390,10 @@ fd_quic_conn_tx( fd_quic_t * quic, fd_quic_conn_t * conn ) { if( stream->flags == 0 ) { /* remove from list */ FD_QUIC_STREAM_LIST_REMOVE( stream ); + + /* add to used list */ + FD_QUIC_STREAM_LIST_INSERT_BEFORE( conn->used_streams, stream ); + stream->list_memb = FD_QUIC_STREAM_LIST_MEMB_USED; } } } @@ -4402,6 +4474,7 @@ fd_quic_conn_service( fd_quic_t * quic, fd_quic_conn_t * conn, ulong now ) { conn->state = FD_QUIC_CONN_STATE_ACTIVE; /* user callback */ + FD_LOG_NOTICE(("conn acquire %p", (void *)conn)); fd_quic_cb_conn_new( quic, conn ); } } @@ -4489,21 +4562,41 @@ fd_quic_conn_free( fd_quic_t * quic, } } - /* remove all stream ids from map, and free stream */ - ulong tot_num_streams = conn->tot_num_streams; - for( ulong j = 0; j < tot_num_streams; ++j ) { - fd_quic_stream_t * stream = conn->streams[j]; - if( stream->stream_id != FD_QUIC_STREAM_ID_UNUSED ) { - fd_quic_stream_map_t * stream_entry = fd_quic_stream_map_query( conn->stream_map, stream->stream_id, NULL ); - if( stream_entry ) { - /* fd_quic_stream_free calls fd_quic_stream_map_remove */ - /* TODO we seem to be freeing more streams than expected here */ - if( stream_entry->stream ) { - fd_quic_stream_free( quic, conn, stream_entry->stream, FD_QUIC_NOTIFY_ABORT ); - } else { - fd_quic_stream_map_remove( conn->stream_map, stream_entry ); - } - } + /* remove used streams */ + fd_quic_stream_t * used_sentinel = conn->used_streams; + while( 1 ) { + fd_quic_stream_t * stream = used_sentinel->next; + + if( stream == used_sentinel ) break; + + fd_quic_stream_free( quic, conn, stream, FD_QUIC_NOTIFY_ABORT ); + } + + /* remove send streams */ + fd_quic_stream_t * send_sentinel = conn->send_streams; + while( 1 ) { + fd_quic_stream_t * stream = send_sentinel->next; + + if( stream == send_sentinel ) break; + + fd_quic_stream_free( quic, conn, stream, FD_QUIC_NOTIFY_ABORT ); + } + + /* deallocate unused streams */ + for( ulong j = 0UL; j < 4UL; ++j ) { + fd_quic_stream_t * unused_sentinel = conn->unused_streams[j]; + while( 1 ) { + fd_quic_stream_t * stream = unused_sentinel->next; + + if( stream == unused_sentinel ) break; + + /* remove from list */ + FD_QUIC_STREAM_LIST_REMOVE( stream ); + stream->list_memb = FD_QUIC_STREAM_LIST_MEMB_NONE; + stream->flags = 0u; + + /* return to pool */ + fd_quic_stream_pool_free( state->stream_pool, stream ); } } @@ -4524,6 +4617,7 @@ fd_quic_conn_free( fd_quic_t * quic, } /* put connection back in free list */ + FD_LOG_NOTICE(("conn release %p", (void *)conn)); conn->next = state->conns; state->conns = conn; conn->state = FD_QUIC_CONN_STATE_INVALID; @@ -4641,8 +4735,7 @@ fd_quic_connect( fd_quic_t * quic, transport_params_raw, FD_QUIC_TRANSPORT_PARAMS_RAW_SZ, tp ); - if( FD_UNLIKELY( tp_rc == FD_QUIC_ENCODE_FAIL ) ) { - /* FIXME log error in counters */ + if( FD_UNLIKELY( tp_rc == FD_QUIC_ENCODE_FAIL ) ) { /* FIXME log error in counters */ FD_DEBUG( FD_LOG_DEBUG(( "fd_quic_encode_transport_params failed" )) ); goto fail_conn; } @@ -4726,8 +4819,9 @@ fd_quic_conn_create( fd_quic_t * quic, /* fetch top of connection free list */ fd_quic_conn_t * conn = state->conns; - if( FD_UNLIKELY( !conn ) ) { - FD_DEBUG( FD_LOG_DEBUG(( "fd_quic_conn_create failed: no free conn slots" )) ); + + if ( FD_UNLIKELY( !conn ) ) { /* should have been caught by the early check in `handle_initial` */ + FD_DEBUG( FD_LOG_DEBUG(( "fd_quic_conn_create failed: no free conn slots and failed to evict" )) ); quic->metrics.conn_err_no_slots_cnt++; return NULL; } @@ -4789,26 +4883,6 @@ fd_quic_conn_create( fd_quic_t * quic, conn->handshake_done_send = 0; conn->tls_hs = NULL; /* created later */ - /* initial max_streams */ - - if( server ) { - /* we are the server, so start client-initiated at our max-concurrent, - and server-initiated at 0 peer will advertise its configured maximum */ - conn->max_streams[ 0x00 ] = quic->limits.stream_cnt[ 0x00 ]; /* 0x00 Client-Initiated, Bidirectional */ - conn->max_streams[ 0x01 ] = 0; /* 0x01 Server-Initiated, Bidirectional */ - conn->max_streams[ 0x02 ] = quic->limits.stream_cnt[ 0x02 ]; /* 0x02 Client-Initiated, Unidirectional */ - conn->max_streams[ 0x03 ] = 0; /* 0x03 Server-Initiated, Unidirectional */ - } else { - /* we are the client, so start server-initiated at our max-concurrent, - and client-initiated at 0 peer will advertise its configured maximum */ - conn->max_streams[ 0x00 ] = 0; /* 0x00 Client-Initiated, Bidirectional */ - conn->max_streams[ 0x01 ] = quic->limits.stream_cnt[ 0x01 ]; /* 0x01 Server-Initiated, Bidirectional */ - conn->max_streams[ 0x02 ] = 0; /* 0x02 Client-Initiated, Unidirectional */ - conn->max_streams[ 0x03 ] = quic->limits.stream_cnt[ 0x03 ]; /* 0x03 Server-Initiated, Unidirectional */ - } - - /* conn->streams initialized inside fd_quic_conn_new */ - /* points to free tx space */ conn->tx_ptr = conn->tx_buf; conn->tx_sz = sizeof( conn->tx_buf ); @@ -4826,24 +4900,58 @@ fd_quic_conn_create( fd_quic_t * quic, conn->next_stream_id[2] = 2; conn->next_stream_id[3] = 3; - /* start at our max, peer is allowed to lower */ - conn->max_concur_streams = (uint)( - quic->limits.stream_cnt[ 0 ] + - quic->limits.stream_cnt[ 1 ] + - quic->limits.stream_cnt[ 2 ] + - quic->limits.stream_cnt[ 3 ] ); - /* array: current number of streams by type is zero */ fd_memset( &conn->num_streams, 0, sizeof( conn->num_streams ) ); /* initialize streams */ - FD_QUIC_STREAM_LIST_SENTINEL( conn->unused_streams ); + fd_quic_stream_pool_t * stream_pool = state->stream_pool; FD_QUIC_STREAM_LIST_SENTINEL( conn->send_streams ); - ulong tot_num_streams = conn->tot_num_streams; - for( ulong j = 0; j < tot_num_streams; ++j ) { - /* insert into unused list */ - FD_QUIC_STREAM_LIST_INSERT_BEFORE( conn->unused_streams, conn->streams[j] ); - conn->streams[j]->flags = 0; + FD_QUIC_STREAM_LIST_SENTINEL( conn->used_streams ); + for( ulong type = 0UL; type < 4UL; ++type ) { + fd_quic_stream_t * unused_streams = conn->unused_streams[type]; + + FD_QUIC_STREAM_LIST_SENTINEL( unused_streams ); + + ulong num_streams = quic->limits.stream_cnt[type]; + conn->tgt_max_streams[type] = num_streams; + for( ulong j = 0; j < num_streams; ++j ) { + /* allocate stream from pool */ + fd_quic_stream_t * stream = fd_quic_stream_pool_alloc( stream_pool ); + + if( !stream ) { + FD_LOG_WARNING(( "fd_quic_conn_create: no stream" )); + /* pool shouldn't run out here, but if it does, simply break */ + break; + } + + /* insert into unused list */ + FD_QUIC_STREAM_LIST_INSERT_BEFORE( unused_streams, stream ); + stream->list_memb = FD_QUIC_STREAM_LIST_MEMB_UNUSED; + + /* increment alloc_streams */ + conn->alloc_streams[type]++; + conn->num_streams[type]++; + + stream->flags = 0; + } + } + + /* initial max_streams */ + + if( server ) { + /* we are the server, so start client-initiated at our max-concurrent, + and server-initiated at 0 peer will advertise its configured maximum */ + conn->max_streams[ 0x00 ] = conn->alloc_streams[ 0x00 ]; /* 0x00 Client-Initiated, Bidirectional */ + conn->max_streams[ 0x01 ] = 0; /* 0x01 Server-Initiated, Bidirectional */ + conn->max_streams[ 0x02 ] = conn->alloc_streams[ 0x02 ]; /* 0x02 Client-Initiated, Unidirectional */ + conn->max_streams[ 0x03 ] = 0; /* 0x03 Server-Initiated, Unidirectional */ + } else { + /* we are the client, so start server-initiated at our max-concurrent, + and client-initiated at 0 peer will advertise its configured maximum */ + conn->max_streams[ 0x00 ] = 0; /* 0x00 Client-Initiated, Bidirectional */ + conn->max_streams[ 0x01 ] = conn->alloc_streams[ 0x01 ]; /* 0x01 Server-Initiated, Bidirectional */ + conn->max_streams[ 0x02 ] = 0; /* 0x02 Client-Initiated, Unidirectional */ + conn->max_streams[ 0x03 ] = conn->alloc_streams[ 0x03 ]; /* 0x03 Server-Initiated, Unidirectional */ } /* initialize packet metadata */ @@ -5099,10 +5207,12 @@ fd_quic_pkt_meta_retry( fd_quic_t * quic, /* move tx_sent back to calculated offset */ stream->tx_sent = offset; - /* if flags==0, the stream is not in the send list */ - if( stream->flags == 0 ) { - /* insert into send list */ + /* insert into send list */ + if( stream->list_memb != FD_QUIC_STREAM_LIST_MEMB_SEND ) { + FD_QUIC_STREAM_LIST_REMOVE( stream ); FD_QUIC_STREAM_LIST_INSERT_BEFORE( conn->send_streams, stream ); + + stream->list_memb = FD_QUIC_STREAM_LIST_MEMB_SEND; } /* set the data to go out on the next packet */ @@ -5129,22 +5239,36 @@ fd_quic_pkt_meta_retry( fd_quic_t * quic, This will be fixed by reorg of pkt_meta */ - ulong tot_num_streams = conn->tot_num_streams; - for( ulong j = 0u; j < tot_num_streams; ++j ) { - fd_quic_stream_t * stream = conn->streams[j]; - - /* was this stream sent on the given packet number */ - if( stream->stream_id != FD_QUIC_STREAM_ID_UNUSED && - stream->upd_pkt_number == pkt_number ) { - /* if flags==0, the stream is not in the send list */ - if( stream->flags == 0 ) { - /* insert */ + /* scan used streams */ + fd_quic_stream_t * stream = conn->used_streams->next; + while( !stream->sentinel ) { + if( stream->upd_pkt_number == pkt_number ) { + /* insert into send list */ + if( stream->list_memb != FD_QUIC_STREAM_LIST_MEMB_SEND ) { + FD_QUIC_STREAM_LIST_REMOVE( stream ); FD_QUIC_STREAM_LIST_INSERT_BEFORE( conn->send_streams, stream ); + + stream->list_memb = FD_QUIC_STREAM_LIST_MEMB_SEND; } stream->flags |= FD_QUIC_STREAM_FLAGS_MAX_STREAM_DATA; stream->upd_pkt_number = FD_QUIC_PKT_NUM_PENDING; } + + stream = stream->next; + } + + /* scan send streams */ + stream = conn->send_streams->next; + while( !stream->sentinel ) { + if( stream->upd_pkt_number == pkt_number ) { + /* already in send list */ + + stream->flags |= FD_QUIC_STREAM_FLAGS_MAX_STREAM_DATA; + stream->upd_pkt_number = FD_QUIC_PKT_NUM_PENDING; + } + + stream = stream->next; } } if( flags & FD_QUIC_PKT_META_FLAGS_MAX_STREAMS_UNIDIR ) { @@ -5301,6 +5425,10 @@ fd_quic_reclaim_pkt_meta( fd_quic_conn_t * conn, if( stream->flags == 0 ) { /* remove from list */ FD_QUIC_STREAM_LIST_REMOVE( stream ); + + /* add to used list */ + FD_QUIC_STREAM_LIST_INSERT_BEFORE( conn->used_streams, stream ); + stream->list_memb = FD_QUIC_STREAM_LIST_MEMB_USED; } } } @@ -5402,21 +5530,6 @@ fd_quic_reclaim_pkt_meta( fd_quic_conn_t * conn, /* max_stream_data */ if( flags & FD_QUIC_PKT_META_FLAGS_MAX_STREAM_DATA ) { -#if 0 - ulong tot_num_streams = conn->tot_num_streams; - fd_quic_stream_t ** streams = conn->streams; - /* TODO avoid linear search here */ - for( ulong j = 0; j < tot_num_streams; ++j ) { - fd_quic_stream_t * stream = streams[j]; - if( stream->upd_pkt_number == pkt_number ) { - stream->flags &= ~FD_QUIC_STREAM_FLAGS_MAX_STREAM_DATA; - if( stream->flags == 0 ) { - /* stream must be in send_streams, so remove */ - FD_QUIC_STREAM_LIST_REMOVE( stream ); - } - } - } -#else fd_quic_stream_t * sentinel = conn->send_streams; fd_quic_stream_t * stream = sentinel->next; while( !stream->sentinel ) { @@ -5426,13 +5539,16 @@ fd_quic_reclaim_pkt_meta( fd_quic_conn_t * conn, if( stream->flags == 0 ) { /* stream must be in send_streams, so remove */ FD_QUIC_STREAM_LIST_REMOVE( stream ); + + /* add to used streams list */ + FD_QUIC_STREAM_LIST_INSERT_BEFORE( conn->used_streams, stream ); + stream->list_memb = FD_QUIC_STREAM_LIST_MEMB_USED; } } } stream = stream->next; } -#endif } /* acks */ @@ -5661,6 +5777,9 @@ fd_quic_stream_free( fd_quic_t * quic, fd_quic_conn_t * conn, fd_quic_stream_t * /* TODO rename FD_QUIC_NOTIFY_END to FD_QUIC_STREAM_NOTIFY_END et al */ fd_quic_cb_stream_notify( quic, stream, stream->context, code ); + fd_quic_state_t * state = fd_quic_get_state( quic ); + + /* save the original stream id */ ulong stream_id = stream->stream_id; /* free the stream */ @@ -5672,31 +5791,61 @@ fd_quic_stream_free( fd_quic_t * quic, fd_quic_conn_t * conn, fd_quic_stream_t * fd_quic_stream_map_remove( conn->stream_map, stream_entry ); } + /* remove from list */ + FD_QUIC_STREAM_LIST_REMOVE( stream ); + stream->list_memb = FD_QUIC_STREAM_LIST_MEMB_NONE; + stream->flags = 0; + /* if stream of relevant type, increase max_streams for relevant type */ - ulong stream_type = stream_id & 3u; + ulong stream_type = stream_id & 3u; + int peer_initiated = (uint)( stream_type & 1u ) == (uint)!conn->server; + + /* track current number of streams */ + conn->num_streams[stream_type]--; + + /* we might want to increase max_streams and notify the peer */ + if( !peer_initiated || conn->alloc_streams[stream_type] < conn->tgt_max_streams[stream_type] ) { + ulong num_alloc = 1UL; + + if( peer_initiated ) { + /* trigger frame to increase max_streams for peer */ + uint flag = ( stream_id & 2u ) ? FD_QUIC_CONN_FLAGS_MAX_STREAMS_UNIDIR + : FD_QUIC_CONN_FLAGS_MAX_STREAMS_BIDIR; + conn->flags |= flag; + conn->upd_pkt_number = FD_QUIC_PKT_NUM_PENDING; + + num_alloc = conn->tgt_max_streams[stream_type] - conn->alloc_streams[stream_type]; + } + + /* return to the appropriate unused list */ + FD_QUIC_STREAM_LIST_INSERT_AFTER( conn->unused_streams[stream_type], stream ); + stream->list_memb = FD_QUIC_STREAM_LIST_MEMB_UNUSED; - /* was the stream initiated by the peer */ - if( (uint)( stream_type & 1u ) == (uint)!conn->server ) { conn->max_streams[stream_type]++; /* allows for one more stream */ - /* trigger frame to increase max_streams for peer */ - uint flag = ( stream_id & 2u ) ? FD_QUIC_CONN_FLAGS_MAX_STREAMS_UNIDIR - : FD_QUIC_CONN_FLAGS_MAX_STREAMS_BIDIR; - conn->flags |= flag; - conn->upd_pkt_number = FD_QUIC_PKT_NUM_PENDING; - } + num_alloc--; - /* remove from send_streams */ - if( stream->flags ) { - FD_QUIC_STREAM_LIST_REMOVE( stream ); - } - stream->flags = 0; + /* if we're still short of target, try allocating more */ + for( ulong j = 0UL; j < num_alloc; ++j ) { + fd_quic_stream_t * new_stream = fd_quic_stream_pool_alloc( state->stream_pool ); + if( !new_stream ) break; - /* insert into unused list */ - FD_QUIC_STREAM_LIST_INSERT_AFTER( conn->unused_streams, stream ); + //TODO( "does this stream need initializing here?" ); - /* track current number of streams */ - conn->num_streams[stream_type]--; + /* add to the appropriate unused list */ + FD_QUIC_STREAM_LIST_INSERT_AFTER( conn->unused_streams[stream_type], new_stream ); + new_stream->list_memb = FD_QUIC_STREAM_LIST_MEMB_UNUSED; + + conn->max_streams[stream_type]++; /* allows for one more stream */ + } + } else { + /* if we're peer initiated and we don't require the stream anymore + then return it to the pool */ + fd_quic_stream_pool_free( state->stream_pool, stream ); + + /* track allocated streams */ + conn->alloc_streams[stream_type]--; + } } static ulong @@ -5725,11 +5874,12 @@ fd_quic_frame_handle_stream_frame( fd_quic_stream_t * stream = NULL; fd_quic_stream_map_t * stream_entry = fd_quic_stream_map_query( context.conn->stream_map, stream_id, NULL ); + /* existing stream? */ if( stream_entry ) { stream = stream_entry->stream; } else { /* not found, get unused stream */ - fd_quic_stream_t * sentinel = context.conn->unused_streams; + fd_quic_stream_t * sentinel = context.conn->unused_streams[type]; stream = sentinel->next; @@ -5814,6 +5964,10 @@ fd_quic_frame_handle_stream_frame( fd_quic_conn_t * conn = context.conn; FD_QUIC_STREAM_LIST_REMOVE( stream ); + /* add to used list */ + FD_QUIC_STREAM_LIST_INSERT_BEFORE( conn->used_streams, stream ); + stream->list_memb = FD_QUIC_STREAM_LIST_MEMB_USED; + stream->stream_id = stream_id; /* track current number of streams */ @@ -5902,9 +6056,12 @@ fd_quic_frame_handle_stream_frame( /* set max_data and max_data_frame to go out next packet */ stream->upd_pkt_number = FD_QUIC_PKT_NUM_PENDING; - if( stream->flags == 0 ) { - /* going from 0 to nonzero, so insert into action list */ + /* insert into send list */ + if( stream->list_memb != FD_QUIC_STREAM_LIST_MEMB_SEND ) { + FD_QUIC_STREAM_LIST_REMOVE( stream ); FD_QUIC_STREAM_LIST_INSERT_BEFORE( conn->send_streams, stream ); + + stream->list_memb = FD_QUIC_STREAM_LIST_MEMB_SEND; } stream->flags |= FD_QUIC_STREAM_FLAGS_MAX_STREAM_DATA; diff --git a/src/tango/quic/fd_quic.h b/src/tango/quic/fd_quic.h index 1ffcb30580..dd6042707e 100644 --- a/src/tango/quic/fd_quic.h +++ b/src/tango/quic/fd_quic.h @@ -82,89 +82,15 @@ /* TODO provide fd_quic on non-hosted targets */ +#include "fd_quic_enum.h" + #include "../aio/fd_aio.h" #include "../../util/fd_util.h" +#include /* FD_QUIC_API marks public API declarations. No-op for now. */ #define FD_QUIC_API -/* FD_QUIC_{SUCCESS,FAILED} are used for error return codes. */ -#define FD_QUIC_SUCCESS (0) -#define FD_QUIC_FAILED (1) - -/* FD_QUIC_TYPE_{UNI,BI}DIR indicate stream type. */ -#define FD_QUIC_TYPE_BIDIR (0) -#define FD_QUIC_TYPE_UNIDIR (1) - -/* FD_QUIC_ALIGN specifies the alignment needed for an fd_quic_t. - This is provided to facilitate compile-time QUIC declarations. - Also see fd_quic_align() */ -#define FD_QUIC_ALIGN (4096UL) /* 4KiB */ - -/* FD_QUIC_MTU is the assumed network link MTU in bytes, including L2 - and L3 headers. */ -#define FD_QUIC_MTU (1500) - -/* FD_QUIC_INITIAL_PAYLOAD_SZ_MIN is the min byte size of the UDP payload - of Initial-type packets. Mandated for both clients and servers as a - form of MTU discovery and to mitigate amplification attacks. See - RFC 9000 Section 14.1: - https://datatracker.ietf.org/doc/html/rfc9000#name-initial-datagram-size */ -#define FD_QUIC_INITIAL_PAYLOAD_SZ_MIN (1200) -#define FD_QUIC_INITIAL_PAYLOAD_SZ_MAX (FD_QUIC_INITIAL_PAYLOAD_SZ_MIN) - -/* Tokens (both RETRY and NEW_TOKEN) are specified by varints. We bound it to - 77 bytes. Both our and quinn's RETRY tokens are 77 bytes, but our client - needs to be able to handle other server impl's of RETRY too. - - FIXME change this bound (requires variable-length encoding). */ -#define FD_QUIC_TOKEN_SZ_MAX (77) -/* Retry packets don't carry a token length field, so we infer it from the - footprint of a packet with a zero-length token and zero-length conn ids. */ -#define FD_QUIC_EMPTY_RETRY_PKT_SZ (23) - -/* FD_QUIC_MAX_PAYLOAD_SZ is the max byte size of the UDP payload of any - QUIC packets. Derived from FD_QUIC_MTU by subtracting the typical - IPv4 header (no options) and UDP header sizes. */ -#define FD_QUIC_MAX_PAYLOAD_SZ (FD_QUIC_MTU - 20 - 8) - -/* FD_QUIC_ROLE_{CLIENT,SERVER} identify the fd_quic_t's role as a - client or server. */ -#define FD_QUIC_ROLE_CLIENT 1 -#define FD_QUIC_ROLE_SERVER 2 - -/* FD_QUIC_SEND_ERR_* are negative int error codes indicating a stream - send failure. - ...INVAL_STREAM: Not allowed to send for stream ID (e.g. not open) - ...INVAL_CONN: Connection not in valid state for sending - ...FIN: Not allowed to send, stream finished */ -#define FD_QUIC_SEND_ERR_INVAL_STREAM (-1) -#define FD_QUIC_SEND_ERR_INVAL_CONN (-2) -#define FD_QUIC_SEND_ERR_STREAM_FIN (-3) - -/* FD_QUIC_MIN_CONN_ID_CNT: min permitted conn ID count per conn */ -#define FD_QUIC_MIN_CONN_ID_CNT (4UL) - -/* FD_QUIC_DEFAULT_SPARSITY: default fd_quic_limits_t->conn_id_sparsity */ -#define FD_QUIC_DEFAULT_SPARSITY (2.5) - -/* FD_QUIC_STREAM_TYPE_* indicate stream type (two least significant - bits of a stream ID) */ -#define FD_QUIC_STREAM_TYPE_BIDI_CLIENT 0 -#define FD_QUIC_STREAM_TYPE_BIDI_SERVER 1 -#define FD_QUIC_STREAM_TYPE_UNI_CLIENT 2 -#define FD_QUIC_STREAM_TYPE_UNI_SERVER 3 - -/* FD_QUIC_NOTIFY_* indicate stream notification types. - ...END: Stream lifetime has ended, no more callbacks will be - generated for it. Stream will be freed after event - delivery. - ...RESET: Peer has reset the stream (will not send) - ...ABORT: Peer has aborted the stream (will not receive) */ -#define FD_QUIC_NOTIFY_END (100) -#define FD_QUIC_NOTIFY_RESET (101) -#define FD_QUIC_NOTIFY_ABORT (102) - /* Forward declarations */ struct fd_quic_conn; @@ -187,13 +113,15 @@ struct __attribute__((aligned(16UL))) fd_quic_limits { ulong conn_id_cnt; /* per-conn, max conn ID count (min 4UL) */ double conn_id_sparsity; /* per-conn, conn ID hashmap sparsity */ - ulong stream_cnt[4]; /* per-conn, max concurrent stream count */ + ulong stream_cnt[4]; /* per-conn, initial max concurrent stream count */ double stream_sparsity; /* per-conn, stream hashmap sparsity */ ulong inflight_pkt_cnt; /* per-conn, max inflight packet count */ ulong tx_buf_sz; /* per-stream, tx buf sz in bytes */ /* the user consumes rx directly from the network buffer */ + + ulong stream_pool_sz; /* instance-wide, number of streams in stream pool */ }; typedef struct fd_quic_limits fd_quic_limits_t; @@ -251,6 +179,11 @@ struct __attribute__((aligned(16UL))) fd_quic_config { ulong initial_rx_max_stream_data; /* per-stream, rx buf sz in bytes, set by the user. */ + int verify_peer; /* sets SSL_VERIFY_PEER flag. if server, sends a client cert request. */ + int verify_depth; /* sets the maximum allowable depth of a cert chain when verifying. */ + int verify_strict; /* sets whether to fail the handshake if cert verification fails. */ + int verify_self_signed; /* sets whether to allow self-signed certs */ + /* Network config ****************************************/ struct { /* Link layer config */ @@ -395,14 +328,14 @@ struct fd_quic_metrics { ulong net_tx_byte_cnt; /* total bytes sent */ /* Conn metrics */ - long conn_active_cnt; /* number of active conns */ - ulong conn_created_cnt; /* number of conns created */ - ulong conn_closed_cnt; /* number of conns gracefully closed */ - ulong conn_aborted_cnt; /* number of conns aborted */ - ulong conn_retry_cnt; /* number of conns established with retry */ - ulong conn_err_no_slots_cnt; /* number of conns that failed to create due to lack of slots */ - ulong conn_err_tls_fail_cnt; /* number of conns that aborted due to TLS failure */ - ulong conn_err_retry_fail_cnt; /* number of conns that failed during retry (e.g. invalid token) */ + long conn_active_cnt; /* number of active conns */ + ulong conn_created_cnt; /* number of conns created */ + ulong conn_closed_cnt; /* number of conns gracefully closed */ + ulong conn_aborted_cnt; /* number of conns aborted */ + ulong conn_retry_cnt; /* number of conns established with retry */ + ulong conn_err_no_slots_cnt; /* number of conns that failed to create due to lack of slots */ + ulong conn_err_tls_fail_cnt; /* number of conns that aborted due to TLS failure */ + ulong conn_err_retry_fail_cnt; /* number of conns that failed during retry (e.g. invalid token) */ /* Handshake metrics */ ulong hs_created_cnt; /* number of handshake flows created */ @@ -629,6 +562,8 @@ fd_quic_conn_new_stream( fd_quic_conn_t * conn, Use fd_quic_conn_new_stream to create a new stream for sending or use the new stream callback to obtain a stream for replying. + "On the sending part of a stream, an application protocol can write data..." + args stream the stream to send on batch a pointer to an array of buffers @@ -653,14 +588,48 @@ fd_quic_stream_send( fd_quic_stream_t * stream, no more data will be sent to self-to-peer flow of stream. Peer may continue sending data on their side of the stream. Caller should only call stream_fin once per stream, except when fin was already - indicated via stream_send. */ + indicated via stream_send. + "On the sending part of a stream, an application protocol can end the stream + (clean termination)..." */ FD_QUIC_API void fd_quic_stream_fin( fd_quic_stream_t * stream ); -/* TODO: fd_quic_stream_close */ -//void -//fd_quic_stream_close( fd_quic_stream_t * stream, int direction_flags ); +/* fd_quic_stream_close: close a stream. Called to signal no more data + will be read in the peer-to-self flow of stream and request stream closure. + + "On the receiving part of a stream, an application protocol can abort + reading of the stream and request closure..." */ +FD_QUIC_API void +fd_quic_stream_close( fd_quic_stream_t * stream ); + +/* Flow Control API ***************************************************/ + +/* fd_quic_conn_set_rx_max_data sets the maximum amount of data that can be sent + by the peer on a connection. This update will propagate to the peer via a + MAX_DATA frame. + + A violation of this flow control param will result in connection termination + with FLOW_CONTROL_ERROR, per RFC 9000. */ +FD_QUIC_API void +fd_quic_conn_set_rx_max_data( fd_quic_conn_t * conn, ulong rx_max_data ); + +FD_QUIC_API void +fd_quic_conn_set_max_streams( fd_quic_conn_t * conn, int dirtype, ulong max_streams ); + +/* fd_quic_stream_set_rx_max_stream_data sets the maximum amount of data that + can be sent by the peer on a stream. This update will propagate to the peer + via a MAX_STREAM_DATA frame. + + A violation of this flow control param will result in connection termination + with FLOW_CONTROL_ERROR, per RFC 9000. + + Note that updating this param will not affect the `max_data` param (above). + The effective limit will be the smaller of the two (see the stream loop in + `fd_quic.c`). Therefore, a user should consider both params when configuring + flow control. */ +FD_QUIC_API void +fd_quic_stream_set_rx_max_stream_data( fd_quic_stream_t * stream, ulong rx_max_stream_data ); FD_PROTOTYPES_END diff --git a/src/tango/quic/fd_quic_conn.c b/src/tango/quic/fd_quic_conn.c index fb9c4d2d33..bf00aaa666 100644 --- a/src/tango/quic/fd_quic_conn.c +++ b/src/tango/quic/fd_quic_conn.c @@ -2,6 +2,7 @@ #include "fd_quic_common.h" #include "../../util/fd_util.h" #include "fd_quic_pkt_meta.h" +#include "fd_quic_private.h" /* define a map for stream_id -> stream* */ #define MAP_NAME fd_quic_stream_map @@ -17,7 +18,6 @@ struct fd_quic_conn_layout { ulong stream_cnt; ulong stream_ptr_off; ulong stream_footprint; - ulong stream_off; int stream_map_lg; ulong stream_map_off; ulong pkt_meta_off; @@ -70,14 +70,6 @@ fd_quic_conn_footprint_ext( fd_quic_limits_t const * limits, layout->stream_ptr_off = off; off += stream_cnt * sizeof(void *); - /* allocate space for stream instances */ - ulong stream_footprint = fd_quic_stream_footprint( tx_buf_sz ); - layout->stream_footprint = stream_footprint; - - off = fd_ulong_align_up( off, fd_quic_stream_align() ); - layout->stream_off = off; - off += stream_cnt*stream_footprint; - /* allocate space for stream hash map */ ulong lg = 0; while( lg < 40 && (1ul<tot_num_streams = layout.stream_cnt; conn->state = FD_QUIC_CONN_STATE_INVALID; - /* Initialize stream pointers */ - - conn->streams = (fd_quic_stream_t **)( (ulong)mem + layout.stream_ptr_off ); - /* Initialize streams */ FD_QUIC_STREAM_LIST_SENTINEL( conn->send_streams ); - FD_QUIC_STREAM_LIST_SENTINEL( conn->unused_streams ); - - fd_quic_stream_t * unused_streams = conn->unused_streams; - - ulong stream_laddr = (ulong)mem + layout.stream_off; - for( ulong j=0; j < layout.stream_cnt; j++ ) { - fd_quic_stream_t * stream = fd_quic_stream_new( - (void *)stream_laddr, conn, limits->tx_buf_sz ); - if( FD_UNLIKELY( !stream ) ) return NULL; - - conn->streams[j] = stream; - - /* insert into unused list */ - FD_QUIC_STREAM_LIST_INSERT_BEFORE( unused_streams, stream ); - - stream_laddr += layout.stream_footprint; - } /* Initialize stream hash map */ ulong stream_map_laddr = (ulong)mem + layout.stream_map_off; - FD_TEST( stream_laddr <= stream_map_laddr ); conn->stream_map = fd_quic_stream_map_join( fd_quic_stream_map_new( (void *)stream_map_laddr, layout.stream_map_lg ) ); if( FD_UNLIKELY( !conn->stream_map ) ) return NULL; @@ -225,3 +195,77 @@ void * fd_quic_conn_get_context( fd_quic_conn_t * conn ) { return conn->context; } + + +/* set the max concurrent streams value for the specified type + This is used to flow control the peer. + + type is one of: + FD_QUIC_TYPE_UNIDIR + FD_QUIC_TYPE_BIDIR */ +FD_QUIC_API void +fd_quic_conn_set_max_stream( fd_quic_conn_t * conn, int dirtype, ulong stream_cnt ) { + if( FD_UNLIKELY( dirtype != FD_QUIC_TYPE_UNIDIR + && dirtype != FD_QUIC_TYPE_BIDIR ) ) { + FD_LOG_ERR(( "fd_quic_conn_set_max_stream called with invalid type" )); + return; + } + + fd_quic_t * quic = conn->quic; + fd_quic_state_t * state = fd_quic_get_state( quic ); + + /* TODO align usage of "type" and "dirtype" + perhaps: + dir - direction: bidir or unidir + role - client or server + type - dir | role */ + uint server = (uint)conn->server; + uint type = server + ( (uint)dirtype << 1u ); + + /* store the desired value */ + conn->tgt_max_streams[type] = stream_cnt; + + /* if we're decreasing the max streams value, we simply set the target + to lower the value, we have to wait until streams are freed + if we're increasing, we try to allocate more streams from the pool + to satisfy the request */ + if( stream_cnt > conn->alloc_streams[type] ) { + + /* load the currently allocated streams */ + ulong alloc_streams = conn->alloc_streams[type]; + ulong tgt_max_streams = conn->tgt_max_streams[type]; + + fd_quic_stream_t * unused_streams = conn->unused_streams[type]; + + /* allocate streams from the pool to the connection */ + for( ulong j = alloc_streams; j < tgt_max_streams; ++j ) { + fd_quic_stream_t * stream = fd_quic_stream_pool_alloc( state->stream_pool ); + + /* best effort */ + if( FD_UNLIKELY( !stream ) ) break; + + /* insert into unused list */ + FD_QUIC_STREAM_LIST_INSERT_BEFORE( unused_streams, stream ); + stream->list_memb = FD_QUIC_STREAM_LIST_MEMB_UNUSED; + + /* adjust alloc_streams to match */ + alloc_streams++; + } + + /* store alloc_streams */ + conn->alloc_streams[type] = alloc_streams; + } +} + + +/* get the current value for the concurrent streams for the specified type + + type is one of: + FD_QUIC_TYPE_UNIDIR + FD_QUIC_TYPE_BIDIR */ +FD_QUIC_API ulong +fd_quic_conn_get_max_streams( fd_quic_conn_t * conn, int dirtype ) { + uint server = (uint)conn->server; + uint type = server + ( (uint)dirtype << 1u ); + return conn->tgt_max_streams[type]; +} diff --git a/src/tango/quic/fd_quic_conn.h b/src/tango/quic/fd_quic_conn.h index 84a5328ddd..27cec5e591 100644 --- a/src/tango/quic/fd_quic_conn.h +++ b/src/tango/quic/fd_quic_conn.h @@ -7,6 +7,8 @@ #include "crypto/fd_quic_crypto_suites.h" #include "templ/fd_quic_transport_params.h" #include "fd_quic_pkt_meta.h" +#include "fd_quic_stream_pool.h" +#include "../../ballet/txn/fd_txn.h" #define FD_QUIC_CONN_STATE_INVALID 0 /* dead object / freed */ #define FD_QUIC_CONN_STATE_HANDSHAKE 1 /* currently doing handshaking with peer */ @@ -137,10 +139,15 @@ struct fd_quic_conn { uint key_phase_upd; /* set to 1 if we're undertaking a key update */ ulong tot_num_streams; - fd_quic_stream_t ** streams; /* array of stream pointers */ - fd_quic_stream_t send_streams[1]; /* sentinel of list of streams needing action */ - fd_quic_stream_t unused_streams[1]; /* sentinel of list of unused streams */ - fd_quic_stream_map_t * stream_map; /* map stream_id -> stream */ + fd_quic_stream_t send_streams[1]; /* sentinel of list of streams needing action */ + fd_quic_stream_t unused_streams[4][1]; /* sentinel of list of unused streams */ + fd_quic_stream_t used_streams[1]; /* sentinel of list of used streams */ + /* invariant: an allocated stream must be in exactly one of the following lists: + send_streams + unused_streams + used_streams */ + fd_quic_stream_map_t * stream_map; /* map stream_id -> stream */ + ulong tgt_max_streams[4]; /* target value for max_streams by type */ /* packet number info each encryption level maps to a packet number space @@ -195,9 +202,8 @@ struct fd_quic_conn { crypto streams - one for each enc_level acks sent */ - ulong next_stream_id[4]; /* next stream id by type - see rfc9000 2.1 */ + ulong next_stream_id[4]; /* next unused stream id by type - see rfc9000 2.1 */ - uint max_concur_streams; /* configured max concurrent streams by connection and type */ ulong max_streams[4]; /* maximum stream id by type */ /* rfc9000: 19.11 Note that these frames (and the corresponding transport parameters) @@ -210,8 +216,11 @@ struct fd_quic_conn { 0x02 Client-Initiated, Unidirectional 0x03 Server-Initiated, Unidirectional */ + /* TODO remove this if unused: */ ulong num_streams[4]; /* current number of streams of each type */ + ulong alloc_streams[4]; /* current number of streams allocated by type */ + /* TODO find better name than pool */ fd_quic_pkt_meta_pool_t pkt_meta_pool; ulong num_pkt_meta; @@ -280,6 +289,17 @@ struct fd_quic_conn { fd_quic_conn_t * next; ulong token_len; uchar token[FD_QUIC_TOKEN_SZ_MAX]; + + /* stream pool */ + /* when do we assign streams to connections? + can't increase max_stream_data until we have streams + Easiest to put under user control + + Maybe: + Assign N at initial connection creation + Add more via api call + */ + fd_quic_stream_pool_t * stream_pool; }; FD_PROTOTYPES_BEGIN @@ -320,6 +340,26 @@ fd_quic_handshake_complete( fd_quic_conn_t * conn ) { return conn->handshake_complete; } + +/* set the max concurrent streams value for the specified type + This is used to flow control the peer. + + type is one of: + FD_QUIC_CONN_MAX_STREAM_TYPE_UNIDIR + FD_QUIC_CONN_MAX_STREAM_TYPE_BIDIR */ +FD_QUIC_API void +fd_quic_conn_set_max_stream( fd_quic_conn_t * conn, int type, ulong stream_cnt ); + + +/* get the current value for the concurrent streams for the specified type + + type is one of: + FD_QUIC_CONN_MAX_STREAM_TYPE_UNIDIR + FD_QUIC_CONN_MAX_STREAM_TYPE_BIDIR */ +FD_QUIC_API ulong +fd_quic_conn_get_max_streams( fd_quic_conn_t * conn, int type ); + + //static inline void //fd_quic_conn_set_next( fd_quic_conn_t * conn, // fd_quic_conn_t * next ) { diff --git a/src/tango/quic/fd_quic_enum.h b/src/tango/quic/fd_quic_enum.h new file mode 100644 index 0000000000..f62cc3285c --- /dev/null +++ b/src/tango/quic/fd_quic_enum.h @@ -0,0 +1,83 @@ +#ifndef HEADER_fd_src_tango_quic_fd_quic_enum_h +#define HEADER_fd_src_tango_quic_fd_quic_enum_h + + +/* FD_QUIC_STREAM_TYPE_* indicate stream type (two least significant + bits of a stream ID) */ +#define FD_QUIC_STREAM_TYPE_BIDI_CLIENT 0 +#define FD_QUIC_STREAM_TYPE_BIDI_SERVER 1 +#define FD_QUIC_STREAM_TYPE_UNI_CLIENT 2 +#define FD_QUIC_STREAM_TYPE_UNI_SERVER 3 + +/* FD_QUIC_{SUCCESS,FAILED} are used for error return codes. */ +#define FD_QUIC_SUCCESS (0) +#define FD_QUIC_FAILED (1) + +/* FD_QUIC_TYPE_{UNI,BI}DIR indicate stream type. */ +#define FD_QUIC_TYPE_BIDIR (0) +#define FD_QUIC_TYPE_UNIDIR (1) + +/* FD_QUIC_ALIGN specifies the alignment needed for an fd_quic_t. + This is provided to facilitate compile-time QUIC declarations. + Also see fd_quic_align() */ +#define FD_QUIC_ALIGN (4096UL) /* 4KiB */ + +/* FD_QUIC_MTU is the assumed network link MTU in bytes, including L2 + and L3 headers. */ +#define FD_QUIC_MTU (1500) + +/* FD_QUIC_INITIAL_PAYLOAD_SZ_MIN is the min byte size of the UDP payload + of Initial-type packets. Mandated for both clients and servers as a + form of MTU discovery and to mitigate amplification attacks. See + RFC 9000 Section 14.1: + https://datatracker.ietf.org/doc/html/rfc9000#name-initial-datagram-size */ +#define FD_QUIC_INITIAL_PAYLOAD_SZ_MIN (1200) +#define FD_QUIC_INITIAL_PAYLOAD_SZ_MAX (FD_QUIC_INITIAL_PAYLOAD_SZ_MIN) + +/* Tokens (both RETRY and NEW_TOKEN) are specified by varints. We bound it to + 77 bytes. Both our and quinn's RETRY tokens are 77 bytes, but our client + needs to be able to handle other server impl's of RETRY too. + + FIXME change this bound (requires variable-length encoding). */ +#define FD_QUIC_TOKEN_SZ_MAX (77) +/* Retry packets don't carry a token length field, so we infer it from the + footprint of a packet with a zero-length token and zero-length conn ids. */ +#define FD_QUIC_EMPTY_RETRY_PKT_SZ (23) + +/* FD_QUIC_MAX_PAYLOAD_SZ is the max byte size of the UDP payload of any + QUIC packets. Derived from FD_QUIC_MTU by subtracting the typical + IPv4 header (no options) and UDP header sizes. */ +#define FD_QUIC_MAX_PAYLOAD_SZ (FD_QUIC_MTU - 20 - 8) + +/* FD_QUIC_ROLE_{CLIENT,SERVER} identify the fd_quic_t's role as a + client or server. */ +#define FD_QUIC_ROLE_CLIENT 1 +#define FD_QUIC_ROLE_SERVER 2 + +/* FD_QUIC_SEND_ERR_* are negative int error codes indicating a stream + send failure. + ...INVAL_STREAM: Not allowed to send for stream ID (e.g. not open) + ...INVAL_CONN: Connection not in valid state for sending + ...FIN: Not allowed to send, stream finished */ +#define FD_QUIC_SEND_ERR_INVAL_STREAM (-1) +#define FD_QUIC_SEND_ERR_INVAL_CONN (-2) +#define FD_QUIC_SEND_ERR_STREAM_FIN (-3) + +/* FD_QUIC_MIN_CONN_ID_CNT: min permitted conn ID count per conn */ +#define FD_QUIC_MIN_CONN_ID_CNT (4UL) + +/* FD_QUIC_DEFAULT_SPARSITY: default fd_quic_limits_t->conn_id_sparsity */ +#define FD_QUIC_DEFAULT_SPARSITY (2.5) + +/* FD_QUIC_NOTIFY_* indicate stream notification types. + ...END: Stream lifetime has ended, no more callbacks will be + generated for it. Stream will be freed after event + delivery. + ...RESET: Peer has reset the stream (will not send) + ...ABORT: Peer has aborted the stream (will not receive) */ +#define FD_QUIC_NOTIFY_END (100) +#define FD_QUIC_NOTIFY_RESET (101) +#define FD_QUIC_NOTIFY_ABORT (102) + + +#endif diff --git a/src/tango/quic/fd_quic_private.h b/src/tango/quic/fd_quic_private.h index 36fc34b9d1..e66a662d0e 100644 --- a/src/tango/quic/fd_quic_private.h +++ b/src/tango/quic/fd_quic_private.h @@ -64,15 +64,16 @@ struct __attribute__((aligned(16UL))) fd_quic_state_private { /* Various internal state */ - fd_quic_conn_t * conns; /* free list of unused connections */ - fd_quic_conn_map_t * conn_map; /* map connection ids -> connection */ - fd_quic_event_t * service_queue; /* priority queue of connections by service time */ + fd_quic_conn_t * conns; /* free list of unused connections */ + fd_quic_conn_map_t * conn_map; /* map connection ids -> connection */ + fd_quic_event_t * service_queue; /* priority queue of connections by service time */ + fd_quic_stream_pool_t * stream_pool; /* stream pool */ /* crypto members */ - fd_quic_crypto_ctx_t crypto_ctx[1]; /* crypto context */ + fd_quic_crypto_ctx_t crypto_ctx[1]; /* crypto context */ - fd_quic_pkt_meta_t * pkt_meta; /* records the metadata for the contents - of each sent packet */ + fd_quic_pkt_meta_t * pkt_meta; /* records the metadata for the contents + of each sent packet */ /* flow control - configured initial limits */ ulong initial_max_data; /* directly from transport params */ diff --git a/src/tango/quic/fd_quic_qos.c b/src/tango/quic/fd_quic_qos.c new file mode 100644 index 0000000000..31ca9fd3f5 --- /dev/null +++ b/src/tango/quic/fd_quic_qos.c @@ -0,0 +1,221 @@ +#include "fd_quic_qos.h" +#include "../../util/rng/fd_rng.h" +#include "../stake/fd_stake.h" +#include "fd_quic.h" +#include "fd_quic_conn.h" +#include "fd_quic_enum.h" +#include "fd_quic_private.h" +#include "tls/fd_quic_tls.h" + +ulong +fd_quic_qos_align( void ) { + return FD_QUIC_QOS_ALIGN; +} + +ulong +fd_quic_qos_footprint( fd_quic_qos_limits_t * limits ) { + ulong l; + l = FD_LAYOUT_INIT; + l = FD_LAYOUT_APPEND( l, fd_quic_qos_align(), sizeof( fd_quic_qos_t ) ); + l = FD_LAYOUT_APPEND( + l, fd_quic_qos_pq_align(), fd_quic_qos_pq_footprint( limits->pq_lg_slot_cnt ) ); + l = FD_LAYOUT_APPEND( l, fd_lru_align(), fd_lru_footprint( limits->lru_depth, 0UL ) ); + return FD_LAYOUT_FINI( l, fd_quic_qos_align() ); +} + +/* fd_quic_qos_new formats an unused memory for use a QoS (Quality of Service) component. Not + * designed to be shared across multiple joins (pointer addresses are local to the joined process). + */ +void * +fd_quic_qos_new( void * mem, fd_quic_qos_limits_t * limits ) { + FD_SCRATCH_ALLOC_INIT( l, mem ); + fd_quic_qos_t * qos = FD_SCRATCH_ALLOC_APPEND( l, fd_quic_qos_align(), sizeof( fd_quic_qos_t ) ); + qos->limits = *limits; + void * pq = FD_SCRATCH_ALLOC_APPEND( + l, fd_quic_qos_pq_align(), fd_quic_qos_pq_footprint( limits->pq_lg_slot_cnt ) ); + fd_quic_qos_pq_new( pq, limits->pq_lg_slot_cnt ); + void * lru = + FD_SCRATCH_ALLOC_APPEND( l, fd_lru_align(), fd_lru_footprint( limits->lru_depth, 0UL ) ); + fd_lru_new( lru, limits->lru_depth, 0UL ); + void * cnt = FD_SCRATCH_ALLOC_APPEND( + l, fd_quic_qos_cnt_align(), fd_quic_qos_cnt_footprint( limits->cnt_lg_slot_cnt ) ); + fd_quic_qos_cnt_new( cnt, limits->cnt_lg_slot_cnt ); + return mem; +} + +fd_quic_qos_t * +fd_quic_qos_join( void * mem ) { + FD_SCRATCH_ALLOC_INIT( l, mem ); + fd_quic_qos_t * qos = FD_SCRATCH_ALLOC_APPEND( l, fd_quic_qos_align(), sizeof( fd_quic_qos_t ) ); + qos->pq = fd_quic_qos_pq_join( FD_SCRATCH_ALLOC_APPEND( + l, fd_quic_qos_pq_align(), fd_quic_qos_pq_footprint( qos->limits.pq_lg_slot_cnt ) ) ); + qos->lru = fd_lru_join( FD_SCRATCH_ALLOC_APPEND( + l, fd_lru_align(), fd_lru_footprint( qos->limits.lru_depth, 0UL ) ) ); + qos->cnt = fd_quic_qos_cnt_join( FD_SCRATCH_ALLOC_APPEND( + l, fd_quic_qos_cnt_align(), fd_quic_qos_cnt_footprint( qos->limits.cnt_lg_slot_cnt ) ) ); + return qos; +} + +void +fd_quic_qos_conn_new( fd_quic_qos_t * qos, + fd_stake_t * stake, + fd_rng_t * rng, + fd_quic_conn_t * conn, + fd_stake_pubkey_t * pubkey ) { + /* check the incoming connection's origin key (pubkey / ipv4) is not exceeding max */ + fd_quic_qos_cnt_key_t check_cnt_keys[2] = { 0 }; + if ( FD_UNLIKELY( pubkey ) ) check_cnt_keys[0].pubkey = *pubkey; + check_cnt_keys[1].ip4_addr = conn->peer[conn->cur_peer_idx].net.ip_addr; + for ( ulong i = 0; i < fd_quic_qos_cnt_slot_cnt( qos->cnt ); i++ ) { + if ( !fd_quic_qos_cnt_key_inval( qos->cnt[i].key ) ) { + FD_LOG_NOTICE( ( "%u: %lu", qos->cnt[i].key.ip4_addr, qos->cnt[i].count ) ); + } + } + for ( ulong i = 0; i < 2; i++ ) { + fd_quic_qos_cnt_t * query = fd_quic_qos_cnt_query( qos->cnt, check_cnt_keys[i], NULL ); + if ( FD_UNLIKELY( query && query->count >= qos->limits.cnt_max_conns ) ) { + fd_quic_conn_close( conn, FD_QUIC_CONN_REASON_CONNECTION_REFUSED ); + return; + } + } + + ulong begin = ULONG_MAX; + ulong end = 0; + fd_quic_conn_t * evict = NULL; + ulong max_streams = 0; + + do { + begin = fd_mvcc_begin_read( &stake->mvcc ); + + /* get connection stake (pubkey is stored in a conn) */ + ulong conn_stake = 0; + if ( FD_LIKELY( pubkey ) ) { /* optimize for authenticated conns */ + fd_stake_node_t * node = fd_stake_node_query( fd_stake_nodes_laddr( stake ), *pubkey, NULL ); + if ( FD_LIKELY( ( node ) ) ) { /* optimize for staked traffic */ + conn_stake = node->stake; + } + } + + /* determine conn eviction and update conn counts for a given origin key (pubkey / ipv4 addr) */ + fd_quic_conn_t * evict = conn; + if ( conn_stake > 0 ) evict = fd_quic_qos_pq_conn_upsert( qos, stake, rng, conn, pubkey ); + if ( FD_LIKELY( evict == conn ) ) { /* unlikely to evict from pq */ + fd_list_t * lru_evict = fd_lru_upsert( qos->lru, (ulong)conn ); /* NULL if no evict */ + + if ( FD_LIKELY( lru_evict ) ) { + /* save the evicted conn to return */ + evict = (fd_quic_conn_t *)lru_evict->tag; + + /* decrement the evicted key */ + fd_quic_qos_cnt_key_t cnt_key = { 0 }; + cnt_key.ip4_addr = conn->peer[conn->cur_peer_idx].net.ip_addr; + fd_quic_qos_cnt_t * query = fd_quic_qos_cnt_query( qos->cnt, cnt_key, NULL ); + if ( FD_UNLIKELY( !query ) ) FD_LOG_ERR( ( "fd_quic_qos: key in lru missing from cnt!" ) ); + if ( FD_UNLIKELY( !--query->count ) ) fd_quic_qos_cnt_remove( qos->cnt, query ); + } + + /* increment the inserted key */ + fd_quic_qos_cnt_key_t cnt_key = { 0 }; + cnt_key.ip4_addr = conn->peer[conn->cur_peer_idx].net.ip_addr; + fd_quic_qos_cnt_t * query = fd_quic_qos_cnt_query( qos->cnt, cnt_key, NULL ); + if ( FD_LIKELY( !query ) ) fd_quic_qos_cnt_insert( qos->cnt, cnt_key )->count = 1; + else query->count++; + } + + /* determine flow control (max streams) based on stake */ + ulong total_stake = fd_ulong_max( stake->total_stake, 1UL ); /* avoid division by zero */ + ulong share = ( conn_stake * 100 / total_stake * 100 ); /* truncating division */ + max_streams = ( share * qos->limits.total_streams ) / 100; + max_streams = fd_ulong_min( max_streams, qos->limits.max_streams ); + + end = fd_mvcc_end_read( &stake->mvcc ); + } while ( end % 2 != 0 || end != begin ); + + if ( FD_LIKELY( evict ) ) { + /* The logic here is that we will first gracefully close a connection. + fd_quic_service will later attempt to service the pending close. + + If there are still no connections available when a new connection arrives, + it will attempt to service any pending closes again. If there are still + no available connections, it will refuse the connection. + + See also "Early check" in the initial pkt handler. */ + fd_quic_conn_close( evict, FD_QUIC_CONN_REASON_INTERNAL_ERROR ); + conn->quic->metrics.conn_aborted_cnt++; + } + + max_streams = fd_ulong_max( max_streams, qos->limits.min_streams ); + FD_DEBUG( FD_LOG_DEBUG( ( "server: new connection with alloted max streams %lu", + conn->max_streams[FD_QUIC_STREAM_TYPE_UNI_CLIENT] ) ) ); + fd_quic_conn_set_max_streams( conn, FD_QUIC_TYPE_UNIDIR, max_streams ); +} + +fd_quic_conn_t * +fd_quic_qos_pq_conn_upsert( fd_quic_qos_t * qos, + fd_stake_t * stake, + fd_rng_t * rng, + fd_quic_conn_t * conn, + fd_stake_pubkey_t * pubkey ) { + ulong key_cnt = fd_quic_qos_pq_key_cnt( qos->pq ); + ulong key_max = fd_quic_qos_pq_key_max( qos->pq ); + fd_quic_conn_t * evict = NULL; + fd_quic_qos_cnt_key_t cnt_key = { 0 }; + + /* only evict if the pq map is >= half full */ + if ( FD_LIKELY( key_cnt >= key_max / 2 ) ) { + /* randomly sample lg(n) entries in the staked map and evict the conn with the smallest stake */ + fd_stake_node_t * node = fd_stake_node_query( fd_stake_nodes_laddr( stake ), *pubkey, NULL ); + ulong conn_stake = 0UL; + if ( node ) conn_stake = node->stake; + fd_quic_qos_pq_t * arg_min = NULL; + ulong min = conn_stake; + int lg_n = fd_quic_qos_pq_lg_slot_cnt( qos->pq ); + ulong n = fd_quic_qos_pq_slot_cnt( qos->pq ); + for ( int i = 0; i < lg_n; i++ ) { + ulong slot_idx = fd_rng_ulong( rng ) % n; + fd_quic_qos_pq_t * random_slot = &qos->pq[slot_idx]; + /* optimize for key exists when random sampling a key to evict */ + if ( FD_LIKELY( !fd_quic_qos_pq_key_inval( random_slot->key ) ) ) { + fd_stake_node_t * random_node = + fd_stake_node_query( fd_stake_nodes_laddr( stake ), random_slot->pubkey, NULL ); + if ( random_node && random_node->stake < min ) arg_min = random_slot; + } + } + if ( FD_UNLIKELY( arg_min ) ) { /* unlikely to meet stake threshold to evict */ + /* save the evicted conn to return */ + evict = arg_min->conn; + + /* remove the evicted key */ + fd_quic_qos_pq_remove( qos->pq, arg_min ); + fd_quic_qos_pq_insert( qos->pq, conn->local_conn_id ); + + /* decrement the evicted key */ + memset( &cnt_key, 0, sizeof( fd_quic_qos_cnt_key_t ) ); + cnt_key.pubkey = arg_min->pubkey; + fd_quic_qos_cnt_t * query = fd_quic_qos_cnt_query( qos->cnt, cnt_key, NULL ); + if ( FD_UNLIKELY( !query ) ) FD_LOG_ERR( ( "fd_quic_qos: key in pq missing from cnt!" ) ); + if ( FD_UNLIKELY( !--query->count ) ) fd_quic_qos_cnt_remove( qos->cnt, query ); + } + } + + /* if there is space (regardless of whether we evicted), insert the conn */ + if ( FD_LIKELY( key_cnt < key_max / 2 ) ) { + fd_quic_qos_pq_t * insert = fd_quic_qos_pq_insert( qos->pq, conn->local_conn_id ); + /* if insert is NULL (key already in map), this indicates a programming error. even though + connections are pooled, the previous usage of the connection should have been freed (and + removed from the map) already. */ + + if ( FD_UNLIKELY( insert == NULL ) ) + FD_LOG_ERR( ( "fd_quic_qos: detected reuse of conn without free!" ) ); + insert->conn = conn; + insert->pubkey = *pubkey; + + /* increment the inserted key */ + memset( &cnt_key, 0, sizeof( fd_quic_qos_cnt_key_t ) ); + cnt_key.pubkey = *pubkey; + fd_quic_qos_cnt_t * query = fd_quic_qos_cnt_query( qos->cnt, cnt_key, NULL ); + if ( FD_LIKELY( !query ) ) fd_quic_qos_cnt_insert( qos->cnt, cnt_key )->count = 1; + else query->count++; + } + return evict; +} diff --git a/src/tango/quic/fd_quic_qos.h b/src/tango/quic/fd_quic_qos.h new file mode 100644 index 0000000000..5c55ea8a19 --- /dev/null +++ b/src/tango/quic/fd_quic_qos.h @@ -0,0 +1,126 @@ +#ifndef HEADER_fd_src_tango_quic_fd_quic_qos_h +#define HEADER_fd_src_tango_quic_fd_quic_qos_h + +#include "../../ballet/txn/fd_txn.h" +#include "../../util/fd_util.h" +#include "../../util/fd_util_base.h" +#include "../lru/fd_lru.h" +#include "../stake/fd_stake.h" +#include "fd_quic_common.h" +#include "fd_quic_conn.h" +#include + +#define FD_QUIC_QOS_LRU_ALIGN ( 128UL ) +#define FD_QUIC_QOS_ALIGN ( 128UL ) + +/* Default limits */ +#define FD_QUIC_QOS_DEFAULT_MIN_STREAMS ( 1UL << 7 ) +#define FD_QUIC_QOS_DEFAULT_MAX_STREAMS ( 1UL << 11 ) +#define FD_QUIC_QOS_DEFAULT_TOTAL_STREAMS ( 1UL << 16 ) +#define FD_QUIC_QOS_DEFAULT_PRIV_CONNS ( 1UL << 16 ) +#define FD_QUIC_QOS_DEFAULT_UNPRIV_CONNS ( 1UL << 16 ) + +/* Configurable limits */ +struct fd_quic_qos_limits { + ulong min_streams; /* the min # of concurrent streams that can be alloted to a single conn */ + ulong max_streams; /* the max # of concurrent streams that can be alloted to a single conn */ + ulong total_streams; /* the total # of streams that can be alloted across all conns */ + int pq_lg_slot_cnt; /* the lg max # of "prioritized" conns. stake-based (priority) eviction. */ + ulong lru_depth; /* the lg max # of "unprioritzed" conns. LRU eviction. */ + int cnt_lg_slot_cnt; /* the lg max # of origins (pubkey or IpV4) we track conn cnts. */ + ulong cnt_max_conns; /* the max # of conns allowed per conn origin key. */ +}; +typedef struct fd_quic_qos_limits fd_quic_qos_limits_t; + +struct fd_quic_qos_pq { + ulong key; /* conn->local_conn_id */ + uint hash; + fd_quic_conn_t * conn; + fd_stake_pubkey_t pubkey; +}; +typedef struct fd_quic_qos_pq fd_quic_qos_pq_t; +#define MAP_NAME fd_quic_qos_pq +#define MAP_T fd_quic_qos_pq_t +#include "../../util/tmpl/fd_map_dynamic.c" + +union fd_quic_qos_cnt_key { + fd_stake_pubkey_t pubkey; + uint ip4_addr; +}; +typedef union fd_quic_qos_cnt_key fd_quic_qos_cnt_key_t; +static fd_quic_qos_cnt_key_t cnt_key_null = { 0 }; + +struct fd_quic_qos_cnt { + fd_quic_qos_cnt_key_t key; + uint hash; + ulong count; +}; +typedef struct fd_quic_qos_cnt fd_quic_qos_cnt_t; +#define MAP_NAME fd_quic_qos_cnt +#define MAP_T fd_quic_qos_cnt_t +#define MAP_KEY_T fd_quic_qos_cnt_key_t +#define MAP_KEY_NULL cnt_key_null +#define MAP_KEY_INVAL( k ) !( memcmp( &k, &cnt_key_null, sizeof( fd_quic_qos_cnt_key_t ) ) ) +#define MAP_KEY_EQUAL( k0, k1 ) !( memcmp( ( &k0 ), ( &k1 ), sizeof( fd_quic_qos_cnt_key_t ) ) ) +#define MAP_KEY_EQUAL_IS_SLOW 1 +#define MAP_KEY_HASH( key ) ( (uint)( fd_hash( 0UL, &key, sizeof( fd_quic_qos_cnt_key_t ) ) ) ) +#include "../../util/tmpl/fd_map_dynamic.c" + +struct fd_quic_qos { + fd_quic_qos_limits_t limits; + fd_stake_t * stake; + fd_rng_t * rng; + /* priority queue for "prioritized traffic". eviction is done by removing the minimum element of a + * random lg(n) sample (vs. global minimum). connections in the pq will _probably_ have stake, but + * it is not a strict requirement. */ + fd_quic_qos_pq_t * pq; + /* LRU cache for "unprioritized traffic". connections in the lru will probably not have stake, but + * it is not strictly the case: a staked connection will end up in the LRU if it doesn't meet the + * threshold to evict from the pq. */ + fd_lru_t * lru; + /* counter of connections for a given pubkey / IPv4 address */ + fd_quic_qos_cnt_t * cnt; +}; +typedef struct fd_quic_qos fd_quic_qos_t; + +FD_PROTOTYPES_BEGIN + +ulong +fd_quic_qos_align( void ); + +ulong +fd_quic_qos_footprint( fd_quic_qos_limits_t * limits ); + +void * +fd_quic_qos_new( void * mem, fd_quic_qos_limits_t * limits ); + +fd_quic_qos_t * +fd_quic_qos_join( void * mem ); + +/* fd_quic_qos_conn_new attempts to place conn in the PQ or LRU, as well as how many lifetime QUIC + streams (client-initiated, unidirectional) to allocate to this conn. It is designed to work with + fd_quic's conn_new callback */ +void +fd_quic_qos_conn_new( fd_quic_qos_t * qos, + fd_stake_t * stake, + fd_rng_t * rng, + fd_quic_conn_t * conn, + fd_stake_pubkey_t * pubkey ); + +/* fd_quic_qos_pq_upsert upserts conn into the pq map. + + - If there is space in the map, it inserts conn and returns NULL. + - If there is no space in the map, it will look for an eviction candidate by randomly sampling + lg(n) conns, and evicting the lowest stake one that is also less than the incoming conn's stake. + - If it finds a candidate, it will evict and return candidate, and insert the incoming conn. + - Otherwise, it will return the incoming conn itself. */ +fd_quic_conn_t * +fd_quic_qos_pq_conn_upsert( fd_quic_qos_t * qos, + fd_stake_t * stake, + fd_rng_t * rng, + fd_quic_conn_t * conn, + fd_stake_pubkey_t * pubkey ); + +FD_PROTOTYPES_END + +#endif /* HEADER_fd_src_tango_quic_fd_quic_qos_h */ diff --git a/src/tango/quic/fd_quic_stream.c b/src/tango/quic/fd_quic_stream.c index 64b0871023..93040cc1d9 100644 --- a/src/tango/quic/fd_quic_stream.c +++ b/src/tango/quic/fd_quic_stream.c @@ -148,6 +148,9 @@ fd_quic_stream_new( void * mem, fd_quic_conn_t * conn, ulong tx_buf_sz ) { stream->conn = conn; stream->stream_id = FD_QUIC_STREAM_ID_UNUSED; + /* stream pointing to itself is not a member of any list */ + stream->next = stream->prev = stream; + return stream; } @@ -157,8 +160,10 @@ fd_quic_stream_new( void * mem, fd_quic_conn_t * conn, ulong tx_buf_sz ) { stream the stream to free */ void fd_quic_stream_delete( fd_quic_stream_t * stream ) { - /* nothing to do */ - (void)stream; + /* stream pointing to itself is not a member of any list */ + stream->next = stream->prev = stream; + stream->list_memb = FD_QUIC_STREAM_LIST_MEMB_NONE; + stream->flags = 0u; } @@ -184,3 +189,14 @@ void * fd_quic_stream_get_context( fd_quic_stream_t * stream ) { return stream->context; } + + +/* set stream connection + + args + stream the stream to change + conn the connection to set on the stream or NULL to remove the connection */ +void +fd_quic_stream_set_conn( fd_quic_stream_t * stream, fd_quic_conn_t * conn ) { + stream->conn = conn; +} diff --git a/src/tango/quic/fd_quic_stream.h b/src/tango/quic/fd_quic_stream.h index fbbb668db4..24f0ef4a85 100644 --- a/src/tango/quic/fd_quic_stream.h +++ b/src/tango/quic/fd_quic_stream.h @@ -62,6 +62,12 @@ struct fd_quic_stream { # define FD_QUIC_DEFAULT_INITIAL_RX_MAX_STREAM_DATA 1280 // IPv6 minimum MTU + uint list_memb; /* list membership */ +# define FD_QUIC_STREAM_LIST_MEMB_NONE 0 +# define FD_QUIC_STREAM_LIST_MEMB_UNUSED 1 +# define FD_QUIC_STREAM_LIST_MEMB_USED 2 +# define FD_QUIC_STREAM_LIST_MEMB_SEND 3 + /* flow control */ ulong tx_max_stream_data; /* the limit on the number of bytes we are allowed to send to the peer on this stream @@ -118,13 +124,15 @@ struct fd_quic_stream { FD_QUIC_STREAM_LIST_LINK( stream_prev, new_stream ); \ } while(0) -/* remove stream from list */ +/* remove stream from list + + a stream pointing to itself is not in a list */ #define FD_QUIC_STREAM_LIST_REMOVE( stream ) \ do { \ fd_quic_stream_t * stream_prev = (stream)->prev; \ fd_quic_stream_t * stream_next = (stream)->next; \ FD_QUIC_STREAM_LIST_LINK( stream_prev, stream_next ); \ - (stream)->next = (stream)->prev = NULL; \ + (stream)->next = (stream)->prev = (stream); \ } while(0) @@ -205,9 +213,26 @@ fd_quic_stream_set_context( fd_quic_stream_t * stream, void * context ); void * fd_quic_stream_get_context( fd_quic_stream_t * stream ); +/* set rx max stream data + + This allows the peer to send more data on this stream + + args + stream the stream to change + rx_max_stream_data the new max_stream_data to set on the stream */ void fd_quic_stream_set_rx_max_stream_data( fd_quic_stream_t * stream, ulong rx_max_stream_data ); + +/* set stream connection + + args + stream the stream to change + conn the connection to set on the stream or NULL to remove the connection */ +void +fd_quic_stream_set_conn( fd_quic_stream_t * stream, fd_quic_conn_t * conn ); + + FD_PROTOTYPES_END #endif /* HEADER_fd_src_tango_quic_fd_quic_stream_h */ diff --git a/src/tango/quic/fd_quic_stream_pool.c b/src/tango/quic/fd_quic_stream_pool.c new file mode 100644 index 0000000000..abe9c68a40 --- /dev/null +++ b/src/tango/quic/fd_quic_stream_pool.c @@ -0,0 +1,110 @@ +#include "fd_quic_stream_pool.h" + +#include "../../util/fd_util.h" + +/* returns the required footprint of fd_quic_stream_pool_t + + args + count the number of streams the pool will manage + tx_buf_sz the size of the tx buffer + should be 0 for RX only streams */ +FD_FN_CONST +ulong +fd_quic_stream_pool_footprint( ulong count, ulong tx_buf_sz ) { + ulong foot = fd_ulong_align_up( sizeof( fd_quic_stream_pool_t ), + FD_QUIC_STREAM_POOL_ALIGN ); + + ulong stream_foot = fd_quic_stream_footprint( tx_buf_sz ); + + return foot + stream_foot * count; +} + +/* returns a newly initialized stream pool + + args + mem the memory aligned to fd_quic_stream_pool_align, and at least fd_quic_stream_pool_footprint + bytes + count the number of streams the pool will manage + type the stream type used for the streams managed by this pool */ +fd_quic_stream_pool_t * +fd_quic_stream_pool_new( void * mem, ulong count, ulong tx_buf_sz ) { + ulong offs = 0; + ulong ul_mem = (ulong)mem; + + fd_quic_stream_pool_t * pool = (fd_quic_stream_pool_t*)ul_mem; + memset( pool, 0, sizeof( fd_quic_stream_pool_t ) ); + + pool->cap = count; + pool->cur_cnt = 0UL; + + offs += fd_ulong_align_up( sizeof( fd_quic_stream_pool_t ), FD_QUIC_STREAM_POOL_ALIGN ); + + ulong stream_foot = fd_quic_stream_footprint( tx_buf_sz ); + + FD_QUIC_STREAM_LIST_SENTINEL( pool->head ); + + /* allocate count streams */ + for( ulong j = 0; j < count; ++j ) { + fd_quic_stream_t * stream = fd_quic_stream_new( (void*)( ul_mem + offs ), NULL, tx_buf_sz ); + + FD_QUIC_STREAM_LIST_INSERT_BEFORE( pool->head, stream ); + pool->cur_cnt++; + + offs += stream_foot; + + } + + return pool; +} + + +/* delete a stream pool + + this will also delete all the associated streams + + All streams should be freed back to the pool before this function is called + + args + stream_pool the stream pool to free */ +void +fd_quic_stream_pool_delete( fd_quic_stream_pool_t * stream_pool ) { + (void)stream_pool; +} + + +/* allocates a stream from the pool + + args + stream_pool the pool from which to obtain the stream + + returns + the newly allocated stream, or NULL if no streams are available */ +fd_quic_stream_t * +fd_quic_stream_pool_alloc( fd_quic_stream_pool_t * pool ) { + fd_quic_stream_t * stream_sentinel = pool->head; + fd_quic_stream_t * stream = stream_sentinel->next; + + if( FD_UNLIKELY( stream == stream_sentinel ) ) { + /* no streams left in free list, return NULL */ + return NULL; + } + + /* remove from free list */ + FD_QUIC_STREAM_LIST_REMOVE( stream ); + pool->cur_cnt--; + + return stream; +} + +/* free a stream to the specified pool + + args + stream_pool the pool to return the stream to + stream the stream to return */ +void +fd_quic_stream_pool_free( fd_quic_stream_pool_t * pool, + fd_quic_stream_t * stream ) { + FD_QUIC_STREAM_LIST_INSERT_BEFORE( pool->head, stream ); + pool->cur_cnt++; +} + diff --git a/src/tango/quic/fd_quic_stream_pool.h b/src/tango/quic/fd_quic_stream_pool.h new file mode 100644 index 0000000000..ad9c81eff2 --- /dev/null +++ b/src/tango/quic/fd_quic_stream_pool.h @@ -0,0 +1,75 @@ +#ifndef HEADER_fd_src_tango_quic_fd_quic_stream_pool_h +#define HEADER_fd_src_tango_quic_fd_quic_stream_pool_h + +#include "fd_quic_stream.h" + +/* stream pool alignment */ +#define FD_QUIC_STREAM_POOL_ALIGN 128ul + +struct fd_quic_stream_pool { + ulong cap; /* the capacity of the pool */ + ulong cur_cnt; /* the current number of streams in the pool */ + fd_quic_stream_t head[1]; /* the head of the linked list of free streams, or NULL if none */ +}; + +typedef struct fd_quic_stream_pool fd_quic_stream_pool_t; + +FD_PROTOTYPES_BEGIN + +/* returns the alignment of the fd_quic_stream_pool_t */ +FD_FN_CONST inline +ulong +fd_quic_stream_pool_align( void ) { + return FD_QUIC_STREAM_POOL_ALIGN; +} + +/* returns the required footprint of fd_quic_stream_pool_t + + args + count the number of streams the pool will manage */ +FD_FN_CONST +ulong +fd_quic_stream_pool_footprint( ulong count, ulong tx_buf_sz ); + +/* returns a newly initialized stream pool + + args + mem the memory aligned to fd_quic_stream_pool_align, and at least fd_quic_stream_pool_footprint + bytes + count the number of streams the pool will manage + type the stream type used for the streams managed by this pool */ +fd_quic_stream_pool_t * +fd_quic_stream_pool_new( void * mem, ulong count, ulong tx_buf_sz ); + +/* delete a stream pool + + this will also delete all the associated streams + + All streams should be freed back to the pool before this function is called + + args + stream_pool the stream pool to free */ +void +fd_quic_stream_pool_delete( fd_quic_stream_pool_t * stream_pool ); + +/* allocates a stream from the pool + + args + stream_pool the pool from which to obtain the stream + + returns + the newly allocated stream, or NULL if no streams are available */ +fd_quic_stream_t * +fd_quic_stream_pool_alloc( fd_quic_stream_pool_t * pool ); + +/* free a stream to the specified pool + + args + stream_pool the pool to return the stream to + stream the stream to return */ +void +fd_quic_stream_pool_free( fd_quic_stream_pool_t * pool, fd_quic_stream_t * stream ); + +FD_PROTOTYPES_END + +#endif diff --git a/src/tango/quic/tests/Local.mk b/src/tango/quic/tests/Local.mk index 5cf968ccfa..9cfaa6e3b4 100644 --- a/src/tango/quic/tests/Local.mk +++ b/src/tango/quic/tests/Local.mk @@ -6,6 +6,7 @@ $(call make-unit-test,test_quic_hs,test_quic_hs,fd_aio fd_quic fd_ballet fd_tang $(call make-unit-test,test_quic_streams,test_quic_streams,fd_aio fd_ballet fd_tango fd_quic fd_util) $(call make-unit-test,test_quic_conn,test_quic_conn,fd_aio fd_quic fd_ballet fd_tango fd_util) $(call make-unit-test,test_quic_server,test_quic_server,fd_aio fd_ballet fd_quic fd_tango fd_util) +$(call make-unit-test,test_quic_qos_server,test_quic_qos_server,fd_aio fd_ballet fd_quic fd_tango fd_util) $(call make-unit-test,test_quic_client_flood,test_quic_client_flood,fd_aio fd_quic fd_ballet fd_tango fd_util) $(call make-unit-test,test_quic_bw,test_quic_bw,fd_aio fd_quic fd_ballet fd_tango fd_util) $(call make-unit-test,test_quic_handshake,test_handshake,fd_aio fd_ballet fd_quic fd_util) @@ -19,6 +20,7 @@ $(call make-unit-test,test_quic_tls_both,test_tls_quic_both,fd_quic fd_ballet fd # $(call make-unit-test,test_quic_flow_control,test_quic_flow_control,fd_aio fd_quic fd_ballet fd_tango fd_util) $(call make-unit-test,test_quic_retry_unit,test_quic_retry_unit,fd_aio fd_quic fd_ballet fd_tango fd_util) $(call make-unit-test,test_quic_retry_integration,test_quic_retry_integration,fd_aio fd_quic fd_ballet fd_tango fd_util) +$(call make-unit-test,test_quic_qos_unit,test_quic_qos_unit,fd_aio fd_quic fd_ballet fd_tango fd_util) $(call run-unit-test,test_quic_hs) $(call run-unit-test,test_quic_streams) diff --git a/src/tango/quic/tests/test_quic_qos_server.c b/src/tango/quic/tests/test_quic_qos_server.c new file mode 100644 index 0000000000..6c8352d09e --- /dev/null +++ b/src/tango/quic/tests/test_quic_qos_server.c @@ -0,0 +1,216 @@ +#include + +#include + +#include "fd_quic_test_helpers.h" +#include "../fd_quic.h" +#include "../fd_quic_qos.h" +#include "../tls/fd_quic_tls.h" + +#include "../../../util/fd_util_base.h" +#include "../../../util/net/fd_eth.h" +#include "../../../util/net/fd_ip4.h" + +#include "../../xdp/fd_xdp_redirect_user.h" +#include "../../xdp/fd_xsk.h" +#include "../../xdp/fd_xsk_aio.h" + +#include "../../../ballet/ed25519/fd_ed25519_openssl.h" +#include "../../../ballet/x509/fd_x509.h" + +#define FD_DEBUG_MODE 1 + +#define STAKE_LG_SLOT_CNT 1 + +static FD_TLS ulong conn_seq = 0UL; + +struct test_quic_qos_ctx { + fd_stake_t * stake; + fd_quic_qos_t * quic_qos; + fd_rng_t * rng; +}; +typedef struct test_quic_qos_ctx test_quic_qos_ctx_t; + +void +test_quic_qos_conn_new( fd_quic_conn_t * conn, void * _ctx ) { + conn->local_conn_id = ++conn_seq; + test_quic_qos_ctx_t * ctx = (test_quic_qos_ctx_t *)_ctx; + + fd_stake_pubkey_t pubkey = { 0 }; + fd_stake_pubkey_t * pubkey_ptr = &pubkey; + int verify_result = fd_quic_tls_get_pubkey( conn->tls_hs, pubkey.pubkey, FD_STAKE_PUBKEY_SZ ); + if ( FD_UNLIKELY( verify_result != X509_V_ERR_DEPTH_ZERO_SELF_SIGNED_CERT ) ) { + FD_DEBUG( FD_LOG_WARNING( ( "Failed to get conn: %lu's pubkey", conn->local_conn_id ) ) ); + pubkey_ptr = NULL; + } + fd_quic_qos_conn_new( ctx->quic_qos, ctx->stake, ctx->rng, conn, pubkey_ptr ); +} + +void +test_quic_qos_conn_final( fd_quic_conn_t * conn, void * _ctx ) { + FD_LOG_NOTICE(("releasing")); + test_quic_qos_ctx_t * ctx = (test_quic_qos_ctx_t *)_ctx; + fd_quic_qos_pq_t * pq = ctx->quic_qos->pq; + fd_quic_qos_pq_t * query = fd_quic_qos_pq_query( pq, conn->local_conn_id, NULL ); + if ( FD_UNLIKELY( ( query ) ) ) { /* most connections likely unstaked */ + fd_quic_qos_pq_remove( pq, query ); + } +} + +void +test_quic_qos_stream_receive( fd_quic_stream_t * stream, + void * ctx, + uchar const * data, + ulong data_sz, + ulong offset, + int fin ) { + (void)ctx; + + FD_LOG_NOTICE( ( "server rx stream data stream=%lu size=%lu offset=%lu fin=%d", + stream->stream_id, + data_sz, + offset, + fin ) ); + FD_LOG_HEXDUMP_NOTICE( ( "received data", data, data_sz ) ); +} + +int +main( int argc, char ** argv ) { + fd_boot( &argc, &argv ); + + ulong cpu_idx = fd_tile_cpu_id( fd_tile_idx() ); + if ( cpu_idx >= fd_shmem_cpu_cnt() ) cpu_idx = 0UL; + + char const * _page_sz = fd_env_strip_cmdline_cstr( &argc, &argv, "--page-sz", NULL, "gigantic" ); + ulong page_cnt = fd_env_strip_cmdline_ulong( &argc, &argv, "--page-cnt", NULL, 1UL ); + ulong numa_idx = + fd_env_strip_cmdline_ulong( &argc, &argv, "--numa-idx", NULL, fd_shmem_numa_idx( cpu_idx ) ); + + ulong page_sz = fd_cstr_to_shmem_page_sz( _page_sz ); + if ( FD_UNLIKELY( !page_sz ) ) FD_LOG_ERR( ( "unsupported --page-sz" ) ); + + fd_quic_limits_t quic_limits = { 0 }; + fd_quic_limits_from_env( &argc, &argv, &quic_limits ); + + FD_LOG_NOTICE( ( "Creating workspace with --page-cnt %lu --page-sz %s pages on --numa-idx %lu", + page_cnt, + _page_sz, + numa_idx ) ); + fd_wksp_t * wksp = + fd_wksp_new_anonymous( page_sz, page_cnt, fd_shmem_cpu_idx( numa_idx ), "wksp", 0UL ); + FD_TEST( wksp ); + + /* initialize stakes*/ + ulong stake_footprint = fd_stake_footprint( STAKE_LG_SLOT_CNT ); + uchar * stake_mem = (uchar *)fd_wksp_alloc_laddr( wksp, fd_stake_align(), stake_footprint, 1UL ); + FD_TEST( stake_mem ); + fd_stake_t * stake = fd_stake_join( fd_stake_new( stake_mem, STAKE_LG_SLOT_CNT ) ); + fd_stake_node_t * staked_nodes = fd_stake_nodes_laddr( stake ); + FD_TEST( stake ); + FD_TEST( staked_nodes ); + FD_LOG_NOTICE( ( "stake: %p, footprint: %lu", (void *)stake, stake_footprint ) ); + FD_LOG_NOTICE( ( " ->staked_nodes: %p", (void *)staked_nodes ) ); + + fd_stake_pubkey_t pubkey = { + .pubkey = {0x55, 0xc8, 0x0e, 0xa6, 0x55, 0xe2, 0xc2, 0x7a, 0xec, 0xef, 0xb0, + 0x4e, 0x2b, 0x86, 0xcb, 0x9e, 0x73, 0x0d, 0x09, 0x49, 0x75, 0xc8, + 0xc9, 0xa6, 0x81, 0xf1, 0x54, 0x6c, 0x7c, 0x40, 0x11, 0x7d} + }; + fd_stake_node_t * insert = fd_stake_node_insert( staked_nodes, pubkey ); + insert->stake = 42UL; + fd_stake_node_t * query = fd_stake_node_query( staked_nodes, pubkey, NULL ); + FD_TEST( query ); + FD_TEST( !memcmp( query->key.pubkey, query->key.pubkey, FD_TXN_PUBKEY_SZ ) ); + FD_TEST( insert->stake == query->stake ); + + /* initialize QoS */ + fd_quic_qos_limits_t qos_limits = { + .min_streams = FD_QUIC_QOS_DEFAULT_MIN_STREAMS, + .max_streams = FD_QUIC_QOS_DEFAULT_MAX_STREAMS, + .total_streams = FD_QUIC_QOS_DEFAULT_TOTAL_STREAMS, + // .pq_lg_slot_cnt = fd_ulong_find_msb(quic_limits.conn_cnt >> 1), + .pq_lg_slot_cnt = 1, + // .lru_depth = quic_limits.conn_cnt >> 1, + .lru_depth = 1, + }; + ulong qos_footprint = fd_quic_qos_footprint( &qos_limits ); + uchar * qos_mem = (uchar *)fd_wksp_alloc_laddr( wksp, fd_quic_qos_align(), qos_footprint, 1UL ); + FD_TEST( qos_mem ); + fd_quic_qos_t * qos = fd_quic_qos_join( fd_quic_qos_new( qos_mem, &qos_limits ) ); + FD_TEST( qos ); + FD_TEST( qos->pq ); + FD_TEST( qos->lru ); + FD_LOG_NOTICE( ( "qos: %p, footprint %lu", (void *)qos_mem, qos_footprint ) ); + FD_LOG_NOTICE( ( " ->pq: %p", (void *)qos->pq ) ); + FD_LOG_NOTICE( ( " ->lru: %p", (void *)qos->lru ) ); + + fd_quic_t * quic = fd_quic_new_anonymous( wksp, &quic_limits, FD_QUIC_ROLE_SERVER ); + FD_TEST( quic ); + FD_LOG_NOTICE( ( "quic %p, footprint: %lu", (void *)quic, fd_quic_footprint( &quic_limits ) ) ); + + fd_quic_udpsock_t _udpsock[1]; + fd_quic_udpsock_t * udpsock = + fd_quic_udpsock_create( _udpsock, &argc, &argv, wksp, fd_quic_get_aio_net_rx( quic ) ); + FD_TEST( udpsock ); + + fd_quic_config_t * quic_config = &quic->config; + FD_TEST( quic_config ); + + quic_config->role = FD_QUIC_ROLE_SERVER; + quic_config->retry = 1; + quic_config->verify_peer = 1; + quic_config->verify_depth = 0; + quic_config->verify_strict = 0; + FD_TEST( fd_quic_config_from_env( &argc, &argv, quic_config ) ); + + memcpy( quic_config->link.src_mac_addr, udpsock->self_mac, 6UL ); + quic_config->net.ip_addr = udpsock->listen_ip; + quic_config->net.listen_udp_port = udpsock->listen_port; + fd_quic_set_aio_net_tx( quic, udpsock->aio ); + uchar server_pkey[32] = { + 137, 115, 254, 55, 116, 55, 118, 19, 151, 66, 229, 24, 188, 62, 99, 209, + 162, 16, 6, 7, 24, 81, 152, 128, 139, 234, 170, 93, 88, 204, 245, 205, + }; + uchar server_pubkey[32] = { 44, 174, 25, 39, 43, 255, 200, 81, 55, 73, 10, + 113, 174, 91, 223, 80, 50, 51, 102, 25, 63, 110, + 36, 28, 51, 11, 174, 179, 110, 8, 25, 152 }; + FD_LOG_HEXDUMP_NOTICE( + ( "server: private key", server_pkey, 32 ) ); /* TODO use base-58 format specifier */ + FD_LOG_HEXDUMP_NOTICE( + ( "server: public key", server_pubkey, 32 ) ); /* TODO use base-58 format specifier */ + quic->cert_key_object = fd_ed25519_pkey_from_private( server_pkey ); + quic->cert_object = fd_x509_gen_solana_cert( quic->cert_key_object ); + + FILE * cert_file = fopen( "cert.pem", "wb" ); + PEM_write_X509( cert_file, quic->cert_object ); + fclose( cert_file ); + + if ( FD_UNLIKELY( argc > 1 ) ) FD_LOG_ERR( ( "unrecognized argument: %s", argv[1] ) ); + + FD_LOG_NOTICE( ( "Initializing QUIC" ) ); + FD_TEST( fd_quic_init( quic ) ); + + fd_rng_t _rng[1]; + fd_rng_t * rng = fd_rng_join( fd_rng_new( _rng, 0U, 0UL ) ); + + test_quic_qos_ctx_t ctx = { .stake = stake, .quic_qos = qos, .rng = rng }; + quic->cb.quic_ctx = &ctx; + quic->cb.conn_new = test_quic_qos_conn_new; + quic->cb.stream_receive = test_quic_qos_stream_receive; + quic->cb.conn_final = test_quic_qos_conn_final; + + while ( 1 ) { + fd_quic_service( quic ); + fd_quic_udpsock_service( udpsock ); + } + + FD_TEST( fd_quic_fini( quic ) ); + + fd_wksp_free_laddr( fd_quic_delete( fd_quic_leave( quic ) ) ); + fd_quic_udpsock_destroy( udpsock ); + fd_wksp_delete_anonymous( wksp ); + + FD_LOG_NOTICE( ( "pass" ) ); + fd_halt(); + return 0; +} diff --git a/src/tango/quic/tests/test_quic_qos_unit.c b/src/tango/quic/tests/test_quic_qos_unit.c new file mode 100644 index 0000000000..76d386b7c1 --- /dev/null +++ b/src/tango/quic/tests/test_quic_qos_unit.c @@ -0,0 +1,136 @@ +#include "../../../util/fd_util.h" +#include "../../../util/sanitize/fd_asan.h" +#include "../../stake/fd_stake.h" +#include "../../tcache/fd_tcache.h" +#include "../fd_quic_qos.h" +#include "../tls/fd_quic_tls.h" +#include "fd_quic_test_helpers.h" + +#define FD_DEBUG_MODE 1 + +#define PQ_LG_SLOT_CNT 6UL +#define PQ_SLOT_CNT ( 1UL << PQ_LG_SLOT_CNT ) +#define LRU_DEPTH ( 1UL << PQ_LG_SLOT_CNT ) +#define CNT_LG_SLOT_CNT ( PQ_LG_SLOT_CNT + 2UL ) +#define STAKE_LG_SLOT_CNT CNT_LG_SLOT_CNT +#define PUBKEY_CNT ( 1UL << ( PQ_LG_SLOT_CNT + 1UL ) ) +#define CONN_CNT ( 46 ) + +int +main( int argc, char ** argv ) { + fd_boot( &argc, &argv ); + + if ( FD_UNLIKELY( argc > 1 ) ) FD_LOG_ERR( ( "unrecognized argument: %s", argv[1] ) ); + + fd_wksp_t * wksp = fd_wksp_new_anonymous( + FD_SHMEM_HUGE_PAGE_SZ, 1, fd_shmem_cpu_idx( fd_shmem_numa_idx( 0 ) ), "wksp", 0UL ); + FD_TEST( wksp ); + + fd_quic_limits_t const quic_limits = { + .conn_cnt = 2, + .conn_id_cnt = 4, + .conn_id_sparsity = 4.0, + .handshake_cnt = 10, + .stream_cnt = {0, 0, 2, 0}, + .inflight_pkt_cnt = 100, + .tx_buf_sz = 1 << 16 + }; + ulong quic_footprint = fd_quic_footprint( &quic_limits ); + FD_TEST( quic_footprint ); + fd_quic_t * quic = fd_quic_new_anonymous( wksp, &quic_limits, FD_QUIC_ROLE_SERVER ); + FD_TEST( quic ); + FD_LOG_NOTICE( ( "quic %p, footprint: %lu", (void *)quic, quic_footprint ) ); + + ulong stake_footprint = fd_stake_footprint( STAKE_LG_SLOT_CNT ); + uchar * stake_mem = (uchar *)fd_wksp_alloc_laddr( wksp, fd_stake_align(), stake_footprint, 1UL ); + FD_TEST( stake_mem ); + fd_stake_t * stake = fd_stake_join( fd_stake_new( stake_mem, STAKE_LG_SLOT_CNT ) ); + fd_stake_node_t * staked_nodes = fd_stake_nodes_laddr( stake ); + FD_TEST( stake ); + FD_TEST( staked_nodes ); + FD_LOG_NOTICE( ( "stake: %p, footprint: %lu", (void *)stake, stake_footprint ) ); + FD_LOG_NOTICE( ( " ->staked_nodes: %p", (void *)staked_nodes ) ); + + fd_quic_qos_limits_t limits = { + .min_streams = FD_QUIC_QOS_DEFAULT_MIN_STREAMS, + .max_streams = FD_QUIC_QOS_DEFAULT_MAX_STREAMS, + .total_streams = FD_QUIC_QOS_DEFAULT_TOTAL_STREAMS, + .pq_lg_slot_cnt = PQ_LG_SLOT_CNT, + .lru_depth = LRU_DEPTH, + .cnt_lg_slot_cnt = CNT_LG_SLOT_CNT, + .cnt_max_conns = 42, + }; + ulong qos_footprint = fd_quic_qos_footprint( &limits ); + uchar * qos_mem = (uchar *)fd_wksp_alloc_laddr( wksp, fd_quic_qos_align(), qos_footprint, 1UL ); + FD_TEST( qos_mem ); + fd_quic_qos_t * qos = fd_quic_qos_join( fd_quic_qos_new( qos_mem, &limits ) ); + FD_TEST( qos ); + FD_TEST( qos->pq ); + FD_TEST( qos->lru ); + FD_TEST( qos->cnt ); + FD_LOG_NOTICE( ( "qos: %p, footprint %lu", (void *)qos_mem, qos_footprint ) ); + FD_LOG_NOTICE( ( " ->pq: %p", (void *)qos->pq ) ); + FD_LOG_NOTICE( ( " ->lru: %p", (void *)qos->lru ) ); + FD_LOG_NOTICE( ( " ->cnt: %p", (void *)qos->cnt ) ); + + /* initialize stakes*/ + ulong stakes[PUBKEY_CNT] = { [PUBKEY_CNT >> 2] = 1UL << 15, 1UL << 14, 1UL << 13, 1UL << 13 }; + fd_stake_pubkey_t pubkeys[PUBKEY_CNT]; + for ( ulong i = 0; i < PUBKEY_CNT; i++ ) { + fd_stake_pubkey_t pubkey = { .pubkey = { (uchar)( i + 1 ) } }; + pubkeys[i] = pubkey; + } + for ( ulong i = 0; i < PUBKEY_CNT; i++ ) { + fd_stake_node_t * staked_node = fd_stake_node_insert( staked_nodes, pubkeys[i] ); + FD_TEST( staked_node ); + staked_node->key = pubkeys[i]; + staked_node->stake = stakes[i]; + stake->total_stake += stakes[i]; + } + FD_TEST( stake->total_stake == 1UL << 16 ); + + for ( ulong i = 0; i < PUBKEY_CNT; i++ ) { + fd_stake_node_t * staked_node = fd_stake_node_query( staked_nodes, pubkeys[i], NULL ); + FD_TEST( staked_node ); + FD_TEST( !memcmp( staked_node->key.pubkey, pubkeys[i].pubkey, FD_TXN_PUBKEY_SZ ) ); + FD_TEST( staked_node->stake == stakes[i] ); + } + + /* initialize mock conns */ + fd_quic_conn_t conns[CONN_CNT]; + for ( ulong i = 0; i < CONN_CNT; i++ ) { + memset( &conns[i], 0, sizeof( fd_quic_conn_t ) ); + conns[i].local_conn_id = i + 1; + conns[i].server = 1; + conns[i].quic = quic; + conns[i].cur_peer_idx = 0; + conns[i].peer[0].net.ip_addr = 0x0100007f; + } + + ulong x = 1; + FD_TEST( !--x ); + + fd_rng_t _rng[1]; + fd_rng_t * rng = fd_rng_join( fd_rng_new( _rng, 0U, 0UL ) ); + (void)rng; + + for ( ulong i = 0; i < CONN_CNT; i++ ) { + FD_TEST( &conns[i] ); + fd_quic_qos_conn_new( qos, stake, rng, &conns[i], &pubkeys[i] ); + } + + fd_quic_qos_cnt_key_t key; + key.ip4_addr = 0x0100007f; + fd_quic_qos_cnt_t * query = fd_quic_qos_cnt_query( qos->cnt, key, NULL ); + for ( ulong i = 0; i < fd_quic_qos_cnt_slot_cnt( qos->cnt ); i++ ) { + if ( !fd_quic_qos_cnt_key_inval( qos->cnt[i].key ) ) { + FD_LOG_NOTICE( ( "%u: %lu", qos->cnt[i].key.ip4_addr, qos->cnt[i].count ) ); + } + } + FD_TEST( query ); + FD_TEST( query->count == 42 ); + + FD_LOG_NOTICE( ( "pass" ) ); + fd_halt(); + return 0; +} diff --git a/src/tango/quic/tests/test_quic_streams.c b/src/tango/quic/tests/test_quic_streams.c index 1e1e2ed0a8..05b188d2c9 100644 --- a/src/tango/quic/tests/test_quic_streams.c +++ b/src/tango/quic/tests/test_quic_streams.c @@ -103,7 +103,8 @@ main( int argc, .handshake_cnt = 10, .stream_cnt = { 0, 0, 20, 0 }, .inflight_pkt_cnt = 100, - .tx_buf_sz = 1<<15 + .tx_buf_sz = 1<<15, + .stream_pool_sz = 128 }; fd_quic_t * server_quic = fd_quic_new_anonymous( wksp, &quic_server_limits, FD_QUIC_ROLE_SERVER ); FD_TEST( server_quic ); diff --git a/src/tango/quic/tests/test_tls_quic_both.c b/src/tango/quic/tests/test_tls_quic_both.c index b7582682cd..2bf4b6cc77 100644 --- a/src/tango/quic/tests/test_tls_quic_both.c +++ b/src/tango/quic/tests/test_tls_quic_both.c @@ -252,6 +252,7 @@ fd_quic_create_context( int is_server, // set callback for client hello SSL_CTX_set_client_hello_cb(ctx, fd_quic_ssl_client_hello, NULL); + } return ctx; diff --git a/src/tango/quic/tls/fd_quic_tls.c b/src/tango/quic/tls/fd_quic_tls.c index fa782ae046..41ce6d7c0e 100644 --- a/src/tango/quic/tls/fd_quic_tls.c +++ b/src/tango/quic/tls/fd_quic_tls.c @@ -1,13 +1,17 @@ #include "fd_quic_tls.h" #include "../fd_quic_private.h" #include "../../../util/fd_util.h" +#include "fd_quic_tls_enum.h" +#include #include #include #include #include #include +#define FD_DEBUG_MODE 1 + /* internal callbacks */ int fd_quic_ssl_add_handshake_data( SSL * ssl, @@ -160,6 +164,11 @@ fd_quic_tls_new( void * mem, self->alpns = cfg->alpns; self->alpns_sz = cfg->alpns_sz; + self->verify_peer = cfg->verify_peer; + self->verify_depth = cfg->verify_depth; + self->verify_strict = cfg->verify_strict; + self->verify_self_signed = cfg->verify_self_signed; + return self; } @@ -403,6 +412,7 @@ fd_quic_tls_process( fd_quic_tls_hs_t * self ) { } } else { // handle post-handshake messages + /* A server MUST NOT use post-handshake client authentication */ switch( SSL_process_quic_post_handshake( self->ssl ) ) { case 0: // failed { @@ -748,8 +758,19 @@ fd_quic_create_context( fd_quic_tls_t * quic_tls, } EVP_PKEY_free( pkey ); - /* set verification */ - //SSL_CTX_set_verify( ctx, SSL_VERIFY_PEER, NULL ); + /* Set verify. + + For a client, verifies the server cert. + For a server, sends a client cert request and verifies. + + See: https://www.openssl.org/docs/man3.0/man3/SSL_CTX_set_verify.html */ + // SSL_CTX_set_verify( ctx, SSL_VERIFY_PEER | SSL_VERIFY_CLIENT_ONCE, NULL ); + SSL_CTX_set_verify( ctx, SSL_VERIFY_PEER | SSL_VERIFY_CLIENT_ONCE, fd_quic_tls_always_continue_verify_cb ); + // if (quic_tls->verify_peer) { + // if (quic_tls->verify_strict) SSL_CTX_set_verify( ctx, SSL_VERIFY_PEER | SSL_VERIFY_CLIENT_ONCE, NULL ); + // else SSL_CTX_set_verify( ctx, SSL_VERIFY_PEER | SSL_VERIFY_CLIENT_ONCE, fd_quic_tls_always_continue_verify_cb ); + // SSL_CTX_set_verify_depth( ctx, quic_tls->verify_depth + 1 ); /* OpenSSL default is 100 */ + // } /* solana actual: "solana-tpu" */ ERR_clear_error(); @@ -852,3 +873,43 @@ fd_quic_tls_get_peer_transport_params( fd_quic_tls_hs_t * self, ulong * transport_params_sz ) { SSL_get_peer_quic_transport_params( self->ssl, transport_params, transport_params_sz ); } + +int +fd_quic_tls_get_pubkey( fd_quic_tls_hs_t * self, uchar * pubkey, ulong pubkey_sz ) { + X509 * client_cert = SSL_get_peer_certificate( self->ssl ); + if ( FD_LIKELY( client_cert ) ) { /* optimize for peers that present cert */ + EVP_PKEY * pubkey_ = X509_get_pubkey( client_cert ); + if ( FD_LIKELY( pubkey_ && EVP_PKEY_get_raw_public_key( pubkey_, pubkey, &pubkey_sz ) ) ) { + EVP_PKEY_free( pubkey_ ); + } + FD_DEBUG( + else { FD_LOG_WARNING( ( "Failed to get public key %s", fd_quic_tls_strerror() ) ); } ); + } + FD_DEBUG( + else { FD_LOG_WARNING( ( "Failed to get peer certificate %s", fd_quic_tls_strerror() ) ); } ); + return (int)SSL_get_verify_result( self->ssl ); +} + +int +fd_quic_tls_always_continue_verify_cb( int preverify_ok, + X509_STORE_CTX * ctx ) { + int err = X509_STORE_CTX_get_error( ctx ); + int depth = X509_STORE_CTX_get_error_depth( ctx ); + + /* clang-format off */ + if ( FD_UNLIKELY( !preverify_ok ) ) { + FD_DEBUG( FD_LOG_WARNING( ( "client verification failed: num=%d reason=%s depth=%d. continuing anyways.", + err, + X509_verify_cert_error_string( err ), + depth ) ) ); + } + + if ( depth > 0 ) { + FD_DEBUG( FD_LOG_WARNING( + ( "client certificate verify depth: %d too long. certificates should be self-signed", + depth ) ) ); + X509_STORE_CTX_set_error( ctx, X509_V_ERR_CERT_CHAIN_TOO_LONG ); + } + /* clang-format on */ + return 1; /* always continue even if client verify fails; just treat the connection as 0 stake */ +} diff --git a/src/tango/quic/tls/fd_quic_tls.h b/src/tango/quic/tls/fd_quic_tls.h index 22f2b7ffd7..d9f3a157e9 100644 --- a/src/tango/quic/tls/fd_quic_tls.h +++ b/src/tango/quic/tls/fd_quic_tls.h @@ -94,6 +94,10 @@ typedef void (* fd_quic_tls_cb_handshake_complete_t)( fd_quic_tls_hs_t * hs, void * context ); +typedef void +(* fd_quic_tls_cb_verify_t)( int preverify_ok, + X509_STORE_CTX * ctx ); + typedef void (* fd_quic_tls_cb_keylog_t)( fd_quic_tls_hs_t * hs, char const * line ); @@ -112,6 +116,7 @@ struct fd_quic_tls_cfg { fd_quic_tls_cb_alert_t alert_cb; fd_quic_tls_cb_secret_t secret_cb; fd_quic_tls_cb_handshake_complete_t handshake_complete_cb; + fd_quic_tls_cb_verify_t verify_cb; fd_quic_tls_cb_keylog_t keylog_cb; ulong max_concur_handshakes; @@ -125,6 +130,12 @@ struct fd_quic_tls_cfg { uchar const * alpns; /* ALPNs */ uint alpns_sz; /* number of bytes... see ALPN spec */ + + /* see `fd_quic.h` or `fd_quic_tls_verify_cb` for docs on these verify params */ + int verify_peer; + int verify_depth; + int verify_strict; + int verify_self_signed; }; /* structure for organising handshake data */ @@ -150,6 +161,14 @@ struct fd_quic_tls { fd_quic_tls_cb_handshake_complete_t handshake_complete_cb; fd_quic_tls_cb_keylog_t keylog_cb; + int keylog_fd; /* Regular file descriptor for key logging. Owned by fd_quic. */ + + /* see `fd_quic.h` or `fd_quic_tls_verify_cb` for docs on these verify params */ + int verify_peer; + int verify_depth; + int verify_strict; + int verify_self_signed; + ulong max_concur_handshakes; /* array of (max_concur_handshakes) pre-allocated handshakes */ @@ -159,9 +178,6 @@ struct fd_quic_tls { /* ssl related */ SSL_CTX * ssl_ctx; - /* Regular file descriptor for key logging. - Owned by fd_quic. */ - int keylog_fd; /* ALPNs in OpenSSL length-prefixed list format */ uchar const * alpns; @@ -349,5 +365,33 @@ fd_quic_tls_get_peer_transport_params( fd_quic_tls_hs_t * self, uchar const ** transport_params, ulong * transport_params_sz ); -#endif /* HEADER_fd_src_tango_quic_tls_fd_quic_tls_h */ +/* fd_quic_tls_get_pubkey + + get the pubkey from an X509 certificate, checking it for validity, and + returns the OpenSSL SSL_get_verify_result return code. + the application can decide how to process the return code accordingly. + for example, X509_V_ERR_DEPTH_ZERO_SELF_SIGNED_CERT is treated as an error + by OpenSSL, but the application can choose to make it allowable. */ +int +fd_quic_tls_get_pubkey( fd_quic_tls_hs_t * self, + uchar * pubkey, + ulong pubkey_sz ); + +/* fd_quic_tls_verify_cb is called by OpenSSL when verifying the peer cert. + + By default, OpenSSL will fail the handshake of verify fails. This will + instead continue the handshake even if verify fails. + + This is useful, for example, for an application that wants to treat + unverified connections as unauthenticated). + + verify_peer sets SSL_VERIFY_PEER flag. if server, sends a client cert request. + verify_depth sets the maximum allowable depth of a cert chain when verifying. + verify_strict sets whether to fail the handshake if cert verification fails. + */ +int +fd_quic_tls_always_continue_verify_cb( int preverify_ok, + X509_STORE_CTX * ctx ); + +#endif /* HEADER_fd_src_tango_quic_tls_fd_quic_tls_h */ diff --git a/src/tango/stake/Local.mk b/src/tango/stake/Local.mk new file mode 100644 index 0000000000..b79b1fd7ce --- /dev/null +++ b/src/tango/stake/Local.mk @@ -0,0 +1,4 @@ +$(call add-hdrs,fd_stake.h) +$(call add-objs,fd_stake,fd_tango) +$(call make-unit-test,test_stake,test_stake,fd_tango fd_util) +$(call run-unit-test,test_stake,) diff --git a/src/tango/stake/fd_stake.c b/src/tango/stake/fd_stake.c new file mode 100644 index 0000000000..5a882c70fd --- /dev/null +++ b/src/tango/stake/fd_stake.c @@ -0,0 +1,123 @@ +#include "../mvcc/fd_mvcc.h" +#include "fd_stake.h" +#include + +ulong +fd_stake_align( void ) { + return FD_STAKE_ALIGN; +} + +ulong +fd_stake_footprint( int lg_slot_cnt ) { + if ( lg_slot_cnt <= 0 ) { return 0UL; } + return fd_ulong_align_up( sizeof( fd_stake_t ) + fd_stake_node_footprint( lg_slot_cnt ), + fd_stake_align() ); +} + +void * +fd_stake_new( void * shmem, int lg_slot_cnt ) { + + if ( FD_UNLIKELY( !shmem ) ) { + FD_LOG_WARNING( ( "NULL shmem" ) ); + return NULL; + } + + if ( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)shmem, fd_stake_align() ) ) ) { + FD_LOG_NOTICE(("unaligned")); + FD_LOG_WARNING( ( "misaligned shmem" ) ); + return NULL; + } + + ulong footprint = fd_stake_node_footprint( (int)lg_slot_cnt ); + if ( FD_UNLIKELY( !footprint ) ) { + FD_LOG_WARNING( ( "bad lg_slot_cnt (%d): must be >=0", lg_slot_cnt ) ); + return NULL; + } + + fd_memset( shmem, 0, footprint ); + + fd_stake_t * stake = (fd_stake_t *)shmem; + fd_mvcc_t mvcc = { .version = 0 }; + stake->mvcc = mvcc; + stake->total_stake = 0; + /* note the map join happens inside `new`, because the offset from the start of the stake region + * to map slot0 is stable across joins */ + fd_stake_node_new( (uchar *)stake + sizeof( fd_stake_t ), lg_slot_cnt ); + + FD_COMPILER_MFENCE(); + FD_VOLATILE( stake->magic ) = FD_STAKE_MAGIC; + FD_COMPILER_MFENCE(); + + return shmem; +} + +fd_stake_t * +fd_stake_join( void * shstake ) { + FD_TEST(shstake); + + if ( FD_UNLIKELY( !shstake ) ) { + FD_LOG_WARNING( ( "NULL shstake" ) ); + return NULL; + } + + if ( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)shstake, fd_stake_align() ) ) ) { + FD_LOG_WARNING( ( "misaligned shmem" ) ); + return NULL; + } + + fd_stake_t * stake = (fd_stake_t *)shstake; + if ( FD_UNLIKELY( stake->magic != FD_STAKE_MAGIC ) ) { + FD_LOG_WARNING( ( "bad magic" ) ); + return NULL; + } + + uchar * shmap = (uchar *)shstake + sizeof( fd_stake_t ); + fd_stake_node_t * stake_node = fd_stake_node_join( shmap ); + stake->nodes_off = (ulong)stake_node - (ulong)stake; + + return stake; +} + +fd_stake_node_t * +fd_stake_nodes_laddr( fd_stake_t * stake ) { + return (fd_stake_node_t *)( (ulong)stake + stake->nodes_off ); +} + +void +fd_stake_deser( fd_stake_t * stake, uchar * data, ulong sz ) { + fd_mvcc_begin_write( &stake->mvcc ); + + fd_stake_node_t * staked_nodes = fd_stake_nodes_laddr( stake ); + fd_stake_node_clear( staked_nodes ); + ulong total_stake = 0; + for ( ulong off = 0; off < sz; off += 40 ) { + /* 32-byte aligned. dcache is 128-byte aligned. 128 % 32 = 0. */ + fd_stake_pubkey_t * pubkey = (fd_stake_pubkey_t *)( fd_type_pun( data + off ) ); + /* 8-byte aligned. 32 + 8 = 40. 40 % 8 = 0. */ + ulong stake = + *(ulong *)( fd_type_pun( data + off + sizeof( fd_stake_pubkey_t ) ) ); + fd_stake_node_t * staked_node = fd_stake_node_insert( staked_nodes, *pubkey ); + if ( staked_node == NULL ) staked_node = fd_stake_node_query( staked_nodes, *pubkey, NULL ); + if ( staked_node == NULL ) { + FD_LOG_HEXDUMP_WARNING( ( "failed to insert pubkey", pubkey, sizeof( fd_stake_pubkey_t ) ) ); + continue; + } + staked_node->stake = stake; + total_stake += stake; + } + printf("writing total stake %lu\n", stake->total_stake); + stake->total_stake = total_stake; + + fd_mvcc_end_write( &stake->mvcc ); +} + +void +fd_stake_dump( fd_stake_t * stake ) { + fd_stake_node_t * staked_nodes = fd_stake_nodes_laddr( stake ); + for ( ulong i = 0; i < fd_stake_node_slot_cnt( staked_nodes ); i++ ) { + fd_stake_node_t * staked_node = &staked_nodes[i]; + if ( !fd_stake_node_key_inval( staked_node->key ) ) { + FD_LOG_NOTICE( ( "stake[%lu] = %lu", i, staked_node->stake ) ); + } + } +} diff --git a/src/tango/stake/fd_stake.h b/src/tango/stake/fd_stake.h new file mode 100644 index 0000000000..f107e524e6 --- /dev/null +++ b/src/tango/stake/fd_stake.h @@ -0,0 +1,133 @@ +#ifndef HEADER_fd_src_tango_stake_fd_stake_h +#define HEADER_fd_src_tango_stake_fd_stake_h + +#include "../mvcc/fd_mvcc.h" + +/* double cache line */ +#define FD_STAKE_ALIGN 128UL + +/* maximum lg # of staked nodes we can track */ +#define FD_STAKE_LG_SLOT_CNT 16UL + +/* 32-bytes, as with all Solana pubkeys */ +#define FD_STAKE_PUBKEY_SZ 32UL + +/* opaque */ +#define FD_STAKE_MAGIC 0xF17EDA2CE757A1E0 /* FIREDANCER STAKE V0 */ + +struct fd_stake_private { + ulong magic; /* == FD_STAKE_MAGIC */ + fd_mvcc_t mvcc; + ulong total_stake; /* total amount of stake */ + ulong nodes_off; /* offset to map region */ +}; +typedef struct fd_stake_private fd_stake_t; + +struct fd_stake_pubkey { + uchar pubkey[FD_STAKE_PUBKEY_SZ]; +}; + +typedef struct fd_stake_pubkey fd_stake_pubkey_t; +static fd_stake_pubkey_t pubkey_null = { 0 }; + +/* Staked node map */ +struct fd_stake_node { + fd_stake_pubkey_t key; + uint hash; + ulong stake; +}; +typedef struct fd_stake_node fd_stake_node_t; + +#define MAP_NAME fd_stake_node +#define MAP_T fd_stake_node_t +#define MAP_KEY_T fd_stake_pubkey_t +#define MAP_KEY_NULL pubkey_null +#define MAP_KEY_INVAL( k ) !( memcmp( &k, &pubkey_null, sizeof( fd_stake_pubkey_t ) ) ) +#define MAP_KEY_EQUAL( k0, k1 ) !( memcmp( ( k0.pubkey ), ( k1.pubkey ), FD_STAKE_PUBKEY_SZ ) ) +#define MAP_KEY_EQUAL_IS_SLOW 1 +#define MAP_KEY_HASH( key ) ( (uint)( fd_hash( 0UL, key.pubkey, FD_STAKE_PUBKEY_SZ ) ) ) +#include "../../util/tmpl/fd_map_dynamic.c" + +ulong +fd_stake_align( void ); + +ulong +fd_stake_footprint( int lg_slot_cnt ); + +/* fd_stake_new formats an unused memory region for use as a stake object. `nodes_off` points to the + first slot, which is past the map header. The layout is diagrammed below: + + ------------------ <- (fd_stake_t * stake) // returned by fd_stake_new + private hdr region + ------------------ + nodes map region + + ..... hdr ..... + ..... node 0 ..... + ..... node 1 ..... + ..... ...... ..... + ..... node n ..... + + ------------------ */ +void * +fd_stake_new( void * mem, int lg_slot_cnt ); + +/* fd_stake_join joins the caller to the stake object. + + fd_stake_t is designed to be shared across multiple joins. Therefore, it maintains an offset for + the staked nodes region (which itself requires a join), which is located within the stake region + itself. It uses an offset in lieu of pointers, because the pointer addresses would otherwise be + local to each joined process. Note this is a pointer to the first slot in the map, rather than + the start of the map region itself, as `fd_map_dynamic` expects slot pointers in its API. + + ------------------- + private hdr region + ------------------ + nodes map region + + ..... hdr ..... + ..... node 0 ..... <- (fd_stake_t * stake) + nodes_off // set by fd_stake_join + ..... node 1 ..... + ..... ...... ..... + ..... node n ..... + + ------------------ */ +fd_stake_t * +fd_stake_join( void * shstake ); + +ulong +fd_stake_version( fd_stake_t * stake ); + +ulong * +fd_stake_version_laddr( fd_stake_t * stake ); + +fd_stake_node_t * +fd_stake_nodes_laddr( fd_stake_t * stake ); + +/* fd_stake_read performs an mvcc-fenced read of the stake structure. `fd_stake_t` is a single-producer, + * multiple-consumer concurrency structure and an odd version number indicates the writer is + * currently writing to the structure. */ +fd_stake_t * +fd_stake_read( fd_stake_t * stake); + +/* fd_stake_write performs an mvcc-fenced write of the stake structure. Assumes there is a single + writer and does not check for safe concurrency with multiple writers. + + `data` is a pointer to a bincode-serialized byte representation of stakes from the labs client. + + Serialization format: + ----------- + total stake (8 bytes, le) + node0 pubkey (32 bytes, le) + node0 stake (8 bytes, le) + node1 pubkey (32 bytes, le) + node1 stake (8 bytes, le) + ... + ----------- */ +void +fd_stake_deser( fd_stake_t * stake, uchar * data, ulong sz ); + +void +fd_stake_dump( fd_stake_t * stake ); + +#endif /* HEADER_fd_src_tango_stake_fd_stake_h */ diff --git a/src/tango/stake/test_stake.c b/src/tango/stake/test_stake.c new file mode 100644 index 0000000000..70c203a989 --- /dev/null +++ b/src/tango/stake/test_stake.c @@ -0,0 +1,52 @@ +#include "../../util/fd_util.h" +#include "fd_stake.h" + +#define LG_SLOT_CNT 10 +#define MAX_NODE_CNT ( 1UL << LG_SLOT_CNT ) +#define NUM_PUBKEYS 4 + +fd_stake_pubkey_t pubkeys[NUM_PUBKEYS] = { + { .pubkey = { 44, 174, 25, 39, 43, 255, 200, 81, 55, 73, 10, 113, 174, 91, 223, 80, + 50, 51, 102, 25, 63, 110, 36, 28, 51, 11, 174, 179, 110, 8, 25, 152 } }, + { .pubkey = { 250, 56, 248, 84, 190, 46, 154, 76, 15, 72, 181, 205, 32, 96, 128, 213, + 158, 33, 81, 193, 63, 154, 93, 254, 15, 81, 32, 175, 54, 60, 179, 224 } }, + { .pubkey = { 225, 102, 95, 246, 174, 91, 1, 240, 118, 174, 119, 113, 150, 146, 149, 29, + 253, 10, 69, 168, 188, 51, 31, 11, 67, 18, 201, 181, 189, 178, 159, 178 } }, + { .pubkey = { 160, 58, 145, 16, 41, 55, 193, 27, 132, 112, 36, 109, 233, 125, 206, + 165, 200, 130, 76, 147, 173, 151, 180, 73, 248, 4, 165, 8, 163, 42 } } }; + +void +test_stake( void ) { + fd_wksp_t * wksp = fd_wksp_new_anonymous( + FD_SHMEM_GIGANTIC_PAGE_SZ, 1, fd_shmem_cpu_idx( fd_shmem_numa_idx( 0 ) ), "wksp", 0UL ); + FD_TEST( wksp ); + void * mem = + fd_wksp_alloc_laddr( wksp, fd_stake_align(), fd_stake_footprint( LG_SLOT_CNT ), 42UL ); + + fd_stake_t * stake = fd_stake_join( fd_stake_new( mem, LG_SLOT_CNT ) ); + fd_stake_node_t * staked_nodes = fd_stake_nodes_laddr( stake ); + + for ( ulong i = 0; i < NUM_PUBKEYS; i++ ) { + fd_stake_node_t * staked_node = fd_stake_node_insert( staked_nodes, pubkeys[i] ); + staked_node->stake = i; + FD_TEST( staked_node ); + } + for ( ulong i = 0; i < NUM_PUBKEYS; i++ ) { + fd_stake_node_t * staked_node = fd_stake_node_query( staked_nodes, pubkeys[i], NULL ); + FD_TEST( staked_node ); + FD_TEST( staked_node->stake == i ); + } +} + +int +main( int argc, char ** argv ) { + fd_boot( &argc, &argv ); + + if ( FD_UNLIKELY( argc > 1 ) ) FD_LOG_ERR( ( "unrecognized argument: %s", argv[1] ) ); + + test_stake(); + + FD_LOG_NOTICE( ( "pass" ) ); + fd_halt(); + return 0; +} diff --git a/src/tango/tcache/fd_tcache.h b/src/tango/tcache/fd_tcache.h index ae18ca9d5f..4ed7abb792 100644 --- a/src/tango/tcache/fd_tcache.h +++ b/src/tango/tcache/fd_tcache.h @@ -344,7 +344,7 @@ fd_tcache_remove( ulong * map, /* FD_TCACHE_INSERT inserts tag into the tcache in fast O(1) operations. On return, if dup is non-zero, tag is already in the tcache and the tcache in unchanged. If dup is zero, tag was inserted and, if the - tcache was full (e.g. has had depth values previously inserted), the + tcache was full (i.e. had already contained depth values), the oldest tag in the tcache will have been evicted. This is implemented as a macro to support multiple return values (dup diff --git a/src/util/bits/fd_bits.h b/src/util/bits/fd_bits.h index 9d90692ac1..9c399daae5 100644 --- a/src/util/bits/fd_bits.h +++ b/src/util/bits/fd_bits.h @@ -537,7 +537,7 @@ fd_double_eq( double x, #ifndef FD_UNALIGNED_ACCESS_STYLE #if FD_HAS_X86 -#define FD_UNALIGNED_ACCESS_STYLE 1 +#define FD_UNALIGNED_ACCESS_STYLE 0 #else #define FD_UNALIGNED_ACCESS_STYLE 0 #endif diff --git a/src/util/shmem/fd_shmem_private.h b/src/util/shmem/fd_shmem_private.h index 0a80894e1e..7ad9fcf1ef 100644 --- a/src/util/shmem/fd_shmem_private.h +++ b/src/util/shmem/fd_shmem_private.h @@ -2,6 +2,7 @@ #define HEADER_fd_src_util_shmem_fd_shmem_private_h #include "fd_shmem.h" +#include #if FD_HAS_THREADS #include @@ -136,8 +137,18 @@ static inline char * /* ==buf always */ fd_shmem_private_path( char const * name, /* Valid name */ ulong page_sz, /* Valid page size (normal, huge, gigantic) */ char * buf ) { /* Non-NULL with FD_SHMEM_PRIVATE_PATH_BUF_MAX bytes */ - return fd_cstr_printf( buf, FD_SHMEM_PRIVATE_PATH_BUF_MAX, NULL, "%s/.%s/%s", + + buf = fd_cstr_printf( buf, FD_SHMEM_PRIVATE_PATH_BUF_MAX, NULL, "%s/.%s/%s", fd_shmem_private_base, fd_shmem_page_sz_to_cstr( page_sz ), name ); + if ( getenv( "FD_FFI" ) ) { + char tmp[FD_SHMEM_PRIVATE_PATH_BUF_MAX]; + char prefix[9] = "/mnt/.fd"; + memcpy( tmp, prefix, 8 ); + memcpy( tmp + 8, buf, FD_SHMEM_PRIVATE_PATH_BUF_MAX - 8 ); + memset( buf, 0, FD_SHMEM_PRIVATE_PATH_BUF_MAX ); + memcpy( buf, tmp, FD_SHMEM_PRIVATE_PATH_BUF_MAX ); + } + return buf; } FD_PROTOTYPES_END