diff --git a/book/.vitepress/config.mts b/book/.vitepress/config.mts index 3978424b78..b771898adf 100644 --- a/book/.vitepress/config.mts +++ b/book/.vitepress/config.mts @@ -10,7 +10,7 @@ export default defineConfig({ lastUpdated: true, head: [ - ['link', { rel: 'icon', type: 'image/svg+xml', href: '/fire.svg' }], + ['link', { rel: 'icon', type: 'image/svg+xml', href: '/firedancer/fire.svg' }], ['meta', { name: 'theme-color', content: '#1ce7c2' }], ['meta', { name: 'og:type', content: 'website' }], ['meta', { name: 'og:locale', content: 'en' }], @@ -37,13 +37,19 @@ export default defineConfig({ { text: 'Initializing', link: 'initializing' }, ] }, + { + text: 'Performance', + collapsed: false, + items: [ + { text: 'Tuning', link: 'tuning' }, + ] + }, { text: 'Operating', collapsed: false, items: [ { text: 'Monitoring', link: 'monitoring' }, { text: 'Troubleshooting', link: 'troubleshooting' }, - { text: 'Tuning', link: 'tuning' }, { text: 'Frequently Asked Questions', link: 'faq' }, ] } diff --git a/book/guide/configuring.md b/book/guide/configuring.md index c3356dedd6..ec50dcdd13 100644 --- a/book/guide/configuring.md +++ b/book/guide/configuring.md @@ -4,7 +4,9 @@ Firedancer is configured via. a [TOML](https://toml.io/en/) file. Almost all options have a recommended default value that is set automatically by Firedancer, and an operator needs only to specify values for options -they wish to change. +they wish to change. The full list of options is as specified in the +[`default.toml`](https://github.com/firedancer-io/firedancer/blob/main/src/app/fdctl/config/default.toml) +file is documented below. ::: tip MIGRATING diff --git a/book/guide/monitoring.md b/book/guide/monitoring.md index 10a92f5123..a3706efd4d 100644 --- a/book/guide/monitoring.md +++ b/book/guide/monitoring.md @@ -74,18 +74,8 @@ tile_pid{kind="quic",kind_id="0"} 1108975 tile_pid{kind="verify",kind_id="0"} 1108978 ``` -All of the metrics have two two lables, a `kind` which tells you what -type of tile the metric is being reported for, and a `kind_id` which is -the index of the tile. For example, if there are two bank tiles for -executing transactions, they have `kind_id` of `0` and `1`, and each -report metrics separately. - -```sh [bash] -# HELP bank_tile_transaction_executed_program_account_not_found When a transaction executes (makes it onto the chain), result of executing a transaction. The transaction can still fail. (Attempt to load a program that does not exist.) -# TYPE bank_tile_transaction_executed_program_account_not_found counter -bank_tile_transaction_executed_program_account_not_found{kind="bank",kind_id="0"} 241 -bank_tile_transaction_executed_program_account_not_found{kind="bank",kind_id="1"} 13 -``` +See the [metrics API documentation](/api/metrics.html) for more +information on the available data. ## Live monitoring Firedancer ships with a monitoring tool included in `fdctl`, which you diff --git a/book/guide/tuning.md b/book/guide/tuning.md index 5b235907d9..12524e8778 100644 --- a/book/guide/tuning.md +++ b/book/guide/tuning.md @@ -1,81 +1,241 @@ -# Tuning +# Performance Tuning + +## Overview +The Firedancer validator is composed of a handful of threads, each +performing one of eleven distinct jobs. Some jobs only need one thread +to do them, but certain jobs require many threads performing the same +work in parallel. + +Each thread is given a CPU core to run on, and threads take ownership of +the core: never sleeping or letting the operating system use it for +another purpose. The combination of a job, and the thread it runs on, +and the CPU core it is assigned to is called a tile. The eleven kinds of +tile are, + +| Tile | Description | +|--------|-------------| +| `net` | Sends and receives network packets from the network device | +| `quic` | Receives transactions from clients, performing all connection management and packet processing to manage and implement the QUIC protocol | +| `verify` | Verifies the cryptographic signature of incoming transactions, filtering invalid ones | +| `dedup` | Checks for and filters out duplicated incoming transactions | +| `pack` | Collects incoming transactions and smartly schedules them for execution when we are leader | +| `bank` | Executes transactions that have been scheduled when we are leader | +| `poh` | Continuously hashes in the background, and mixes the hash in with executed transactions to prove passage of time | +| `shred` | Distributes block data to the network when leader, and receives and retransmits block data when not leader | +| `store` | Receives block data when we are leader, or from other nodes when they are leader, and stores it locally in a database on disk | +| `metric` | Collects monitoring information about other tiles and serves it on a HTTP endpoint | +| `sign` | Holds the validator private key, and receives and responds to signing requests from other tiles | + +These tiles communicate with each other via shared memory queues. The +work each tile performs and how they communicate with each other is +fixed, but the count of each tile kind and which CPU cores they are +assigned to is set by your configuration, and this the primary way to +tune the performance of Firedancer. + +## Configuration +The default configuration provided if no options are specified is given +in the [`default.toml`](https://github.com/firedancer-io/firedancer/blob/main/src/app/fdctl/config/default.toml) +file: + +::: code-group + +```toml [default.toml] +[layout] + affinity = "1-16" + solana_labs_affinity = "17-31" + net_tile_count = 1 + quic_tile_count = 1 + verify_tile_count = 4 + bank_tile_count = 2 + shred_tile_count = 2 +``` -## Tiles +::: + +Note that not all tiles have a configurable count. The `dedup`, `pack`, +`poh`, `store`, `metric`, and `sign` tiles are fixed at one thread each. + +The assignment of tiles to CPU cores is determined by the `affinity` +string, which is documented fully in the +[`default.toml`](https://github.com/firedancer-io/firedancer/blob/main/src/app/fdctl/config/default.toml) +file itself. The Frankendancer validator currently starts an Agave +process to perform functionality like replay, gossip, and repair that is +not yet implemented in Firedancer. The `solana_labs_affinity` string +determines the CPU cores that are given to the threads of this Agave +process. + +The following table shows the performance of the adjustable tiles on an +Intel Icelake core, along with some performance notes and +recommendations for `mainnet-beta`, + +| Tile | Default | Notes | +|----------|-----------------|-------| +| `net` | 1 | Handles >1M TPS per tile. Designed to scale out for future network conditions, but there is no need to run more than 1 net tile at the moment on `mainnet-beta` | +| `quic` | 1 | Handles >1M TPS per tile. Designed to scale out for future network conditions, but there is no need to run more than 1 QUIC tile at the moment on `mainnet-beta` | +| `verify` | 4 | Handles 20-40k TPS per tile. Recommend running many verify tiles, as signature verification is the primary bottleneck of the application | +| `bank` | 2 | Handles 20-40k TPS per tile, with diminishing returns from adding more tiles. Designed to scale out for future network conditions, but 2 tiles is enough to handle current `mainnet-beta` conditions. Can be increased further when benchmarking to test future network performance | +| `shred` | 2 | Throughput is mainly dependent on cluster size, 2 tiles is enough to handle current `mainnet-beta` conditions. In benchmarking, if the cluster size is small, 1 tile can handle >1M TPS | + +## Testing +Firedancer includes a simple benchmarking tool for measuring the +transaction throughput of the validator when it is leader, in +transactions per second (TPS). In practice, the Solana network +performance is limited by two factors that are unrelated to what +this tool measures: + + - The replay performance of the slowest nodes in the network, and if +they can keep up + - The consensus limits on block size and data size + +In particular, consensus limits on the Solana protocol limit the network +strictly to around 81,000 TPS. But the tool can be useful for testing +local affinity and layout configurations. + +The benchmark runs on a single machine and performs the following: + + 1. A new genesis is created, and set of accounts are pre-funded + 2. A set of CPU cores is assigned to generating and signing simple +transactions using these accounts as fast as possible + 3. Another set of CPU cores is assigned to sending these transfers +via. QUIC over loopback to the locally running validator + 4. Around once a second, an RPC call is made to get the total count of +transactions that have executed on the chain, and this information is +printed to the console + +The benchmark is currently quite synthetic, as it only measures single +node performance, in an idealized case where all transactions are +non-conflicting. + +## Running +The benchmark command is part of the `fddev` development binary, which +can be built with `make -j fddev`. With the binary in hand, we can run +our benchmark, here it will be on a 32 physical core AMD EPYC 7513: + +```sh [bash] +$ lscpu +Architecture: x86_64 +CPU(s): 64 +On-line CPU(s) list: 0-63 +Thread(s) per core: 2 +Core(s) per socket: 32 +Socket(s): 1 +NUMA node(s): 1 +Vendor ID: AuthenticAMD +Model name: AMD EPYC 7513 32-Core Processor +``` -To stay caught up with the cluster, the replay stage needs enough -cores and processing power. If you see your validator falling -behind with the default configuration, consider trying out the -following: +<<< @/snippets/bench/bench1.ansi -### Increase Shred Tiles +We have not provided a configuration file to the bench command, so it +is using the stock configuration from `default.toml` and reaching around +63,000 TPS. -Example Original Config: +Let's take a look at the performance with the `monitor` command and see +if we can figure out what's going on. -```toml -[layout] - affinity = "1-18" - quic_tile_count = 2 - verify_tile_count = 4 - bank_tile_count = 4 - agave_affinity = "19-31" -``` +<<< @/snippets/bench/bench2.ansi -Example New Config: +If we narrow in on just the verify tiles we can see the problem: all of +the verify tiles are completely busy processing incoming transactions, +and so additional transactions are being dropped. Here `% finish` +indicates the percentage of time the tile is occupied doing work, while +`overnp cnt` indicates that the tile is being overrun by the quic tile +and dropping transactions. -```toml -[layout] - affinity = "1-18" - quic_tile_count = 2 - verify_tile_count = 5 - bank_tile_count = 2 - shred_tile_count = 2 - agave_affinity = "19-31" -``` +<<< @/snippets/bench/bench3.ansi -This takes a core from the `bank` tile (transaction execution) and -gives it to another `shred` tile (turbine and shred processing). It -takes another core from another `bank` tile and gives it to a `verify` -(signature verification) tile. +This configuration is not ideal. With some tuning to increase the number +of verify tiles, and a few other changes we can try to achieve a higher +TPS rate, -### Increase Cores for Agave +::: code-group -Example Original Config: +```toml [bench-zen3-32core.toml] +[ledger] + # Place the ledger in memory rather than on disk so that writing the + # ledger is not a performance bottleneck + path = "/data/shm/{name}/ledger" -```toml [layout] - affinity = "1-18" - quic_tile_count = 2 - verify_tile_count = 5 - bank_tile_count = 2 - shred_tile_count = 2 - agave_affinity = "19-31" + # We will need a lot of verify tiles, and a few more bank tiles to be + # able to execute at higher TPS rates. Increase their core counts, and + # assign the tiles to cores. We only need 1 shred tile, since there is + # only 1 node in the cluster it can handle a high TPS rate by itself + affinity = "14-57,f1" + solana_labs_affinity = "58-63" + verify_tile_count = 30 + bank_tile_count = 6 + shred_tile_count = 1 + +[development.genesis] + # The default amount of accounts to use for the benchmark is 1024, but + # to reach higher transaction throughput we need more accounts so that + # more transfers can be handled in parallel + fund_initial_accounts = 32768 + +[development.bench] + # benchg tiles are used to generate and sign transactions in the + # benchmarking tool, we are going to need more of them to test higher + # TPS rate + benchg_tile_count = 12 + + # benchs tiles are for sending the transactions to Firedancer over + # loopback, and we will need an extra one of these as well + benchs_tile_count = 2 + + # Assign these benchg, benchs (and the bencho tile which orchestrates + # the benchmarking) to some CPU cores. The bencho assignment is + # floating as it is not performance sensitive + affinity = "f1,0-13" + + # The Solana protocol consensus limits restrict the benchmark to + # around 81,000 TPS. We have special options to increase these limits + # for testing and benchmarking + larger_max_cost_per_block = true + larger_shred_limits_per_block = true + +[rpc] + # Tracking certain transaction history and metadata to serve RPC + # requests is expensive and can slow down our validator, turn this + # functionality off + transaction_history = false + extended_tx_metadata_storage = false ``` -Example New Config: +::: -```toml -[layout] - affinity = "1-16" - quic_tile_count = 1 - verify_tile_count = 4 - bank_tile_count = 2 - shred_tile_count = 2 - agave_affinity = "17-31" -``` +Now try running again, -This takes 1 core from the `quic` tile and another from the `verify` -tile gives them both to the agave threads (where the replay stage -runs). +<<< @/snippets/bench/bench4.ansi -## QUIC +We start out with a higher TPS rate but it quickly falls back to around +90k TPS. We can try to figure out why by running the `monitor` command. -There is a lot of QUIC traffic in the cluster. If the validator is -having a hard time establishing QUIC connections, it might end up -getting less transactions. Some parameters that can be tuned to address -this are (these 2 parameters need to be the same value): +<<< @/snippets/bench/bench5.ansi -```toml -[tiles.quic] - max_concurrent_connections = 2048 - max_concurrent_handshakes = 2048 +The culprit is visible in the output, which will be clearer if we filter +it down to the relevant information, + +<<< @/snippets/bench/bench6.ansi + +Here we see what is happening. The blockstore is completely busy +spending 99.973% of its time storing data, while the PoH and shred tiles +are in back-pressure waiting for the the blockstore to catch up. The +blockstore is an Agave component built on RocksDB that is not rewritten +as part of Frankendancer. + +::: code-group + +```toml [bench-zen3-32core.toml] +[development.bench] + disable_blockstore = true // [!code ++] ``` + +::: + +We can disable the blockstore specifically for benchmarking, to show the +performance of just the Firedancer components in the leader pipeline. +Now we can run one more time and see a reasonably good value for the TPS +throughput of Firedancer on this machine. + +<<< @/snippets/bench/bench7.ansi diff --git a/book/snippets/bench/bench1.ansi b/book/snippets/bench/bench1.ansi new file mode 100644 index 0000000000..fa726f49d8 --- /dev/null +++ b/book/snippets/bench/bench1.ansi @@ -0,0 +1,8 @@ +$ ./build/native/gcc/bin/fddev bench +NOTICE  main configure.c(106): kill ... configuring +NOTICE  main configure.c(81): netns ... skipping .. not enabled +NOTICE  main configure.c(102): hugetlbfs ... already valid +NOTICE  main configure.c(102): sysctl ... already valid +[ ... snip ... ] +NOTICE  bencho:0 fd_bencho.c(137): 63373 txn/s +NOTICE  bencho:0 fd_bencho.c(137): 63340 txn/s diff --git a/book/snippets/bench/bench2.ansi b/book/snippets/bench/bench2.ansi new file mode 100644 index 0000000000..361aa74b8c --- /dev/null +++ b/book/snippets/bench/bench2.ansi @@ -0,0 +1,10 @@ +$ ./build/native/gcc/bin/fddev monitor +snapshot for 2024-07-29 16:44:59.066296419 GMT+00 + tile | pid | stale | heart | sig | in backp | backp cnt | % hkeep | % backp | % wait | % ovrnp | % ovrnr | % filt1 | % filt2 | % finish +---------+---------+------------+-------+------------+----------+---------------------+----------+----------+----------+----------+----------+----------+----------+---------- + net | 2555236 |  - |  - |  run( run) |  -( -) | 0( +0) | 35.511 | 0.000 | 54.233 | 0.000 | 0.000 | 0.000 | 10.255 | 0.000 + quic | 2555236 |  - |  - |  run( run) |  -( -) | 0( +0) | 0.405 | 0.000 | 95.501 | 0.000 | 0.000 | 0.000 | 0.000 | 4.094 + verify | 2555236 |  - |  - |  run( run) |  -( -) | 0( +0) | 0.081 | 0.000 | 0.047 | 0.000 | 0.000 | 0.084 | 0.000 | 99.788 + verify | 2555236 |  - |  - |  run( run) |  -( -) | 0( +0) | 0.082 | 0.000 | 0.036 | 0.000 | 0.000 | 0.078 | 0.000 | 99.804 + verify | 2555236 |  - |  - |  run( run) |  -( -) | 0( +0) | 0.123 | 0.000 | 0.000 | 0.000 | 0.000 | 0.111 | 0.000 | 99.765 +[ ... snip ... ] diff --git a/book/snippets/bench/bench3.ansi b/book/snippets/bench/bench3.ansi new file mode 100644 index 0000000000..66e2ce90a2 --- /dev/null +++ b/book/snippets/bench/bench3.ansi @@ -0,0 +1,13 @@ + tile | sig | in backp | backp cnt | % backp | % finish +---------+------------+----------+---------------+----------+---------- + verify |  run( run) |  -( -) | 0( +0) | 0.000 | 99.788 + verify |  run( run) |  -( -) | 0( +0) | 0.000 | 99.804 + verify |  run( run) |  -( -) | 0( +0) | 0.000 | 99.765 + verify |  run( run) |  -( -) | 0( +0) | 0.000 | 99.745 + + link | tot TPS | ovrnp cnt | ovrnr cnt | slow cnt | tx seq +------------------+----------+---------------------+---------------------+---------------------+------------------- + quic->verify | 17.2K | 9( +1) | 0( +0) | 0( +0) | 507134( +7149) + quic->verify | 17.2K | 9( +1) | 0( +0) | 0( +0) | 507134( +7149) + quic->verify | 11.8K | 15( +0) | 0( +0) | 0( +0) | 507134( +7149) + quic->verify | 17.0K | 9( +1) | 0( +0) | 0( +0) | 507134( +7149) diff --git a/book/snippets/bench/bench4.ansi b/book/snippets/bench/bench4.ansi new file mode 100644 index 0000000000..38873049a0 --- /dev/null +++ b/book/snippets/bench/bench4.ansi @@ -0,0 +1,7 @@ +$ ./build/native/gcc/bin/fddev bench --config src/app/fdctl/config/bench-zen3-32core.toml +[ ... snip ... ] +NOTICE  bencho:0 fd_bencho.c(137): 191180 txn/s +NOTICE  bencho:0 fd_bencho.c(137): 308027 txn/s +NOTICE  bencho:0 fd_bencho.c(137): 12389 txn/s +NOTICE  bencho:0 fd_bencho.c(137): 89564 txn/s +NOTICE  bencho:0 fd_bencho.c(137): 92380 txn/s diff --git a/book/snippets/bench/bench5.ansi b/book/snippets/bench/bench5.ansi new file mode 100644 index 0000000000..c30d0327bd --- /dev/null +++ b/book/snippets/bench/bench5.ansi @@ -0,0 +1,10 @@ +$ ./build/native/gcc/bin/fddev monitor --config src/app/fdctl/config/bench-zen3-32core.toml +snapshot for 2024-07-29 16:44:59.066296419 GMT+00 + tile | pid | stale | heart | sig | in backp | backp cnt | % hkeep | % backp | % wait | % ovrnp | % ovrnr | % filt1 | % filt2 | % finish +---------+---------+------------+-------+------------+----------+---------------------+----------+----------+----------+----------+----------+----------+----------+---------- + net | 2239972 |  - |  - |  run( run) |  -( -) | 0( +0) | 48.576 | 0.000 | 51.424 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 + quic | 2239972 |  - |  - |  run( run) |  -( -) | 0( +0) | 0.553 | 0.000 | 88.238 | 0.000 | 0.000 | 0.000 | 0.000 | 11.208 + verify | 2239972 |  - |  - |  run( run) |  -( -) | 0( +0) | 0.199 | 0.000 | 0.000 | 0.000 | 0.000 | 0.804 | 0.000 | 98.997 + verify | 2239972 |  - |  - |  run( run) |  -( -) | 0( +0) | 0.177 | 0.000 | 0.000 | 0.000 | 0.000 | 0.796 | 0.000 | 99.027 + verify | 2239972 |  - |  - |  run( run) |  -( -) | 0( +0) | 0.177 | 0.000 | 0.000 | 0.000 | 0.000 | 0.796 | 0.000 | 99.027 +[ ... snip ... ] diff --git a/book/snippets/bench/bench6.ansi b/book/snippets/bench/bench6.ansi new file mode 100644 index 0000000000..c3d5f4f384 --- /dev/null +++ b/book/snippets/bench/bench6.ansi @@ -0,0 +1,6 @@ + + tile | sig | in backp | backp cnt | % backp | % finish +---------+------------+----------+---------------+----------+---------- + poh |  run( run) | err(err) | 1002( +22) | 76.918 | 7.892 + shred |  run( run) | err(err) | 1462( +22) | 95.857 | 3.386 + store |  run( run) |  -( -) | 0( +0) | 0.000 | 99.973 diff --git a/book/snippets/bench/bench7.ansi b/book/snippets/bench/bench7.ansi new file mode 100644 index 0000000000..9932b66c27 --- /dev/null +++ b/book/snippets/bench/bench7.ansi @@ -0,0 +1,5 @@ +$ ./build/native/gcc/bin/fddev bench --config src/app/fdctl/config/bench-zen3-32core.toml +[ ... snip ... ] +NOTICE  bencho:0 fd_bencho.c(137): 272840 txn/s +NOTICE  bencho:0 fd_bencho.c(137): 278380 txn/s +NOTICE  bencho:0 fd_bencho.c(137): 268246 txn/s diff --git a/src/app/fdctl/config/bench-icelake-80core.toml b/src/app/fdctl/config/bench-icelake-80core.toml index 1fa479d951..9ab2c38969 100644 --- a/src/app/fdctl/config/bench-icelake-80core.toml +++ b/src/app/fdctl/config/bench-icelake-80core.toml @@ -16,9 +16,9 @@ scratch_directory = "/dev/shm/fd1" agave_affinity = "71-79/2" verify_tile_count = 33 bank_tile_count = 19 + shred_tile_count = 1 [development.genesis] - hashes_per_tick = 12500 fund_initial_accounts = 10000 [development.bench] diff --git a/src/app/fdctl/config/bench-zen3-32core.toml b/src/app/fdctl/config/bench-zen3-32core.toml index b384906e83..d4f745733f 100644 --- a/src/app/fdctl/config/bench-zen3-32core.toml +++ b/src/app/fdctl/config/bench-zen3-32core.toml @@ -13,9 +13,9 @@ agave_affinity = "58-63" verify_tile_count = 30 bank_tile_count = 6 + shred_tile_count = 1 [development.genesis] - hashes_per_tick = 12500 fund_initial_accounts = 32768 [development.bench] diff --git a/src/app/fdctl/config/default.toml b/src/app/fdctl/config/default.toml index 8286cc15f6..00b3436d8a 100644 --- a/src/app/fdctl/config/default.toml +++ b/src/app/fdctl/config/default.toml @@ -459,57 +459,84 @@ dynamic_port_range = "8900-9000" # run on which cores and for how long, Firedancer overrides most of this # behavior by pinning threads to CPU cores. # -# Consider a validator node needing to do six essential pieces of work: +# The validator splits all work into eleven distinct jobs, with each +# thread running one of the jobs: # -# 1. quic Parse received QUIC packets into transactions, and -# respond to the client appropriately -# 2. verify Verify the signature of the transaction, dropping -# invalid ones -# 3. dedup Drop duplicated or repeatedly sent transactions -# 4. pack Decide which transactions to execute, ordering them by -# profitability -# 5. bank Run the transactions in order and update accounting -# 6. shred Sign outgoing messages and forward to other validators +# - net Sends and receives network packets from the network +# device # -# This is a data pipeline. When we model the flow of a transaction -# through the system, it's a simple linear sequence, and could run -# nicely on six CPU cores, one for each stage, +# - quic Receives transactions from clients, performing all +# connection management and packet processing to manage +# and implement the QUIC protocol # -# 1 -> 2 -> 3 -> 4 -> 5 -> 6 +# - verify Verifies the cryptographic signature of incoming +# transactions, filtering invalid ones # -# Transactions can largely be processed independently, except for -# deduplication. With that in mind, if we had ten CPU cores, we could -# make our pipeline faster by parallelizing it as follows, +# - dedup Checks for and filters out duplicated incoming +# transactions # -# 1 -> 2 --> 3 --> 4 --> 5 -> 6 -# | | -# 1 -> 2 -+ +-> 5 -> 6 +# - pack Collects incoming transactions and smartly schedules +# them for execution when we are leader # -# The problem of deciding which cores to use, and what work to run on +# - bank Executes transactions that have been scheduled when we +# are leader +# +# - poh Continuously hashes in the background, and mixes the +# hash in with executed transactions to prove passage of +# time +# +# - shred Distributes block data to the network when leader, and +# receives and retransmits block data when not leader +# +# - store Receives block data when we are leader, or from other +# nodes when they are leader, and stores it locally in a +# database on disk +# +# - metric Collects monitoring information about other tiles and +# serves it on a HTTP endpoint +# +# - sign Holds the validator private key, and receives and +# responds to signing requests from other tiles +# +# The jobs involved in producing blocks when we are leader are organized +# in a pipeline, where transactions flow through the system in a linear +# sequence. +# +# net -> quic -> verify -> dedup -> pack -> bank -> poh -> shred -> +# store +# +# Some of these jobs (net, quic, verify, bank, and shred) can be +# parallelized, and run on multiple CPU cores at once. For example, we +# could structure the pipeline like this for performance: +# +# net -> quic +-> verify -+> dedup -> pack +-> bank -+> poh -> shred -> +# store +-> verify -+ +-> bank -+ +-> verify +# -+ +-> verify -+ + +# Each instance of a job running on a CPU core is called a tile. In +# this configuration we are running 4 verify tiles and 2 bank tiles. +# +# The problem of deciding which cores to use, and what job to run on # each core we call layout. Layout is system dependent and the highest # throughput layout will vary depending on the specific hardware # available. # -# Pinning and layout is accomplished with help from a primitive we call -# a tile. A tile is a thread which you can dispatch work to. Tiles may -# either be pinned to a specific core, or float between unassigned cores -# (the OS scheduler will assign). While a tile could receive and handle -# arbitrary new work requests over its lifetime, acting like a worker -# thread in a thread pool, in practice most tiles are dispatched just -# one piece of work at startup, one of the six described above, which -# they run forever. +# Tiles communciate with each other using message queues. If a queue +# between two tiles fills up, the producer will either block, waiting +# until there is free space to continue which is referred to as +# backpressure, or it will drop transactions or data and continue. # -# The concurrency model is that each tile runs exclusively on its own -# thread, and communicates with other tiles via. message passing. The -# message passing primitives are built on top of shared memory, but -# tiles do not otherwise communicate via. shared memory. These message -# queues between tiles are all fixed size, and when a producer outruns a -# downstream consumer and fills the outgoing buffer transactions will be -# dropped. +# A slow tile can cause backpressure through the rest of the system +# causing it to halt, and the goal of adding more tiles is to increase +# throughput of a job, preventing dropped transactions. For example, +# if the QUIC server was producing 100,000 transactions a second, but +# each verify tile could only handle 20,000 transactions a second, five +# of the verify tiles would be needed to keep up without dropping +# transactions. # -# A full Firedancer layout spins up these six tasks onto a variety of -# tiles and connects them together with queues so that data can flow in -# and out of the system with maximum throughput and minimal overruns. +# A full Firedancer layout spins up these eleven tasks onto a variety of +# CPU cores and connects them together with queues so that data can flow +# in and out of the system with maximum throughput and minimal drops. [layout] # Logical CPU cores to run Firedancer tiles on. Can be specified as # a single core like "0", a range like "0-10", or a range with @@ -534,57 +561,100 @@ dynamic_port_range = "8900-9000" # 3 | 2 # 4 | 4 # 5 | floating - # - # It is suggested to use all available CPU cores for Firedancer, so - # that the Solana network can run as fast as possible. affinity = "1-16" # In addition to the Firedancer tiles which use a core each, the # current version of Firedancer hosts a Agave validator as # a subprocess. # - # This affinity congtrols which logical CPU cores the Agave + # This affinity controls which logical CPU cores the Agave # subprocess and all of its threads are allowed to run on. This is # specified in the same format as the above Firedancer affinity. # # It is strongly suggested that you do not overlap the Firedancer # affinity with the Agave affinity, as Firedancer tiles expect - # to have exclusive use of their core. Unexpected latency spikes + # to have exclusive use of their core. Unexpected latency spikes # due to context switching may decrease performance overall. agave_affinity = "" # The following option is retained for backwards compatibility. It - # will be disregarded if agave_affinity is specified. + # will be ignored if agave_affinity is specified. solana_labs_affinity = "17-31" - # How many net tiles to run. Each networking tile will service - # exactly one queue from a network device being listened to. If - # there are less net tiles than queues, some queues will not get - # drained and packets will be lost, and if there are more tiles than - # queues, some tiles will spin a CPU core doing nothing since no - # packets will ever arrive. + # How many net tiles to run. Should be set to 1. This is + # configurable and designed to scale out for future network + # conditions but there is no need to run more than 1 net tile given + # current `mainnet-beta` conditions. + # + # Net tiles are responsible for sending and receiving packets from + # the network device configured in the [tiles.net] section below. + # Each net tile will service exactly one queue from the device, and + # firedancer will error on boot if the number of queues on the + # device is not configured correctly. + # + # The net tile is designed to scale linearly when adding more tiles. # # See the comments for the [tiles.net] section below for more # information. net_tile_count = 1 - # How many QUIC tiles to run. Each QUIC tile can service persistent - # QUIC connections, and will be responsible for parsing and and - # responding to incoming data. + # How many QUIC tiles to run. Should be set to 1. This is + # configurable and designed to scale out for future network + # conditions but there is no need to run more then 1 QUIC tile given + # current `mainnet-beta` conditions, unless the validator is the + # subject of large DoS attack. + # + # QUIC tiles are responsible for parsing incoming QUIC protocol + # messages, managing connections and responding to clients. + # Connections from the net tiles will be evenly distributed + # between the available QUIC tiles round robin style. + # + # QUIC tiles are designed to scale linearly when adding more tiles, quic_tile_count = 1 # How many verify tiles to run. Verify tiles perform signature # verification on incoming transactions, an expensive operation that # is often the bottleneck of the validator. - verify_tile_count = 5 - - # How many bank tiles to run. Multiple banks can run in parallel, - # if they are not writing to the same accounts at the same time. + # + # Verify tiles are designed to scale linearly when adding more + # tiles, and the verify tile count should be increased until the + # validator is not dropping incoming QUIC transactions from clients. + # + # On modern hardware, each verify tile can handle around 20-40K + # transactions per second. Four tiles seems to be enough to handle + # current `mainnet-beta` traffic, unless the validator is under a + # DoS or spam attack. + verify_tile_count = 4 + + # How many bank tiles to run. Should be set to 2. Bank tiles + # execute transactions, so the validator can include the results of + # the transaction into a block when we are leader. Because of + # current consensus limits restricting blocks to around 32,000 + # transactions per block, there is no need to use more than 2 bank + # tiles on mainnet-beta. For development and benchmarking, it can + # be useful to increase this number further. + # + # Bank tiles do not scale linearly. The current implementation uses + # the agave runtime for execution, which takes various locks and + # uses concurrent data structures which slow down with multiple + # parallel users. bank_tile_count = 2 - # How many shred tiles to run. Multiple shred tiles can run in - # parallel to create shreds from transaction entries. - shred_tile_count = 1 + # How many shred tiles to run. Should be set to 2. This is + # configurable and designed to scale out for future network + # conditions but there is no need to run more than 2 shred tiles + # given current `mainnet-beta` conditions. + # + # Shred tiles distribute block data to the network when we are + # leader, and receive and retransmit it to other nodes when we are + # not leader. + # + # Shred tile performance heavily dependent on the number of peer + # nodes in the cluster, as computing where data should go is an + # expensive function with this list of peers as the input. In + # development and benchmarking, 1 tile is sufficient to hit very + # high TPS rates because the cluster size will be very small. + shred_tile_count = 2 # All memory that will be used in Firedancer is pre-allocated in two # kinds of pages: huge and gigantic. Huge pages are 2MB and gigantic @@ -715,7 +785,7 @@ dynamic_port_range = "8900-9000" # be accepted. # # This must be >= 2 and also a power of 2. - max_concurrent_connections = 256 + max_concurrent_connections = 2048 # QUIC allows for multiple streams to be multiplexed over a # single connection. This option sets the maximum number of @@ -737,12 +807,12 @@ dynamic_port_range = "8900-9000" # QUIC uses a fixed-size pool of streams to use for all the # connections in the QUIC instance. When a new connection is - # established, QUIC attempts to allocate stream objects to it for - # incoming peer-initiated streams. Locally-initiated streams are - # allocated explicitly. One stream per connection is reserved, so - # every connection can be used. This means that this value MUST - # be at least as high as max_concurrent_connections. - # The size of the pool is defined here: + # established, QUIC attempts to allocate stream objects to it + # for incoming peer-initiated streams. Locally initiated + # streams are allocated explicitly. One stream per connection is + # reserved, so every connection can be used. This means that + # this value must be at least as high as the value of + # `max_concurrent_connections` above. stream_pool_cnt = 4096 # Controls how much transactions coming in via TPU can be @@ -768,7 +838,7 @@ dynamic_port_range = "8900-9000" # above. # # TODO: This should be removed. - max_concurrent_handshakes = 256 + max_concurrent_handshakes = 2048 # QUIC has a concept of a "QUIC packet", there can be multiple # of these inside a UDP packet. Each QUIC packet we send to the @@ -1022,13 +1092,15 @@ dynamic_port_range = "8900-9000" # the generating computer can do in the target tick duration # specified below. # - # A value of 12500 is the same as mainnet, devnet, and testnet. + # This value specifies the initial value for the chain in the + # genesis, but it might be overriden at runtime if the related + # features which increase this value are enabled. The features + # are named like `update_hashes_per_tick2`. # - # This value specifies the initial value for chain in the - # genesis, but it might be immediately overriden at runtime if - # the related features which increase this value are enabled. - # The features are like update_hashes_per_tick2. - hashes_per_tick = 1 + # A value of 62,500 is the same as mainnet-beta, devnet, and + # testnet, following activation of the `update_hashes_per_tick6` + # feature. + hashes_per_tick = 62_500 # How long each tick of the proof of history component should # take, in microseconds. This value specifies the initial value