diff --git a/book/.vitepress/config.mts b/book/.vitepress/config.mts
index 3978424b78..b771898adf 100644
--- a/book/.vitepress/config.mts
+++ b/book/.vitepress/config.mts
@@ -10,7 +10,7 @@ export default defineConfig({
   lastUpdated: true,
 
   head: [
-    ['link', { rel: 'icon', type: 'image/svg+xml', href: '/fire.svg' }],
+    ['link', { rel: 'icon', type: 'image/svg+xml', href: '/firedancer/fire.svg' }],
     ['meta', { name: 'theme-color', content: '#1ce7c2' }],
     ['meta', { name: 'og:type', content: 'website' }],
     ['meta', { name: 'og:locale', content: 'en' }],
@@ -37,13 +37,19 @@ export default defineConfig({
             { text: 'Initializing', link: 'initializing' },
           ]
         },
+        {
+          text: 'Performance',
+          collapsed: false,
+          items: [
+            { text: 'Tuning', link: 'tuning' },
+          ]
+        },
         {
           text: 'Operating',
           collapsed: false,
           items: [
             { text: 'Monitoring', link: 'monitoring' },
             { text: 'Troubleshooting', link: 'troubleshooting' },
-            { text: 'Tuning', link: 'tuning' },
             { text: 'Frequently Asked Questions', link: 'faq' },
           ]
         }
diff --git a/book/guide/configuring.md b/book/guide/configuring.md
index c3356dedd6..ec50dcdd13 100644
--- a/book/guide/configuring.md
+++ b/book/guide/configuring.md
@@ -4,7 +4,9 @@
 Firedancer is configured via. a [TOML](https://toml.io/en/) file. Almost
 all options have a recommended default value that is set automatically
 by Firedancer, and an operator needs only to specify values for options
-they wish to change.
+they wish to change. The full list of options is as specified in the
+[`default.toml`](https://github.com/firedancer-io/firedancer/blob/main/src/app/fdctl/config/default.toml)
+file is documented below.
 
 ::: tip MIGRATING
 
diff --git a/book/guide/monitoring.md b/book/guide/monitoring.md
index 10a92f5123..a3706efd4d 100644
--- a/book/guide/monitoring.md
+++ b/book/guide/monitoring.md
@@ -74,18 +74,8 @@ tile_pid{kind="quic",kind_id="0"} 1108975
 tile_pid{kind="verify",kind_id="0"} 1108978
 ```
 
-All of the metrics have two two lables, a `kind` which tells you what
-type of tile the metric is being reported for, and a `kind_id` which is
-the index of the tile. For example, if there are two bank tiles for
-executing transactions, they have `kind_id` of `0` and `1`, and each
-report metrics separately.
-
-```sh [bash]
-# HELP bank_tile_transaction_executed_program_account_not_found When a transaction executes (makes it onto the chain), result of executing a transaction. The transaction can still fail. (Attempt to load a program that does not exist.)
-# TYPE bank_tile_transaction_executed_program_account_not_found counter
-bank_tile_transaction_executed_program_account_not_found{kind="bank",kind_id="0"} 241
-bank_tile_transaction_executed_program_account_not_found{kind="bank",kind_id="1"} 13
-```
+See the [metrics API documentation](/api/metrics.html) for more
+information on the available data.
 
 ## Live monitoring
 Firedancer ships with a monitoring tool included in `fdctl`, which you
diff --git a/book/guide/tuning.md b/book/guide/tuning.md
index 5b235907d9..12524e8778 100644
--- a/book/guide/tuning.md
+++ b/book/guide/tuning.md
@@ -1,81 +1,241 @@
-# Tuning
+# Performance Tuning
+
+## Overview
+The Firedancer validator is composed of a handful of threads, each
+performing one of eleven distinct jobs. Some jobs only need one thread
+to do them, but certain jobs require many threads performing the same
+work in parallel.
+
+Each thread is given a CPU core to run on, and threads take ownership of
+the core: never sleeping or letting the operating system use it for
+another purpose. The combination of a job, and the thread it runs on,
+and the CPU core it is assigned to is called a tile. The eleven kinds of
+tile are,
+
+| Tile   | Description |
+|--------|-------------|
+| `net`  | Sends and receives network packets from the network device |
+| `quic` | Receives transactions from clients, performing all connection management and packet processing to manage and implement the QUIC protocol |
+| `verify` | Verifies the cryptographic signature of incoming transactions, filtering invalid ones |
+| `dedup` | Checks for and filters out duplicated incoming transactions |
+| `pack` | Collects incoming transactions and smartly schedules them for execution when we are leader |
+| `bank` | Executes transactions that have been scheduled when we are leader |
+| `poh`  | Continuously hashes in the background, and mixes the hash in with executed transactions to prove passage of time |
+| `shred` | Distributes block data to the network when leader, and receives and retransmits block data when not leader |
+| `store` | Receives block data when we are leader, or from other nodes when they are leader, and stores it locally in a database on disk |
+| `metric` | Collects monitoring information about other tiles and serves it on a HTTP endpoint |
+| `sign` | Holds the validator private key, and receives and responds to signing requests from other tiles |
+
+These tiles communicate with each other via shared memory queues. The
+work each tile performs and how they communicate with each other is
+fixed, but the count of each tile kind and which CPU cores they are
+assigned to is set by your configuration, and this the primary way to
+tune the performance of Firedancer.
+
+## Configuration
+The default configuration provided if no options are specified is given
+in the [`default.toml`](https://github.com/firedancer-io/firedancer/blob/main/src/app/fdctl/config/default.toml)
+file:
+
+::: code-group
+
+```toml [default.toml]
+[layout]
+    affinity = "1-16"
+    solana_labs_affinity = "17-31"
+    net_tile_count = 1
+    quic_tile_count = 1
+    verify_tile_count = 4
+    bank_tile_count = 2
+    shred_tile_count = 2
+```
 
-## Tiles
+:::
+
+Note that not all tiles have a configurable count. The `dedup`, `pack`,
+`poh`, `store`, `metric`, and `sign` tiles are fixed at one thread each.
+
+The assignment of tiles to CPU cores is determined by the `affinity`
+string, which is documented fully in the
+[`default.toml`](https://github.com/firedancer-io/firedancer/blob/main/src/app/fdctl/config/default.toml)
+file itself. The Frankendancer validator currently starts an Agave
+process to perform functionality like replay, gossip, and repair that is
+not yet implemented in Firedancer. The `solana_labs_affinity` string
+determines the CPU cores that are given to the threads of this Agave
+process.
+
+The following table shows the performance of the adjustable tiles on an
+Intel Icelake core, along with some performance notes and
+recommendations for `mainnet-beta`,
+
+| Tile     | Default         | Notes |
+|----------|-----------------|-------|
+| `net`    | 1               | Handles >1M TPS per tile. Designed to scale out for future network conditions, but there is no need to run more than 1 net tile at the moment on `mainnet-beta` |
+| `quic`   | 1               | Handles >1M TPS per tile. Designed to scale out for future network conditions, but there is no need to run more than 1 QUIC tile at the moment on `mainnet-beta` |
+| `verify` | 4               | Handles 20-40k TPS per tile. Recommend running many verify tiles, as signature verification is the primary bottleneck of the application |
+| `bank`   | 2               | Handles 20-40k TPS per tile, with diminishing returns from adding more tiles. Designed to scale out for future network conditions, but 2 tiles is enough to handle current `mainnet-beta` conditions. Can be increased further when benchmarking to test future network performance |
+| `shred`  | 2               | Throughput is mainly dependent on cluster size, 2 tiles is enough to handle current `mainnet-beta` conditions. In benchmarking, if the cluster size is small, 1 tile can handle >1M TPS |
+
+## Testing
+Firedancer includes a simple benchmarking tool for measuring the
+transaction throughput of the validator when it is leader, in
+transactions per second (TPS). In practice, the Solana network
+performance is limited by two factors that are unrelated to what
+this tool measures:
+
+ - The replay performance of the slowest nodes in the network, and if
+they can keep up
+ - The consensus limits on block size and data size
+
+In particular, consensus limits on the Solana protocol limit the network
+strictly to around 81,000 TPS. But the tool can be useful for testing
+local affinity and layout configurations.
+
+The benchmark runs on a single machine and performs the following:
+
+ 1. A new genesis is created, and set of accounts are pre-funded
+ 2. A set of CPU cores is assigned to generating and signing simple
+transactions using these accounts as fast as possible
+ 3. Another set of CPU cores is assigned to sending these transfers
+via. QUIC over loopback to the locally running validator
+ 4. Around once a second, an RPC call is made to get the total count of
+transactions that have executed on the chain, and this information is
+printed to the console
+
+The benchmark is currently quite synthetic, as it only measures single
+node performance, in an idealized case where all transactions are
+non-conflicting.
+
+## Running
+The benchmark command is part of the `fddev` development binary, which
+can be built with `make -j fddev`. With the binary in hand, we can run
+our benchmark, here it will be on a 32 physical core AMD EPYC 7513:
+
+```sh [bash]
+$ lscpu
+Architecture:        x86_64
+CPU(s):              64
+On-line CPU(s) list: 0-63
+Thread(s) per core:  2
+Core(s) per socket:  32
+Socket(s):           1
+NUMA node(s):        1
+Vendor ID:           AuthenticAMD
+Model name:          AMD EPYC 7513 32-Core Processor
+```
 
-To stay caught up with the cluster, the replay stage needs enough
-cores and processing power. If you see your validator falling
-behind with the default configuration, consider trying out the
-following:
+<<< @/snippets/bench/bench1.ansi
 
-### Increase Shred Tiles
+We have not provided a configuration file to the bench command, so it
+is using the stock configuration from `default.toml` and reaching around
+63,000 TPS.
 
-Example Original Config:
+Let's take a look at the performance with the `monitor` command and see
+if we can figure out what's going on.
 
-```toml
-[layout]
-    affinity = "1-18"
-    quic_tile_count = 2
-    verify_tile_count = 4
-    bank_tile_count = 4
-    agave_affinity = "19-31"
-```
+<<< @/snippets/bench/bench2.ansi
 
-Example New Config:
+If we narrow in on just the verify tiles we can see the problem: all of
+the verify tiles are completely busy processing incoming transactions,
+and so additional transactions are being dropped. Here `% finish`
+indicates the percentage of time the tile is occupied doing work, while
+`overnp cnt` indicates that the tile is being overrun by the quic tile
+and dropping transactions.
 
-```toml
-[layout]
-    affinity = "1-18"
-    quic_tile_count = 2
-    verify_tile_count = 5
-    bank_tile_count = 2
-    shred_tile_count = 2
-    agave_affinity = "19-31"
-```
+<<< @/snippets/bench/bench3.ansi
 
-This takes a core from the `bank` tile (transaction execution) and
-gives it to another `shred` tile (turbine and shred processing). It
-takes another core from another `bank` tile and gives it to a `verify`
-(signature verification) tile.
+This configuration is not ideal. With some tuning to increase the number
+of verify tiles, and a few other changes we can try to achieve a higher
+TPS rate,
 
-### Increase Cores for Agave
+::: code-group
 
-Example Original Config:
+```toml [bench-zen3-32core.toml]
+[ledger]
+  # Place the ledger in memory rather than on disk so that writing the
+  # ledger is not a performance bottleneck
+  path = "/data/shm/{name}/ledger"
 
-```toml
 [layout]
-    affinity = "1-18"
-    quic_tile_count = 2
-    verify_tile_count = 5
-    bank_tile_count = 2
-    shred_tile_count = 2
-    agave_affinity = "19-31"
+  # We will need a lot of verify tiles, and a few more bank tiles to be
+  # able to execute at higher TPS rates. Increase their core counts, and
+  # assign the tiles to cores. We only need 1 shred tile, since there is
+  # only 1 node in the cluster it can handle a high TPS rate by itself
+  affinity = "14-57,f1"
+  solana_labs_affinity = "58-63"
+  verify_tile_count = 30
+  bank_tile_count = 6
+  shred_tile_count = 1
+
+[development.genesis]
+  # The default amount of accounts to use for the benchmark is 1024, but
+  # to reach higher transaction throughput we need more accounts so that
+  # more transfers can be handled in parallel
+  fund_initial_accounts = 32768
+
+[development.bench]
+  # benchg tiles are used to generate and sign transactions in the
+  # benchmarking tool, we are going to need more of them to test higher
+  # TPS rate
+  benchg_tile_count = 12
+
+  # benchs tiles are for sending the transactions to Firedancer over
+  # loopback, and we will need an extra one of these as well
+  benchs_tile_count = 2
+
+  # Assign these benchg, benchs (and the bencho tile which orchestrates
+  # the benchmarking) to some CPU cores. The bencho assignment is
+  # floating as it is not performance sensitive
+  affinity = "f1,0-13"
+
+  # The Solana protocol consensus limits restrict the benchmark to
+  # around 81,000 TPS. We have special options to increase these limits
+  # for testing and benchmarking
+  larger_max_cost_per_block = true
+  larger_shred_limits_per_block = true
+
+[rpc]
+  # Tracking certain transaction history and metadata to serve RPC
+  # requests is expensive and can slow down our validator, turn this
+  # functionality off
+  transaction_history = false
+  extended_tx_metadata_storage = false
 ```
 
-Example New Config:
+:::
 
-```toml
-[layout]
-    affinity = "1-16"
-    quic_tile_count = 1
-    verify_tile_count = 4
-    bank_tile_count = 2
-    shred_tile_count = 2
-    agave_affinity = "17-31"
-```
+Now try running again,
 
-This takes 1 core from the `quic` tile and another from the `verify`
-tile gives them both to the agave threads (where the replay stage
-runs).
+<<< @/snippets/bench/bench4.ansi
 
-## QUIC
+We start out with a higher TPS rate but it quickly falls back to around
+90k TPS. We can try to figure out why by running the `monitor` command.
 
-There is a lot of QUIC traffic in the cluster. If the validator is
-having a hard time establishing QUIC connections, it might end up
-getting less transactions. Some parameters that can be tuned to address
-this are (these 2 parameters need to be the same value):
+<<< @/snippets/bench/bench5.ansi
 
-```toml
-[tiles.quic]
-    max_concurrent_connections = 2048
-    max_concurrent_handshakes  = 2048
+The culprit is visible in the output, which will be clearer if we filter
+it down to the relevant information,
+
+<<< @/snippets/bench/bench6.ansi
+
+Here we see what is happening. The blockstore is completely busy
+spending 99.973% of its time storing data, while the PoH and shred tiles
+are in back-pressure waiting for the the blockstore to catch up. The
+blockstore is an Agave component built on RocksDB that is not rewritten
+as part of Frankendancer.
+
+::: code-group
+
+```toml [bench-zen3-32core.toml]
+[development.bench]
+  disable_blockstore = true // [!code ++]
 ```
+
+:::
+
+We can disable the blockstore specifically for benchmarking, to show the
+performance of just the Firedancer components in the leader pipeline.
+Now we can run one more time and see a reasonably good value for the TPS
+throughput of Firedancer on this machine.
+
+<<< @/snippets/bench/bench7.ansi
diff --git a/book/snippets/bench/bench1.ansi b/book/snippets/bench/bench1.ansi
new file mode 100644
index 0000000000..fa726f49d8
--- /dev/null
+++ b/book/snippets/bench/bench1.ansi
@@ -0,0 +1,8 @@
+$ ./build/native/gcc/bin/fddev bench
+[32mNOTICE [0m main configure.c(106): kill ... configuring
+[32mNOTICE [0m main configure.c(81): netns ... skipping .. not enabled
+[32mNOTICE [0m main configure.c(102): hugetlbfs ... already valid
+[32mNOTICE [0m main configure.c(102): sysctl ... already valid
+[ ... snip ... ]
+[32mNOTICE [0m bencho:0 fd_bencho.c(137): 63373 txn/s
+[32mNOTICE [0m bencho:0 fd_bencho.c(137): 63340 txn/s
diff --git a/book/snippets/bench/bench2.ansi b/book/snippets/bench/bench2.ansi
new file mode 100644
index 0000000000..361aa74b8c
--- /dev/null
+++ b/book/snippets/bench/bench2.ansi
@@ -0,0 +1,10 @@
+$ ./build/native/gcc/bin/fddev monitor
+snapshot for 2024-07-29 16:44:59.066296419 GMT+00
+    tile |     pid |      stale | heart |        sig | in backp |           backp cnt |  % hkeep |  % backp |   % wait |  % ovrnp |  % ovrnr |  % filt1 |  % filt2 | % finish
+---------+---------+------------+-------+------------+----------+---------------------+----------+----------+----------+----------+----------+----------+----------+----------
+     net | 2555236 | [32m         -[0m | [32m    -[0m | [32m run[0m([32m run[0m) | [32m  -[0m([32m  -[0m) |          0([32m     +0[0m) |   35.511 |    0.000 |   54.233 |    0.000 |    0.000 |    0.000 |   10.255 |    0.000
+    quic | 2555236 | [32m         -[0m | [32m    -[0m | [32m run[0m([32m run[0m) | [32m  -[0m([32m  -[0m) |          0([32m     +0[0m) |    0.405 |    0.000 |   95.501 |    0.000 |    0.000 |    0.000 |    0.000 |    4.094
+  verify | 2555236 | [32m         -[0m | [32m    -[0m | [32m run[0m([32m run[0m) | [32m  -[0m([32m  -[0m) |          0([32m     +0[0m) |    0.081 |    0.000 |    0.047 |    0.000 |    0.000 |    0.084 |    0.000 |   99.788
+  verify | 2555236 | [32m         -[0m | [32m    -[0m | [32m run[0m([32m run[0m) | [32m  -[0m([32m  -[0m) |          0([32m     +0[0m) |    0.082 |    0.000 |    0.036 |    0.000 |    0.000 |    0.078 |    0.000 |   99.804
+  verify | 2555236 | [32m         -[0m | [32m    -[0m | [32m run[0m([32m run[0m) | [32m  -[0m([32m  -[0m) |          0([32m     +0[0m) |    0.123 |    0.000 |    0.000 |    0.000 |    0.000 |    0.111 |    0.000 |   99.765
+[ ... snip ... ]
diff --git a/book/snippets/bench/bench3.ansi b/book/snippets/bench/bench3.ansi
new file mode 100644
index 0000000000..66e2ce90a2
--- /dev/null
+++ b/book/snippets/bench/bench3.ansi
@@ -0,0 +1,13 @@
+   tile |        sig | in backp |     backp cnt |  % backp | % finish
+---------+------------+----------+---------------+----------+----------
+  verify | [32m run[0m([32m run[0m) | [32m  -[0m([32m  -[0m) |    0([32m     +0[0m) |    0.000 |   99.788
+  verify | [32m run[0m([32m run[0m) | [32m  -[0m([32m  -[0m) |    0([32m     +0[0m) |    0.000 |   99.804
+  verify | [32m run[0m([32m run[0m) | [32m  -[0m([32m  -[0m) |    0([32m     +0[0m) |    0.000 |   99.765
+  verify | [32m run[0m([32m run[0m) | [32m  -[0m([32m  -[0m) |    0([32m     +0[0m) |    0.000 |   99.745
+
+             link |  tot TPS |           ovrnp cnt |           ovrnr cnt |            slow cnt |             tx seq
+------------------+----------+---------------------+---------------------+---------------------+-------------------
+    quic->verify  |    17.2K |          9([31m     +1[0m) |          0([32m     +0[0m) |          0([32m     +0[0m) |     507134([32m  +7149[0m)
+    quic->verify  |    17.2K |          9([31m     +1[0m) |          0([32m     +0[0m) |          0([32m     +0[0m) |     507134([32m  +7149[0m)
+    quic->verify  |    11.8K |         15([32m     +0[0m) |          0([32m     +0[0m) |          0([32m     +0[0m) |     507134([32m  +7149[0m)
+    quic->verify  |    17.0K |          9([31m     +1[0m) |          0([32m     +0[0m) |          0([32m     +0[0m) |     507134([32m  +7149[0m)
diff --git a/book/snippets/bench/bench4.ansi b/book/snippets/bench/bench4.ansi
new file mode 100644
index 0000000000..38873049a0
--- /dev/null
+++ b/book/snippets/bench/bench4.ansi
@@ -0,0 +1,7 @@
+$ ./build/native/gcc/bin/fddev bench --config src/app/fdctl/config/bench-zen3-32core.toml
+[ ... snip ... ]
+[32mNOTICE [0m bencho:0 fd_bencho.c(137): 191180 txn/s
+[32mNOTICE [0m bencho:0 fd_bencho.c(137): 308027 txn/s
+[32mNOTICE [0m bencho:0 fd_bencho.c(137): 12389 txn/s
+[32mNOTICE [0m bencho:0 fd_bencho.c(137): 89564 txn/s
+[32mNOTICE [0m bencho:0 fd_bencho.c(137): 92380 txn/s
diff --git a/book/snippets/bench/bench5.ansi b/book/snippets/bench/bench5.ansi
new file mode 100644
index 0000000000..c30d0327bd
--- /dev/null
+++ b/book/snippets/bench/bench5.ansi
@@ -0,0 +1,10 @@
+$ ./build/native/gcc/bin/fddev monitor --config src/app/fdctl/config/bench-zen3-32core.toml
+snapshot for 2024-07-29 16:44:59.066296419 GMT+00
+    tile |     pid |      stale | heart |        sig | in backp |           backp cnt |  % hkeep |  % backp |   % wait |  % ovrnp |  % ovrnr |  % filt1 |  % filt2 | % finish
+---------+---------+------------+-------+------------+----------+---------------------+----------+----------+----------+----------+----------+----------+----------+----------
+     net | 2239972 | [32m         -[0m | [32m    -[0m | [32m run[0m([32m run[0m) | [32m  -[0m([32m  -[0m) |          0([32m     +0[0m) |   48.576 |    0.000 |   51.424 |    0.000 |    0.000 |    0.000 |    0.000 |    0.000
+    quic | 2239972 | [32m         -[0m | [32m    -[0m | [32m run[0m([32m run[0m) | [32m  -[0m([32m  -[0m) |          0([32m     +0[0m) |    0.553 |    0.000 |   88.238 |    0.000 |    0.000 |    0.000 |    0.000 |   11.208
+  verify | 2239972 | [32m         -[0m | [32m    -[0m | [32m run[0m([32m run[0m) | [32m  -[0m([32m  -[0m) |          0([32m     +0[0m) |    0.199 |    0.000 |    0.000 |    0.000 |    0.000 |    0.804 |    0.000 |   98.997
+  verify | 2239972 | [32m         -[0m | [32m    -[0m | [32m run[0m([32m run[0m) | [32m  -[0m([32m  -[0m) |          0([32m     +0[0m) |    0.177 |    0.000 |    0.000 |    0.000 |    0.000 |    0.796 |    0.000 |   99.027
+  verify | 2239972 | [32m         -[0m | [32m    -[0m | [32m run[0m([32m run[0m) | [32m  -[0m([32m  -[0m) |          0([32m     +0[0m) |    0.177 |    0.000 |    0.000 |    0.000 |    0.000 |    0.796 |    0.000 |   99.027
+[ ... snip ... ]
diff --git a/book/snippets/bench/bench6.ansi b/book/snippets/bench/bench6.ansi
new file mode 100644
index 0000000000..c3d5f4f384
--- /dev/null
+++ b/book/snippets/bench/bench6.ansi
@@ -0,0 +1,6 @@
+
+   tile |        sig | in backp |     backp cnt |  % backp | % finish
+---------+------------+----------+---------------+----------+----------
+     poh | [32m run[0m([32m run[0m) | [31merr[0m([31merr[0m) | 1002([31m    +22[0m) |   76.918 |    7.892
+   shred | [32m run[0m([32m run[0m) | [31merr[0m([31merr[0m) | 1462([31m    +22[0m) |   95.857 |    3.386
+   store | [32m run[0m([32m run[0m) | [32m  -[0m([32m  -[0m) |    0([32m     +0[0m) |    0.000 |   99.973
diff --git a/book/snippets/bench/bench7.ansi b/book/snippets/bench/bench7.ansi
new file mode 100644
index 0000000000..9932b66c27
--- /dev/null
+++ b/book/snippets/bench/bench7.ansi
@@ -0,0 +1,5 @@
+$ ./build/native/gcc/bin/fddev bench --config src/app/fdctl/config/bench-zen3-32core.toml
+[ ... snip ... ]
+[32mNOTICE [0m bencho:0 fd_bencho.c(137): 272840 txn/s
+[32mNOTICE [0m bencho:0 fd_bencho.c(137): 278380 txn/s
+[32mNOTICE [0m bencho:0 fd_bencho.c(137): 268246 txn/s
diff --git a/src/app/fdctl/config/bench-icelake-80core.toml b/src/app/fdctl/config/bench-icelake-80core.toml
index 1fa479d951..9ab2c38969 100644
--- a/src/app/fdctl/config/bench-icelake-80core.toml
+++ b/src/app/fdctl/config/bench-icelake-80core.toml
@@ -16,9 +16,9 @@ scratch_directory = "/dev/shm/fd1"
   agave_affinity = "71-79/2"
   verify_tile_count = 33
   bank_tile_count = 19
+  shred_tile_count = 1
 
 [development.genesis]
-  hashes_per_tick = 12500
   fund_initial_accounts = 10000
 
 [development.bench]
diff --git a/src/app/fdctl/config/bench-zen3-32core.toml b/src/app/fdctl/config/bench-zen3-32core.toml
index b384906e83..d4f745733f 100644
--- a/src/app/fdctl/config/bench-zen3-32core.toml
+++ b/src/app/fdctl/config/bench-zen3-32core.toml
@@ -13,9 +13,9 @@
   agave_affinity = "58-63"
   verify_tile_count = 30
   bank_tile_count = 6
+  shred_tile_count = 1
 
 [development.genesis]
-  hashes_per_tick = 12500
   fund_initial_accounts = 32768
 
 [development.bench]
diff --git a/src/app/fdctl/config/default.toml b/src/app/fdctl/config/default.toml
index 8286cc15f6..00b3436d8a 100644
--- a/src/app/fdctl/config/default.toml
+++ b/src/app/fdctl/config/default.toml
@@ -459,57 +459,84 @@ dynamic_port_range = "8900-9000"
 # run on which cores and for how long, Firedancer overrides most of this
 # behavior by pinning threads to CPU cores.
 #
-# Consider a validator node needing to do six essential pieces of work:
+# The validator splits all work into eleven distinct jobs, with each
+# thread running one of the jobs:
 #
-#  1. quic      Parse received QUIC packets into transactions, and
-#               respond to the client appropriately
-#  2. verify    Verify the signature of the transaction, dropping
-#               invalid ones
-#  3. dedup     Drop duplicated or repeatedly sent transactions
-#  4. pack      Decide which transactions to execute, ordering them by
-#               profitability
-#  5. bank      Run the transactions in order and update accounting
-#  6. shred     Sign outgoing messages and forward to other validators
+#  - net        Sends and receives network packets from the network
+#               device
 #
-# This is a data pipeline.  When we model the flow of a transaction
-# through the system, it's a simple linear sequence, and could run
-# nicely on six CPU cores, one for each stage,
+#  - quic       Receives transactions from clients, performing all
+#               connection management and packet processing to manage
+#               and implement the QUIC protocol
 #
-#   1 -> 2 -> 3 -> 4 -> 5 -> 6
+#  - verify     Verifies the cryptographic signature of incoming
+#               transactions, filtering invalid ones
 #
-# Transactions can largely be processed independently, except for
-# deduplication.  With that in mind, if we had ten CPU cores, we could
-# make our pipeline faster by parallelizing it as follows,
+#  - dedup      Checks for and filters out duplicated incoming
+#               transactions
 #
-#   1 -> 2 --> 3 --> 4 --> 5 -> 6
-#           |          |
-#   1 -> 2 -+          +-> 5 -> 6
+#  - pack       Collects incoming transactions and smartly schedules
+#               them for execution when we are leader
 #
-# The problem of deciding which cores to use, and what work to run on
+#  - bank       Executes transactions that have been scheduled when we
+#               are leader
+#
+#  - poh        Continuously hashes in the background, and mixes the
+#               hash in with executed transactions to prove passage of
+#               time
+#
+#  - shred      Distributes block data to the network when leader, and
+#               receives and retransmits block data when not leader
+#
+#  - store      Receives block data when we are leader, or from other
+#               nodes when they are leader, and stores it locally in a
+#               database on disk
+#
+#  - metric     Collects monitoring information about other tiles and
+#               serves it on a HTTP endpoint
+#
+#  - sign       Holds the validator private key, and receives and
+#               responds to signing requests from other tiles
+#
+# The jobs involved in producing blocks when we are leader are organized
+# in a pipeline, where transactions flow through the system in a linear
+# sequence.
+#
+#   net -> quic -> verify -> dedup -> pack -> bank -> poh -> shred ->
+#   store
+#
+# Some of these jobs (net, quic, verify, bank, and shred) can be
+# parallelized, and run on multiple CPU cores at once. For example, we
+# could structure the pipeline like this for performance:
+#
+# net -> quic +-> verify -+> dedup -> pack +-> bank -+> poh -> shred ->
+#             store +-> verify -+                +-> bank -+ +-> verify
+#             -+ +-> verify -+
+
+# Each instance of a job running on a CPU core is called a tile.  In
+# this configuration we are running 4 verify tiles and 2 bank tiles.
+#
+# The problem of deciding which cores to use, and what job to run on
 # each core we call layout.  Layout is system dependent and the highest
 # throughput layout will vary depending on the specific hardware
 # available.
 #
-# Pinning and layout is accomplished with help from a primitive we call
-# a tile.  A tile is a thread which you can dispatch work to.  Tiles may
-# either be pinned to a specific core, or float between unassigned cores
-# (the OS scheduler will assign).  While a tile could receive and handle
-# arbitrary new work requests over its lifetime, acting like a worker
-# thread in a thread pool, in practice most tiles are dispatched just
-# one piece of work at startup, one of the six described above, which
-# they run forever.
+# Tiles communciate with each other using message queues.  If a queue
+# between two tiles fills up, the producer will either block, waiting
+# until there is free space to continue which is referred to as
+# backpressure, or it will drop transactions or data and continue.
 #
-# The concurrency model is that each tile runs exclusively on its own
-# thread, and communicates with other tiles via. message passing.  The
-# message passing primitives are built on top of shared memory, but
-# tiles do not otherwise communicate via. shared memory.  These message
-# queues between tiles are all fixed size, and when a producer outruns a
-# downstream consumer and fills the outgoing buffer transactions will be
-# dropped.
+# A slow tile can cause backpressure through the rest of the system
+# causing it to halt, and the goal of adding more tiles is to increase
+# throughput of a job, preventing dropped transactions.  For example,
+# if the QUIC server was producing 100,000 transactions a second, but
+# each verify tile could only handle 20,000 transactions a second, five
+# of the verify tiles would be needed to keep up without dropping
+# transactions.
 #
-# A full Firedancer layout spins up these six tasks onto a variety of
-# tiles and connects them together with queues so that data can flow in
-# and out of the system with maximum throughput and minimal overruns.
+# A full Firedancer layout spins up these eleven tasks onto a variety of
+# CPU cores and connects them together with queues so that data can flow
+# in and out of the system with maximum throughput and minimal drops.
 [layout]
     # Logical CPU cores to run Firedancer tiles on.  Can be specified as
     # a single core like "0", a range like "0-10", or a range with
@@ -534,57 +561,100 @@ dynamic_port_range = "8900-9000"
     #    3 | 2
     #    4 | 4
     #    5 | floating
-    #
-    # It is suggested to use all available CPU cores for Firedancer, so
-    # that the Solana network can run as fast as possible.
     affinity = "1-16"
 
     # In addition to the Firedancer tiles which use a core each, the
     # current version of Firedancer hosts a Agave validator as
     # a subprocess.
     #
-    # This affinity congtrols which logical CPU cores the Agave
+    # This affinity controls which logical CPU cores the Agave
     # subprocess and all of its threads are allowed to run on.  This is
     # specified in the same format as the above Firedancer affinity.
     #
     # It is strongly suggested that you do not overlap the Firedancer
     # affinity with the Agave affinity, as Firedancer tiles expect
-    # to have exclusive use of their core.  Unexpected latency  spikes
+    # to have exclusive use of their core.  Unexpected latency spikes
     # due to context switching may decrease performance overall.
     agave_affinity = ""
 
     # The following option is retained for backwards compatibility.  It
-    # will be disregarded if agave_affinity is specified.
+    # will be ignored if agave_affinity is specified.
     solana_labs_affinity = "17-31"
 
-    # How many net tiles to run.  Each networking tile will service
-    # exactly one queue from a network device being listened to.  If
-    # there are less net tiles than queues, some queues will not get
-    # drained and packets will be lost, and if there are more tiles than
-    # queues, some tiles will spin a CPU core doing nothing since no
-    # packets will ever arrive.
+    # How many net tiles to run.  Should be set to 1.  This is
+    # configurable and designed to scale out for future network
+    # conditions but there is no need to run more than 1 net tile given
+    # current `mainnet-beta` conditions.
+    #
+    # Net tiles are responsible for sending and receiving packets from
+    # the network device configured in the [tiles.net] section below.
+    # Each net tile will service exactly one queue from the device, and
+    # firedancer will error on boot if the number of queues on the
+    # device is not configured correctly.
+    #
+    # The net tile is designed to scale linearly when adding more tiles.
     #
     # See the comments for the [tiles.net] section below for more
     # information.
     net_tile_count = 1
 
-    # How many QUIC tiles to run.  Each QUIC tile can service persistent
-    # QUIC connections, and will be responsible for parsing and and
-    # responding to incoming data.
+    # How many QUIC tiles to run.  Should be set to 1.  This is
+    # configurable and designed to scale out for future network
+    # conditions but there is no need to run more then 1 QUIC tile given
+    # current `mainnet-beta` conditions, unless the validator is the
+    # subject of large DoS attack.
+    #
+    # QUIC tiles are responsible for parsing incoming QUIC protocol
+    # messages, managing connections and responding to clients.
+    # Connections from the net tiles will be evenly distributed
+    # between the available QUIC tiles round robin style.
+    #
+    # QUIC tiles are designed to scale linearly when adding more tiles,
     quic_tile_count = 1
 
     # How many verify tiles to run.  Verify tiles perform signature
     # verification on incoming transactions, an expensive operation that
     # is often the bottleneck of the validator.
-    verify_tile_count = 5
-
-    # How many bank tiles to run.  Multiple banks can run in parallel,
-    # if they are not writing to the same accounts at the same time.
+    #
+    # Verify tiles are designed to scale linearly when adding more
+    # tiles, and the verify tile count should be increased until the
+    # validator is not dropping incoming QUIC transactions from clients.
+    #
+    # On modern hardware, each verify tile can handle around 20-40K
+    # transactions per second.  Four tiles seems to be enough to handle
+    # current `mainnet-beta` traffic, unless the validator is under a
+    # DoS or spam attack.
+    verify_tile_count = 4
+
+    # How many bank tiles to run.  Should be set to 2.  Bank tiles
+    # execute transactions, so the validator can include the results of
+    # the transaction into a block when we are leader.  Because of
+    # current consensus limits restricting blocks to around 32,000
+    # transactions per block, there is no need to use more than 2 bank
+    # tiles on mainnet-beta.  For development and benchmarking, it can
+    # be useful to increase this number further.
+    #
+    # Bank tiles do not scale linearly.  The current implementation uses
+    # the agave runtime for execution, which takes various locks and
+    # uses concurrent data structures which slow down with multiple
+    # parallel users.
     bank_tile_count = 2
 
-    # How many shred tiles to run.  Multiple shred tiles can run in
-    # parallel to create shreds from transaction entries.
-    shred_tile_count = 1
+    # How many shred tiles to run.  Should be set to 2.  This is
+    # configurable and designed to scale out for future network
+    # conditions but there is no need to run more than 2 shred tiles
+    # given current `mainnet-beta` conditions.
+    #
+    # Shred tiles distribute block data to the network when we are
+    # leader, and receive and retransmit it to other nodes when we are
+    # not leader.
+    #
+    # Shred tile performance heavily dependent on the number of peer
+    # nodes in the cluster, as computing where data should go is an
+    # expensive function with this list of peers as the input.  In
+    # development and benchmarking, 1 tile is sufficient to hit very
+    # high TPS rates because the cluster size will be very small.
+    shred_tile_count = 2
 
 # All memory that will be used in Firedancer is pre-allocated in two
 # kinds of pages: huge and gigantic.  Huge pages are 2MB and gigantic
@@ -715,7 +785,7 @@ dynamic_port_range = "8900-9000"
         # be accepted.
         #
         # This must be >= 2 and also a power of 2.
-        max_concurrent_connections = 256
+        max_concurrent_connections = 2048
 
         # QUIC allows for multiple streams to be multiplexed over a
         # single connection.  This option sets the maximum number of
@@ -737,12 +807,12 @@ dynamic_port_range = "8900-9000"
 
         # QUIC uses a fixed-size pool of streams to use for all the
         # connections in the QUIC instance.  When a new connection is
-        # established, QUIC attempts to allocate stream objects to it for
-        # incoming peer-initiated streams. Locally-initiated streams are
-        # allocated explicitly.  One stream per connection is reserved, so
-        # every connection can be used.  This means that this value MUST
-        # be at least as high as max_concurrent_connections.
-        # The size of the pool is defined here:
+        # established, QUIC attempts to allocate stream objects to it
+        # for incoming peer-initiated streams.  Locally initiated
+        # streams are allocated explicitly.  One stream per connection is
+        # reserved, so every connection can be used.  This means that
+        # this value must be at least as high as the value of
+        # `max_concurrent_connections` above.
         stream_pool_cnt = 4096
 
         # Controls how much transactions coming in via TPU can be
@@ -768,7 +838,7 @@ dynamic_port_range = "8900-9000"
         # above.
         #
         # TODO: This should be removed.
-        max_concurrent_handshakes = 256
+        max_concurrent_handshakes = 2048
 
         # QUIC has a concept of a "QUIC packet", there can be multiple
         # of these inside a UDP packet.  Each QUIC packet we send to the
@@ -1022,13 +1092,15 @@ dynamic_port_range = "8900-9000"
         # the generating computer can do in the target tick duration
         # specified below.
         #
-        # A value of 12500 is the same as mainnet, devnet, and testnet.
+        # This value specifies the initial value for the chain in the
+        # genesis, but it might be overriden at runtime if the related
+        # features which increase this value are enabled. The features
+        # are named like `update_hashes_per_tick2`.
         #
-        # This value specifies the initial value for chain in the
-        # genesis, but it might be immediately overriden at runtime if
-        # the related features which increase this value are enabled.
-        # The features are like update_hashes_per_tick2.
-        hashes_per_tick = 1
+        # A value of 62,500 is the same as mainnet-beta, devnet, and
+        # testnet, following activation of the `update_hashes_per_tick6`
+        # feature.
+        hashes_per_tick = 62_500
 
         # How long each tick of the proof of history component should
         # take, in microseconds.  This value specifies the initial value