From af519140c2f783e94ea564ad3ddd269a9e70ddfc Mon Sep 17 00:00:00 2001
From: zpatel <22306219+zbpatel@users.noreply.github.com>
Date: Tue, 22 Jul 2025 14:55:06 -0700
Subject: [PATCH 01/11] add link to Frank's trtllm-bench blog post to
 perf_overview.md

Signed-off-by: zpatel <22306219+zbpatel@users.noreply.github.com>
---
 docs/source/performance/perf-overview.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/docs/source/performance/perf-overview.md b/docs/source/performance/perf-overview.md
index 2b0df0f9066..ababd8be812 100644
--- a/docs/source/performance/perf-overview.md
+++ b/docs/source/performance/perf-overview.md
@@ -12,6 +12,8 @@ Tuning batch sizes, parallelism configurations, and other options may lead to im
 
 For DeepSeek R1 performance, please check out our [performance guide](../blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md)
 
+For more information on benchmarking with `trtllm-bench` see this NVIDIA [blog post](https://developer.nvidia.com/blog/llm-inference-benchmarking-performance-tuning-with-tensorrt-llm/).
+
 ## Throughput Measurements
 
 The below table shows performance data where a local inference client is fed requests at an infinite rate (no delay between messages),
@@ -216,7 +218,7 @@ a model name (HuggingFace reference or path to a local model), a [generated data
 trtllm-bench --model $model_name throughput --dataset $dataset_file --backend pytorch --extra_llm_api_options $llm_options
 ```
 
-The data collected for the v0.20 benchmarks was run with the following file:
+The data collected for the v0.21 benchmarks was run with the following file:
 
 `llm_options.yml`
 ```yaml

From ab941caf70a5fac59c4fc7a14e0f16965669acbf Mon Sep 17 00:00:00 2001
From: zpatel <22306219+zbpatel@users.noreply.github.com>
Date: Wed, 23 Jul 2025 21:52:58 -0700
Subject: [PATCH 02/11] add initial 0.21 perf tables

Signed-off-by: zpatel <22306219+zbpatel@users.noreply.github.com>
---
 docs/source/performance/perf-overview.md | 172 +++++++++++------------
 1 file changed, 81 insertions(+), 91 deletions(-)

diff --git a/docs/source/performance/perf-overview.md b/docs/source/performance/perf-overview.md
index ababd8be812..b7d601fb106 100644
--- a/docs/source/performance/perf-overview.md
+++ b/docs/source/performance/perf-overview.md
@@ -31,39 +31,41 @@ nvidia/Llama-3.1-405B-Instruct-FP4
 
 #### Llama 3.3 70B FP4
 
-|                         | GPU     | B200      |           |           |           |
-|:------------------------|:--------|:----------|:----------|:----------|:----------|
-|                         | TP Size | 1         | 2         | 4         | 8         |
-| ISL, OSL                |         |           |           |           |           |
-|                         |         |           |           |           |           |
-| 128, 128                |         | 10,994.48 | 17,542.11 | 24,667.31 | 27,272.27 |
-| 128, 2048               |         | 9,580.46  | 15,432.35 | 23,568.12 | 31,174.31 |
-| 128, 4096               |         | 6,418.39  | 9,841.53  | 17,808.76 | 25,229.25 |
-| 500, 2000               |         | 7,343.32  | 11,850.57 | 20,709.67 | 28,038.78 |
-| 1000, 1000              |         | 6,752.53  | 10,815.88 | 16,413.04 | 20,060.66 |
-| 1000, 2000              |         | 6,670.07  | 9,830.73  | 15,597.49 | 20,672.37 |
-| 1024, 2048              |         | 6,636.75  | 9,807.13  | 15,519.23 | 20,617.28 |
-| 2048, 128               |         | 1,342.17  | 1,989.41  | 3,033.14  | 4,035.64  |
-| 5000, 500               |         | 1,429.67  | 2,419.67  | 3,686.84  | 5,182.96  |
-| 20000, 2000             |         | 629.77    | 1,177.01  | 2,120.66  | 3,429.03  |
+|                          | GPU:   | B200      | GB200 180GB   |
+|:-----------------------------|:---|:----------|:--------------|
+|    | TP Size   | 1      | 1          |
+| ISL, OSL |    |           |               |
+|                              |    |           |               |
+| 128, 128                     |    | 10,613.84 | 11,100.97     |
+| 128, 2048                    |    | 9,445.51  | 10,276.05     |
+| 128, 4096                    |    | 6,276.85  | 7,351.12      |
+| 500, 2000                    |    | 6,983.27  | 8,194.30      |
+| 1000, 1000                   |    | 6,434.29  | 7,401.80      |
+| 1000, 2000                   |    | 6,725.03  | 6,478.72      |
+| 1024, 2048                   |    | 6,546.61  | 7,922.88      |
+| 2048, 128                    |    | 1,330.35  | 1,418.47      |
+| 2048, 2048                   |    | 4,528.48  | 5,326.77      |
+| 5000, 500                    |    | 1,427.44  | 1,502.44      |
+| 20000, 2000                  |    | 636.36    | 732.43        |
 
 #### Llama 3.1 405B FP4
 
-|                         | GPU     | B200     |           |
-|:------------------------|:------- |:---------|:----------|
-|                         | TP Size | 4        | 8         |
-| ISL, OSL                |         |          |           |
-|                         |         |          |           |
-| 128, 128                |         | 6,163.81 | 9,002.90  |
-| 128, 2048               |         | 7,081.21 | 10,288.28 |
-| 128, 4096               |         | 6,028.37 | 8,713.77  |
-| 500, 2000               |         | 5,858.75 | 9,125.86  |
-| 1000, 1000              |         | 4,848.00 | 7,582.97  |
-| 1000, 2000              |         | 5,375.25 | 7,626.28  |
-| 1024, 2048              |         | 5,345.70 | 7,464.03  |
-| 2048, 128               |         | 693.55   | 1,086.56  |
-| 5000, 500               |         | 947.49   | 1,532.45  |
-| 20000, 2000             |         | 641.11   | 1,097.84  |
+|                         | GPU:    | B200     | GB200 180GB   |
+|:-----------------------------|:---|:---------|:--------------|
+|   | TP Size   | 4     | 4          |
+| ISL, OSL |    |          |               |
+|                              |    |          |               |
+| 128, 128                     |    | 6,218.89 | 6,598.97      |
+| 128, 2048                    |    | 7,178.10 | 7,497.40      |
+| 128, 4096                    |    | 5,890.89 | 5,898.19      |
+| 500, 2000                    |    | 5,844.37 | 6,198.33      |
+| 1000, 1000                   |    | 4,958.53 | 5,243.35      |
+| 1000, 2000                   |    | 4,874.16 | 4,905.51      |
+| 1024, 2048                   |    | 4,833.19 | 4,686.38      |
+| 2048, 128                    |    | 737.95   | 761.58        |
+| 2048, 2048                   |    | 4,024.02 | 4,326.56      |
+| 5000, 500                    |    | 1,032.40 | 1,078.87      |
+| 20000, 2000                  |    | 667.39   | 649.95        |
 
 ### FP8 Models:
 ```
@@ -75,74 +77,62 @@ nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8
 
 #### Llama 3.1 8B FP8
 
-|                         | GPU     | H200 141GB HBM3   | H100 80GB HBM3   |
-|:-----------------------------|:---|:------------------|:-----------------|
-|    | TP Size   | 1              | 1             |
-| ISL, OSL |    |                   |                  |
-|                              |    |                   |                  |
-| 128, 128                     |    | 27,970.14         | 27,688.36        |
-| 128, 2048                    |    | 23,326.38         | 21,841.15        |
-| 128, 4096                    |    | 17,508.51         | 13,730.89        |
-| 500, 2000                    |    | 21,390.41         | 17,833.34        |
-| 1000, 1000                   |    | 17,366.89         | 15,270.62        |
-| 1000, 2000                   |    | 16,831.31         | 13,798.08        |
-| 1024, 2048                   |    | 16,737.03         | 13,385.50        |
-| 2048, 128                    |    | 3,488.03          | 3,414.67         |
-| 5000, 500                    |    | 3,813.69          | 3,394.54         |
-| 20000, 2000                  |    | 1,696.66          | 1,345.42         |
+|                          | GPU:   | GH200 480GB   | H100 80GB HBM3   | H200 141GB HBM3   |
+|:-----------------------------|:---|:--------------|:-----------------|:------------------|
+|    | TP Size   | 1          | 1             | 1              |
+| ISL, OSL |    |               |                  |                   |
+|                              |    |               |                  |                   |
+| 128, 128                     |    | 27,304.25     | 26,401.48        | 27,027.80         |
+| 128, 2048                    |    | 24,045.60     | 21,413.21        | 23,102.25         |
+| 128, 4096                    |    | 15,409.85     | 13,541.54        | 17,396.83         |
+| 500, 2000                    |    | 20,123.88     | 17,571.01        | 19,759.16         |
+| 1000, 1000                   |    | 16,352.99     | 14,991.62        | 17,162.49         |
+| 1000, 2000                   |    | 15,705.82     | 13,505.23        | 16,227.11         |
+| 1024, 2048                   |    | 16,102.52     | 13,165.91        | 16,057.66         |
+| 2048, 128                    |    | 3,573.85      | 3,275.55         | 3,390.69          |
+| 2048, 2048                   |    | 10,767.05     | 9,462.43         | 11,822.14         |
+| 5000, 500                    |    | 3,584.74      | 3,276.47         | 3,758.08          |
+| 20000, 2000                  |    | 1,393.31      | 1,340.69         | 1,705.68          |
 
 #### Llama 3.3 70B FP8
 
-|                          | GPU    | H200 141GB HBM3   |          |           |           | H100 80GB HBM3   |          |           |           |
-|:-----------------------------|:---|:------------------|:---------|:----------|:----------|:-----------------|:---------|:----------|:----------|
-|    | TP Size   | 1              | 2     | 4      | 8      | 1            | 2     | 4      | 8      |
-| ISL, OSL |    |                   |          |           |           |                  |          |           |           |
-|                              |    |                   |          |           |           |                  |          |           |           |
-| 128, 128                     |    | 3,605.47          | 6,427.69 | 10,407.42 | 15,434.37 | 3,128.33         | 6,216.91 |           |           |
-| 128, 2048                    |    | 4,315.80          | 8,464.03 | 13,508.59 | 20,759.72 | 756.42           | 5,782.57 | 11,464.94 | 17,424.32 |
-| 128, 4096                    |    | 2,701.17          | 5,573.55 | 11,458.56 | 16,668.75 |                  | 3,868.37 | 8,206.39  | 12,624.61 |
-| 500, 2000                    |    | 3,478.76          | 6,740.06 | 12,200.18 |           |                  | 4,684.06 | 9,903.53  | 14,553.93 |
-| 1000, 1000                   |    | 2,744.32          | 5,119.72 | 8,685.44  | 12,744.51 | 742.14           | 4,247.19 | 7,435.65  | 11,018.81 |
-| 1000, 2000                   |    | 2,896.44          | 5,847.26 | 9,031.21  | 13,141.17 | 533.74           | 3,866.53 | 7,611.12  | 11,139.22 |
-| 1024, 2048                   |    | 2,874.18          | 5,568.61 | 8,946.71  | 13,082.62 | 530.16           | 3,796.68 | 7,575.24  | 11,004.31 |
-| 2048, 128                    |    | 435.90            | 772.67   | 1,264.76  |           |                  | 736.89   | 1,213.33  | 1,839.22  |
-| 2048, 2048                   |    |                   |          |           | 10,412.85 |                  |          |           |           |
-| 5000, 500                    |    | 545.96            | 997.15   | 1,698.22  | 2,655.28  | 204.94           | 862.91   | 1,552.68  | 2,369.84  |
-| 20000, 2000                  |    | 276.66            | 620.33   | 1,161.29  | 1,985.85  |                  | 416.13   | 903.66    | 1,554.10  |
+|                        | GPU:     | H100 80GB HBM3   | H200 141GB HBM3   |
+|:-----------------------------|:---|:-----------------|:------------------|
+|    | TP Size   | 2             | 2              |
+| ISL, OSL |    |                  |                   |
+|                              |    |                  |                   |
+| 128, 128                     |    | 6,092.28         | 6,327.98          |
+| 128, 2048                    |    | 5,892.94         | 7,467.36          |
+| 128, 4096                    |    | 3,828.46         | 5,526.42          |
+| 500, 2000                    |    | 4,654.74         | 6,639.15          |
+| 1000, 1000                   |    | 4,181.06         | 4,773.33          |
+| 1000, 2000                   |    | 3,708.93         | 5,790.36          |
+| 1024, 2048                   |    | 3,785.04         | 5,480.44          |
+| 2048, 128                    |    | 723.40           | 747.55            |
+| 2048, 2048                   |    | 2,785.53         | 3,775.80          |
+| 5000, 500                    |    | 865.55           | 978.28            |
+| 20000, 2000                  |    | 411.85           | 609.42            |
 
 #### Llama 3.1 405B FP8
-
-|                          | GPU    | H200 141GB HBM3   | H100 80GB HBM3   |
-|:-----------------------------|:---|:------------------|:-----------------|
-|   | TP Size   | 8              | 8             |
-| ISL, OSL |    |                   |                  |
-|                              |    |                   |                  |
-| 128, 2048                    |    | 5,567.87          |                  |
-| 128, 4096                    |    | 5,136.85          |                  |
-| 500, 2000                    |    | 4,787.61          | 3,673.91         |
-| 1000, 1000                   |    | 3,286.30          | 3,012.22         |
-| 1000, 2000                   |    | 3,636.76          | 3,262.20         |
-| 1024, 2048                   |    | 3,618.66          | 3,109.70         |
-| 2048, 128                    |    | 443.10            | 449.02           |
-| 5000, 500                    |    | 645.46            |                  |
-| 20000, 2000                  |    |                   | 372.12           |
+|                         | GPU:    | H100 80GB HBM3   | H200 141GB HBM3   |
+|:-----------------------------|:---|:-----------------|:------------------|
+|    | TP Size   | 8             | 8              |
+| Runtime Input/Output Lengths |    |                  |                   |
+|                              |    |                  |                   |
+| 128, 128                     |    |                  | 3,705.18          |
+| 128, 2048                    |    | 4,517.39         | 4,715.13          |
+| 128, 4096                    |    | 2,910.31         | 4,475.91          |
+| 500, 2000                    |    | 3,664.62         | 4,804.10          |
+| 1000, 1000                   |    | 2,955.50         | 3,208.25          |
+| 1000, 2000                   |    | 2,884.69         | 3,630.29          |
+| 1024, 2048                   |    | 3,237.41         | 3,609.50          |
+| 2048, 128                    |    | 433.47           | 441.35            |
+| 2048, 2048                   |    | 2,216.55         | 2,840.86          |
+| 5000, 500                    |    | 579.05           | 645.26            |
+| 20000, 2000                  |    | 363.27           | 509.87            |
 
 #### Llama 4 Maverick FP8
-
-|                          | GPU    | H200 141GB HBM3   | H100 80GB HBM3   |
-|:-----------------------------|:---|:------------------|:-----------------|
-|    | TP Size    | 8              | 8             |
-| ISL, OSL |    |                   |                  |
-|                              |    |                   |                  |
-| 128, 2048                    |    | 27,543.87         |                  |
-| 128, 4096                    |    | 18,541.01         | 11,163.12        |
-| 500, 2000                    |    | 21,117.34         |                  |
-| 1000, 2000                   |    |                   | 10,556.00        |
-| 1024, 2048                   |    | 16,859.45         | 11,584.33        |
-| 2048, 128                    |    | 4,364.06          | 3,832.38         |
-| 2048, 2048                   |    | 12,800.89         |                  |
-| 5000, 500                    |    | 5,128.60          |                  |
-| 20000, 2000                  |    | 1,764.27          | 1,400.79         |
+TODO
 
 ## Reproducing Benchmarked Results
 

From b8d1a6d4f12c5aaed08bcbb1b5b005eb88517931 Mon Sep 17 00:00:00 2001
From: zpatel <22306219+zbpatel@users.noreply.github.com>
Date: Wed, 23 Jul 2025 21:56:25 -0700
Subject: [PATCH 03/11] add 405b maverick perf data and note

Signed-off-by: zpatel <22306219+zbpatel@users.noreply.github.com>
---
 docs/source/performance/perf-overview.md | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/docs/source/performance/perf-overview.md b/docs/source/performance/perf-overview.md
index b7d601fb106..8201cf72c6d 100644
--- a/docs/source/performance/perf-overview.md
+++ b/docs/source/performance/perf-overview.md
@@ -132,7 +132,23 @@ nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8
 | 20000, 2000                  |    | 363.27           | 509.87            |
 
 #### Llama 4 Maverick FP8
-TODO
+
+*Performance for Llama 4 on sequence lengths less than 8,192 tokens is affected by an issue introduced in v0.21. To reproduce the Llama 4 performance noted here, please use v0.20
+
+|                          | GPU    | H200 141GB HBM3   | H100 80GB HBM3   |
+|:-----------------------------|:---|:------------------|:-----------------|
+|    | TP Size    | 8              | 8             |
+| ISL, OSL |    |                   |                  |
+|                              |    |                   |                  |
+| 128, 2048                    |    | 27,543.87         |                  |
+| 128, 4096                    |    | 18,541.01         | 11,163.12        |
+| 500, 2000                    |    | 21,117.34         |                  |
+| 1000, 2000                   |    |                   | 10,556.00        |
+| 1024, 2048                   |    | 16,859.45         | 11,584.33        |
+| 2048, 128                    |    | 4,364.06          | 3,832.38         |
+| 2048, 2048                   |    | 12,800.89         |                  |
+| 5000, 500                    |    | 5,128.60          |                  |
+| 20000, 2000                  |    | 1,764.27          | 1,400.79         |
 
 ## Reproducing Benchmarked Results
 

From c67679170454fc4948d9072f991ee882775e88d9 Mon Sep 17 00:00:00 2001
From: zpatel <22306219+zbpatel@users.noreply.github.com>
Date: Wed, 23 Jul 2025 23:39:33 -0700
Subject: [PATCH 04/11] clarify working on maverick perf note

Signed-off-by: zpatel <22306219+zbpatel@users.noreply.github.com>
---
 docs/source/performance/perf-overview.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/performance/perf-overview.md b/docs/source/performance/perf-overview.md
index 8201cf72c6d..60e65d32bb2 100644
--- a/docs/source/performance/perf-overview.md
+++ b/docs/source/performance/perf-overview.md
@@ -77,7 +77,7 @@ nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8
 
 #### Llama 3.1 8B FP8
 
-|                          | GPU:   | GH200 480GB   | H100 80GB HBM3   | H200 141GB HBM3   |
+|                          | GPU:   | GH200   | H100 80GB HBM3   | H200 141GB HBM3   |
 |:-----------------------------|:---|:--------------|:-----------------|:------------------|
 |    | TP Size   | 1          | 1             | 1              |
 | ISL, OSL |    |               |                  |                   |
@@ -133,7 +133,7 @@ nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8
 
 #### Llama 4 Maverick FP8
 
-*Performance for Llama 4 on sequence lengths less than 8,192 tokens is affected by an issue introduced in v0.21. To reproduce the Llama 4 performance noted here, please use v0.20
+Note: Performance for Llama 4 on sequence lengths less than 8,192 tokens is affected by an issue introduced in v0.21. To reproduce the Llama 4 performance noted here, please use v0.20
 
 |                          | GPU    | H200 141GB HBM3   | H100 80GB HBM3   |
 |:-----------------------------|:---|:------------------|:-----------------|

From 961348559facde967c23364653d2a2ca3a19cea5 Mon Sep 17 00:00:00 2001
From: zpatel <22306219+zbpatel@users.noreply.github.com>
Date: Wed, 23 Jul 2025 23:43:02 -0700
Subject: [PATCH 05/11] change GH200 memory to reflect only the GPU vram

Signed-off-by: zpatel <22306219+zbpatel@users.noreply.github.com>
---
 docs/source/performance/perf-overview.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/performance/perf-overview.md b/docs/source/performance/perf-overview.md
index 60e65d32bb2..9f526e13a65 100644
--- a/docs/source/performance/perf-overview.md
+++ b/docs/source/performance/perf-overview.md
@@ -77,7 +77,7 @@ nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8
 
 #### Llama 3.1 8B FP8
 
-|                          | GPU:   | GH200   | H100 80GB HBM3   | H200 141GB HBM3   |
+|                          | GPU:   | GH200 96GB  | H100 80GB HBM3   | H200 141GB HBM3   |
 |:-----------------------------|:---|:--------------|:-----------------|:------------------|
 |    | TP Size   | 1          | 1             | 1              |
 | ISL, OSL |    |               |                  |                   |

From a6889260ccdb58184de2e5d6eff4b31414d5d2ab Mon Sep 17 00:00:00 2001
From: zpatel <22306219+zbpatel@users.noreply.github.com>
Date: Tue, 29 Jul 2025 18:55:38 -0700
Subject: [PATCH 06/11] add hardware heading with information about benchmarked
 gpu variants

Signed-off-by: zpatel <22306219+zbpatel@users.noreply.github.com>
---
 docs/source/performance/perf-overview.md | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/docs/source/performance/perf-overview.md b/docs/source/performance/perf-overview.md
index 9f526e13a65..048b9555bb6 100644
--- a/docs/source/performance/perf-overview.md
+++ b/docs/source/performance/perf-overview.md
@@ -23,6 +23,13 @@ The performance numbers below were collected using the steps described in this d
 
 Testing was performed on models with weights quantized using [ModelOpt](https://nvidia.github.io/TensorRT-Model-Optimizer/#) and published by NVIDIA on the [Model Optimizer HuggingFace Collection](https://huggingface.co/collections/nvidia/model-optimizer-66aa84f7966b3150262481a4).
 
+### Hardware:
+The following GPU variants were used for testing:
+- H100 SXM 80GB (DGX H100)
+- H200 SXM 141GB (DGX H200)
+- B200 180GB (DGX B200)
+- GB200 192GB (GB200 NVL72)
+
 ### FP4 Models:
 ```
 nvidia/Llama-3.3-70B-Instruct-FP4
@@ -31,7 +38,7 @@ nvidia/Llama-3.1-405B-Instruct-FP4
 
 #### Llama 3.3 70B FP4
 
-|                          | GPU:   | B200      | GB200 180GB   |
+|                          | GPU:   | B200     | GB200  |
 |:-----------------------------|:---|:----------|:--------------|
 |    | TP Size   | 1      | 1          |
 | ISL, OSL |    |           |               |
@@ -50,7 +57,7 @@ nvidia/Llama-3.1-405B-Instruct-FP4
 
 #### Llama 3.1 405B FP4
 
-|                         | GPU:    | B200     | GB200 180GB   |
+|                         | GPU:    | B200    | GB200  |
 |:-----------------------------|:---|:---------|:--------------|
 |   | TP Size   | 4     | 4          |
 | ISL, OSL |    |          |               |
@@ -77,7 +84,7 @@ nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8
 
 #### Llama 3.1 8B FP8
 
-|                          | GPU:   | GH200 96GB  | H100 80GB HBM3   | H200 141GB HBM3   |
+|                          | GPU:   | GH200  | H100   | H200   |
 |:-----------------------------|:---|:--------------|:-----------------|:------------------|
 |    | TP Size   | 1          | 1             | 1              |
 | ISL, OSL |    |               |                  |                   |
@@ -96,7 +103,7 @@ nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8
 
 #### Llama 3.3 70B FP8
 
-|                        | GPU:     | H100 80GB HBM3   | H200 141GB HBM3   |
+|                        | GPU:     | H100   | H200   |
 |:-----------------------------|:---|:-----------------|:------------------|
 |    | TP Size   | 2             | 2              |
 | ISL, OSL |    |                  |                   |
@@ -114,7 +121,7 @@ nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8
 | 20000, 2000                  |    | 411.85           | 609.42            |
 
 #### Llama 3.1 405B FP8
-|                         | GPU:    | H100 80GB HBM3   | H200 141GB HBM3   |
+|                         | GPU:    | H100   | H200   |
 |:-----------------------------|:---|:-----------------|:------------------|
 |    | TP Size   | 8             | 8              |
 | Runtime Input/Output Lengths |    |                  |                   |
@@ -135,7 +142,7 @@ nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8
 
 Note: Performance for Llama 4 on sequence lengths less than 8,192 tokens is affected by an issue introduced in v0.21. To reproduce the Llama 4 performance noted here, please use v0.20
 
-|                          | GPU    | H200 141GB HBM3   | H100 80GB HBM3   |
+|                          | GPU    | H200   | H100   |
 |:-----------------------------|:---|:------------------|:-----------------|
 |    | TP Size    | 8              | 8             |
 | ISL, OSL |    |                   |                  |
@@ -248,7 +255,7 @@ cuda_graph_batch_sizes:
   - 8192
 ```
 
-In a majority of cases, we also use a higher KV cache percentage by setting `--kv_cache_free_gpu_mem_fraction 0.95` in the benchmark command. This allows us to obtain better performance than the default setting of `0.90`. We fall back to `0.90` if we hit an out of memory issue.
+In many cases, we also use a higher KV cache percentage by setting `--kv_cache_free_gpu_mem_fraction 0.95` in the benchmark command. This allows us to obtain better performance than the default setting of `0.90`. We fall back to `0.90` or lower if out-of-memory errors are encountered.
 
 The results will be printed to the terminal upon benchmark completion. For example,
 

From b1d010b80518040ac9e1c45d2d199b35a6ef8a08 Mon Sep 17 00:00:00 2001
From: zpatel <22306219+zbpatel@users.noreply.github.com>
Date: Tue, 29 Jul 2025 18:55:46 -0700
Subject: [PATCH 07/11] add hardware heading with information about benchmarked
 gpu variants

Signed-off-by: zpatel <22306219+zbpatel@users.noreply.github.com>
---
 docs/source/performance/perf-overview.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/source/performance/perf-overview.md b/docs/source/performance/perf-overview.md
index 048b9555bb6..8999a06e6eb 100644
--- a/docs/source/performance/perf-overview.md
+++ b/docs/source/performance/perf-overview.md
@@ -30,6 +30,8 @@ The following GPU variants were used for testing:
 - B200 180GB (DGX B200)
 - GB200 192GB (GB200 NVL72)
 
+Other hardware variants may have different TDP, memory bandwidth, core count, or other features leading to performance differences on these workloads.
+
 ### FP4 Models:
 ```
 nvidia/Llama-3.3-70B-Instruct-FP4

From 67d20a772848ebcac96a6ac2118ea70cf6cb5f56 Mon Sep 17 00:00:00 2001
From: zpatel <22306219+zbpatel@users.noreply.github.com>
Date: Tue, 29 Jul 2025 19:12:59 -0700
Subject: [PATCH 08/11] add gh200 information, remove some colons

Signed-off-by: zpatel <22306219+zbpatel@users.noreply.github.com>
---
 docs/source/performance/perf-overview.md | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/docs/source/performance/perf-overview.md b/docs/source/performance/perf-overview.md
index 8999a06e6eb..87afc36017c 100644
--- a/docs/source/performance/perf-overview.md
+++ b/docs/source/performance/perf-overview.md
@@ -23,16 +23,17 @@ The performance numbers below were collected using the steps described in this d
 
 Testing was performed on models with weights quantized using [ModelOpt](https://nvidia.github.io/TensorRT-Model-Optimizer/#) and published by NVIDIA on the [Model Optimizer HuggingFace Collection](https://huggingface.co/collections/nvidia/model-optimizer-66aa84f7966b3150262481a4).
 
-### Hardware:
+### Hardware
 The following GPU variants were used for testing:
 - H100 SXM 80GB (DGX H100)
 - H200 SXM 141GB (DGX H200)
+- GH200 96GB HBM3 (480GB LPDDR5X)
 - B200 180GB (DGX B200)
 - GB200 192GB (GB200 NVL72)
 
 Other hardware variants may have different TDP, memory bandwidth, core count, or other features leading to performance differences on these workloads.
 
-### FP4 Models:
+### FP4 Models
 ```
 nvidia/Llama-3.3-70B-Instruct-FP4
 nvidia/Llama-3.1-405B-Instruct-FP4
@@ -76,7 +77,7 @@ nvidia/Llama-3.1-405B-Instruct-FP4
 | 5000, 500                    |    | 1,032.40 | 1,078.87      |
 | 20000, 2000                  |    | 667.39   | 649.95        |
 
-### FP8 Models:
+### FP8 Models
 ```
 nvidia/Llama-3.1-8B-Instruct-FP8
 nvidia/Llama-3.3-70B-Instruct-FP8

From 9271f094556aab24e08b37b025447ae09dccda62 Mon Sep 17 00:00:00 2001
From: zpatel <22306219+zbpatel@users.noreply.github.com>
Date: Tue, 29 Jul 2025 19:16:25 -0700
Subject: [PATCH 09/11] remove 20k/2k data point from llama 4 maverick
 benchmarks

Signed-off-by: zpatel <22306219+zbpatel@users.noreply.github.com>
---
 docs/source/performance/perf-overview.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/docs/source/performance/perf-overview.md b/docs/source/performance/perf-overview.md
index 87afc36017c..473f2fb6d11 100644
--- a/docs/source/performance/perf-overview.md
+++ b/docs/source/performance/perf-overview.md
@@ -158,7 +158,6 @@ Note: Performance for Llama 4 on sequence lengths less than 8,192 tokens is affe
 | 2048, 128                    |    | 4,364.06          | 3,832.38         |
 | 2048, 2048                   |    | 12,800.89         |                  |
 | 5000, 500                    |    | 5,128.60          |                  |
-| 20000, 2000                  |    | 1,764.27          | 1,400.79         |
 
 ## Reproducing Benchmarked Results
 

From 79daeb1cb31fbc575b9b6d2ede83e1dfa6432726 Mon Sep 17 00:00:00 2001
From: zpatel <22306219+zbpatel@users.noreply.github.com>
Date: Tue, 29 Jul 2025 19:27:34 -0700
Subject: [PATCH 10/11] add space after headers based on code rabbit lint
 feedback

Signed-off-by: zpatel <22306219+zbpatel@users.noreply.github.com>
---
 docs/source/performance/perf-overview.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/source/performance/perf-overview.md b/docs/source/performance/perf-overview.md
index 473f2fb6d11..0908ad2cae2 100644
--- a/docs/source/performance/perf-overview.md
+++ b/docs/source/performance/perf-overview.md
@@ -34,6 +34,7 @@ The following GPU variants were used for testing:
 Other hardware variants may have different TDP, memory bandwidth, core count, or other features leading to performance differences on these workloads.
 
 ### FP4 Models
+
 ```
 nvidia/Llama-3.3-70B-Instruct-FP4
 nvidia/Llama-3.1-405B-Instruct-FP4
@@ -78,6 +79,7 @@ nvidia/Llama-3.1-405B-Instruct-FP4
 | 20000, 2000                  |    | 667.39   | 649.95        |
 
 ### FP8 Models
+
 ```
 nvidia/Llama-3.1-8B-Instruct-FP8
 nvidia/Llama-3.3-70B-Instruct-FP8

From 6f44a997938bb5c192085d81937c863d84bdebe5 Mon Sep 17 00:00:00 2001
From: zpatel <22306219+zbpatel@users.noreply.github.com>
Date: Tue, 29 Jul 2025 19:29:41 -0700
Subject: [PATCH 11/11] add text header to code block

Signed-off-by: zpatel <22306219+zbpatel@users.noreply.github.com>
---
 docs/source/performance/perf-overview.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/performance/perf-overview.md b/docs/source/performance/perf-overview.md
index 0908ad2cae2..ab49a47fa5d 100644
--- a/docs/source/performance/perf-overview.md
+++ b/docs/source/performance/perf-overview.md
@@ -35,7 +35,7 @@ Other hardware variants may have different TDP, memory bandwidth, core count, or
 
 ### FP4 Models
 
-```
+```text
 nvidia/Llama-3.3-70B-Instruct-FP4
 nvidia/Llama-3.1-405B-Instruct-FP4
 ```
@@ -80,7 +80,7 @@ nvidia/Llama-3.1-405B-Instruct-FP4
 
 ### FP8 Models
 
-```
+```text
 nvidia/Llama-3.1-8B-Instruct-FP8
 nvidia/Llama-3.3-70B-Instruct-FP8
 nvidia/Llama-3.1-405B-Instruct-FP8