diff --git a/torchao/testing/float8/roofline_utils.py b/torchao/testing/float8/roofline_utils.py index 92becb9b94..f0b873c352 100644 --- a/torchao/testing/float8/roofline_utils.py +++ b/torchao/testing/float8/roofline_utils.py @@ -47,12 +47,10 @@ "fp8_peak_tops": 2614e12, # 5.3 TB per second "peak_mem_bw_bytes_sec": 5.3e12, - # for now, copy over from H100 - # TODO(future): run measurement on hardware - "pct_achievable_gemm_tops": 0.78, - # for now, copy over from H100 - # TODO(future): run measurement on hardware - "pct_achievable_mem_bw": 0.92, + # based on microbenchmark (fw + bw gemms) with M,K,N = 3 * (8192,) + "pct_achievable_gemm_tops": 0.47, + # based on microbenchmark with pointwise triton kernel with large inputs + "pct_achievable_mem_bw": 0.72, }, # TODO(future): more GPU names }