From 553ace5eb91fc07681fa9edf8b6c09827a72617a Mon Sep 17 00:00:00 2001 From: Nandor Licker Date: Wed, 4 Dec 2024 02:27:50 -0800 Subject: [PATCH] perf: reduce total_num_tiles_q by one (#644) The bound can be reduced by one to slightly decrease workspace memory usage. --- include/flashinfer/attention/scheduler.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/flashinfer/attention/scheduler.cuh b/include/flashinfer/attention/scheduler.cuh index e08c4c0f..296b2ea6 100644 --- a/include/flashinfer/attention/scheduler.cuh +++ b/include/flashinfer/attention/scheduler.cuh @@ -483,7 +483,7 @@ inline auto PrefillSplitQOKVIndptr(IdType* qo_indptr_h, IdType* kv_indptr_h, // number of rows and the batch size. The sum of qo lengths rounded // up to cta_tile_q will not exceed this number derived from the total // number of rows. - total_num_tiles_q = ceil_div(total_num_rows, cta_tile_q) + batch_size; + total_num_tiles_q = ceil_div(total_num_rows, cta_tile_q) + batch_size - 1; } else { int64_t sum_packed_qo_len = 0; for (uint32_t i = 0; i < batch_size; ++i) {