diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index a29622b7d25c3..3c62008fbfcc1 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -492,12 +492,14 @@ def fused_experts(hidden_states: torch.Tensor, if tokens_in_chunk == 0: break - if tokens_in_chunk < CHUNK_SIZE: - # will only happen in the last chunk + if tokens_in_chunk < CHUNK_SIZE and chunk > 0: + # Adjust the intermediate cache size and config for the last + # chunk. Note that in most cases we only have one chunk + # so the cache size and config are already set correctly and + # do not need to be adjusted. intermediate_cache1 = intermediate_cache1[:tokens_in_chunk] intermediate_cache2 = intermediate_cache2[:tokens_in_chunk] intermediate_cache3 = intermediate_cache3[:tokens_in_chunk] - # reload config to get better performance on the last chunk config = get_config_func(tokens_in_chunk) curr_topk_ids = topk_ids[begin_chunk_idx:end_chunk_idx]