Don't calculate KV scales dynamically if Q scale is included

ROCm · Dec 19, 2024 · 0bd414a · 0bd414a
1 parent 06f53ba
commit 0bd414a
Showing 1 changed file with 1 addition and 0 deletions.
diff --git a/vllm/model_executor/layers/quantization/kv_cache.py b/vllm/model_executor/layers/quantization/kv_cache.py
@@ -89,6 +89,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             q_scale = layer.q_scale.to("cpu").tolist()
             if current_platform.is_rocm() and not is_navi():
                 q_scale *= 2
+            layer.calculate_kv_scales = False
         else:
             q_scale = 1.0
         if layer.prob_scale > 0.0: