fix attn_mask on 310I

DeepLink-org · Oct 24, 2024 · 27afd12 · 27afd12
1 parent 379b2d3
commit 27afd12
Showing 1 changed file with 2 additions and 1 deletion.
diff --git a/dlinfer/vendor/ascend/torch_npu_ops.py b/dlinfer/vendor/ascend/torch_npu_ops.py
@@ -109,6 +109,7 @@ def prefill_attention(
         scale_value = (
             softmax_scale if softmax_scale else 1.0 / math.sqrt(query.shape[-1])
         )
+        assert SocVersion.is_Ascend910B() or SocVersion.is_Ascend310P()
         if SocVersion.is_Ascend910B():
             attn_output[:] = torch.ops.npu.npu_fusion_attention(
                 query,
@@ -142,7 +143,7 @@ def prefill_attention(
                     single_v,
                     single_o,
                     padding_mask=None,
-                    atten_mask=attn_mask[0],
+                    atten_mask=None,
                     actual_seq_lengths=actual_seq_lengths,
                     num_heads=num_q_heads,
                     scale_value=scale_value,