From 8e67e0e779fff4ec378b49710cbc3ad75fb4c096 Mon Sep 17 00:00:00 2001 From: Hazem Essam Date: Tue, 8 Oct 2024 03:03:55 +0300 Subject: [PATCH] Flash attention support. (#20152) * added flash attention support for pytorch * added a comment explaining why the causal mask is created manually * added unit tests for flash attention * added test skipping for flash attention for numpy * removed flash attn op and added support for flash attention inside dot_product_attention op * added skip tests for every framework except torch * formatted files * added checks for flash attention in pytorch beforing computing attention and removed flash attention from tests * added skipping tests for all frameworks except jax * formatted files * added conditions to skip tests for jax * fixed typo --- keras/src/backend/jax/nn.py | 18 ++++++++- keras/src/backend/numpy/nn.py | 11 ++++- keras/src/backend/tensorflow/nn.py | 14 ++++++- keras/src/backend/torch/nn.py | 56 +++++++++++++++++++++++-- keras/src/ops/nn.py | 32 +++++++++++++-- keras/src/ops/nn_test.py | 65 +++++++++++++++++++++++++----- 6 files changed, 176 insertions(+), 20 deletions(-) diff --git a/keras/src/backend/jax/nn.py b/keras/src/backend/jax/nn.py index b549b3517e2..cba73918976 100644 --- a/keras/src/backend/jax/nn.py +++ b/keras/src/backend/jax/nn.py @@ -986,7 +986,14 @@ def _dot_product_attention_core( def dot_product_attention( - query, key, value, bias=None, mask=None, scale=None, is_causal=False + query, + key, + value, + bias=None, + mask=None, + scale=None, + is_causal=False, + flash_attention=False, ): query = convert_to_tensor(query) key = convert_to_tensor(key) @@ -1000,6 +1007,7 @@ def dot_product_attention( # `dot_product_attention` is only available in jax>=0.4.31 if hasattr(jax.nn, "dot_product_attention"): + implementation = "cudnn" if flash_attention else "xla" return jax.nn.dot_product_attention( query, key, @@ -1008,6 +1016,14 @@ def dot_product_attention( mask=mask, scale=scale, is_causal=is_causal, + implementation=implementation, + ) + + if flash_attention: + raise ValueError( + "Flash attention is not supported in your " + "current JAX version. Please update it " + "using `pip install -U jax jaxlib`." ) # Ref: jax.nn.dot_product_attention diff --git a/keras/src/backend/numpy/nn.py b/keras/src/backend/numpy/nn.py index f3e02d6d5a9..eea127e554a 100644 --- a/keras/src/backend/numpy/nn.py +++ b/keras/src/backend/numpy/nn.py @@ -1033,8 +1033,17 @@ def _dot_product_attention_xla(query, key, value, bias, mask, is_causal, scale): def dot_product_attention( - query, key, value, bias=None, mask=None, scale=None, is_causal=False + query, + key, + value, + bias=None, + mask=None, + scale=None, + is_causal=False, + flash_attention=False, ): + if flash_attention: + raise ValueError("Flash attention is not implemented in NumPy.") # Ref: jax.nn.dot_product_attention # https://github.com/jax-ml/jax/blob/jax-v0.4.32/jax/_src/nn/functions.py#L828 # Not support `query_seq_lengths` and `key_value_seq_lengths` args diff --git a/keras/src/backend/tensorflow/nn.py b/keras/src/backend/tensorflow/nn.py index bc7c1e61486..01a1aca26d0 100644 --- a/keras/src/backend/tensorflow/nn.py +++ b/keras/src/backend/tensorflow/nn.py @@ -964,8 +964,20 @@ def _dot_product_attention_xla(query, key, value, bias, mask, is_causal, scale): def dot_product_attention( - query, key, value, bias=None, mask=None, scale=None, is_causal=False + query, + key, + value, + bias=None, + mask=None, + scale=None, + is_causal=False, + flash_attention=False, ): + if flash_attention: + raise ValueError( + "Flash attention is not supported yet in TensorFlow backend." + ) + # Ref: jax.nn.dot_product_attention # https://github.com/jax-ml/jax/blob/jax-v0.4.32/jax/_src/nn/functions.py#L828 # Not support `query_seq_lengths` and `key_value_seq_lengths` args diff --git a/keras/src/backend/torch/nn.py b/keras/src/backend/torch/nn.py index 449c0976aff..e4291f6b84c 100644 --- a/keras/src/backend/torch/nn.py +++ b/keras/src/backend/torch/nn.py @@ -864,8 +864,28 @@ def _get_large_negative(dtype): return convert_to_tensor(val * -0.7, dtype=dtype) +def is_flash_attention_enabled(query, key, value, mask=None, is_causal=False): + params = torch.backends.cuda.SDPAParams( + query, + key, + value, + mask, + 0.0, + is_causal, + ) + is_enabled = torch.backends.cuda.can_use_flash_attention(params, False) + return is_enabled + + def dot_product_attention( - query, key, value, bias=None, mask=None, scale=None, is_causal=False + query, + key, + value, + bias=None, + mask=None, + scale=None, + is_causal=False, + flash_attention=False, ): if bias is not None: raise ValueError( @@ -891,7 +911,35 @@ def dot_product_attention( query = torch.transpose(query, axis0, axis1) key = torch.transpose(key, axis0, axis1) value = torch.transpose(value, axis0, axis1) - attention_output = torch.nn.functional.scaled_dot_product_attention( - query, key, value, attn_mask=mask, is_causal=is_causal, scale=scale - ) + + if flash_attention: + is_enabled = is_flash_attention_enabled( + query=query, + key=key, + value=value, + mask=mask, + is_causal=is_causal, + ) + if not is_enabled: + raise ValueError( + "Flash attention is not enabled in `torch` backend. " + "The dtype of the inputs should be float16/bfloat16 " + "and your GPU should support flash attention implementation." + ) + + with torch.nn.attention.sdpa_kernel( + backends=[torch.nn.attention.SDPBackend.FLASH_ATTENTION], + ): + attention_output = torch.nn.functional.scaled_dot_product_attention( + query, + key, + value, + attn_mask=mask, + is_causal=is_causal, + scale=scale, + ) + else: + attention_output = torch.nn.functional.scaled_dot_product_attention( + query, key, value, attn_mask=mask, is_causal=is_causal, scale=scale + ) return torch.transpose(attention_output, axis1, axis0) diff --git a/keras/src/ops/nn.py b/keras/src/ops/nn.py index c0f65dc87cc..2d779582a5b 100644 --- a/keras/src/ops/nn.py +++ b/keras/src/ops/nn.py @@ -2131,7 +2131,16 @@ def __init__(self, is_causal=False): super().__init__() self.is_causal = is_causal - def call(self, query, key, value, bias=None, mask=None, scale=None): + def call( + self, + query, + key, + value, + bias=None, + mask=None, + scale=None, + flash_attention=False, + ): return backend.nn.dot_product_attention( query, key, @@ -2140,10 +2149,18 @@ def call(self, query, key, value, bias=None, mask=None, scale=None): mask=mask, scale=scale, is_causal=self.is_causal, + flash_attention=flash_attention, ) def compute_output_spec( - self, query, key, value, bias=None, mask=None, scale=None + self, + query, + key, + value, + bias=None, + mask=None, + scale=None, + flash_attention=False, ): return KerasTensor(query.shape, dtype=query.dtype) @@ -2152,7 +2169,14 @@ def compute_output_spec( ["keras.ops.dot_product_attention", "keras.ops.nn.dot_product_attention"] ) def dot_product_attention( - query, key, value, bias=None, mask=None, scale=None, is_causal=False + query, + key, + value, + bias=None, + mask=None, + scale=None, + is_causal=False, + flash_attention=False, ): """Scaled dot product attention function. @@ -2207,6 +2231,7 @@ def dot_product_attention( bias=bias, mask=mask, scale=scale, + flash_attention=flash_attention, ) return backend.nn.dot_product_attention( query, @@ -2216,4 +2241,5 @@ def dot_product_attention( mask=mask, scale=scale, is_causal=is_causal, + flash_attention=flash_attention, ) diff --git a/keras/src/ops/nn_test.py b/keras/src/ops/nn_test.py index fe8d34fc656..4d75760d894 100644 --- a/keras/src/ops/nn_test.py +++ b/keras/src/ops/nn_test.py @@ -2208,9 +2208,12 @@ def test_psnr(self): bias=(None, True), scale=(None, 1.0), mask_and_is_causal=((None, False), (True, False), (None, True)), + flash_attention=(True, False), ) ) - def test_dot_product_attention(self, bias, scale, mask_and_is_causal): + def test_dot_product_attention( + self, bias, scale, mask_and_is_causal, flash_attention + ): mask, is_causal = mask_and_is_causal query_shape = (2, 3, 4, 5) key_shape = (2, 6, 4, 5) @@ -2232,6 +2235,57 @@ def test_dot_product_attention(self, bias, scale, mask_and_is_causal): mask_shape ) + if flash_attention and backend.backend() in [ + "torch", + "tensorflow", + "numpy", + ]: + self.skipTest( + "Not supported in TF and NumPy and supported for " + "PyTorch with specific requirements." + ) + + if flash_attention and backend.backend() == "jax": + try: + outputs = knn.dot_product_attention( + query, + key, + value, + bias=bias, + mask=mask, + scale=scale, + is_causal=is_causal, + flash_attention=flash_attention, + ) + except ValueError as e: + if e.args[0].startswith( + "Flash attention is not supported in your " + "current JAX version" + ): + self.skipTest( + "JAX version does not have " + "`dot_product_attention` function." + ) + except RuntimeError as e: + if e.args[0] == "cuDNN is not detected.": + self.skipTest("No CuDNN to run flash attention for JAX.") + elif e.args[0] == "Require at least Ampere arch to run": + self.skipTest( + "Requires at least Ampere arch to run flash attention " + "for JAX." + ) + else: + outputs = knn.dot_product_attention( + query, + key, + value, + bias=bias, + mask=mask, + scale=scale, + is_causal=is_causal, + flash_attention=flash_attention, + ) + expected = _dot_product_attention( query, key, @@ -2241,15 +2295,6 @@ def test_dot_product_attention(self, bias, scale, mask_and_is_causal): scale=scale, is_causal=is_causal, ) - outputs = knn.dot_product_attention( - query, - key, - value, - bias=bias, - mask=mask, - scale=scale, - is_causal=is_causal, - ) self.assertAllClose(outputs, expected)