From 74102c9f9542f34188d31c74051518e9ee1401b8 Mon Sep 17 00:00:00 2001
From: "steve.an" <steve.an@kakaobrain.com>
Date: Tue, 10 Oct 2023 10:39:19 +0900
Subject: [PATCH] Use power of 2 size tensors in Attention benchmark

---
 benchmarks/benchmark_attention.py |  6 +++---
 trident/operation/attention.py    | 12 +++++++++++-
 trident/util/util.py              |  4 ++++
 3 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/benchmarks/benchmark_attention.py b/benchmarks/benchmark_attention.py
index 264c5d7d..c8a29a6d 100644
--- a/benchmarks/benchmark_attention.py
+++ b/benchmarks/benchmark_attention.py
@@ -20,7 +20,7 @@
 
 
 @util.report(
-    "attention forward", ["y_size"], [32 * i for i in range(1, 21)], {"num_batches": 64, "num_heads": 8, "x_size": 64}
+    "attention forward", ["y_size"], [2**i for i in range(5, 10)], {"num_batches": 64, "num_heads": 8, "x_size": 64}
 )
 def bench_attention_forward(num_batches, num_heads, y_size, x_size, dtype, backend):
     factory_kwargs = {"device": "cuda", "dtype": dtype}
@@ -39,8 +39,8 @@ def bench_attention_forward(num_batches, num_heads, y_size, x_size, dtype, backe
 @util.report(
     "attention backward",
     ["y_size"],
-    [64 * i for i in range(1, 21)],
-    {"num_batches": 64, "num_heads": 8, "x_size": 64},
+    [2**i for i in range(5, 10)],
+    {"num_batches": 32, "num_heads": 8, "x_size": 64},
 )
 def bench_attention_backward(num_batches, num_heads, y_size, x_size, dtype, backend):
     factory_kwargs = {"device": "cuda", "dtype": dtype}
diff --git a/trident/operation/attention.py b/trident/operation/attention.py
index fa02549e..ede78438 100644
--- a/trident/operation/attention.py
+++ b/trident/operation/attention.py
@@ -28,6 +28,16 @@ def forward(ctx: Any, *args: Any, **kwargs: Any):
         if query.dim() != 4 or key.dim() != 4 or value.dim() != 4:
             raise ValueError("The dimension of query, key and value should be 4.")
 
+        if (
+            not util.is_pow2(query.shape[-2])
+            or not util.is_pow2(query.shape[-1])
+            or not util.is_pow2(key.shape[-2])
+            or not util.is_pow2(key.shape[-1])
+            or not util.is_pow2(value.shape[-2])
+            or not util.is_pow2(value.shape[-1])
+        ):
+            raise ValueError("Attention supports only for power of 2 size tensors.")
+
         if mask is not None:
             if is_causal:
                 raise ValueError("Error because both attn_mask and is_causal are set.")
@@ -185,7 +195,7 @@ def grid(meta):
             util.dtype(grad_query.dtype),
             64,
             triton.next_power_of_2(x_size),
-            num_warps=4 if x_size <= 64 else 8,
+            num_warps=2,
         )
         util.pop_trace()
 
diff --git a/trident/util/util.py b/trident/util/util.py
index 048ec74c..7b6d9165 100644
--- a/trident/util/util.py
+++ b/trident/util/util.py
@@ -58,6 +58,10 @@ def dtype(input):
         raise ValueError(f"Unable to convert the given input: '{input}'.")
 
 
+def is_pow2(value):
+    return False if value == 0 else (value & (value - 1)) == 0
+
+
 def size_and_stride(input: torch.Tensor, dim: int):
     if input.dim() == 2:
         if dim == 0: