From e64d347def39bb47101efafe4177adf9f77a63ec Mon Sep 17 00:00:00 2001
From: Ivan Yashchuk <IvanYashchuk@users.noreply.github.com>
Date: Thu, 5 Sep 2024 20:09:04 +0300
Subject: [PATCH] Add a parametrized benchmark for swiglu (#1105)

---
 thunder/benchmarks/__init__.py | 82 ++++++++++++++++++++++++++++++++++
 thunder/benchmarks/targets.py  | 65 ++++++++++++++++++++++++++-
 2 files changed, 146 insertions(+), 1 deletion(-)

diff --git a/thunder/benchmarks/__init__.py b/thunder/benchmarks/__init__.py
index ba16b1ac4c..e5f7756c83 100644
--- a/thunder/benchmarks/__init__.py
+++ b/thunder/benchmarks/__init__.py
@@ -1240,6 +1240,88 @@ def foo(a):
         return foo
 
 
+class LitGPTSwigluBenchmark(Benchmark, metaclass=UserFacingBenchmarkMeta):
+    _args = (
+        BenchmarkArg(
+            name="config",
+            description="The LitGPT config (str, LitGPTConfig) to use. See the litgpt_model.py for details.",
+        ),
+        BenchmarkArg(
+            name="batchdims",
+            description="The shape (Sequence[int]) of input batch dimensions. The input will have innermost dimensions of (config.seq_len,). Default is (16,).",
+        ),
+        BenchmarkArg(
+            name="device",
+            description="A string representing the device to run on. Default is 'cuda'.",
+        ),
+        BenchmarkArg(
+            name="dtype",
+            description="The dtype of the tensors. Default is thunder.float32.",
+        ),
+        BenchmarkArg(
+            name="requires_grad",
+            description="Whether the input tensors require grad. Default is False.",
+        ),
+    )
+
+    @classmethod
+    @property
+    def name(cls) -> str:
+        return "litgpt-swiglu"
+
+    @classmethod
+    @property
+    def description(cls) -> str:
+        return "LitGPT's 'swiglu' elementwise unary operation."
+
+    @classmethod
+    @property
+    def args(cls) -> tuple[BenchmarkArg, ...]:
+        return cls._args
+
+    def __init__(
+        self,
+        config: str | LitGPTConfig,
+        batchdims: Sequence[int],
+        device: str,
+        dtype: dtypes.dtype,
+        requires_grad: bool,
+        use_liger: bool = False,
+    ) -> None:
+        super().__init__()
+
+        self.config = LitGPTConfig.from_name(config) if not isinstance(config, LitGPTConfig) else config
+        self.batchdims = batchdims
+        self.shape: Sequence[int] = batchdims + (self.config.block_size, self.config.intermediate_size)
+        self.device: str = device
+        self.dtype: dtypes.dtype = dtype
+        self.tdtype: torch.dtype = ltorch.to_torch_dtype(dtype)
+        self.requires_grad: bool = requires_grad
+        self.devices: list[str] = [device]
+        self.use_liger: bool = use_liger
+
+    def make_batch(self) -> tuple[list, dict]:
+        return (
+            make_tensor(self.shape, device=self.device, dtype=self.tdtype, requires_grad=self.requires_grad),
+            make_tensor(self.shape, device=self.device, dtype=self.tdtype, requires_grad=self.requires_grad),
+        ), {}
+
+    def fn(self) -> Callable:
+        # https://github.com/Lightning-AI/litgpt/blob/fdf6a120056d1363287285599eb84907f6c589b9/litgpt/model.py#L372
+        def fn(x_fc_1, x_fc_2):
+            return torch.nn.functional.silu(x_fc_1) * x_fc_2
+
+        if self.use_liger:
+            try:
+                from liger_kernel.ops.swiglu import LigerSiLUMulFunction
+
+                return LigerSiLUMulFunction.apply
+            except ImportError:
+                raise ImportError("Requested to use the Liger SiLU Mul function, but the Liger kernel is not available")
+
+        return fn
+
+
 class NanoGPTBenchmark(Benchmark, metaclass=UserFacingBenchmarkMeta):
     _args = (
         BenchmarkArg(
diff --git a/thunder/benchmarks/targets.py b/thunder/benchmarks/targets.py
index 906454a018..4a77aa33cd 100644
--- a/thunder/benchmarks/targets.py
+++ b/thunder/benchmarks/targets.py
@@ -18,6 +18,7 @@
     LitGPTBenchmark,
     LitGPTCausalSelfAttentionBenchmark,
     LitGPTSDPABenchmark,
+    LitGPTSwigluBenchmark,
     LlamaMLPBenchmark,
     NanoGPTBenchmark,
     NanoGPTCrossEntropyBenchmark,
@@ -39,7 +40,7 @@
 from thunder.tests.litgpt_model import Config as LitGPTConfig
 from thunder.tests.make_tensor import make_tensor
 
-
+LIGER_FUSED_SWIGLU_AVAILABLE: bool = package_available("liger_kernel.ops.swiglu")
 APEX_FUSED_ROPE_AVAILABLE: bool = package_available("fused_rotary_positional_embedding")
 IMPORTANT_CONFIGS = [
     "Llama-2-13b-hf",
@@ -233,6 +234,68 @@ def test_litgpt_gelu(benchmark, executor: Callable, bs: int, compute_type: Compu
     benchmark_for_compute_type(compute_type, benchmark, fn, args, kwargs)
 
 
+# There are many configurations but only the following parameters affect the swiglu benchmark:
+# - intermediate_size
+# - block_size
+# Let's select only the configurations that differ in these parameters
+def get_configs_for_swiglu():
+    return get_unique_configs(("intermediate_size", "block_size"))
+
+
+swiglu_executors = (
+    (torch_executor, False),
+    (torch_compile_executor, False),
+    (thunder_executor, False),
+    (torch_executor, True),
+    (torch_compile_executor, True),
+)
+swiglu_executors_ids = (
+    "torch",
+    "torch.compile",
+    "thunder",
+    "torch+liger",
+    "torch.compile+liger",
+)
+
+
+# Sample command to run this benchmark:
+# pytest thunder/benchmarks/targets.py -k "test_litgpt_swiglu" --benchmark-group-by='param:config,param:bs,param:compute_type'
+@pytest.mark.parametrize(
+    "executor,use_liger,",
+    swiglu_executors,
+    ids=swiglu_executors_ids,
+)
+# bs = batch size
+# It's typically small for LLMs
+@pytest.mark.parametrize(
+    "bs,",
+    (2**i for i in range(0, 2)),
+    ids=(f"bs{2**i}" for i in range(0, 2)),
+)
+@parametrize_compute_type
+@pytest.mark.parametrize(
+    "config,",
+    get_configs_for_swiglu(),
+)
+def test_litgpt_swiglu(benchmark, executor: Callable, use_liger: bool, bs: int, compute_type: ComputeType, config: str):
+    if use_liger and not LIGER_FUSED_SWIGLU_AVAILABLE:
+        pytest.skip("Liger fused swiglu is unavailable")
+
+    bench: Benchmark = LitGPTSwigluBenchmark(
+        config=config,
+        batchdims=(bs,),
+        device="cuda:0",
+        dtype=thunder.bfloat16,
+        requires_grad=is_requires_grad(compute_type),
+        use_liger=use_liger,
+    )
+
+    args, kwargs = bench.make_batch()
+    fn = executor(bench.fn())
+
+    benchmark_for_compute_type(compute_type, benchmark, fn, args, kwargs)
+
+
 @pytest.mark.parametrize(
     "executor,",
     executors,