Update llama tests for block size 32 (#696)

The block_seq_stride default is changing to 32 instead of 16, so this PR updates the tests to use the block_seq_stride flag and the new numpy inputs for block size 32 to benchmark correctly. This PR also removes the decomposed fp16 tests that are not needed anymore. --------- Signed-off-by: aviator19941 <[email protected]>
nod-ai · Dec 16, 2024 · ba78824 · ba78824
1 parent 4f542ac
commit ba78824
Show file tree

Hide file tree

Showing 2 changed files with 32 additions and 188 deletions.
diff --git a/sharktank/sharktank/utils/export_artifacts.py b/sharktank/sharktank/utils/export_artifacts.py
@@ -92,6 +92,7 @@ def __init__(
         iree_hal_target_backends: str,
         attention_kernel: str,
         tensor_parallelism_size: int,
+        block_seq_stride: Optional[int] = None,
     ):
         self.sharktank_dir = str(
             Path(os.path.dirname(os.path.abspath(__file__))).parent.parent.parent
@@ -102,6 +103,7 @@ def __init__(
         self.iree_hal_target_backends = iree_hal_target_backends
         self.attention_kernel = attention_kernel
         self.tensor_parallelism_size = tensor_parallelism_size
+        self.block_seq_stride = block_seq_stride
 
     def timeit(func):
         def wrapper(*args, **kwargs):
@@ -184,6 +186,8 @@ def export_to_mlir(
         if self.attention_kernel in ["decomposed", "torch"]:
             export_args.append("--attention-kernel")
             export_args.append(self.attention_kernel)
+        if self.block_seq_stride:
+            export_args.append(f"--block-seq-stride={self.block_seq_stride}")
 
         cwd = self.sharktank_dir
         cmd = subprocess.list2cmdline(export_args)
@@ -280,7 +284,6 @@ def iree_benchmark_vmfb(
         benchmark_args += [
             "iree-benchmark-module",
             "--hip_use_streams=true",
-            "--hip_allow_inline_execution=true",
             "--device_allocator=caching",
             f"--module={vmfb_name}",
         ]