diff --git a/.gitignore b/.gitignore
index 485cccfcf..f818d1151 100644
--- a/.gitignore
+++ b/.gitignore
@@ -12,3 +12,6 @@
 
 # Clangd cache
 .cache
+
+# Python venv
+venv*
diff --git a/.gitmodules b/.gitmodules
index 5ed61a524..00d892bd3 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -2,6 +2,8 @@
 	path = llvm
 	url = https://github.com/llvm/llvm-project.git
 	branch = main
+	shallow = true
 [submodule "thirdparty/mimalloc"]
 	path = thirdparty/mimalloc
 	url = https://github.com/microsoft/mimalloc.git
+	shallow = true
diff --git a/benchmark/makefile b/benchmark/makefile
index 3dc5543b7..112d554c1 100644
--- a/benchmark/makefile
+++ b/benchmark/makefile
@@ -33,7 +33,7 @@ all:$(OUT)
 	$(shell rm -rf tempFile)
 
 BUDDY_OPT_OPTIONS := -conv-vectorization="strip-mining=${STRIP}" -lower-affine -convert-scf-to-cf -convert-vector-to-llvm -finalize-memref-to-llvm -convert-arith-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts
-MLIR_OPT_OPTIONS := -convert-linalg-to-loops -convert-linalg-to-llvm -lower-affine -convert-scf-to-cf -convert-scf-to-cf -convert-vector-to-llvm --finalize-memref-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts
+MLIR_OPT_OPTIONS := -convert-linalg-to-loops -lower-affine -convert-scf-to-cf -convert-scf-to-cf -convert-vector-to-llvm --finalize-memref-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts
 
 $(OUT):$(SOURCE)
 	@echo $*
diff --git a/docs/IIRVectorizationAlgorithm.md b/docs/IIRVectorizationAlgorithm.md
new file mode 100644
index 000000000..420450581
--- /dev/null
+++ b/docs/IIRVectorizationAlgorithm.md
@@ -0,0 +1,53 @@
+# Algorithm Explanation
+
+This document shows the details of the algorithms used in DAPVectorization pass. 
+
+## IIR Vectorization Implementation
+
+IIR filter can represent in different forms, typically ZPK(Zero-Pole-Gain) form or SOS(Second-Order Sections) form. Filter can be defined in ZPK form and then transformed to SOS form.
+
+### Scalar Computation for IIR Operation
+
+Currently, our IIR operation supports filter with SOS form. When the filter has only one set of parameters, denoted as {$𝑏_0, 𝑏_1, b_2, a_1, a_2$}, distinguishing parameters by subscripts. The equation is shown in the following form:
+
+**IIR with one set of params:**
+$$ y_n = 𝑏_0 𝑥_𝑛 + 𝑏_1 𝑥_{𝑛−1} − 𝑎_1 𝑦_{𝑛−1} + 𝑏_2 𝑥_{𝑛−2} − 𝑎_2 𝑦_{𝑛−2} $$
+
+When the filter have multiple sets of filters, the operation use a cascade method for calculation. Take two sets of params as an example, filter parameters denoted as {$𝑏_0^0, 𝑏_1^0, b_2^0, a_1^0, a_2^0$} and {$𝑏_0^1, 𝑏_1^1, b_2^1, a_1^1, a_2^1$}, superscript indicates parameters from different sets. The process is listed below:
+
+**IIR with two sets of params:**
+$$y_n^0 = 𝑏_0^0 𝑥_𝑛^0 + 𝑏_1^0 𝑥_{𝑛−1}^0 − 𝑎_1^0 𝑦_{𝑛−1}^0 + 𝑏_2^0 𝑥_{𝑛−2}^0 − 𝑎_2^0 𝑦_{𝑛−2}^0 $$
+$$x_n^1 = y_n^0$$
+$$y_n^1 = 𝑏_0^1 𝑥_𝑛^1 + 𝑏_1^1 𝑥_{𝑛−1}^1 − 𝑎_1^1 𝑦_{𝑛−1}^1 + 𝑏_2^1 𝑥_{𝑛−2}^1 − 𝑎_2^0 𝑦_{𝑛−2}^1$$
+
+### Vectorization for IIR Operation
+
+This section shows the implementation of IIR Vectorization algorithm. The example shown below contains 4 sets of parameters, with superscript {$0, 1, 2, 3$} representing each set of parameters.
+
+1. **Segment IIR Equation & Generate Vector Params**
+   ![Segment IIR Equation to three parts due to different time moment](./Images/IIRSegmentation.png)
+    IIR equation were segmented into 3 parts, each part were calculated in different time moment. When $S2$ was calculated at time $t_i$, it will be used to calculate $S1$ at time $t_{i+1}$, then produce the final result at time $t_{i+2}$.
+
+   ![Generate SOS params in vector form](./Images/IIRVectorParams.png)
+    In the above image, vector $B0$ were the collection of all $b_0$ params, other vectors $B1, B2, A1, A2$ each collect there corresponding params. 
+    
+2. **Computing One Set of Params**
+   ![Computing step 1](./Images/IIRComputing1.png)
+    The first step in computation, calculate $y_0^0$ with the following equation:
+    $$𝑦_0^0=𝑏_0^0𝑥_0+s_1^0$$
+    At time moment $0$, the initial values of $S1, S2$ were set to $0$.
+   ![Computing step 2](./Images/IIRComputing2.png)
+    The second step in computation, calculate $s_1^0$ with the following equation:
+    $$𝑠_1^0=𝑏_1^0𝑥_0−𝑎_1^0𝑦_0^0+s_2^0 $$
+   ![Computing step 3](./Images/IIRComputing3.png)
+    The third step in computation, calculate $s_2^0$ with the following equation:
+    $$𝑠_2^0=𝑏_2^0𝑥_0−𝑎_2^0𝑦_0^0$$
+
+    The above three steps happen in the same time moment $t$, which is the same loop iteration in program. The order of these three steps cannot change, because the value from vector $S1, S2$ were actually produced before time moment $t$.
+3. **Cascade Method**
+   ![Cascade step 1](./Images/IIRCascade1.png)
+    Now the values $y_0^0$, $s_1^0$ and $s_2^0$ were produced, here the whole system will get a new input $x1$ and move on the computation.
+   ![Cascade step 2](./Images/IIRCascade2.png)
+    The $y_0^0$ were moved right and the new input $x1$ were pushed in. The value in vector $S1$ and $S2$ are not changed and will jump back to the second step. The difference in the next iteration is that two sets of parameters are used and this is where the performance improves.
+
+    When the example above came to the fourth iteration, the computation will be using all the parameters. This situation occurs for the vast majority of the time during the computation. Also, considering a longer vector length(currently support 4, 8, 16, 32, 64), it can achieve a 10x performance improvement.
diff --git a/docs/Images/IIRCascade1.png b/docs/Images/IIRCascade1.png
new file mode 100644
index 000000000..e766735b1
Binary files /dev/null and b/docs/Images/IIRCascade1.png differ
diff --git a/docs/Images/IIRCascade2.png b/docs/Images/IIRCascade2.png
new file mode 100644
index 000000000..5e97da94d
Binary files /dev/null and b/docs/Images/IIRCascade2.png differ
diff --git a/docs/Images/IIRComputing1.png b/docs/Images/IIRComputing1.png
new file mode 100644
index 000000000..69b5f33de
Binary files /dev/null and b/docs/Images/IIRComputing1.png differ
diff --git a/docs/Images/IIRComputing2.png b/docs/Images/IIRComputing2.png
new file mode 100644
index 000000000..1e33f8532
Binary files /dev/null and b/docs/Images/IIRComputing2.png differ
diff --git a/docs/Images/IIRComputing3.png b/docs/Images/IIRComputing3.png
new file mode 100644
index 000000000..4a8d3d2b4
Binary files /dev/null and b/docs/Images/IIRComputing3.png differ
diff --git a/docs/Images/IIRSegmentation.png b/docs/Images/IIRSegmentation.png
new file mode 100644
index 000000000..e498e8652
Binary files /dev/null and b/docs/Images/IIRSegmentation.png differ
diff --git a/docs/Images/IIRVectorParams.png b/docs/Images/IIRVectorParams.png
new file mode 100644
index 000000000..87e0e81bf
Binary files /dev/null and b/docs/Images/IIRVectorParams.png differ
diff --git a/examples/BuddyBert/import-bert.py b/examples/BuddyBert/import-bert.py
index c2044cb03..92e8e055e 100644
--- a/examples/BuddyBert/import-bert.py
+++ b/examples/BuddyBert/import-bert.py
@@ -46,12 +46,16 @@
     "attention_mask": torch.tensor([[1 for _ in range(5)]], dtype=torch.int64),
 }
 with torch.no_grad():
-    module, params = dynamo_compiler.importer(model, **inputs)
+    graphs = dynamo_compiler.importer(model, **inputs)
 
+assert len(graphs) == 1
+graph = graphs[0]
+params = dynamo_compiler.imported_params[graph]
+graph.lower_to_top_level_ir(do_params_pack=True)
 current_path = os.path.dirname(os.path.abspath(__file__))
 
 with open(Path(current_path) / "bert.mlir", "w") as module_file:
-    module_file.write(str(module))
+    module_file.write(str(graph._imported_module))
 
 float32_param = np.concatenate(
     [param.detach().numpy().reshape([-1]) for param in params[:-1]]
diff --git a/examples/BuddyGPU/matmul.mlir b/examples/BuddyGPU/matmul.mlir
new file mode 100644
index 000000000..642fc2d0a
--- /dev/null
+++ b/examples/BuddyGPU/matmul.mlir
@@ -0,0 +1,8 @@
+module {
+  func.func @forward(%arg0: tensor<5376x2048xf32>, %arg1: tensor<2048x5376xf32>) -> tensor<5376x5376xf32> {
+    %cst = arith.constant dense<0.000000e+00> : tensor<5376x5376xf32>
+    %0 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%arg0, %arg1 : tensor<5376x2048xf32>, tensor<2048x5376xf32>) outs(%cst : tensor<5376x5376xf32>) -> tensor<5376x5376xf32>
+    return %0 : tensor<5376x5376xf32>
+  }
+}
+
diff --git a/examples/BuddyGPU/matmul.py b/examples/BuddyGPU/matmul.py
new file mode 100644
index 000000000..af6dfe341
--- /dev/null
+++ b/examples/BuddyGPU/matmul.py
@@ -0,0 +1,54 @@
+# ===- matmul.py --------------------------------------------------------------
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# ===--------------------------------------------------------------------------
+#
+# This file demonstrates the usage of Buddy's frontend for PyTorch module.
+#
+# ===--------------------------------------------------------------------------
+
+import os
+import time
+
+import numpy
+import torch
+from transformers import LlamaForCausalLM, LlamaTokenizer
+from torch._functorch.aot_autograd import aot_autograd_decompositions
+from torch._inductor.decomposition import decompositions as inductor_decomp
+
+from buddy.compiler.frontend import DynamoCompiler
+from buddy.compiler.ops import tosa
+
+dtype = torch.float32
+
+def foo(x, y):
+    return torch.matmul(x, y)
+
+in1 = torch.ones([5376, 2048], dtype=torch.float32)
+in2 = torch.ones([2048, 5376], dtype=torch.float32)
+# Initialize Dynamo Compiler with specific configurations as an importer.
+dynamo_compiler = DynamoCompiler(
+    primary_registry=tosa.ops_registry,
+    aot_autograd_decomposition=inductor_decomp,
+)
+
+graphs = dynamo_compiler.importer(foo, in1, in2)
+assert len(graphs) == 1
+graph = graphs[0]
+graph.lower_to_top_level_ir()
+
+path_prefix = os.path.dirname(os.path.abspath(__file__))
+# Write the MLIR module to the file.
+with open(os.path.join(path_prefix, "matmul.mlir"), "w") as module_file:
+    print(graph._imported_module, file=module_file)
diff --git a/examples/BuddyGPU/run-module.py b/examples/BuddyGPU/run-module.py
new file mode 100644
index 000000000..ef29995f4
--- /dev/null
+++ b/examples/BuddyGPU/run-module.py
@@ -0,0 +1,195 @@
+import mlir.ir as ir
+import mlir.dialects.func as func
+import mlir.dialects.memref as memref
+from mlir.passmanager import *
+from mlir.execution_engine import *
+from mlir import runtime as rt
+from mlir.ir import *
+import numpy as np
+import ctypes
+import gc
+import torch
+
+
+def to_numpy(element_type: str) -> np.dtype:
+    match element_type:
+        case "f16":
+            return np.float16
+        case "f32":
+            return np.float32
+        case "f64":
+            return np.float64
+        case "i8":
+            return np.int8
+        case "i16":
+            return np.int16
+        case "i32":
+            return np.int32
+        case "i64":
+            return np.int64
+        case "bf16":
+            return ValueError("bf16 is not supported by numpy")
+        case _:
+            raise ValueError(f"Unsupported type: {element_type}")
+
+
+def to_mlir(dtype: np.dtype) -> ir.Type:
+    match dtype:
+        case np.float16:
+            return ir.F16Type.get()
+        case np.float32:
+            return ir.F32Type.get()
+        case np.float64:
+            return ir.F64Type.get()
+        case np.int8:
+            return ir.IntegerType.get_signless(8)
+        case np.int16:
+            return ir.IntegerType.get_signless(16)
+        case np.int32:
+            return ir.IntegerType.get_signless(32)
+        case np.int64:
+            return ir.IntegerType.get_signless(64)
+        case _:
+            raise ValueError(f"Unsupported type: {dtype}")
+
+
+def lower_to_llvm_cpu(module: Module) -> Module:
+    pm = PassManager("builtin.module")
+    pm.add("func.func(tosa-to-linalg-named)")
+    pm.add("func.func(tosa-to-linalg)")
+    pm.add("func.func(tosa-to-tensor)")
+    pm.add("func.func(tosa-to-arith)")
+    pm.add("arith-expand")
+    pm.add("eliminate-empty-tensors")
+    pm.add("empty-tensor-to-alloc-tensor")
+    pm.add("convert-elementwise-to-linalg")
+    pm.add("one-shot-bufferize")
+    pm.add("func.func(convert-linalg-to-affine-loops)")
+    pm.add("affine-loop-fusion")
+    pm.add("func.func(affine-parallelize)")
+    pm.add("lower-affine")
+    pm.add("convert-scf-to-openmp")
+    pm.add("func-bufferize")
+    pm.add("arith-bufferize")
+    pm.add("func.func(tensor-bufferize)")
+    pm.add("func.func(buffer-deallocation)")
+    pm.add("func.func(finalizing-bufferize)")
+    pm.add("expand-strided-metadata")
+    pm.add("convert-vector-to-llvm")
+    pm.add("memref-expand")
+    pm.add("arith-expand")
+    pm.add("convert-arith-to-llvm")
+    pm.add("finalize-memref-to-llvm")
+    pm.add("convert-scf-to-cf")
+    pm.add("func.func(llvm-request-c-wrappers)")
+    pm.add("convert-openmp-to-llvm")
+    pm.add("convert-math-to-llvm")
+    pm.add("convert-math-to-libm")
+    pm.add("convert-func-to-llvm")
+    pm.add("reconcile-unrealized-casts")
+    pm.run(module.operation)
+    return module
+
+
+def new_ranked_memref_descriptor(nparray: np.ndarray):
+    ctp = rt.as_ctype(nparray.dtype)
+    if nparray.ndim == 0:
+        x = rt.make_zero_d_memref_descriptor(ctp)()
+        x.allocated = nparray.ctypes.data
+        x.aligned = nparray.ctypes.data_as(ctypes.POINTER(ctp))
+        x.offset = ctypes.c_longlong(0)
+        return x
+
+    x = rt.make_nd_memref_descriptor(nparray.ndim, ctp)()
+    nbytes = nparray.nbytes
+    buffer = ctypes.create_string_buffer(nbytes)
+    ctypes.memmove(buffer, nparray.ctypes.data, nbytes)
+    x.allocated = ctypes.cast(buffer, ctypes.c_void_p).value
+    x.aligned = ctypes.cast(buffer, ctypes.POINTER(ctp))
+    x.offset = ctypes.c_longlong(0)
+    x.shape = nparray.ctypes.shape
+
+    # Numpy uses byte quantities to express strides, MLIR OTOH uses the
+    # torch abstraction which specifies strides in terms of elements.
+    strides_ctype_t = ctypes.c_longlong * nparray.ndim
+    x.strides = strides_ctype_t(
+        *[x // nparray.itemsize for x in nparray.strides]
+    )
+    return x
+
+
+def testMemrefAdd():
+    with Context():
+        module = Module.parse(
+            """
+    module  {
+      func.func @main(%arg0: memref<1xf32>, %arg1: memref<f32>, %arg2: memref<1xf32>) attributes { llvm.emit_c_interface } {
+        %0 = arith.constant 0 : index
+        %1 = memref.load %arg0[%0] : memref<1xf32>
+        %2 = memref.load %arg1[] : memref<f32>
+        %3 = arith.addf %1, %2 : f32
+        memref.store %3, %arg2[%0] : memref<1xf32>
+        return
+      }
+    } """
+        )
+        arg1 = np.array([32.5]).astype(np.float32)
+        arg2 = np.array(6).astype(np.float32)
+        res = np.array([0]).astype(np.float32)
+
+        arg1_memref_ptr = ctypes.pointer(
+            ctypes.pointer(rt.get_ranked_memref_descriptor(arg1))
+        )
+        arg2_memref_ptr = ctypes.pointer(
+            ctypes.pointer(rt.get_ranked_memref_descriptor(arg2))
+        )
+        res_memref_ptr = ctypes.pointer(
+            ctypes.pointer(rt.get_ranked_memref_descriptor(res))
+        )
+
+        execution_engine = ExecutionEngine(lower_to_llvm_cpu(module))
+        execution_engine.invoke(
+            "main", arg1_memref_ptr, arg2_memref_ptr, res_memref_ptr
+        )
+        npout = rt.ranked_memref_to_numpy(res_memref_ptr[0])
+        print(npout)
+
+def get_memref_descriptors(args: list[Type]):
+    memref_ptrs = []
+    for arg in args:
+        elem_type = to_numpy(str(arg.element_type))
+        np_arg = np.random.rand(*arg.shape).astype(elem_type)
+        memref_ptrs.append(
+            ctypes.pointer(
+                ctypes.pointer(new_ranked_memref_descriptor(np_arg))
+            )
+        )
+    return memref_ptrs
+
+def test():
+    with Context() as ctx:
+        file = open(
+            "/home/liam/PLCT/buddy-mlir/examples/BuddyGPU/matmul.mlir", "r"
+        )
+        module: Module = Module.parse(file.read())
+        funcOp: func.FuncOp = (
+            module.operation.regions[0].blocks[0].operations[0]
+        )
+        funcName = str(funcOp.name).replace('"', "")
+        assert isinstance(funcOp, func.FuncOp)
+        args_type: list[Type] = [arg.type for arg in funcOp.arguments]
+        res_type = funcOp.type.results
+
+        newModule = lower_to_llvm_cpu(module)
+        memref_ptrs = get_memref_descriptors(res_type+args_type)
+
+        engine = ExecutionEngine(newModule,shared_libs=['/usr/lib/libomp.so'])
+        engine.invoke(funcName, *memref_ptrs)
+        out = rt.ranked_memref_to_numpy(memref_ptrs[0][0])
+        print(out)
+        input1 = rt.ranked_memref_to_numpy(memref_ptrs[1][0])
+        input2 = rt.ranked_memref_to_numpy(memref_ptrs[2][0])
+        numpy_out = np.matmul(input1, input2)
+        print(f"MLIR equal to PyTorch? {np.allclose(out, numpy_out)}")
+
+test()
diff --git a/examples/BuddyGraph/README.md b/examples/BuddyGraph/README.md
new file mode 100644
index 000000000..d7b977f57
--- /dev/null
+++ b/examples/BuddyGraph/README.md
@@ -0,0 +1,23 @@
+# Buddy Graph Representation Examples
+
+## Run the Examples
+
+0. Enter your Python Env
+```
+(base)$ conda activate buddy
+(buddy)$ ...
+```
+1. Build Python Packages
+2. Configure Python Path
+```
+(buddy)$ cd buddy-mlir/build
+(buddy)$ export BUDDY_MLIR_BUILD_DIR=$PWD
+(buddy)$ export LLVM_MLIR_BUILD_DIR=$PWD/../llvm/build
+(buddy)$ export PYTHONPATH=${LLVM_MLIR_BUILD_DIR}/tools/mlir/python_packages/mlir_core:${BUDDY_MLIR_BUILD_DIR}/python_packages:${PYTHONPATH}
+
+```
+3. Run the Examples
+```
+(buddy)$ cd examples/BuddyGraph
+(buddy)$ python import-dynamo-break.py
+```
\ No newline at end of file
diff --git a/examples/BuddyGraph/import-dynamo-break.py b/examples/BuddyGraph/import-dynamo-break.py
new file mode 100644
index 000000000..42bbed603
--- /dev/null
+++ b/examples/BuddyGraph/import-dynamo-break.py
@@ -0,0 +1,63 @@
+# ===- import-dynamo-break.py --------------------------------------------------
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# ===---------------------------------------------------------------------------
+#
+# The example for dynamo graph break, import, and execute.
+#
+# ===---------------------------------------------------------------------------
+
+import torch
+import torch._dynamo as dynamo
+from torch._inductor.decomposition import decompositions as inductor_decomp
+from torch._functorch.aot_autograd import aot_autograd_decompositions
+
+from buddy.compiler.frontend import DynamoCompiler
+from buddy.compiler.ops import tosa
+
+
+class TestModule(torch.nn.Module):
+    def __init__(self, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+
+    def forward(self, b, c):
+        if torch.nn.functional.silu(b)[0][0]:
+            return torch.add(b, c)
+        else:
+            return torch.matmul(b, c)
+
+# Define a PyTorch model and run it with PyTorch runtime.
+model = TestModule()
+a, b = torch.randn((1024, 1024)), torch.randn((1024, 1024))
+print(model(a, b))
+
+# JIT Mode
+# Initialize Buddy Dynamo Compiler to compile and execute the PyTorch model.
+dynamo_compiler = DynamoCompiler(
+    primary_registry=tosa.ops_registry,
+    aot_autograd_decomposition=aot_autograd_decompositions
+)
+model_opt = torch.compile(model, backend=dynamo_compiler)
+print(model_opt(a, b))
+
+torch._dynamo.reset()
+
+# AOT Mode
+# Import PyTorch model to Buddy Graph and MLIR/LLVM IR.
+graphs = dynamo_compiler.importer(
+    model, a, b
+)
+for g in graphs:
+    g.lower_to_top_level_ir()
+    print(g._imported_module)
diff --git a/examples/BuddyLlama/.gitignore b/examples/BuddyLlama/.gitignore
index 6fc96fbb7..ffee494f3 100644
--- a/examples/BuddyLlama/.gitignore
+++ b/examples/BuddyLlama/.gitignore
@@ -1,5 +1,5 @@
 # model params file
-arg0.data
+*.data
 
 # model mlir file
-llama.mlir
+*.mlir
diff --git a/examples/BuddyLlama/CMakeLists.txt b/examples/BuddyLlama/CMakeLists.txt
index c344cfe44..6c70f11c7 100644
--- a/examples/BuddyLlama/CMakeLists.txt
+++ b/examples/BuddyLlama/CMakeLists.txt
@@ -50,6 +50,47 @@ add_custom_command(
 
 add_library(LLAMA STATIC llama.o)
 
+add_custom_command(
+  OUTPUT llama-gpu.o
+  COMMAND ${LLVM_MLIR_BINARY_DIR}/mlir-opt ${BUDDY_EXAMPLES_DIR}/BuddyLlama/llama.mlir 
+            -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" |
+          ${BUDDY_BINARY_DIR}/buddy-opt
+            -arith-expand
+            -eliminate-empty-tensors
+            -empty-tensor-to-alloc-tensor
+            -linalg-bufferize
+            -matmul-paralell-vectorization-optimize
+            -batchmatmul-optimize
+            -convert-linalg-to-affine-loops
+            -affine-loop-fusion
+            -affine-parallelize
+            -lower-affine
+            -canonicalize
+            -func-bufferize
+            -arith-bufferize
+            -tensor-bufferize
+            -buffer-deallocation
+            -finalizing-bufferize
+            -gpu-map-parallel-loops
+            -convert-parallel-loops-to-gpu
+            -canonicalize
+            -gpu-kernel-outlining
+            -convert-scf-to-cf
+            -memref-expand
+            -finalize-memref-to-llvm
+            -convert-arith-to-llvm
+            -convert-gpu-to-nvvm='has-redux=1'
+            -llvm-request-c-wrappers
+            --test-lower-to-nvvm="cubin-chip=sm_80 cubin-features=+ptx71 cubin-format=fatbin" |
+        ${LLVM_MLIR_BINARY_DIR}/mlir-translate -mlir-to-llvmir |
+        ${LLVM_MLIR_BINARY_DIR}/llvm-as |
+        ${LLVM_MLIR_BINARY_DIR}/llc -filetype=obj -relocation-model=pic -O3
+          -o ${BUDDY_BINARY_DIR}/../examples/BuddyLlama/llama-gpu.o
+  DEPENDS buddy-opt ${BUDDY_EXAMPLES_DIR}/BuddyLlama/llama.mlir
+  COMMENT "Building llama-gpu.o "
+  VERBATIM)
+add_library(LLAMA_GPU STATIC llama-gpu.o)
+
 SET_SOURCE_FILES_PROPERTIES(
   template.o
   PROPERTIES
@@ -74,3 +115,21 @@ if(BUDDY_MLIR_USE_MIMALLOC)
 endif()
 
 target_link_libraries(buddy-llama-run ${BUDDY_LLAMA_LIBS})
+
+SET_TARGET_PROPERTIES(
+  LLAMA_GPU
+  PROPERTIES
+  LINKER_LANGUAGE C)
+
+set(BUDDY_LLAMA_GPU_LIBS
+  LLAMA_GPU
+  mlir_c_runner_utils
+  omp
+)
+if(BUDDY_MLIR_USE_MIMALLOC)
+  list(APPEND BUDDY_LLAMA_GPU_LIBS mimalloc)
+endif()
+
+add_executable(buddy-llama-gpu-run llama-gpu.cpp)
+target_link_directories(buddy-llama-gpu-run PRIVATE ${LLVM_MLIR_LIBRARY_DIR})
+target_link_libraries(buddy-llama-gpu-run ${BUDDY_LLAMA_GPU_LIBS})
diff --git a/examples/BuddyLlama/README-gpu.md b/examples/BuddyLlama/README-gpu.md
new file mode 100644
index 000000000..a6cdffd79
--- /dev/null
+++ b/examples/BuddyLlama/README-gpu.md
@@ -0,0 +1,197 @@
+# Buddy Compiler LLaMA on GPU Example
+
+** This is a work in progress. Current version of Buddy-MLIR is using an older version of LLVM, which is not compatible with the latest version of CUDA. We are working on updating the LLVM version. **
+
+## 1. Prerequisites
+Please refer to [readme-cpu.md](readme-cpu.md) for most of the steps, except for the following steps.
+
+1. Install CUDA-toolkit
+Please refer to [CUDA-toolkit](https://developer.nvidia.com/cuda-toolkit) for installation.
+It is suggested that you install nsight system and nsight as well compute for profiling. Please refer to [nsight-system](https://developer.nvidia.com/nsight-systems) and [nsight-compute](https://developer.nvidia.com/nsight-compute) for installation.
+Don't forget to add CUDA and other tools to your PATH.
+
+...
+
+For Step 4. Build and check LLVM/MLIR, please enable CUDA runner for MLIR.
+
+```
+$ cd buddy-mlir
+$ mkdir llvm/build
+$ cd llvm/build
+$ cmake -G Ninja ../llvm \
+    -DLLVM_ENABLE_PROJECTS="mlir;clang;openmp" \
+    -DLLVM_TARGETS_TO_BUILD="host;RISCV;NVPTX" \
+    -DLLVM_ENABLE_ASSERTIONS=ON \
+    -DOPENMP_ENABLE_LIBOMPTARGET=OFF \
+    -DCMAKE_BUILD_TYPE=RELEASE \
+    -DMLIR_ENABLE_BINDINGS_PYTHON=ON \
+    -DPython3_EXECUTABLE=$(which python3) \
+    -DMLIR_ENABLE_CUDA_RUNNER=ON \
+    -DLLVM_CCACHE_BUILD=ON
+$ ninja check-clang check-mlir omp
+```
+
+## 2. Lowering LLaMA MLIR to CUDA
+We would use multiple steps to demonstrate the lowering process. Notice the first process would require the `mlir-opt` built in previous steps, but the remaining ones would need the latest version of `mlir-opt` and other llvm tools such as `llc`.
+
+### 2.1 Lowering TOSA to Linalg
+Due to the availbilty of certain operations such as `transpose`, current LLaMA lowering process would require the use of TOSA dialect. We would first lower the LLaMA model to a mixture of TOSA and Linalg dialects.
+```
+mlir-opt llama.mlir -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" -o llama-linalg-default.mlir
+```
+** Use the old version of `mlir-opt` built in previous steps. Or you might get following error: **
+```
+llama.mlir:747:11: error: 'tosa.mul' op attribute 'shift' failed to satisfy constraint: 8-bit signless integer attribute
+    %36 = "tosa.mul"(%5, %35) {shift = 0 : i32} : (tensor<1x80x4096xf32>, tensor<1x80x1xf32>) -> tensor<1x80x4096xf32>
+          ^
+```
+There should be no `tosa` operations in the output. Most of the operations should be `linalg` operations such as `matmul`, `batch_matmul` or `generic`.
+
+### 2.2 Bufferizing Linalg
+This step bufferizes the Linalg operations. It would fully convert the linalg-on-tensor operations to scf-on-memref operations.
+
+- Bufferize using the old bufferization pipeline:
+```
+mlir-opt llama-linalg-default.mlir -arith-expand -eliminate-empty-tensors -empty-tensor-to-alloc-tensor -linalg-bufferize -convert-linalg-to-affine-loops -affine-loop-fusion -affine-parallelize -lower-affine -canonicalize -func-bufferize -arith-bufferize -tensor-bufferize -buffer-deallocation -finalizing-bufferize -canonicalize -o llama-bufferized.mlir
+```
+
+- Bufferize everything using one-shot-bufferize:
+```
+mlir-opt llama-linalg-default.mlir -arith-expand -eliminate-empty-tensors -empty-tensor-to-alloc-tensor -one-shot-bufferize="bufferize-function-boundaries" -expand-realloc  -resolve-shaped-type-result-dims -canonicalize -buffer-deallocation-simplification -bufferization-lower-deallocations -cse -canonicalize -buffer-deallocation-pipeline  -o llama-bufferized.mlir
+```
+
+- Bufferize everything but function boundaries using one-shot-bufferize:
+```
+mlir-opt llama-linalg-default.mlir -arith-expand -eliminate-empty-tensors -empty-tensor-to-alloc-tensor -one-shot-bufferize -func-bufferize -expand-realloc  -resolve-shaped-type-result-dims -canonicalize -buffer-deallocation-simplification -bufferization-lower-deallocations -finalizing-bufferize -cse -canonicalize -buffer-deallocation-pipeline  -o llama-bufferized.mlir
+```
+
+- Bufferize GPU first
+```
+buddy-opt -gpu-bufferize llama-linalg-default.mlir -o llama-gpu-bufferized.mlir  
+```
+
+- Bufferize everything else using one-shot-bufferize:
+```
+mlir-opt llama-gpu-bufferized.mlir -arith-expand -eliminate-empty-tensors -empty-tensor-to-alloc-tensor -one-shot-bufferize="bufferize-function-boundaries" -expand-realloc  -resolve-shaped-type-result-dims -canonicalize -buffer-deallocation-simplification -bufferization-lower-deallocations -cse -canonicalize -buffer-deallocation-pipeline  -o llama-bufferized.mlir
+```
+
+You should not be seeing any tensor on linalg operations. All operations would look like this:
+
+```
+scf.parallel (%arg2, %arg3) = (%c0, %c0) to (%c80, %c80) step (%c1, %c1) {
+      %6 = memref.load %0[%arg3] : memref<80xi64>
+      %7 = memref.load %expand_shape_650[%arg2, %c0] : memref<80x1xi64>
+      %8 = arith.cmpi slt, %6, %7 : i64
+      memref.store %8, %alloc_651[%arg2, %arg3] : memref<80x80xi1>
+      scf.yield
+    }
+```
+
+### 2.3 Converting to GPU
+This step converts the scf-on-memref operations to gpu operations, with gpu kernels outlined.
+
+```
+mlir-opt llama-bufferized.mlir -gpu-map-parallel-loops -convert-parallel-loops-to-gpu -canonicalize -gpu-kernel-outlining -o llama-outlined.mlir
+```
+
+GPU kernels will be converted into separate modules and functions as such:
+```
+gpu.module @forward_kernel_753 {
+    gpu.func @forward_kernel(%arg0: memref<80x4096xf32>, %arg1: memref<4096x4096xf32>, %arg2: memref<80x4096xf32>, %arg3: index, %arg4: index, %arg5: index) kernel attributes {gpu.known_block_size = array<i32: 1, 1, 1>, gpu.known_grid_size = array<i32: 80, 4096, 1>} {
+      %0 = gpu.block_id  x
+      %1 = gpu.block_id  y
+      %2 = gpu.block_id  z
+      %3 = gpu.thread_id  x
+      %4 = gpu.thread_id  y
+      %5 = gpu.thread_id  z
+      %6 = gpu.grid_dim  x
+      %7 = gpu.grid_dim  y
+      %8 = gpu.grid_dim  z
+      %9 = gpu.block_dim  x
+      %10 = gpu.block_dim  y
+      %11 = gpu.block_dim  z
+      cf.br ^bb1
+    ^bb1:  // pred: ^bb0
+      scf.for %arg6 = %arg3 to %arg4 step %arg5 {
+        %12 = memref.load %arg0[%0, %arg6] : memref<80x4096xf32>
+        %13 = memref.load %arg1[%arg6, %1] : memref<4096x4096xf32>
+        %14 = memref.load %arg2[%0, %1] : memref<80x4096xf32>
+        %15 = arith.mulf %12, %13 : f32
+        %16 = arith.addf %14, %15 : f32
+        memref.store %16, %arg2[%0, %1] : memref<80x4096xf32>
+      }
+      gpu.return
+    }
+  }
+```
+
+### 2.4 Converting to LLVM and NVVM operations
+This step converts the operations to LLVM dialect operations, and then convert some math functions to NVVM intrinsics.
+
+```
+buddy-opt llama-outlined.mlir -gpu-host-register -o llama-host-registered.mlir
+mlir-opt llama-host-registered.mlir -convert-scf-to-cf -memref-expand -finalize-memref-to-llvm -convert-arith-to-llvm -convert-gpu-to-nvvm='has-redux=1' -o llama-nvvm.mlir
+```
+
+Why do we need the `convert-gpu-to-nvvm` step? If it is not applied, and we are using the unmodified lowering pipeline from torch to linalg, the generated LLVM IR would look like this:
+```
+%24 = llvm.getelementptr %17[%23] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+%25 = llvm.load %24 : !llvm.ptr -> f32
+%26 = math.fpowi %25, %arg2 : f32, i32
+%27 = llvm.extractvalue %2[1] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> 
+%28 = llvm.mlir.constant(327680 : index) : i64
+```
+For CPU, math operations such as `math.fpowi` would be lowered to LLVM intrinsics such as `llvm.powi.f32`. However, for GPU, we need to use NVVM intrinsics. And sadly there is no NVVM intrinsics for `math.fpowi`. So we would need to change the lowering pipeline to use `mlir.powf` instead. Before lowering to nvvm, it would look like this:
+```
+%24 = llvm.getelementptr %17[%23] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+%25 = llvm.load %24 : !llvm.ptr -> f32
+%26 = math.powf %25, %arg2 : f32
+%27 = llvm.extractvalue %2[1] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> 
+%28 = llvm.mlir.constant(327680 : index) : i64
+```
+
+And after the lowering:
+```
+llvm.func @__nv_powf(f32, f32) -> f32
+...
+%41 = llvm.getelementptr %36[%40] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+%42 = llvm.load %41 : !llvm.ptr -> f32
+%43 = llvm.call @__nv_powf(%42, %arg10) : (f32, f32) -> f32
+%44 = llvm.extractvalue %27[1] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> 
+```
+
+It now uses `llvm.call` to call the NVVM intrinsics.
+
+### 2.5 Request C wrappers
+Notice that you must request wrappers before compiling GPU codes.
+```
+mlir-opt llama-nvvm.mlir -llvm-request-c-wrappers -o llama-wrapper.mlir
+```
+
+### 2.6 Lowering to LLVM Dialect + GPU Binary
+```
+mlir-opt llama-wrapper.mlir --test-lower-to-nvvm="cubin-chip=sm_80 cubin-features=+ptx71 cubin-format=fatbin" -o llama-cubin.mlir
+```
+Now you could use a builtin pipeline to lower code to nvvm. Notice that you must specify the chip and features. You could find the chip and features from Nvidia.
+After the process all gpu code would be compiled.
+```
+  gpu.binary @forward_kernel_1078  [#gpu.object<#nvvm.target<chip = "sm_80", features = "+ptx71">, "...">]
+```
+
+### 2.7 Translate to LLVM IR
+```
+mlir-translate llama-cubin.mlir --mlir-to-llvmir -o llama.ll
+```
+
+### 2.8 Compile the LLVM IR
+** Remember to use the latest version of LLC, as the latest version of MLIR generates some new intrinsics that are not supported by the old version of LLC. **
+```
+llc llama.ll -filetype=obj -relocation-model=pic -O3 -o llama.o
+```
+
+### 2.9 Link the object file and run
+Following is an example of linking the object file with the runtime library and run the program. You could find the runtime library in the build directory of llvm-project.
+```
+clang llama.o llama-main.cpp.o /path-to/llvm-project/build/lib/libmlir_cuda_runtime.so /path-to/llvm-project/build/lib/libmlir_c_runner_utils.so
+```
+** Notice that current version of the llvm-project used by Buddy-MLIR would encounter problems with CUDA_RUNNERS enabled. Please use the latest version of MLIR for this step. **
\ No newline at end of file
diff --git a/examples/BuddyLlama/import-llama2.py b/examples/BuddyLlama/import-llama2.py
index d5a3a29e1..d63eebe37 100644
--- a/examples/BuddyLlama/import-llama2.py
+++ b/examples/BuddyLlama/import-llama2.py
@@ -19,11 +19,13 @@
 # ===---------------------------------------------------------------------------
 
 import os
+import time
 
 import numpy
 import torch
 from transformers import LlamaForCausalLM, LlamaTokenizer
 from torch._functorch.aot_autograd import aot_autograd_decompositions
+from torch._inductor.decomposition import decompositions as inductor_decomp
 
 from buddy.compiler.frontend import DynamoCompiler
 from buddy.compiler.ops import tosa
@@ -31,6 +33,7 @@
 
 # Retrieve the LLaMA model path from environment variables.
 model_path = os.environ.get("LLAMA_MODEL_PATH")
+model_path = "/home/liam/PLCT/Llama-2-7b-chat-hf"
 if model_path is None:
     raise EnvironmentError(
         "The environment variable 'LLAMA_MODEL_PATH' is not set or is invalid."
@@ -44,22 +47,29 @@
 # Initialize Dynamo Compiler with specific configurations as an importer.
 dynamo_compiler = DynamoCompiler(
     primary_registry=tosa.ops_registry,
-    aot_autograd_decomposition=aot_autograd_decompositions,
+    aot_autograd_decomposition=inductor_decomp,
 )
 
 # Import the model into MLIR module and parameters.
 with torch.no_grad():
-    gm, params = dynamo_compiler.importer(
-        model, torch.tensor([[1 for _ in range(40)]], dtype=torch.int64)
-    )
+    data = torch.tensor([[1 for i in range(40)]], dtype=torch.int64)
+    graphs = dynamo_compiler.importer(model, data)
 
+assert len(graphs) == 1
+graph = graphs[0]
+params = dynamo_compiler.imported_params[graph]
+graph.lower_to_top_level_ir(False)
 path_prefix = os.path.dirname(os.path.abspath(__file__))
 # Write the MLIR module to the file.
 with open(os.path.join(path_prefix, "llama.mlir"), "w") as module_file:
-    print(gm, file=module_file)
+    print(graph._imported_module, file=module_file)
 
-# Concatenate all parameters into a single numpy array and write to a file.
-all_param = numpy.concatenate(
-    [param.detach().numpy().reshape([-1]) for param in params]
-)
-all_param.tofile(os.path.join(path_prefix, "arg0.data"))
+param_file = os.path.dirname(os.path.abspath(__file__)) + "/arg0.data"
+if not os.path.exists(param_file):
+    # Concatenate all parameters into a single numpy array and write to a file.
+    all_param = numpy.concatenate(
+        [param.detach().numpy().reshape([-1]) for param in params]
+    )
+    # if file exists, skip dumping
+
+    all_param.tofile(param_file)
diff --git a/examples/BuddyLlama/llama-gpu.cpp b/examples/BuddyLlama/llama-gpu.cpp
new file mode 100644
index 000000000..e07f4a3cf
--- /dev/null
+++ b/examples/BuddyLlama/llama-gpu.cpp
@@ -0,0 +1,189 @@
+//===- llama-main.cpp -----------------------------------------------------===//
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+
+#include <buddy/Core/Container.h>
+#include <buddy/LLM/TextContainer.h>
+#include <chrono>
+#include <cstddef>
+#include <cstdint>
+#include <filesystem>
+#include <fstream>
+#include <iostream>
+#include <limits>
+#include <type_traits>
+
+using namespace buddy;
+
+constexpr size_t ParamsSize = 6755192832;
+constexpr size_t MaxVocabSize = 32000;
+constexpr size_t MaxTokenLength = 40;
+constexpr size_t HiddenSize = 4096;
+
+/// Declare LLaMA forward function.
+extern "C" void _mlir_ciface_forward(MemRef<float, 3> *, MemRef<float, 1> *,
+                                     Text<size_t, 2> *);
+
+// -----------------------------------------------------------------------------
+// Helper Functions
+// -----------------------------------------------------------------------------
+
+/// Capture input message.
+void getUserInput(std::string &inputStr) {
+  std::cout << "\nPlease send a message:" << std::endl;
+  std::cout << ">>> ";
+  getline(std::cin, inputStr);
+  std::cout << std::endl;
+}
+
+/// Print [Log] label in bold blue format.
+void printLogLabel() { std::cout << "\033[34;1m[Log] \033[0m"; }
+
+/// Print information for each iteration.
+void printIterInfo(size_t iterIdx, std::string str, double time) {
+  std::cout << "\033[32;1m[Iteration " << iterIdx << "] \033[0m";
+  std::cout << "Token: " << str << " | "
+            << "Time: " << time << "s" << std::endl;
+}
+
+/// Tokenize input data in the container.
+void tokenizeInput(const std::string &vocabFile,
+                   Text<size_t, 2> &inputContainer) {
+  printLogLabel();
+  std::cout << "Vocab file: " << std::filesystem::canonical(vocabFile)
+            << std::endl;
+  const auto buddyTokenizeStart = std::chrono::high_resolution_clock::now();
+  inputContainer.tokenizeLlama(vocabFile, MaxTokenLength);
+  const auto buddyTokenizeEnd = std::chrono::high_resolution_clock::now();
+  const std::chrono::duration<double, std::milli> buddyTokenizeTime =
+      buddyTokenizeEnd - buddyTokenizeStart;
+  printLogLabel();
+  std::cout << "Tokenize time: " << buddyTokenizeTime.count() << "ms"
+            << std::endl;
+}
+
+/// Load parameters into data container.
+void loadParameters(const std::string &paramFilePath,
+                    MemRef<float, 1> &params) {
+  const auto loadStart = std::chrono::high_resolution_clock::now();
+  std::ifstream paramFile(paramFilePath, std::ios::in | std::ios::binary);
+  if (!paramFile.is_open()) {
+    throw std::runtime_error("[Error] Failed to open params file!");
+  }
+  printLogLabel();
+  std::cout << "Loading params..." << std::endl;
+  printLogLabel();
+  std::cout << "Params file: " << std::filesystem::canonical(paramFilePath)
+            << std::endl;
+  paramFile.read(reinterpret_cast<char *>(params.getData()),
+                 sizeof(float) * (params.getSize()));
+  if (paramFile.fail()) {
+    throw std::runtime_error("Error occurred while reading params file!");
+  }
+  paramFile.close();
+  const auto loadEnd = std::chrono::high_resolution_clock::now();
+  const std::chrono::duration<double, std::milli> loadTime =
+      loadEnd - loadStart;
+  printLogLabel();
+  std::cout << "Params load time: " << (double)(loadTime.count()) / 1000
+            << "s\n"
+            << std::endl;
+}
+
+/// Find the index of the max value.
+int findMaxIndex(const float *start, const float *end) {
+  return std::distance(start, std::max_element(start, end));
+}
+
+// -----------------------------------------------------------------------------
+// LLaMA Inference Main Entry
+// -----------------------------------------------------------------------------
+
+int main() {
+  /// Print the title of this example.
+  const std::string title = "LLaMA 2 Inference Powered by Buddy Compiler";
+  std::cout << "\033[33;1m" << title << "\033[0m" << std::endl;
+
+  /// Define directories of vacabulary and parameter file.
+  const std::string vocabDir = "../../examples/BuddyLlama/vocab.txt";
+  const std::string paramsDir = "../../examples/BuddyLlama/arg0.data";
+
+  /// Get user message.
+  std::string inputStr;
+  getUserInput(inputStr);
+
+  /// Initialize data containers
+  //  - Input container.
+  //  - Result container
+  //  - Output container.
+  //  - Parameters container.
+  Text<size_t, 2> outputContainer;
+  MemRef<float, 3> resultContainer[2] = {
+      MemRef<float, 3>({1, MaxTokenLength, MaxVocabSize}, false, 0),
+      MemRef<float, 3>({1, MaxTokenLength, HiddenSize}, false, 0)};
+  Text<size_t, 2> inputContainer(inputStr);
+  MemRef<float, 1> paramsContainer({ParamsSize});
+
+  /// Fill data into containers
+  //  - Input: register vocabulary and tokenize the input string.
+  //  - Output: register vocabulary.
+  //  - Parameters: load parameters from the `arg0` file into the container.
+  tokenizeInput(vocabDir, inputContainer);
+  outputContainer.loadVocab(vocabDir);
+  loadParameters(paramsDir, paramsContainer);
+
+  /// Run LLaMA Inference
+  //  - Perform the forward function.
+  //  - Find and append the generated token.
+  //  - Continue iterating until the terminal condition is met.
+  int generateLen = MaxTokenLength - inputContainer.getTokenCnt();
+  for (int i = 0; i < generateLen; i++) {
+    const auto inferenceStart = std::chrono::high_resolution_clock::now();
+    // Execute the forward pass of the model.
+    _mlir_ciface_forward(resultContainer, &paramsContainer, &inputContainer);
+
+    const auto inferenceEnd = std::chrono::high_resolution_clock::now();
+    const std::chrono::duration<double, std::milli> inferenceTime =
+        inferenceEnd - inferenceStart;
+
+    // Determine the generated token.
+    int tokenIndex = inputContainer.getTokenCnt() - 1;
+    const float *startPtr =
+        resultContainer[0].getData() + tokenIndex * MaxVocabSize;
+    const float *endPtr = startPtr + MaxVocabSize;
+    int maxIndex = findMaxIndex(startPtr, endPtr);
+    std::string tok = inputContainer.getStr(maxIndex);
+    // Print the generated token and inference time.
+    printIterInfo(i, tok, inferenceTime.count() / 1000);
+
+    // Stop if a separator token (2, </s>) or line break token (13 <0x0A>) is
+    // generated.
+    if (maxIndex == 2) {
+      break;
+    }
+    // Append the generated token into the input and output container.
+    inputContainer.appendTokenIdx(maxIndex);
+    outputContainer.appendTokenIdx(maxIndex);
+    free(resultContainer[0].release());
+    free(resultContainer[1].release());
+  }
+
+  /// Print the final result
+  std::cout << "\n\033[33;1m[Input]\033[0m " << inputStr << std::endl;
+  std::cout << "\033[33;1m[Output]\033[0m " << outputContainer.revertLlama()
+            << std::endl;
+
+  return 0;
+}
diff --git a/examples/BuddyLlama/llama-linalg-default.mlir b/examples/BuddyLlama/llama-linalg-default.mlir
new file mode 100644
index 000000000..ab99e1246
--- /dev/null
+++ b/examples/BuddyLlama/llama-linalg-default.mlir
@@ -0,0 +1,12905 @@
+#map = affine_map<(d0, d1) -> (d0, d1)>
+#map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
+#map2 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+#map3 = affine_map<(d0) -> (d0)>
+#map4 = affine_map<(d0, d1, d2) -> (d1)>
+#map5 = affine_map<(d0, d1, d2) -> (d0, d2)>
+#map6 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
+#map7 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
+#map8 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
+#map9 = affine_map<(d0, d1) -> (d1, d0)>
+#map10 = affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>
+#map11 = affine_map<(d0, d1, d2) -> (d0, 0, d1, d2)>
+#map12 = affine_map<(d0, d1) -> (0, d0, d1)>
+#map13 = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3)>
+#map14 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3, d2)>
+#map15 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, 0)>
+module {
+  func.func @forward(%arg0: tensor<6755192832xf32> {bufferization.writable = false}, %arg1: tensor<1x80xi64>) -> tensor<1x80x32000xf32> {
+    %extracted_slice = tensor.extract_slice %arg0[0] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32>
+    %extracted_slice_0 = tensor.extract_slice %arg0[4096] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32>
+    %extracted_slice_1 = tensor.extract_slice %arg0[8192] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32>
+    %extracted_slice_2 = tensor.extract_slice %arg0[12288] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32>
+    %extracted_slice_3 = tensor.extract_slice %arg0[16384] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32>
+    %extracted_slice_4 = tensor.extract_slice %arg0[20480] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32>
+    %extracted_slice_5 = tensor.extract_slice %arg0[24576] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32>
+    %extracted_slice_6 = tensor.extract_slice %arg0[28672] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32>
+    %extracted_slice_7 = tensor.extract_slice %arg0[32768] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32>
+    %extracted_slice_8 = tensor.extract_slice %arg0[36864] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32>
+    %extracted_slice_9 = tensor.extract_slice %arg0[40960] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32>
+    %extracted_slice_10 = tensor.extract_slice %arg0[45056] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32>
+    %extracted_slice_11 = tensor.extract_slice %arg0[49152] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32>
+    %extracted_slice_12 = tensor.extract_slice %arg0[53248] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32>
+    %extracted_slice_13 = tensor.extract_slice %arg0[57344] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32>
+    %extracted_slice_14 = tensor.extract_slice %arg0[61440] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32>
+    %extracted_slice_15 = tensor.extract_slice %arg0[65536] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32>
+    %extracted_slice_16 = tensor.extract_slice %arg0[69632] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32>
+    %extracted_slice_17 = tensor.extract_slice %arg0[73728] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32>
+    %extracted_slice_18 = tensor.extract_slice %arg0[77824] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32>
+    %extracted_slice_19 = tensor.extract_slice %arg0[81920] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32>
+    %extracted_slice_20 = tensor.extract_slice %arg0[86016] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32>
+    %extracted_slice_21 = tensor.extract_slice %arg0[90112] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32>
+    %extracted_slice_22 = tensor.extract_slice %arg0[94208] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32>
+    %extracted_slice_23 = tensor.extract_slice %arg0[98304] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32>
+    %extracted_slice_24 = tensor.extract_slice %arg0[102400] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32>
+    %extracted_slice_25 = tensor.extract_slice %arg0[106496] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32>
+    %extracted_slice_26 = tensor.extract_slice %arg0[110592] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32>
+    %extracted_slice_27 = tensor.extract_slice %arg0[114688] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32>
+    %extracted_slice_28 = tensor.extract_slice %arg0[118784] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32>
+    %extracted_slice_29 = tensor.extract_slice %arg0[122880] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32>
+    %extracted_slice_30 = tensor.extract_slice %arg0[126976] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32>
+    %extracted_slice_31 = tensor.extract_slice %arg0[131072] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32>
+    %extracted_slice_32 = tensor.extract_slice %arg0[135168] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32>
+    %extracted_slice_33 = tensor.extract_slice %arg0[139264] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32>
+    %extracted_slice_34 = tensor.extract_slice %arg0[143360] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32>
+    %extracted_slice_35 = tensor.extract_slice %arg0[147456] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32>
+    %extracted_slice_36 = tensor.extract_slice %arg0[151552] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32>
+    %extracted_slice_37 = tensor.extract_slice %arg0[155648] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32>
+    %extracted_slice_38 = tensor.extract_slice %arg0[159744] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32>
+    %extracted_slice_39 = tensor.extract_slice %arg0[163840] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32>
+    %extracted_slice_40 = tensor.extract_slice %arg0[167936] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32>
+    %extracted_slice_41 = tensor.extract_slice %arg0[172032] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32>
+    %extracted_slice_42 = tensor.extract_slice %arg0[176128] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32>
+    %extracted_slice_43 = tensor.extract_slice %arg0[180224] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32>
+    %extracted_slice_44 = tensor.extract_slice %arg0[184320] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32>
+    %extracted_slice_45 = tensor.extract_slice %arg0[188416] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32>
+    %extracted_slice_46 = tensor.extract_slice %arg0[192512] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32>
+    %extracted_slice_47 = tensor.extract_slice %arg0[196608] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32>
+    %extracted_slice_48 = tensor.extract_slice %arg0[200704] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32>
+    %extracted_slice_49 = tensor.extract_slice %arg0[204800] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32>
+    %extracted_slice_50 = tensor.extract_slice %arg0[208896] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32>
+    %extracted_slice_51 = tensor.extract_slice %arg0[212992] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32>
+    %extracted_slice_52 = tensor.extract_slice %arg0[217088] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32>
+    %extracted_slice_53 = tensor.extract_slice %arg0[221184] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32>
+    %extracted_slice_54 = tensor.extract_slice %arg0[225280] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32>
+    %extracted_slice_55 = tensor.extract_slice %arg0[229376] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32>
+    %extracted_slice_56 = tensor.extract_slice %arg0[233472] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32>
+    %extracted_slice_57 = tensor.extract_slice %arg0[237568] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32>
+    %extracted_slice_58 = tensor.extract_slice %arg0[241664] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32>
+    %extracted_slice_59 = tensor.extract_slice %arg0[245760] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32>
+    %extracted_slice_60 = tensor.extract_slice %arg0[249856] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32>
+    %extracted_slice_61 = tensor.extract_slice %arg0[253952] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32>
+    %extracted_slice_62 = tensor.extract_slice %arg0[258048] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32>
+    %extracted_slice_63 = tensor.extract_slice %arg0[262144] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32>
+    %extracted_slice_64 = tensor.extract_slice %arg0[266240] [131072000] [1] : tensor<6755192832xf32> to tensor<131072000xf32>
+    %expanded = tensor.expand_shape %extracted_slice_64 [[0, 1]] : tensor<131072000xf32> into tensor<32000x4096xf32>
+    %extracted_slice_65 = tensor.extract_slice %arg0[131338240] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_66 = tensor.expand_shape %extracted_slice_65 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_67 = tensor.extract_slice %arg0[148115456] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_68 = tensor.expand_shape %extracted_slice_67 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_69 = tensor.extract_slice %arg0[164892672] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_70 = tensor.expand_shape %extracted_slice_69 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_71 = tensor.extract_slice %arg0[181669888] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_72 = tensor.expand_shape %extracted_slice_71 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_73 = tensor.extract_slice %arg0[198447104] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32>
+    %expanded_74 = tensor.expand_shape %extracted_slice_73 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32>
+    %extracted_slice_75 = tensor.extract_slice %arg0[243535872] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32>
+    %expanded_76 = tensor.expand_shape %extracted_slice_75 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32>
+    %extracted_slice_77 = tensor.extract_slice %arg0[288624640] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32>
+    %expanded_78 = tensor.expand_shape %extracted_slice_77 [[0, 1]] : tensor<45088768xf32> into tensor<4096x11008xf32>
+    %extracted_slice_79 = tensor.extract_slice %arg0[333713408] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_80 = tensor.expand_shape %extracted_slice_79 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_81 = tensor.extract_slice %arg0[350490624] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_82 = tensor.expand_shape %extracted_slice_81 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_83 = tensor.extract_slice %arg0[367267840] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_84 = tensor.expand_shape %extracted_slice_83 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_85 = tensor.extract_slice %arg0[384045056] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_86 = tensor.expand_shape %extracted_slice_85 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_87 = tensor.extract_slice %arg0[400822272] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32>
+    %expanded_88 = tensor.expand_shape %extracted_slice_87 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32>
+    %extracted_slice_89 = tensor.extract_slice %arg0[445911040] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32>
+    %expanded_90 = tensor.expand_shape %extracted_slice_89 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32>
+    %extracted_slice_91 = tensor.extract_slice %arg0[490999808] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32>
+    %expanded_92 = tensor.expand_shape %extracted_slice_91 [[0, 1]] : tensor<45088768xf32> into tensor<4096x11008xf32>
+    %extracted_slice_93 = tensor.extract_slice %arg0[536088576] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_94 = tensor.expand_shape %extracted_slice_93 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_95 = tensor.extract_slice %arg0[552865792] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_96 = tensor.expand_shape %extracted_slice_95 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_97 = tensor.extract_slice %arg0[569643008] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_98 = tensor.expand_shape %extracted_slice_97 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_99 = tensor.extract_slice %arg0[586420224] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_100 = tensor.expand_shape %extracted_slice_99 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_101 = tensor.extract_slice %arg0[603197440] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32>
+    %expanded_102 = tensor.expand_shape %extracted_slice_101 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32>
+    %extracted_slice_103 = tensor.extract_slice %arg0[648286208] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32>
+    %expanded_104 = tensor.expand_shape %extracted_slice_103 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32>
+    %extracted_slice_105 = tensor.extract_slice %arg0[693374976] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32>
+    %expanded_106 = tensor.expand_shape %extracted_slice_105 [[0, 1]] : tensor<45088768xf32> into tensor<4096x11008xf32>
+    %extracted_slice_107 = tensor.extract_slice %arg0[738463744] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_108 = tensor.expand_shape %extracted_slice_107 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_109 = tensor.extract_slice %arg0[755240960] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_110 = tensor.expand_shape %extracted_slice_109 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_111 = tensor.extract_slice %arg0[772018176] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_112 = tensor.expand_shape %extracted_slice_111 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_113 = tensor.extract_slice %arg0[788795392] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_114 = tensor.expand_shape %extracted_slice_113 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_115 = tensor.extract_slice %arg0[805572608] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32>
+    %expanded_116 = tensor.expand_shape %extracted_slice_115 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32>
+    %extracted_slice_117 = tensor.extract_slice %arg0[850661376] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32>
+    %expanded_118 = tensor.expand_shape %extracted_slice_117 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32>
+    %extracted_slice_119 = tensor.extract_slice %arg0[895750144] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32>
+    %expanded_120 = tensor.expand_shape %extracted_slice_119 [[0, 1]] : tensor<45088768xf32> into tensor<4096x11008xf32>
+    %extracted_slice_121 = tensor.extract_slice %arg0[940838912] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_122 = tensor.expand_shape %extracted_slice_121 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_123 = tensor.extract_slice %arg0[957616128] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_124 = tensor.expand_shape %extracted_slice_123 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_125 = tensor.extract_slice %arg0[974393344] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_126 = tensor.expand_shape %extracted_slice_125 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_127 = tensor.extract_slice %arg0[991170560] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_128 = tensor.expand_shape %extracted_slice_127 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_129 = tensor.extract_slice %arg0[1007947776] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32>
+    %expanded_130 = tensor.expand_shape %extracted_slice_129 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32>
+    %extracted_slice_131 = tensor.extract_slice %arg0[1053036544] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32>
+    %expanded_132 = tensor.expand_shape %extracted_slice_131 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32>
+    %extracted_slice_133 = tensor.extract_slice %arg0[1098125312] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32>
+    %expanded_134 = tensor.expand_shape %extracted_slice_133 [[0, 1]] : tensor<45088768xf32> into tensor<4096x11008xf32>
+    %extracted_slice_135 = tensor.extract_slice %arg0[1143214080] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_136 = tensor.expand_shape %extracted_slice_135 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_137 = tensor.extract_slice %arg0[1159991296] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_138 = tensor.expand_shape %extracted_slice_137 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_139 = tensor.extract_slice %arg0[1176768512] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_140 = tensor.expand_shape %extracted_slice_139 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_141 = tensor.extract_slice %arg0[1193545728] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_142 = tensor.expand_shape %extracted_slice_141 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_143 = tensor.extract_slice %arg0[1210322944] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32>
+    %expanded_144 = tensor.expand_shape %extracted_slice_143 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32>
+    %extracted_slice_145 = tensor.extract_slice %arg0[1255411712] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32>
+    %expanded_146 = tensor.expand_shape %extracted_slice_145 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32>
+    %extracted_slice_147 = tensor.extract_slice %arg0[1300500480] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32>
+    %expanded_148 = tensor.expand_shape %extracted_slice_147 [[0, 1]] : tensor<45088768xf32> into tensor<4096x11008xf32>
+    %extracted_slice_149 = tensor.extract_slice %arg0[1345589248] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_150 = tensor.expand_shape %extracted_slice_149 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_151 = tensor.extract_slice %arg0[1362366464] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_152 = tensor.expand_shape %extracted_slice_151 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_153 = tensor.extract_slice %arg0[1379143680] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_154 = tensor.expand_shape %extracted_slice_153 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_155 = tensor.extract_slice %arg0[1395920896] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_156 = tensor.expand_shape %extracted_slice_155 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_157 = tensor.extract_slice %arg0[1412698112] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32>
+    %expanded_158 = tensor.expand_shape %extracted_slice_157 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32>
+    %extracted_slice_159 = tensor.extract_slice %arg0[1457786880] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32>
+    %expanded_160 = tensor.expand_shape %extracted_slice_159 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32>
+    %extracted_slice_161 = tensor.extract_slice %arg0[1502875648] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32>
+    %expanded_162 = tensor.expand_shape %extracted_slice_161 [[0, 1]] : tensor<45088768xf32> into tensor<4096x11008xf32>
+    %extracted_slice_163 = tensor.extract_slice %arg0[1547964416] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_164 = tensor.expand_shape %extracted_slice_163 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_165 = tensor.extract_slice %arg0[1564741632] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_166 = tensor.expand_shape %extracted_slice_165 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_167 = tensor.extract_slice %arg0[1581518848] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_168 = tensor.expand_shape %extracted_slice_167 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_169 = tensor.extract_slice %arg0[1598296064] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_170 = tensor.expand_shape %extracted_slice_169 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_171 = tensor.extract_slice %arg0[1615073280] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32>
+    %expanded_172 = tensor.expand_shape %extracted_slice_171 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32>
+    %extracted_slice_173 = tensor.extract_slice %arg0[1660162048] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32>
+    %expanded_174 = tensor.expand_shape %extracted_slice_173 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32>
+    %extracted_slice_175 = tensor.extract_slice %arg0[1705250816] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32>
+    %expanded_176 = tensor.expand_shape %extracted_slice_175 [[0, 1]] : tensor<45088768xf32> into tensor<4096x11008xf32>
+    %extracted_slice_177 = tensor.extract_slice %arg0[1750339584] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_178 = tensor.expand_shape %extracted_slice_177 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_179 = tensor.extract_slice %arg0[1767116800] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_180 = tensor.expand_shape %extracted_slice_179 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_181 = tensor.extract_slice %arg0[1783894016] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_182 = tensor.expand_shape %extracted_slice_181 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_183 = tensor.extract_slice %arg0[1800671232] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_184 = tensor.expand_shape %extracted_slice_183 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_185 = tensor.extract_slice %arg0[1817448448] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32>
+    %expanded_186 = tensor.expand_shape %extracted_slice_185 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32>
+    %extracted_slice_187 = tensor.extract_slice %arg0[1862537216] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32>
+    %expanded_188 = tensor.expand_shape %extracted_slice_187 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32>
+    %extracted_slice_189 = tensor.extract_slice %arg0[1907625984] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32>
+    %expanded_190 = tensor.expand_shape %extracted_slice_189 [[0, 1]] : tensor<45088768xf32> into tensor<4096x11008xf32>
+    %extracted_slice_191 = tensor.extract_slice %arg0[1952714752] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_192 = tensor.expand_shape %extracted_slice_191 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_193 = tensor.extract_slice %arg0[1969491968] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_194 = tensor.expand_shape %extracted_slice_193 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_195 = tensor.extract_slice %arg0[1986269184] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_196 = tensor.expand_shape %extracted_slice_195 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_197 = tensor.extract_slice %arg0[2003046400] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_198 = tensor.expand_shape %extracted_slice_197 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_199 = tensor.extract_slice %arg0[2019823616] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32>
+    %expanded_200 = tensor.expand_shape %extracted_slice_199 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32>
+    %extracted_slice_201 = tensor.extract_slice %arg0[2064912384] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32>
+    %expanded_202 = tensor.expand_shape %extracted_slice_201 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32>
+    %extracted_slice_203 = tensor.extract_slice %arg0[2110001152] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32>
+    %expanded_204 = tensor.expand_shape %extracted_slice_203 [[0, 1]] : tensor<45088768xf32> into tensor<4096x11008xf32>
+    %extracted_slice_205 = tensor.extract_slice %arg0[2155089920] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_206 = tensor.expand_shape %extracted_slice_205 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_207 = tensor.extract_slice %arg0[2171867136] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_208 = tensor.expand_shape %extracted_slice_207 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_209 = tensor.extract_slice %arg0[2188644352] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_210 = tensor.expand_shape %extracted_slice_209 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_211 = tensor.extract_slice %arg0[2205421568] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_212 = tensor.expand_shape %extracted_slice_211 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_213 = tensor.extract_slice %arg0[2222198784] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32>
+    %expanded_214 = tensor.expand_shape %extracted_slice_213 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32>
+    %extracted_slice_215 = tensor.extract_slice %arg0[2267287552] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32>
+    %expanded_216 = tensor.expand_shape %extracted_slice_215 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32>
+    %extracted_slice_217 = tensor.extract_slice %arg0[2312376320] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32>
+    %expanded_218 = tensor.expand_shape %extracted_slice_217 [[0, 1]] : tensor<45088768xf32> into tensor<4096x11008xf32>
+    %extracted_slice_219 = tensor.extract_slice %arg0[2357465088] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_220 = tensor.expand_shape %extracted_slice_219 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_221 = tensor.extract_slice %arg0[2374242304] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_222 = tensor.expand_shape %extracted_slice_221 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_223 = tensor.extract_slice %arg0[2391019520] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_224 = tensor.expand_shape %extracted_slice_223 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_225 = tensor.extract_slice %arg0[2407796736] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_226 = tensor.expand_shape %extracted_slice_225 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_227 = tensor.extract_slice %arg0[2424573952] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32>
+    %expanded_228 = tensor.expand_shape %extracted_slice_227 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32>
+    %extracted_slice_229 = tensor.extract_slice %arg0[2469662720] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32>
+    %expanded_230 = tensor.expand_shape %extracted_slice_229 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32>
+    %extracted_slice_231 = tensor.extract_slice %arg0[2514751488] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32>
+    %expanded_232 = tensor.expand_shape %extracted_slice_231 [[0, 1]] : tensor<45088768xf32> into tensor<4096x11008xf32>
+    %extracted_slice_233 = tensor.extract_slice %arg0[2559840256] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_234 = tensor.expand_shape %extracted_slice_233 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_235 = tensor.extract_slice %arg0[2576617472] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_236 = tensor.expand_shape %extracted_slice_235 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_237 = tensor.extract_slice %arg0[2593394688] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_238 = tensor.expand_shape %extracted_slice_237 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_239 = tensor.extract_slice %arg0[2610171904] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_240 = tensor.expand_shape %extracted_slice_239 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_241 = tensor.extract_slice %arg0[2626949120] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32>
+    %expanded_242 = tensor.expand_shape %extracted_slice_241 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32>
+    %extracted_slice_243 = tensor.extract_slice %arg0[2672037888] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32>
+    %expanded_244 = tensor.expand_shape %extracted_slice_243 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32>
+    %extracted_slice_245 = tensor.extract_slice %arg0[2717126656] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32>
+    %expanded_246 = tensor.expand_shape %extracted_slice_245 [[0, 1]] : tensor<45088768xf32> into tensor<4096x11008xf32>
+    %extracted_slice_247 = tensor.extract_slice %arg0[2762215424] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_248 = tensor.expand_shape %extracted_slice_247 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_249 = tensor.extract_slice %arg0[2778992640] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_250 = tensor.expand_shape %extracted_slice_249 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_251 = tensor.extract_slice %arg0[2795769856] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_252 = tensor.expand_shape %extracted_slice_251 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_253 = tensor.extract_slice %arg0[2812547072] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_254 = tensor.expand_shape %extracted_slice_253 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_255 = tensor.extract_slice %arg0[2829324288] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32>
+    %expanded_256 = tensor.expand_shape %extracted_slice_255 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32>
+    %extracted_slice_257 = tensor.extract_slice %arg0[2874413056] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32>
+    %expanded_258 = tensor.expand_shape %extracted_slice_257 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32>
+    %extracted_slice_259 = tensor.extract_slice %arg0[2919501824] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32>
+    %expanded_260 = tensor.expand_shape %extracted_slice_259 [[0, 1]] : tensor<45088768xf32> into tensor<4096x11008xf32>
+    %extracted_slice_261 = tensor.extract_slice %arg0[2964590592] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_262 = tensor.expand_shape %extracted_slice_261 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_263 = tensor.extract_slice %arg0[2981367808] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_264 = tensor.expand_shape %extracted_slice_263 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_265 = tensor.extract_slice %arg0[2998145024] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_266 = tensor.expand_shape %extracted_slice_265 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_267 = tensor.extract_slice %arg0[3014922240] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_268 = tensor.expand_shape %extracted_slice_267 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_269 = tensor.extract_slice %arg0[3031699456] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32>
+    %expanded_270 = tensor.expand_shape %extracted_slice_269 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32>
+    %extracted_slice_271 = tensor.extract_slice %arg0[3076788224] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32>
+    %expanded_272 = tensor.expand_shape %extracted_slice_271 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32>
+    %extracted_slice_273 = tensor.extract_slice %arg0[3121876992] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32>
+    %expanded_274 = tensor.expand_shape %extracted_slice_273 [[0, 1]] : tensor<45088768xf32> into tensor<4096x11008xf32>
+    %extracted_slice_275 = tensor.extract_slice %arg0[3166965760] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_276 = tensor.expand_shape %extracted_slice_275 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_277 = tensor.extract_slice %arg0[3183742976] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_278 = tensor.expand_shape %extracted_slice_277 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_279 = tensor.extract_slice %arg0[3200520192] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_280 = tensor.expand_shape %extracted_slice_279 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_281 = tensor.extract_slice %arg0[3217297408] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_282 = tensor.expand_shape %extracted_slice_281 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_283 = tensor.extract_slice %arg0[3234074624] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32>
+    %expanded_284 = tensor.expand_shape %extracted_slice_283 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32>
+    %extracted_slice_285 = tensor.extract_slice %arg0[3279163392] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32>
+    %expanded_286 = tensor.expand_shape %extracted_slice_285 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32>
+    %extracted_slice_287 = tensor.extract_slice %arg0[3324252160] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32>
+    %expanded_288 = tensor.expand_shape %extracted_slice_287 [[0, 1]] : tensor<45088768xf32> into tensor<4096x11008xf32>
+    %extracted_slice_289 = tensor.extract_slice %arg0[3369340928] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_290 = tensor.expand_shape %extracted_slice_289 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_291 = tensor.extract_slice %arg0[3386118144] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_292 = tensor.expand_shape %extracted_slice_291 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_293 = tensor.extract_slice %arg0[3402895360] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_294 = tensor.expand_shape %extracted_slice_293 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_295 = tensor.extract_slice %arg0[3419672576] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_296 = tensor.expand_shape %extracted_slice_295 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_297 = tensor.extract_slice %arg0[3436449792] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32>
+    %expanded_298 = tensor.expand_shape %extracted_slice_297 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32>
+    %extracted_slice_299 = tensor.extract_slice %arg0[3481538560] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32>
+    %expanded_300 = tensor.expand_shape %extracted_slice_299 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32>
+    %extracted_slice_301 = tensor.extract_slice %arg0[3526627328] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32>
+    %expanded_302 = tensor.expand_shape %extracted_slice_301 [[0, 1]] : tensor<45088768xf32> into tensor<4096x11008xf32>
+    %extracted_slice_303 = tensor.extract_slice %arg0[3571716096] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_304 = tensor.expand_shape %extracted_slice_303 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_305 = tensor.extract_slice %arg0[3588493312] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_306 = tensor.expand_shape %extracted_slice_305 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_307 = tensor.extract_slice %arg0[3605270528] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_308 = tensor.expand_shape %extracted_slice_307 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_309 = tensor.extract_slice %arg0[3622047744] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_310 = tensor.expand_shape %extracted_slice_309 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_311 = tensor.extract_slice %arg0[3638824960] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32>
+    %expanded_312 = tensor.expand_shape %extracted_slice_311 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32>
+    %extracted_slice_313 = tensor.extract_slice %arg0[3683913728] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32>
+    %expanded_314 = tensor.expand_shape %extracted_slice_313 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32>
+    %extracted_slice_315 = tensor.extract_slice %arg0[3729002496] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32>
+    %expanded_316 = tensor.expand_shape %extracted_slice_315 [[0, 1]] : tensor<45088768xf32> into tensor<4096x11008xf32>
+    %extracted_slice_317 = tensor.extract_slice %arg0[3774091264] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_318 = tensor.expand_shape %extracted_slice_317 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_319 = tensor.extract_slice %arg0[3790868480] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_320 = tensor.expand_shape %extracted_slice_319 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_321 = tensor.extract_slice %arg0[3807645696] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_322 = tensor.expand_shape %extracted_slice_321 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_323 = tensor.extract_slice %arg0[3824422912] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_324 = tensor.expand_shape %extracted_slice_323 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_325 = tensor.extract_slice %arg0[3841200128] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32>
+    %expanded_326 = tensor.expand_shape %extracted_slice_325 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32>
+    %extracted_slice_327 = tensor.extract_slice %arg0[3886288896] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32>
+    %expanded_328 = tensor.expand_shape %extracted_slice_327 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32>
+    %extracted_slice_329 = tensor.extract_slice %arg0[3931377664] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32>
+    %expanded_330 = tensor.expand_shape %extracted_slice_329 [[0, 1]] : tensor<45088768xf32> into tensor<4096x11008xf32>
+    %extracted_slice_331 = tensor.extract_slice %arg0[3976466432] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_332 = tensor.expand_shape %extracted_slice_331 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_333 = tensor.extract_slice %arg0[3993243648] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_334 = tensor.expand_shape %extracted_slice_333 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_335 = tensor.extract_slice %arg0[4010020864] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_336 = tensor.expand_shape %extracted_slice_335 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_337 = tensor.extract_slice %arg0[4026798080] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_338 = tensor.expand_shape %extracted_slice_337 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_339 = tensor.extract_slice %arg0[4043575296] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32>
+    %expanded_340 = tensor.expand_shape %extracted_slice_339 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32>
+    %extracted_slice_341 = tensor.extract_slice %arg0[4088664064] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32>
+    %expanded_342 = tensor.expand_shape %extracted_slice_341 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32>
+    %extracted_slice_343 = tensor.extract_slice %arg0[4133752832] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32>
+    %expanded_344 = tensor.expand_shape %extracted_slice_343 [[0, 1]] : tensor<45088768xf32> into tensor<4096x11008xf32>
+    %extracted_slice_345 = tensor.extract_slice %arg0[4178841600] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_346 = tensor.expand_shape %extracted_slice_345 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_347 = tensor.extract_slice %arg0[4195618816] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_348 = tensor.expand_shape %extracted_slice_347 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_349 = tensor.extract_slice %arg0[4212396032] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_350 = tensor.expand_shape %extracted_slice_349 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_351 = tensor.extract_slice %arg0[4229173248] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_352 = tensor.expand_shape %extracted_slice_351 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_353 = tensor.extract_slice %arg0[4245950464] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32>
+    %expanded_354 = tensor.expand_shape %extracted_slice_353 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32>
+    %extracted_slice_355 = tensor.extract_slice %arg0[4291039232] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32>
+    %expanded_356 = tensor.expand_shape %extracted_slice_355 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32>
+    %extracted_slice_357 = tensor.extract_slice %arg0[4336128000] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32>
+    %expanded_358 = tensor.expand_shape %extracted_slice_357 [[0, 1]] : tensor<45088768xf32> into tensor<4096x11008xf32>
+    %extracted_slice_359 = tensor.extract_slice %arg0[4381216768] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_360 = tensor.expand_shape %extracted_slice_359 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_361 = tensor.extract_slice %arg0[4397993984] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_362 = tensor.expand_shape %extracted_slice_361 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_363 = tensor.extract_slice %arg0[4414771200] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_364 = tensor.expand_shape %extracted_slice_363 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_365 = tensor.extract_slice %arg0[4431548416] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_366 = tensor.expand_shape %extracted_slice_365 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_367 = tensor.extract_slice %arg0[4448325632] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32>
+    %expanded_368 = tensor.expand_shape %extracted_slice_367 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32>
+    %extracted_slice_369 = tensor.extract_slice %arg0[4493414400] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32>
+    %expanded_370 = tensor.expand_shape %extracted_slice_369 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32>
+    %extracted_slice_371 = tensor.extract_slice %arg0[4538503168] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32>
+    %expanded_372 = tensor.expand_shape %extracted_slice_371 [[0, 1]] : tensor<45088768xf32> into tensor<4096x11008xf32>
+    %extracted_slice_373 = tensor.extract_slice %arg0[4583591936] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_374 = tensor.expand_shape %extracted_slice_373 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_375 = tensor.extract_slice %arg0[4600369152] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_376 = tensor.expand_shape %extracted_slice_375 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_377 = tensor.extract_slice %arg0[4617146368] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_378 = tensor.expand_shape %extracted_slice_377 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_379 = tensor.extract_slice %arg0[4633923584] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_380 = tensor.expand_shape %extracted_slice_379 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_381 = tensor.extract_slice %arg0[4650700800] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32>
+    %expanded_382 = tensor.expand_shape %extracted_slice_381 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32>
+    %extracted_slice_383 = tensor.extract_slice %arg0[4695789568] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32>
+    %expanded_384 = tensor.expand_shape %extracted_slice_383 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32>
+    %extracted_slice_385 = tensor.extract_slice %arg0[4740878336] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32>
+    %expanded_386 = tensor.expand_shape %extracted_slice_385 [[0, 1]] : tensor<45088768xf32> into tensor<4096x11008xf32>
+    %extracted_slice_387 = tensor.extract_slice %arg0[4785967104] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_388 = tensor.expand_shape %extracted_slice_387 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_389 = tensor.extract_slice %arg0[4802744320] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_390 = tensor.expand_shape %extracted_slice_389 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_391 = tensor.extract_slice %arg0[4819521536] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_392 = tensor.expand_shape %extracted_slice_391 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_393 = tensor.extract_slice %arg0[4836298752] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_394 = tensor.expand_shape %extracted_slice_393 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_395 = tensor.extract_slice %arg0[4853075968] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32>
+    %expanded_396 = tensor.expand_shape %extracted_slice_395 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32>
+    %extracted_slice_397 = tensor.extract_slice %arg0[4898164736] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32>
+    %expanded_398 = tensor.expand_shape %extracted_slice_397 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32>
+    %extracted_slice_399 = tensor.extract_slice %arg0[4943253504] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32>
+    %expanded_400 = tensor.expand_shape %extracted_slice_399 [[0, 1]] : tensor<45088768xf32> into tensor<4096x11008xf32>
+    %extracted_slice_401 = tensor.extract_slice %arg0[4988342272] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_402 = tensor.expand_shape %extracted_slice_401 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_403 = tensor.extract_slice %arg0[5005119488] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_404 = tensor.expand_shape %extracted_slice_403 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_405 = tensor.extract_slice %arg0[5021896704] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_406 = tensor.expand_shape %extracted_slice_405 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_407 = tensor.extract_slice %arg0[5038673920] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_408 = tensor.expand_shape %extracted_slice_407 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_409 = tensor.extract_slice %arg0[5055451136] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32>
+    %expanded_410 = tensor.expand_shape %extracted_slice_409 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32>
+    %extracted_slice_411 = tensor.extract_slice %arg0[5100539904] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32>
+    %expanded_412 = tensor.expand_shape %extracted_slice_411 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32>
+    %extracted_slice_413 = tensor.extract_slice %arg0[5145628672] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32>
+    %expanded_414 = tensor.expand_shape %extracted_slice_413 [[0, 1]] : tensor<45088768xf32> into tensor<4096x11008xf32>
+    %extracted_slice_415 = tensor.extract_slice %arg0[5190717440] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_416 = tensor.expand_shape %extracted_slice_415 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_417 = tensor.extract_slice %arg0[5207494656] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_418 = tensor.expand_shape %extracted_slice_417 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_419 = tensor.extract_slice %arg0[5224271872] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_420 = tensor.expand_shape %extracted_slice_419 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_421 = tensor.extract_slice %arg0[5241049088] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_422 = tensor.expand_shape %extracted_slice_421 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_423 = tensor.extract_slice %arg0[5257826304] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32>
+    %expanded_424 = tensor.expand_shape %extracted_slice_423 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32>
+    %extracted_slice_425 = tensor.extract_slice %arg0[5302915072] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32>
+    %expanded_426 = tensor.expand_shape %extracted_slice_425 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32>
+    %extracted_slice_427 = tensor.extract_slice %arg0[5348003840] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32>
+    %expanded_428 = tensor.expand_shape %extracted_slice_427 [[0, 1]] : tensor<45088768xf32> into tensor<4096x11008xf32>
+    %extracted_slice_429 = tensor.extract_slice %arg0[5393092608] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_430 = tensor.expand_shape %extracted_slice_429 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_431 = tensor.extract_slice %arg0[5409869824] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_432 = tensor.expand_shape %extracted_slice_431 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_433 = tensor.extract_slice %arg0[5426647040] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_434 = tensor.expand_shape %extracted_slice_433 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_435 = tensor.extract_slice %arg0[5443424256] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_436 = tensor.expand_shape %extracted_slice_435 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_437 = tensor.extract_slice %arg0[5460201472] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32>
+    %expanded_438 = tensor.expand_shape %extracted_slice_437 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32>
+    %extracted_slice_439 = tensor.extract_slice %arg0[5505290240] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32>
+    %expanded_440 = tensor.expand_shape %extracted_slice_439 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32>
+    %extracted_slice_441 = tensor.extract_slice %arg0[5550379008] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32>
+    %expanded_442 = tensor.expand_shape %extracted_slice_441 [[0, 1]] : tensor<45088768xf32> into tensor<4096x11008xf32>
+    %extracted_slice_443 = tensor.extract_slice %arg0[5595467776] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_444 = tensor.expand_shape %extracted_slice_443 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_445 = tensor.extract_slice %arg0[5612244992] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_446 = tensor.expand_shape %extracted_slice_445 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_447 = tensor.extract_slice %arg0[5629022208] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_448 = tensor.expand_shape %extracted_slice_447 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_449 = tensor.extract_slice %arg0[5645799424] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_450 = tensor.expand_shape %extracted_slice_449 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_451 = tensor.extract_slice %arg0[5662576640] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32>
+    %expanded_452 = tensor.expand_shape %extracted_slice_451 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32>
+    %extracted_slice_453 = tensor.extract_slice %arg0[5707665408] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32>
+    %expanded_454 = tensor.expand_shape %extracted_slice_453 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32>
+    %extracted_slice_455 = tensor.extract_slice %arg0[5752754176] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32>
+    %expanded_456 = tensor.expand_shape %extracted_slice_455 [[0, 1]] : tensor<45088768xf32> into tensor<4096x11008xf32>
+    %extracted_slice_457 = tensor.extract_slice %arg0[5797842944] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_458 = tensor.expand_shape %extracted_slice_457 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_459 = tensor.extract_slice %arg0[5814620160] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_460 = tensor.expand_shape %extracted_slice_459 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_461 = tensor.extract_slice %arg0[5831397376] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_462 = tensor.expand_shape %extracted_slice_461 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_463 = tensor.extract_slice %arg0[5848174592] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_464 = tensor.expand_shape %extracted_slice_463 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_465 = tensor.extract_slice %arg0[5864951808] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32>
+    %expanded_466 = tensor.expand_shape %extracted_slice_465 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32>
+    %extracted_slice_467 = tensor.extract_slice %arg0[5910040576] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32>
+    %expanded_468 = tensor.expand_shape %extracted_slice_467 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32>
+    %extracted_slice_469 = tensor.extract_slice %arg0[5955129344] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32>
+    %expanded_470 = tensor.expand_shape %extracted_slice_469 [[0, 1]] : tensor<45088768xf32> into tensor<4096x11008xf32>
+    %extracted_slice_471 = tensor.extract_slice %arg0[6000218112] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_472 = tensor.expand_shape %extracted_slice_471 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_473 = tensor.extract_slice %arg0[6016995328] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_474 = tensor.expand_shape %extracted_slice_473 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_475 = tensor.extract_slice %arg0[6033772544] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_476 = tensor.expand_shape %extracted_slice_475 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_477 = tensor.extract_slice %arg0[6050549760] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_478 = tensor.expand_shape %extracted_slice_477 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_479 = tensor.extract_slice %arg0[6067326976] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32>
+    %expanded_480 = tensor.expand_shape %extracted_slice_479 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32>
+    %extracted_slice_481 = tensor.extract_slice %arg0[6112415744] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32>
+    %expanded_482 = tensor.expand_shape %extracted_slice_481 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32>
+    %extracted_slice_483 = tensor.extract_slice %arg0[6157504512] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32>
+    %expanded_484 = tensor.expand_shape %extracted_slice_483 [[0, 1]] : tensor<45088768xf32> into tensor<4096x11008xf32>
+    %extracted_slice_485 = tensor.extract_slice %arg0[6202593280] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_486 = tensor.expand_shape %extracted_slice_485 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_487 = tensor.extract_slice %arg0[6219370496] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_488 = tensor.expand_shape %extracted_slice_487 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_489 = tensor.extract_slice %arg0[6236147712] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_490 = tensor.expand_shape %extracted_slice_489 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_491 = tensor.extract_slice %arg0[6252924928] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_492 = tensor.expand_shape %extracted_slice_491 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_493 = tensor.extract_slice %arg0[6269702144] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32>
+    %expanded_494 = tensor.expand_shape %extracted_slice_493 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32>
+    %extracted_slice_495 = tensor.extract_slice %arg0[6314790912] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32>
+    %expanded_496 = tensor.expand_shape %extracted_slice_495 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32>
+    %extracted_slice_497 = tensor.extract_slice %arg0[6359879680] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32>
+    %expanded_498 = tensor.expand_shape %extracted_slice_497 [[0, 1]] : tensor<45088768xf32> into tensor<4096x11008xf32>
+    %extracted_slice_499 = tensor.extract_slice %arg0[6404968448] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_500 = tensor.expand_shape %extracted_slice_499 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_501 = tensor.extract_slice %arg0[6421745664] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_502 = tensor.expand_shape %extracted_slice_501 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_503 = tensor.extract_slice %arg0[6438522880] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_504 = tensor.expand_shape %extracted_slice_503 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_505 = tensor.extract_slice %arg0[6455300096] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32>
+    %expanded_506 = tensor.expand_shape %extracted_slice_505 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32>
+    %extracted_slice_507 = tensor.extract_slice %arg0[6472077312] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32>
+    %expanded_508 = tensor.expand_shape %extracted_slice_507 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32>
+    %extracted_slice_509 = tensor.extract_slice %arg0[6517166080] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32>
+    %expanded_510 = tensor.expand_shape %extracted_slice_509 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32>
+    %extracted_slice_511 = tensor.extract_slice %arg0[6562254848] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32>
+    %expanded_512 = tensor.expand_shape %extracted_slice_511 [[0, 1]] : tensor<45088768xf32> into tensor<4096x11008xf32>
+    %extracted_slice_513 = tensor.extract_slice %arg0[6607343616] [131072000] [1] : tensor<6755192832xf32> to tensor<131072000xf32>
+    %expanded_514 = tensor.expand_shape %extracted_slice_513 [[0, 1]] : tensor<131072000xf32> into tensor<32000x4096xf32>
+    %extracted_slice_515 = tensor.extract_slice %arg0[6738415616] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32>
+    %expanded_516 = tensor.expand_shape %extracted_slice_515 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32>
+    %extracted_slice_517 = tensor.extract_slice %arg0[6738677760] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32>
+    %expanded_518 = tensor.expand_shape %extracted_slice_517 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32>
+    %extracted_slice_519 = tensor.extract_slice %arg0[6738939904] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32>
+    %expanded_520 = tensor.expand_shape %extracted_slice_519 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32>
+    %extracted_slice_521 = tensor.extract_slice %arg0[6739202048] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32>
+    %expanded_522 = tensor.expand_shape %extracted_slice_521 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32>
+    %extracted_slice_523 = tensor.extract_slice %arg0[6739464192] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32>
+    %expanded_524 = tensor.expand_shape %extracted_slice_523 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32>
+    %extracted_slice_525 = tensor.extract_slice %arg0[6739726336] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32>
+    %expanded_526 = tensor.expand_shape %extracted_slice_525 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32>
+    %extracted_slice_527 = tensor.extract_slice %arg0[6739988480] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32>
+    %expanded_528 = tensor.expand_shape %extracted_slice_527 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32>
+    %extracted_slice_529 = tensor.extract_slice %arg0[6740250624] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32>
+    %expanded_530 = tensor.expand_shape %extracted_slice_529 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32>
+    %extracted_slice_531 = tensor.extract_slice %arg0[6740512768] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32>
+    %expanded_532 = tensor.expand_shape %extracted_slice_531 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32>
+    %extracted_slice_533 = tensor.extract_slice %arg0[6740774912] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32>
+    %expanded_534 = tensor.expand_shape %extracted_slice_533 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32>
+    %extracted_slice_535 = tensor.extract_slice %arg0[6741037056] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32>
+    %expanded_536 = tensor.expand_shape %extracted_slice_535 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32>
+    %extracted_slice_537 = tensor.extract_slice %arg0[6741299200] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32>
+    %expanded_538 = tensor.expand_shape %extracted_slice_537 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32>
+    %extracted_slice_539 = tensor.extract_slice %arg0[6741561344] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32>
+    %expanded_540 = tensor.expand_shape %extracted_slice_539 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32>
+    %extracted_slice_541 = tensor.extract_slice %arg0[6741823488] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32>
+    %expanded_542 = tensor.expand_shape %extracted_slice_541 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32>
+    %extracted_slice_543 = tensor.extract_slice %arg0[6742085632] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32>
+    %expanded_544 = tensor.expand_shape %extracted_slice_543 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32>
+    %extracted_slice_545 = tensor.extract_slice %arg0[6742347776] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32>
+    %expanded_546 = tensor.expand_shape %extracted_slice_545 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32>
+    %extracted_slice_547 = tensor.extract_slice %arg0[6742609920] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32>
+    %expanded_548 = tensor.expand_shape %extracted_slice_547 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32>
+    %extracted_slice_549 = tensor.extract_slice %arg0[6742872064] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32>
+    %expanded_550 = tensor.expand_shape %extracted_slice_549 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32>
+    %extracted_slice_551 = tensor.extract_slice %arg0[6743134208] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32>
+    %expanded_552 = tensor.expand_shape %extracted_slice_551 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32>
+    %extracted_slice_553 = tensor.extract_slice %arg0[6743396352] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32>
+    %expanded_554 = tensor.expand_shape %extracted_slice_553 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32>
+    %extracted_slice_555 = tensor.extract_slice %arg0[6743658496] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32>
+    %expanded_556 = tensor.expand_shape %extracted_slice_555 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32>
+    %extracted_slice_557 = tensor.extract_slice %arg0[6743920640] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32>
+    %expanded_558 = tensor.expand_shape %extracted_slice_557 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32>
+    %extracted_slice_559 = tensor.extract_slice %arg0[6744182784] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32>
+    %expanded_560 = tensor.expand_shape %extracted_slice_559 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32>
+    %extracted_slice_561 = tensor.extract_slice %arg0[6744444928] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32>
+    %expanded_562 = tensor.expand_shape %extracted_slice_561 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32>
+    %extracted_slice_563 = tensor.extract_slice %arg0[6744707072] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32>
+    %expanded_564 = tensor.expand_shape %extracted_slice_563 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32>
+    %extracted_slice_565 = tensor.extract_slice %arg0[6744969216] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32>
+    %expanded_566 = tensor.expand_shape %extracted_slice_565 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32>
+    %extracted_slice_567 = tensor.extract_slice %arg0[6745231360] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32>
+    %expanded_568 = tensor.expand_shape %extracted_slice_567 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32>
+    %extracted_slice_569 = tensor.extract_slice %arg0[6745493504] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32>
+    %expanded_570 = tensor.expand_shape %extracted_slice_569 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32>
+    %extracted_slice_571 = tensor.extract_slice %arg0[6745755648] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32>
+    %expanded_572 = tensor.expand_shape %extracted_slice_571 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32>
+    %extracted_slice_573 = tensor.extract_slice %arg0[6746017792] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32>
+    %expanded_574 = tensor.expand_shape %extracted_slice_573 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32>
+    %extracted_slice_575 = tensor.extract_slice %arg0[6746279936] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32>
+    %expanded_576 = tensor.expand_shape %extracted_slice_575 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32>
+    %extracted_slice_577 = tensor.extract_slice %arg0[6746542080] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32>
+    %expanded_578 = tensor.expand_shape %extracted_slice_577 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32>
+    %extracted_slice_579 = tensor.extract_slice %arg0[6746804224] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32>
+    %expanded_580 = tensor.expand_shape %extracted_slice_579 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32>
+    %extracted_slice_581 = tensor.extract_slice %arg0[6747066368] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32>
+    %expanded_582 = tensor.expand_shape %extracted_slice_581 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32>
+    %extracted_slice_583 = tensor.extract_slice %arg0[6747328512] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32>
+    %expanded_584 = tensor.expand_shape %extracted_slice_583 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32>
+    %extracted_slice_585 = tensor.extract_slice %arg0[6747590656] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32>
+    %expanded_586 = tensor.expand_shape %extracted_slice_585 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32>
+    %extracted_slice_587 = tensor.extract_slice %arg0[6747852800] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32>
+    %expanded_588 = tensor.expand_shape %extracted_slice_587 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32>
+    %extracted_slice_589 = tensor.extract_slice %arg0[6748114944] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32>
+    %expanded_590 = tensor.expand_shape %extracted_slice_589 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32>
+    %extracted_slice_591 = tensor.extract_slice %arg0[6748377088] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32>
+    %expanded_592 = tensor.expand_shape %extracted_slice_591 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32>
+    %extracted_slice_593 = tensor.extract_slice %arg0[6748639232] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32>
+    %expanded_594 = tensor.expand_shape %extracted_slice_593 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32>
+    %extracted_slice_595 = tensor.extract_slice %arg0[6748901376] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32>
+    %expanded_596 = tensor.expand_shape %extracted_slice_595 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32>
+    %extracted_slice_597 = tensor.extract_slice %arg0[6749163520] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32>
+    %expanded_598 = tensor.expand_shape %extracted_slice_597 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32>
+    %extracted_slice_599 = tensor.extract_slice %arg0[6749425664] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32>
+    %expanded_600 = tensor.expand_shape %extracted_slice_599 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32>
+    %extracted_slice_601 = tensor.extract_slice %arg0[6749687808] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32>
+    %expanded_602 = tensor.expand_shape %extracted_slice_601 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32>
+    %extracted_slice_603 = tensor.extract_slice %arg0[6749949952] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32>
+    %expanded_604 = tensor.expand_shape %extracted_slice_603 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32>
+    %extracted_slice_605 = tensor.extract_slice %arg0[6750212096] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32>
+    %expanded_606 = tensor.expand_shape %extracted_slice_605 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32>
+    %extracted_slice_607 = tensor.extract_slice %arg0[6750474240] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32>
+    %expanded_608 = tensor.expand_shape %extracted_slice_607 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32>
+    %extracted_slice_609 = tensor.extract_slice %arg0[6750736384] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32>
+    %expanded_610 = tensor.expand_shape %extracted_slice_609 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32>
+    %extracted_slice_611 = tensor.extract_slice %arg0[6750998528] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32>
+    %expanded_612 = tensor.expand_shape %extracted_slice_611 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32>
+    %extracted_slice_613 = tensor.extract_slice %arg0[6751260672] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32>
+    %expanded_614 = tensor.expand_shape %extracted_slice_613 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32>
+    %extracted_slice_615 = tensor.extract_slice %arg0[6751522816] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32>
+    %expanded_616 = tensor.expand_shape %extracted_slice_615 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32>
+    %extracted_slice_617 = tensor.extract_slice %arg0[6751784960] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32>
+    %expanded_618 = tensor.expand_shape %extracted_slice_617 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32>
+    %extracted_slice_619 = tensor.extract_slice %arg0[6752047104] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32>
+    %expanded_620 = tensor.expand_shape %extracted_slice_619 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32>
+    %extracted_slice_621 = tensor.extract_slice %arg0[6752309248] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32>
+    %expanded_622 = tensor.expand_shape %extracted_slice_621 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32>
+    %extracted_slice_623 = tensor.extract_slice %arg0[6752571392] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32>
+    %expanded_624 = tensor.expand_shape %extracted_slice_623 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32>
+    %extracted_slice_625 = tensor.extract_slice %arg0[6752833536] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32>
+    %expanded_626 = tensor.expand_shape %extracted_slice_625 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32>
+    %extracted_slice_627 = tensor.extract_slice %arg0[6753095680] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32>
+    %expanded_628 = tensor.expand_shape %extracted_slice_627 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32>
+    %extracted_slice_629 = tensor.extract_slice %arg0[6753357824] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32>
+    %expanded_630 = tensor.expand_shape %extracted_slice_629 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32>
+    %extracted_slice_631 = tensor.extract_slice %arg0[6753619968] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32>
+    %expanded_632 = tensor.expand_shape %extracted_slice_631 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32>
+    %extracted_slice_633 = tensor.extract_slice %arg0[6753882112] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32>
+    %expanded_634 = tensor.expand_shape %extracted_slice_633 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32>
+    %extracted_slice_635 = tensor.extract_slice %arg0[6754144256] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32>
+    %expanded_636 = tensor.expand_shape %extracted_slice_635 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32>
+    %extracted_slice_637 = tensor.extract_slice %arg0[6754406400] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32>
+    %expanded_638 = tensor.expand_shape %extracted_slice_637 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32>
+    %extracted_slice_639 = tensor.extract_slice %arg0[6754668544] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32>
+    %expanded_640 = tensor.expand_shape %extracted_slice_639 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32>
+    %extracted_slice_641 = tensor.extract_slice %arg0[6754930688] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32>
+    %expanded_642 = tensor.expand_shape %extracted_slice_641 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32>
+    %cst = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79]> : tensor<80xi64>
+    %cst_643 = arith.constant dense<[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79]]> : tensor<1x80xi64>
+    %0 = tensor.empty() : tensor<1x80xi32>
+    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%arg1 : tensor<1x80xi64>) outs(%0 : tensor<1x80xi32>) {
+    ^bb0(%in: i64, %out: i32):
+      %3652 = arith.trunci %in : i64 to i32
+      linalg.yield %3652 : i32
+    } -> tensor<1x80xi32>
+    %expanded_644 = tensor.expand_shape %expanded [[0, 1], [2]] : tensor<32000x4096xf32> into tensor<1x32000x4096xf32>
+    %2 = tensor.empty() : tensor<1x80x4096xf32>
+    %3 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1 : tensor<1x80xi32>) outs(%2 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: i32, %out: f32):
+      %3652 = linalg.index 0 : index
+      %3653 = arith.index_cast %in : i32 to index
+      %3654 = linalg.index 2 : index
+      %extracted = tensor.extract %expanded_644[%3652, %3653, %3654] : tensor<1x32000x4096xf32>
+      linalg.yield %extracted : f32
+    } -> tensor<1x80x4096xf32>
+    %cst_645 = arith.constant dense<true> : tensor<1x80xi1>
+    %cst_646 = arith.constant dense<-3.40282347E+38> : tensor<80x80xf32>
+    %cst_647 = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79]> : tensor<80xi64>
+    %cst_648 = arith.constant dense<1> : tensor<80xi64>
+    %4 = tensor.empty() : tensor<80xi64>
+    %5 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel"]} ins(%cst_647, %cst_648 : tensor<80xi64>, tensor<80xi64>) outs(%4 : tensor<80xi64>) {
+    ^bb0(%in: i64, %in_2684: i64, %out: i64):
+      %3652 = arith.addi %in, %in_2684 : i64
+      linalg.yield %3652 : i64
+    } -> tensor<80xi64>
+    %expanded_649 = tensor.expand_shape %5 [[0, 1]] : tensor<80xi64> into tensor<80x1xi64>
+    %6 = tensor.empty() : tensor<80x80xi1>
+    %7 = linalg.generic {indexing_maps = [#map4, #map5, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%cst_647, %expanded_649 : tensor<80xi64>, tensor<80x1xi64>) outs(%6 : tensor<80x80xi1>) {
+    ^bb0(%in: i64, %in_2684: i64, %out: i1):
+      %3652 = arith.cmpi slt, %in, %in_2684 : i64
+      linalg.yield %3652 : i1
+    } -> tensor<80x80xi1>
+    %cst_650 = arith.constant 0.000000e+00 : f32
+    %8 = tensor.empty() : tensor<80x80xf32>
+    %9 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%cst_646, %7 : tensor<80x80xf32>, tensor<80x80xi1>) outs(%8 : tensor<80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: i1, %out: f32):
+      %3652 = arith.select %in_2684, %cst_650, %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<80x80xf32>
+    %cst_651 = arith.constant dense<true> : tensor<1x80xi1>
+    %expanded_652 = tensor.expand_shape %cst_651 [[0, 1], [2]] : tensor<1x80xi1> into tensor<1x1x80xi1>
+    %expanded_653 = tensor.expand_shape %expanded_652 [[0], [1, 2], [3]] : tensor<1x1x80xi1> into tensor<1x1x1x80xi1>
+    %cst_654 = arith.constant dense<false> : tensor<1x1x80x80xi1>
+    %10 = tensor.empty() : tensor<1x1x80x80xi1>
+    %collapsed = tensor.collapse_shape %expanded_653 [[0], [1, 2], [3]] : tensor<1x1x1x80xi1> into tensor<1x1x80xi1>
+    %11 = linalg.generic {indexing_maps = [#map6, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%collapsed, %cst_654 : tensor<1x1x80xi1>, tensor<1x1x80x80xi1>) outs(%10 : tensor<1x1x80x80xi1>) {
+    ^bb0(%in: i1, %in_2684: i1, %out: i1):
+      %3652 = arith.addi %in, %in_2684 : i1
+      linalg.yield %3652 : i1
+    } -> tensor<1x1x80x80xi1>
+    %12 = tensor.empty() : tensor<1x1x80x80xf32>
+    %13 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%11 : tensor<1x1x80x80xi1>) outs(%12 : tensor<1x1x80x80xf32>) {
+    ^bb0(%in: i1, %out: f32):
+      %3652 = arith.extui %in : i1 to i32
+      %3653 = arith.sitofp %3652 : i32 to f32
+      linalg.yield %3653 : f32
+    } -> tensor<1x1x80x80xf32>
+    %cst_655 = arith.constant 1.000000e+00 : f32
+    %14 = tensor.empty() : tensor<1x1x80x80xf32>
+    %15 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%13 : tensor<1x1x80x80xf32>) outs(%14 : tensor<1x1x80x80xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.subf %cst_655, %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x1x80x80xf32>
+    %16 = tensor.empty() : tensor<1x1x80x80xi1>
+    %17 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%15 : tensor<1x1x80x80xf32>) outs(%16 : tensor<1x1x80x80xi1>) {
+    ^bb0(%in: f32, %out: i1):
+      %3652 = arith.fptosi %in : f32 to i32
+      %3653 = arith.trunci %3652 : i32 to i1
+      linalg.yield %3653 : i1
+    } -> tensor<1x1x80x80xi1>
+    %cst_656 = arith.constant -3.40282347E+38 : f32
+    %18 = tensor.empty() : tensor<1x1x80x80xf32>
+    %19 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%15, %17 : tensor<1x1x80x80xf32>, tensor<1x1x80x80xi1>) outs(%18 : tensor<1x1x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: i1, %out: f32):
+      %3652 = arith.select %in_2684, %cst_656, %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x1x80x80xf32>
+    %expanded_657 = tensor.expand_shape %9 [[0, 1], [2]] : tensor<80x80xf32> into tensor<1x80x80xf32>
+    %expanded_658 = tensor.expand_shape %expanded_657 [[0, 1], [2], [3]] : tensor<1x80x80xf32> into tensor<1x1x80x80xf32>
+    %cst_659 = arith.constant dense<0.000000e+00> : tensor<1x1x80x80xf32>
+    %20 = tensor.empty() : tensor<1x1x80x80xf32>
+    %21 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%19, %expanded_658 : tensor<1x1x80x80xf32>, tensor<1x1x80x80xf32>) outs(%20 : tensor<1x1x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x1x80x80xf32>
+    %22 = tensor.empty() : tensor<1x80x4096xf32>
+    %cst_660 = arith.constant 2.000000e+00 : f32
+    %23 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<1x80x4096xf32>) outs(%22 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.powf %in, %cst_660 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %cst_661 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32>
+    %24 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%23 : tensor<1x80x4096xf32>) outs(%cst_661 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %cst_2684 = arith.constant 4.096000e+03 : f32
+      %3652 = arith.divf %in, %cst_2684 : f32
+      %3653 = arith.addf %3652, %out : f32
+      linalg.yield %3653 : f32
+    } -> tensor<1x80x1xf32>
+    %cst_662 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32>
+    %25 = tensor.empty() : tensor<1x80x1xf32>
+    %26 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%24, %cst_662 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%25 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %27 = tensor.empty() : tensor<1x80x1xf32>
+    %28 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%26 : tensor<1x80x1xf32>) outs(%27 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.rsqrt %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %29 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_663 = tensor.collapse_shape %28 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32>
+    %30 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3, %collapsed_663 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%29 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %expanded_664 = tensor.expand_shape %extracted_slice [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32>
+    %31 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_665 = tensor.collapse_shape %expanded_664 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32>
+    %32 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_665, %30 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%31 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %33 = tensor.empty() : tensor<4096x4096xf32>
+    %34 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_66 : tensor<4096x4096xf32>) outs(%33 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_666 = tensor.collapse_shape %32 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_667 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %35 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_666, %34 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_667 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_668 = tensor.expand_shape %35 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %36 = tensor.empty() : tensor<4096x4096xf32>
+    %37 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_68 : tensor<4096x4096xf32>) outs(%36 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_669 = tensor.collapse_shape %32 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_670 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %38 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_669, %37 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_670 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_671 = tensor.expand_shape %38 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %39 = tensor.empty() : tensor<4096x4096xf32>
+    %40 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_70 : tensor<4096x4096xf32>) outs(%39 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_672 = tensor.collapse_shape %32 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_673 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %41 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_672, %40 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_673 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_674 = tensor.expand_shape %41 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %expanded_675 = tensor.expand_shape %expanded_668 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32>
+    %42 = tensor.empty() : tensor<1x32x80x128xf32>
+    %43 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_675 : tensor<1x80x32x128xf32>) outs(%42 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x80x128xf32>
+    %expanded_676 = tensor.expand_shape %expanded_671 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32>
+    %44 = tensor.empty() : tensor<1x32x80x128xf32>
+    %45 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_676 : tensor<1x80x32x128xf32>) outs(%44 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x80x128xf32>
+    %expanded_677 = tensor.expand_shape %expanded_674 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32>
+    %46 = tensor.empty() : tensor<1x32x80x128xf32>
+    %47 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_677 : tensor<1x80x32x128xf32>) outs(%46 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x80x128xf32>
+    %extracted_slice_678 = tensor.extract_slice %expanded_516[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32>
+    %extracted_slice_679 = tensor.extract_slice %expanded_518[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32>
+    %48 = tensor.empty() : tensor<1x80x128xf32>
+    %49 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_678 : tensor<1x1x80x128xf32>) outs(%48 : tensor<1x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x80x128xf32>
+    %50 = tensor.empty() : tensor<80x128xf32>
+    %51 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%49 : tensor<1x80x128xf32>) outs(%50 : tensor<80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<80x128xf32>
+    %52 = tensor.empty() : tensor<1x80x128xf32>
+    %53 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_679 : tensor<1x1x80x128xf32>) outs(%52 : tensor<1x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x80x128xf32>
+    %54 = tensor.empty() : tensor<80x128xf32>
+    %55 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%53 : tensor<1x80x128xf32>) outs(%54 : tensor<80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<80x128xf32>
+    %56 = tensor.empty() : tensor<1x80x128xf32>
+    %57 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%56 : tensor<1x80x128xf32>) {
+    ^bb0(%in: i64, %out: f32):
+      %3652 = arith.index_cast %in : i64 to index
+      %3653 = linalg.index 2 : index
+      %extracted = tensor.extract %51[%3652, %3653] : tensor<80x128xf32>
+      linalg.yield %extracted : f32
+    } -> tensor<1x80x128xf32>
+    %expanded_680 = tensor.expand_shape %57 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32>
+    %58 = tensor.empty() : tensor<1x80x128xf32>
+    %59 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%58 : tensor<1x80x128xf32>) {
+    ^bb0(%in: i64, %out: f32):
+      %3652 = arith.index_cast %in : i64 to index
+      %3653 = linalg.index 2 : index
+      %extracted = tensor.extract %55[%3652, %3653] : tensor<80x128xf32>
+      linalg.yield %extracted : f32
+    } -> tensor<1x80x128xf32>
+    %expanded_681 = tensor.expand_shape %59 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32>
+    %60 = tensor.empty() : tensor<1x32x80x128xf32>
+    %61 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%43, %57 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%60 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %extracted_slice_682 = tensor.extract_slice %43[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %extracted_slice_683 = tensor.extract_slice %43[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %62 = tensor.empty() : tensor<1x32x80x64xf32>
+    %63 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_683 : tensor<1x32x80x64xf32>) outs(%62 : tensor<1x32x80x64xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.negf %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x64xf32>
+    %64 = tensor.empty() : tensor<1x32x80x128xf32>
+    %inserted_slice = tensor.insert_slice %63 into %64[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %inserted_slice_684 = tensor.insert_slice %extracted_slice_682 into %inserted_slice[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %65 = tensor.empty() : tensor<1x32x80x128xf32>
+    %66 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_684, %59 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%65 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %67 = tensor.empty() : tensor<1x32x80x128xf32>
+    %68 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%61, %66 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%67 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %69 = tensor.empty() : tensor<1x32x80x128xf32>
+    %70 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%45, %57 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%69 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %extracted_slice_685 = tensor.extract_slice %45[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %extracted_slice_686 = tensor.extract_slice %45[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %71 = tensor.empty() : tensor<1x32x80x64xf32>
+    %72 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_686 : tensor<1x32x80x64xf32>) outs(%71 : tensor<1x32x80x64xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.negf %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x64xf32>
+    %73 = tensor.empty() : tensor<1x32x80x128xf32>
+    %inserted_slice_687 = tensor.insert_slice %72 into %73[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %inserted_slice_688 = tensor.insert_slice %extracted_slice_685 into %inserted_slice_687[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %74 = tensor.empty() : tensor<1x32x80x128xf32>
+    %75 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_688, %59 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%74 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %76 = tensor.empty() : tensor<1x32x80x128xf32>
+    %77 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%70, %75 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%76 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %78 = tensor.empty() : tensor<1x32x128x80xf32>
+    %79 = linalg.generic {indexing_maps = [#map14, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%77 : tensor<1x32x80x128xf32>) outs(%78 : tensor<1x32x128x80xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x128x80xf32>
+    %cst_689 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32>
+    %collapsed_690 = tensor.collapse_shape %68 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32>
+    %cst_691 = arith.constant dense<0.000000e+00> : tensor<1x32x128x80xf32>
+    %collapsed_692 = tensor.collapse_shape %79 [[0, 1], [2], [3]] : tensor<1x32x128x80xf32> into tensor<32x128x80xf32>
+    %cst_693 = arith.constant 0.000000e+00 : f32
+    %80 = tensor.empty() : tensor<32x80x80xf32>
+    %81 = linalg.fill ins(%cst_693 : f32) outs(%80 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32>
+    %82 = linalg.batch_matmul ins(%collapsed_690, %collapsed_692 : tensor<32x80x128xf32>, tensor<32x128x80xf32>) outs(%81 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32>
+    %expanded_694 = tensor.expand_shape %82 [[0, 1], [2], [3]] : tensor<32x80x80xf32> into tensor<1x32x80x80xf32>
+    %cst_695 = arith.constant dense<11.3137083> : tensor<1x32x80x80xf32>
+    %83 = tensor.empty() : tensor<1x32x80x80xf32>
+    %84 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst_695 : tensor<1x32x80x80xf32>) outs(%83 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %cst_2684 = arith.constant 1.000000e+00 : f32
+      %3652 = arith.divf %cst_2684, %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %85 = tensor.empty() : tensor<1x32x80x80xf32>
+    %86 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_694, %84 : tensor<1x32x80x80xf32>, tensor<1x32x80x80xf32>) outs(%85 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %87 = tensor.empty() : tensor<1x32x80x80xf32>
+    %collapsed_696 = tensor.collapse_shape %21 [[0, 1], [2], [3]] : tensor<1x1x80x80xf32> into tensor<1x80x80xf32>
+    %88 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%86, %collapsed_696 : tensor<1x32x80x80xf32>, tensor<1x80x80xf32>) outs(%87 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %89 = tensor.empty() : tensor<1x32x80x1xf32>
+    %90 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%89 : tensor<1x32x80x1xf32>) {
+    ^bb0(%out: f32):
+      %cst_2684 = arith.constant 0xFF800000 : f32
+      linalg.yield %cst_2684 : f32
+    } -> tensor<1x32x80x1xf32>
+    %91 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%88 : tensor<1x32x80x80xf32>) outs(%89 : tensor<1x32x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.maximumf %in, %out : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x1xf32>
+    %92 = tensor.empty() : tensor<1x32x80x80xf32>
+    %93 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%88, %91 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%92 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.subf %in, %in_2684 : f32
+      %3653 = math.exp %3652 : f32
+      linalg.yield %3653 : f32
+    } -> tensor<1x32x80x80xf32>
+    %94 = tensor.empty() : tensor<1x32x80x1xf32>
+    %95 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%94 : tensor<1x32x80x1xf32>) {
+    ^bb0(%out: f32):
+      %cst_2684 = arith.constant 0.000000e+00 : f32
+      linalg.yield %cst_2684 : f32
+    } -> tensor<1x32x80x1xf32>
+    %96 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%93 : tensor<1x32x80x80xf32>) outs(%95 : tensor<1x32x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.addf %in, %out : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x1xf32>
+    %97 = tensor.empty() : tensor<1x32x80x80xf32>
+    %98 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%93, %96 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%97 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.divf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %cst_697 = arith.constant dense<0.000000e+00> : tensor<1x32x80x80xf32>
+    %collapsed_698 = tensor.collapse_shape %98 [[0, 1], [2], [3]] : tensor<1x32x80x80xf32> into tensor<32x80x80xf32>
+    %cst_699 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32>
+    %collapsed_700 = tensor.collapse_shape %47 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32>
+    %cst_701 = arith.constant 0.000000e+00 : f32
+    %99 = tensor.empty() : tensor<32x80x128xf32>
+    %100 = linalg.fill ins(%cst_701 : f32) outs(%99 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32>
+    %101 = linalg.batch_matmul ins(%collapsed_698, %collapsed_700 : tensor<32x80x80xf32>, tensor<32x80x128xf32>) outs(%100 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32>
+    %expanded_702 = tensor.expand_shape %101 [[0, 1], [2], [3]] : tensor<32x80x128xf32> into tensor<1x32x80x128xf32>
+    %102 = tensor.empty() : tensor<1x80x32x128xf32>
+    %103 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_702 : tensor<1x32x80x128xf32>) outs(%102 : tensor<1x80x32x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x80x32x128xf32>
+    %collapsed_703 = tensor.collapse_shape %103 [[0], [1], [2, 3]] : tensor<1x80x32x128xf32> into tensor<1x80x4096xf32>
+    %104 = tensor.empty() : tensor<4096x4096xf32>
+    %105 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_72 : tensor<4096x4096xf32>) outs(%104 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_704 = tensor.collapse_shape %collapsed_703 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_705 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %106 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_704, %105 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_705 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_706 = tensor.expand_shape %106 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %107 = tensor.empty() : tensor<1x80x4096xf32>
+    %108 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3, %expanded_706 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%107 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %109 = tensor.empty() : tensor<1x80x4096xf32>
+    %cst_707 = arith.constant 2.000000e+00 : f32
+    %110 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%108 : tensor<1x80x4096xf32>) outs(%109 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.powf %in, %cst_707 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %cst_708 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32>
+    %111 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%110 : tensor<1x80x4096xf32>) outs(%cst_708 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %cst_2684 = arith.constant 4.096000e+03 : f32
+      %3652 = arith.divf %in, %cst_2684 : f32
+      %3653 = arith.addf %3652, %out : f32
+      linalg.yield %3653 : f32
+    } -> tensor<1x80x1xf32>
+    %cst_709 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32>
+    %112 = tensor.empty() : tensor<1x80x1xf32>
+    %113 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%111, %cst_709 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%112 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %114 = tensor.empty() : tensor<1x80x1xf32>
+    %115 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%113 : tensor<1x80x1xf32>) outs(%114 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.rsqrt %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %116 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_710 = tensor.collapse_shape %115 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32>
+    %117 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%108, %collapsed_710 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%116 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %expanded_711 = tensor.expand_shape %extracted_slice_0 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32>
+    %118 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_712 = tensor.collapse_shape %expanded_711 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32>
+    %119 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_712, %117 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%118 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %120 = tensor.empty() : tensor<4096x11008xf32>
+    %121 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_74 : tensor<11008x4096xf32>) outs(%120 : tensor<4096x11008xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x11008xf32>
+    %collapsed_713 = tensor.collapse_shape %119 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_714 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32>
+    %122 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_713, %121 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_714 : tensor<80x11008xf32>) -> tensor<80x11008xf32>
+    %expanded_715 = tensor.expand_shape %122 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32>
+    %123 = tensor.empty() : tensor<1x80x11008xf32>
+    %124 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_715 : tensor<1x80x11008xf32>) outs(%123 : tensor<1x80x11008xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.negf %in : f32
+      %3653 = math.exp %3652 : f32
+      %cst_2684 = arith.constant 1.000000e+00 : f32
+      %3654 = arith.addf %cst_2684, %3653 : f32
+      %3655 = arith.divf %in, %3654 : f32
+      linalg.yield %3655 : f32
+    } -> tensor<1x80x11008xf32>
+    %125 = tensor.empty() : tensor<4096x11008xf32>
+    %126 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_76 : tensor<11008x4096xf32>) outs(%125 : tensor<4096x11008xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x11008xf32>
+    %collapsed_716 = tensor.collapse_shape %119 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_717 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32>
+    %127 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_716, %126 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_717 : tensor<80x11008xf32>) -> tensor<80x11008xf32>
+    %expanded_718 = tensor.expand_shape %127 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32>
+    %128 = tensor.empty() : tensor<1x80x11008xf32>
+    %129 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%124, %expanded_718 : tensor<1x80x11008xf32>, tensor<1x80x11008xf32>) outs(%128 : tensor<1x80x11008xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x11008xf32>
+    %130 = tensor.empty() : tensor<11008x4096xf32>
+    %131 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_78 : tensor<4096x11008xf32>) outs(%130 : tensor<11008x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<11008x4096xf32>
+    %collapsed_719 = tensor.collapse_shape %129 [[0, 1], [2]] : tensor<1x80x11008xf32> into tensor<80x11008xf32>
+    %cst_720 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %132 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_719, %131 : tensor<80x11008xf32>, tensor<11008x4096xf32>) outs(%cst_720 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_721 = tensor.expand_shape %132 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %133 = tensor.empty() : tensor<1x80x4096xf32>
+    %134 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%108, %expanded_721 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%133 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %135 = tensor.empty() : tensor<1x80x4096xf32>
+    %cst_722 = arith.constant 2.000000e+00 : f32
+    %136 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%134 : tensor<1x80x4096xf32>) outs(%135 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.powf %in, %cst_722 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %cst_723 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32>
+    %137 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%136 : tensor<1x80x4096xf32>) outs(%cst_723 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %cst_2684 = arith.constant 4.096000e+03 : f32
+      %3652 = arith.divf %in, %cst_2684 : f32
+      %3653 = arith.addf %3652, %out : f32
+      linalg.yield %3653 : f32
+    } -> tensor<1x80x1xf32>
+    %cst_724 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32>
+    %138 = tensor.empty() : tensor<1x80x1xf32>
+    %139 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%137, %cst_724 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%138 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %140 = tensor.empty() : tensor<1x80x1xf32>
+    %141 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%139 : tensor<1x80x1xf32>) outs(%140 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.rsqrt %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %142 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_725 = tensor.collapse_shape %141 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32>
+    %143 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%134, %collapsed_725 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%142 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %expanded_726 = tensor.expand_shape %extracted_slice_1 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32>
+    %144 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_727 = tensor.collapse_shape %expanded_726 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32>
+    %145 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_727, %143 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%144 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %146 = tensor.empty() : tensor<4096x4096xf32>
+    %147 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_80 : tensor<4096x4096xf32>) outs(%146 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_728 = tensor.collapse_shape %145 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_729 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %148 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_728, %147 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_729 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_730 = tensor.expand_shape %148 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %149 = tensor.empty() : tensor<4096x4096xf32>
+    %150 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_82 : tensor<4096x4096xf32>) outs(%149 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_731 = tensor.collapse_shape %145 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_732 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %151 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_731, %150 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_732 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_733 = tensor.expand_shape %151 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %152 = tensor.empty() : tensor<4096x4096xf32>
+    %153 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_84 : tensor<4096x4096xf32>) outs(%152 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_734 = tensor.collapse_shape %145 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_735 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %154 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_734, %153 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_735 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_736 = tensor.expand_shape %154 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %expanded_737 = tensor.expand_shape %expanded_730 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32>
+    %155 = tensor.empty() : tensor<1x32x80x128xf32>
+    %156 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_737 : tensor<1x80x32x128xf32>) outs(%155 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x80x128xf32>
+    %expanded_738 = tensor.expand_shape %expanded_733 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32>
+    %157 = tensor.empty() : tensor<1x32x80x128xf32>
+    %158 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_738 : tensor<1x80x32x128xf32>) outs(%157 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x80x128xf32>
+    %expanded_739 = tensor.expand_shape %expanded_736 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32>
+    %159 = tensor.empty() : tensor<1x32x80x128xf32>
+    %160 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_739 : tensor<1x80x32x128xf32>) outs(%159 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x80x128xf32>
+    %extracted_slice_740 = tensor.extract_slice %expanded_520[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32>
+    %extracted_slice_741 = tensor.extract_slice %expanded_522[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32>
+    %161 = tensor.empty() : tensor<1x80x128xf32>
+    %162 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_740 : tensor<1x1x80x128xf32>) outs(%161 : tensor<1x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x80x128xf32>
+    %163 = tensor.empty() : tensor<80x128xf32>
+    %164 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%162 : tensor<1x80x128xf32>) outs(%163 : tensor<80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<80x128xf32>
+    %165 = tensor.empty() : tensor<1x80x128xf32>
+    %166 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_741 : tensor<1x1x80x128xf32>) outs(%165 : tensor<1x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x80x128xf32>
+    %167 = tensor.empty() : tensor<80x128xf32>
+    %168 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%166 : tensor<1x80x128xf32>) outs(%167 : tensor<80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<80x128xf32>
+    %169 = tensor.empty() : tensor<1x80x128xf32>
+    %170 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%169 : tensor<1x80x128xf32>) {
+    ^bb0(%in: i64, %out: f32):
+      %3652 = arith.index_cast %in : i64 to index
+      %3653 = linalg.index 2 : index
+      %extracted = tensor.extract %164[%3652, %3653] : tensor<80x128xf32>
+      linalg.yield %extracted : f32
+    } -> tensor<1x80x128xf32>
+    %expanded_742 = tensor.expand_shape %170 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32>
+    %171 = tensor.empty() : tensor<1x80x128xf32>
+    %172 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%171 : tensor<1x80x128xf32>) {
+    ^bb0(%in: i64, %out: f32):
+      %3652 = arith.index_cast %in : i64 to index
+      %3653 = linalg.index 2 : index
+      %extracted = tensor.extract %168[%3652, %3653] : tensor<80x128xf32>
+      linalg.yield %extracted : f32
+    } -> tensor<1x80x128xf32>
+    %expanded_743 = tensor.expand_shape %172 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32>
+    %173 = tensor.empty() : tensor<1x32x80x128xf32>
+    %174 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%156, %170 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%173 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %extracted_slice_744 = tensor.extract_slice %156[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %extracted_slice_745 = tensor.extract_slice %156[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %175 = tensor.empty() : tensor<1x32x80x64xf32>
+    %176 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_745 : tensor<1x32x80x64xf32>) outs(%175 : tensor<1x32x80x64xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.negf %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x64xf32>
+    %177 = tensor.empty() : tensor<1x32x80x128xf32>
+    %inserted_slice_746 = tensor.insert_slice %176 into %177[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %inserted_slice_747 = tensor.insert_slice %extracted_slice_744 into %inserted_slice_746[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %178 = tensor.empty() : tensor<1x32x80x128xf32>
+    %179 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_747, %172 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%178 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %180 = tensor.empty() : tensor<1x32x80x128xf32>
+    %181 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%174, %179 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%180 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %182 = tensor.empty() : tensor<1x32x80x128xf32>
+    %183 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%158, %170 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%182 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %extracted_slice_748 = tensor.extract_slice %158[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %extracted_slice_749 = tensor.extract_slice %158[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %184 = tensor.empty() : tensor<1x32x80x64xf32>
+    %185 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_749 : tensor<1x32x80x64xf32>) outs(%184 : tensor<1x32x80x64xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.negf %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x64xf32>
+    %186 = tensor.empty() : tensor<1x32x80x128xf32>
+    %inserted_slice_750 = tensor.insert_slice %185 into %186[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %inserted_slice_751 = tensor.insert_slice %extracted_slice_748 into %inserted_slice_750[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %187 = tensor.empty() : tensor<1x32x80x128xf32>
+    %188 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_751, %172 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%187 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %189 = tensor.empty() : tensor<1x32x80x128xf32>
+    %190 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%183, %188 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%189 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %191 = tensor.empty() : tensor<1x32x128x80xf32>
+    %192 = linalg.generic {indexing_maps = [#map14, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%190 : tensor<1x32x80x128xf32>) outs(%191 : tensor<1x32x128x80xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x128x80xf32>
+    %cst_752 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32>
+    %collapsed_753 = tensor.collapse_shape %181 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32>
+    %cst_754 = arith.constant dense<0.000000e+00> : tensor<1x32x128x80xf32>
+    %collapsed_755 = tensor.collapse_shape %192 [[0, 1], [2], [3]] : tensor<1x32x128x80xf32> into tensor<32x128x80xf32>
+    %cst_756 = arith.constant 0.000000e+00 : f32
+    %193 = tensor.empty() : tensor<32x80x80xf32>
+    %194 = linalg.fill ins(%cst_756 : f32) outs(%193 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32>
+    %195 = linalg.batch_matmul ins(%collapsed_753, %collapsed_755 : tensor<32x80x128xf32>, tensor<32x128x80xf32>) outs(%194 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32>
+    %expanded_757 = tensor.expand_shape %195 [[0, 1], [2], [3]] : tensor<32x80x80xf32> into tensor<1x32x80x80xf32>
+    %cst_758 = arith.constant dense<11.3137083> : tensor<1x32x80x80xf32>
+    %196 = tensor.empty() : tensor<1x32x80x80xf32>
+    %197 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst_758 : tensor<1x32x80x80xf32>) outs(%196 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %cst_2684 = arith.constant 1.000000e+00 : f32
+      %3652 = arith.divf %cst_2684, %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %198 = tensor.empty() : tensor<1x32x80x80xf32>
+    %199 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_757, %197 : tensor<1x32x80x80xf32>, tensor<1x32x80x80xf32>) outs(%198 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %200 = tensor.empty() : tensor<1x32x80x80xf32>
+    %collapsed_759 = tensor.collapse_shape %21 [[0, 1], [2], [3]] : tensor<1x1x80x80xf32> into tensor<1x80x80xf32>
+    %201 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%199, %collapsed_759 : tensor<1x32x80x80xf32>, tensor<1x80x80xf32>) outs(%200 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %202 = tensor.empty() : tensor<1x32x80x1xf32>
+    %203 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%202 : tensor<1x32x80x1xf32>) {
+    ^bb0(%out: f32):
+      %cst_2684 = arith.constant 0xFF800000 : f32
+      linalg.yield %cst_2684 : f32
+    } -> tensor<1x32x80x1xf32>
+    %204 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%201 : tensor<1x32x80x80xf32>) outs(%202 : tensor<1x32x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.maximumf %in, %out : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x1xf32>
+    %205 = tensor.empty() : tensor<1x32x80x80xf32>
+    %206 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%201, %204 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%205 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.subf %in, %in_2684 : f32
+      %3653 = math.exp %3652 : f32
+      linalg.yield %3653 : f32
+    } -> tensor<1x32x80x80xf32>
+    %207 = tensor.empty() : tensor<1x32x80x1xf32>
+    %208 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%207 : tensor<1x32x80x1xf32>) {
+    ^bb0(%out: f32):
+      %cst_2684 = arith.constant 0.000000e+00 : f32
+      linalg.yield %cst_2684 : f32
+    } -> tensor<1x32x80x1xf32>
+    %209 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%206 : tensor<1x32x80x80xf32>) outs(%208 : tensor<1x32x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.addf %in, %out : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x1xf32>
+    %210 = tensor.empty() : tensor<1x32x80x80xf32>
+    %211 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%206, %209 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%210 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.divf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %cst_760 = arith.constant dense<0.000000e+00> : tensor<1x32x80x80xf32>
+    %collapsed_761 = tensor.collapse_shape %211 [[0, 1], [2], [3]] : tensor<1x32x80x80xf32> into tensor<32x80x80xf32>
+    %cst_762 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32>
+    %collapsed_763 = tensor.collapse_shape %160 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32>
+    %cst_764 = arith.constant 0.000000e+00 : f32
+    %212 = tensor.empty() : tensor<32x80x128xf32>
+    %213 = linalg.fill ins(%cst_764 : f32) outs(%212 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32>
+    %214 = linalg.batch_matmul ins(%collapsed_761, %collapsed_763 : tensor<32x80x80xf32>, tensor<32x80x128xf32>) outs(%213 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32>
+    %expanded_765 = tensor.expand_shape %214 [[0, 1], [2], [3]] : tensor<32x80x128xf32> into tensor<1x32x80x128xf32>
+    %215 = tensor.empty() : tensor<1x80x32x128xf32>
+    %216 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_765 : tensor<1x32x80x128xf32>) outs(%215 : tensor<1x80x32x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x80x32x128xf32>
+    %collapsed_766 = tensor.collapse_shape %216 [[0], [1], [2, 3]] : tensor<1x80x32x128xf32> into tensor<1x80x4096xf32>
+    %217 = tensor.empty() : tensor<4096x4096xf32>
+    %218 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_86 : tensor<4096x4096xf32>) outs(%217 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_767 = tensor.collapse_shape %collapsed_766 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_768 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %219 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_767, %218 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_768 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_769 = tensor.expand_shape %219 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %220 = tensor.empty() : tensor<1x80x4096xf32>
+    %221 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%134, %expanded_769 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%220 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %222 = tensor.empty() : tensor<1x80x4096xf32>
+    %cst_770 = arith.constant 2.000000e+00 : f32
+    %223 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%221 : tensor<1x80x4096xf32>) outs(%222 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.powf %in, %cst_770 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %cst_771 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32>
+    %224 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%223 : tensor<1x80x4096xf32>) outs(%cst_771 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %cst_2684 = arith.constant 4.096000e+03 : f32
+      %3652 = arith.divf %in, %cst_2684 : f32
+      %3653 = arith.addf %3652, %out : f32
+      linalg.yield %3653 : f32
+    } -> tensor<1x80x1xf32>
+    %cst_772 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32>
+    %225 = tensor.empty() : tensor<1x80x1xf32>
+    %226 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%224, %cst_772 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%225 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %227 = tensor.empty() : tensor<1x80x1xf32>
+    %228 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%226 : tensor<1x80x1xf32>) outs(%227 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.rsqrt %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %229 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_773 = tensor.collapse_shape %228 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32>
+    %230 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%221, %collapsed_773 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%229 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %expanded_774 = tensor.expand_shape %extracted_slice_2 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32>
+    %231 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_775 = tensor.collapse_shape %expanded_774 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32>
+    %232 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_775, %230 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%231 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %233 = tensor.empty() : tensor<4096x11008xf32>
+    %234 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_88 : tensor<11008x4096xf32>) outs(%233 : tensor<4096x11008xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x11008xf32>
+    %collapsed_776 = tensor.collapse_shape %232 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_777 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32>
+    %235 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_776, %234 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_777 : tensor<80x11008xf32>) -> tensor<80x11008xf32>
+    %expanded_778 = tensor.expand_shape %235 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32>
+    %236 = tensor.empty() : tensor<1x80x11008xf32>
+    %237 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_778 : tensor<1x80x11008xf32>) outs(%236 : tensor<1x80x11008xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.negf %in : f32
+      %3653 = math.exp %3652 : f32
+      %cst_2684 = arith.constant 1.000000e+00 : f32
+      %3654 = arith.addf %cst_2684, %3653 : f32
+      %3655 = arith.divf %in, %3654 : f32
+      linalg.yield %3655 : f32
+    } -> tensor<1x80x11008xf32>
+    %238 = tensor.empty() : tensor<4096x11008xf32>
+    %239 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_90 : tensor<11008x4096xf32>) outs(%238 : tensor<4096x11008xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x11008xf32>
+    %collapsed_779 = tensor.collapse_shape %232 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_780 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32>
+    %240 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_779, %239 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_780 : tensor<80x11008xf32>) -> tensor<80x11008xf32>
+    %expanded_781 = tensor.expand_shape %240 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32>
+    %241 = tensor.empty() : tensor<1x80x11008xf32>
+    %242 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%237, %expanded_781 : tensor<1x80x11008xf32>, tensor<1x80x11008xf32>) outs(%241 : tensor<1x80x11008xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x11008xf32>
+    %243 = tensor.empty() : tensor<11008x4096xf32>
+    %244 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_92 : tensor<4096x11008xf32>) outs(%243 : tensor<11008x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<11008x4096xf32>
+    %collapsed_782 = tensor.collapse_shape %242 [[0, 1], [2]] : tensor<1x80x11008xf32> into tensor<80x11008xf32>
+    %cst_783 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %245 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_782, %244 : tensor<80x11008xf32>, tensor<11008x4096xf32>) outs(%cst_783 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_784 = tensor.expand_shape %245 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %246 = tensor.empty() : tensor<1x80x4096xf32>
+    %247 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%221, %expanded_784 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%246 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %248 = tensor.empty() : tensor<1x80x4096xf32>
+    %cst_785 = arith.constant 2.000000e+00 : f32
+    %249 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%247 : tensor<1x80x4096xf32>) outs(%248 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.powf %in, %cst_785 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %cst_786 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32>
+    %250 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%249 : tensor<1x80x4096xf32>) outs(%cst_786 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %cst_2684 = arith.constant 4.096000e+03 : f32
+      %3652 = arith.divf %in, %cst_2684 : f32
+      %3653 = arith.addf %3652, %out : f32
+      linalg.yield %3653 : f32
+    } -> tensor<1x80x1xf32>
+    %cst_787 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32>
+    %251 = tensor.empty() : tensor<1x80x1xf32>
+    %252 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%250, %cst_787 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%251 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %253 = tensor.empty() : tensor<1x80x1xf32>
+    %254 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%252 : tensor<1x80x1xf32>) outs(%253 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.rsqrt %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %255 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_788 = tensor.collapse_shape %254 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32>
+    %256 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%247, %collapsed_788 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%255 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %expanded_789 = tensor.expand_shape %extracted_slice_3 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32>
+    %257 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_790 = tensor.collapse_shape %expanded_789 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32>
+    %258 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_790, %256 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%257 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %259 = tensor.empty() : tensor<4096x4096xf32>
+    %260 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_94 : tensor<4096x4096xf32>) outs(%259 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_791 = tensor.collapse_shape %258 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_792 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %261 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_791, %260 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_792 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_793 = tensor.expand_shape %261 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %262 = tensor.empty() : tensor<4096x4096xf32>
+    %263 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_96 : tensor<4096x4096xf32>) outs(%262 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_794 = tensor.collapse_shape %258 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_795 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %264 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_794, %263 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_795 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_796 = tensor.expand_shape %264 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %265 = tensor.empty() : tensor<4096x4096xf32>
+    %266 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_98 : tensor<4096x4096xf32>) outs(%265 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_797 = tensor.collapse_shape %258 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_798 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %267 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_797, %266 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_798 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_799 = tensor.expand_shape %267 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %expanded_800 = tensor.expand_shape %expanded_793 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32>
+    %268 = tensor.empty() : tensor<1x32x80x128xf32>
+    %269 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_800 : tensor<1x80x32x128xf32>) outs(%268 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x80x128xf32>
+    %expanded_801 = tensor.expand_shape %expanded_796 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32>
+    %270 = tensor.empty() : tensor<1x32x80x128xf32>
+    %271 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_801 : tensor<1x80x32x128xf32>) outs(%270 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x80x128xf32>
+    %expanded_802 = tensor.expand_shape %expanded_799 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32>
+    %272 = tensor.empty() : tensor<1x32x80x128xf32>
+    %273 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_802 : tensor<1x80x32x128xf32>) outs(%272 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x80x128xf32>
+    %extracted_slice_803 = tensor.extract_slice %expanded_524[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32>
+    %extracted_slice_804 = tensor.extract_slice %expanded_526[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32>
+    %274 = tensor.empty() : tensor<1x80x128xf32>
+    %275 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_803 : tensor<1x1x80x128xf32>) outs(%274 : tensor<1x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x80x128xf32>
+    %276 = tensor.empty() : tensor<80x128xf32>
+    %277 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%275 : tensor<1x80x128xf32>) outs(%276 : tensor<80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<80x128xf32>
+    %278 = tensor.empty() : tensor<1x80x128xf32>
+    %279 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_804 : tensor<1x1x80x128xf32>) outs(%278 : tensor<1x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x80x128xf32>
+    %280 = tensor.empty() : tensor<80x128xf32>
+    %281 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%279 : tensor<1x80x128xf32>) outs(%280 : tensor<80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<80x128xf32>
+    %282 = tensor.empty() : tensor<1x80x128xf32>
+    %283 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%282 : tensor<1x80x128xf32>) {
+    ^bb0(%in: i64, %out: f32):
+      %3652 = arith.index_cast %in : i64 to index
+      %3653 = linalg.index 2 : index
+      %extracted = tensor.extract %277[%3652, %3653] : tensor<80x128xf32>
+      linalg.yield %extracted : f32
+    } -> tensor<1x80x128xf32>
+    %expanded_805 = tensor.expand_shape %283 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32>
+    %284 = tensor.empty() : tensor<1x80x128xf32>
+    %285 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%284 : tensor<1x80x128xf32>) {
+    ^bb0(%in: i64, %out: f32):
+      %3652 = arith.index_cast %in : i64 to index
+      %3653 = linalg.index 2 : index
+      %extracted = tensor.extract %281[%3652, %3653] : tensor<80x128xf32>
+      linalg.yield %extracted : f32
+    } -> tensor<1x80x128xf32>
+    %expanded_806 = tensor.expand_shape %285 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32>
+    %286 = tensor.empty() : tensor<1x32x80x128xf32>
+    %287 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%269, %283 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%286 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %extracted_slice_807 = tensor.extract_slice %269[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %extracted_slice_808 = tensor.extract_slice %269[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %288 = tensor.empty() : tensor<1x32x80x64xf32>
+    %289 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_808 : tensor<1x32x80x64xf32>) outs(%288 : tensor<1x32x80x64xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.negf %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x64xf32>
+    %290 = tensor.empty() : tensor<1x32x80x128xf32>
+    %inserted_slice_809 = tensor.insert_slice %289 into %290[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %inserted_slice_810 = tensor.insert_slice %extracted_slice_807 into %inserted_slice_809[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %291 = tensor.empty() : tensor<1x32x80x128xf32>
+    %292 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_810, %285 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%291 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %293 = tensor.empty() : tensor<1x32x80x128xf32>
+    %294 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%287, %292 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%293 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %295 = tensor.empty() : tensor<1x32x80x128xf32>
+    %296 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%271, %283 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%295 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %extracted_slice_811 = tensor.extract_slice %271[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %extracted_slice_812 = tensor.extract_slice %271[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %297 = tensor.empty() : tensor<1x32x80x64xf32>
+    %298 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_812 : tensor<1x32x80x64xf32>) outs(%297 : tensor<1x32x80x64xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.negf %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x64xf32>
+    %299 = tensor.empty() : tensor<1x32x80x128xf32>
+    %inserted_slice_813 = tensor.insert_slice %298 into %299[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %inserted_slice_814 = tensor.insert_slice %extracted_slice_811 into %inserted_slice_813[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %300 = tensor.empty() : tensor<1x32x80x128xf32>
+    %301 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_814, %285 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%300 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %302 = tensor.empty() : tensor<1x32x80x128xf32>
+    %303 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%296, %301 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%302 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %304 = tensor.empty() : tensor<1x32x128x80xf32>
+    %305 = linalg.generic {indexing_maps = [#map14, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%303 : tensor<1x32x80x128xf32>) outs(%304 : tensor<1x32x128x80xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x128x80xf32>
+    %cst_815 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32>
+    %collapsed_816 = tensor.collapse_shape %294 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32>
+    %cst_817 = arith.constant dense<0.000000e+00> : tensor<1x32x128x80xf32>
+    %collapsed_818 = tensor.collapse_shape %305 [[0, 1], [2], [3]] : tensor<1x32x128x80xf32> into tensor<32x128x80xf32>
+    %cst_819 = arith.constant 0.000000e+00 : f32
+    %306 = tensor.empty() : tensor<32x80x80xf32>
+    %307 = linalg.fill ins(%cst_819 : f32) outs(%306 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32>
+    %308 = linalg.batch_matmul ins(%collapsed_816, %collapsed_818 : tensor<32x80x128xf32>, tensor<32x128x80xf32>) outs(%307 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32>
+    %expanded_820 = tensor.expand_shape %308 [[0, 1], [2], [3]] : tensor<32x80x80xf32> into tensor<1x32x80x80xf32>
+    %cst_821 = arith.constant dense<11.3137083> : tensor<1x32x80x80xf32>
+    %309 = tensor.empty() : tensor<1x32x80x80xf32>
+    %310 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst_821 : tensor<1x32x80x80xf32>) outs(%309 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %cst_2684 = arith.constant 1.000000e+00 : f32
+      %3652 = arith.divf %cst_2684, %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %311 = tensor.empty() : tensor<1x32x80x80xf32>
+    %312 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_820, %310 : tensor<1x32x80x80xf32>, tensor<1x32x80x80xf32>) outs(%311 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %313 = tensor.empty() : tensor<1x32x80x80xf32>
+    %collapsed_822 = tensor.collapse_shape %21 [[0, 1], [2], [3]] : tensor<1x1x80x80xf32> into tensor<1x80x80xf32>
+    %314 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%312, %collapsed_822 : tensor<1x32x80x80xf32>, tensor<1x80x80xf32>) outs(%313 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %315 = tensor.empty() : tensor<1x32x80x1xf32>
+    %316 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%315 : tensor<1x32x80x1xf32>) {
+    ^bb0(%out: f32):
+      %cst_2684 = arith.constant 0xFF800000 : f32
+      linalg.yield %cst_2684 : f32
+    } -> tensor<1x32x80x1xf32>
+    %317 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%314 : tensor<1x32x80x80xf32>) outs(%315 : tensor<1x32x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.maximumf %in, %out : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x1xf32>
+    %318 = tensor.empty() : tensor<1x32x80x80xf32>
+    %319 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%314, %317 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%318 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.subf %in, %in_2684 : f32
+      %3653 = math.exp %3652 : f32
+      linalg.yield %3653 : f32
+    } -> tensor<1x32x80x80xf32>
+    %320 = tensor.empty() : tensor<1x32x80x1xf32>
+    %321 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%320 : tensor<1x32x80x1xf32>) {
+    ^bb0(%out: f32):
+      %cst_2684 = arith.constant 0.000000e+00 : f32
+      linalg.yield %cst_2684 : f32
+    } -> tensor<1x32x80x1xf32>
+    %322 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%319 : tensor<1x32x80x80xf32>) outs(%321 : tensor<1x32x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.addf %in, %out : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x1xf32>
+    %323 = tensor.empty() : tensor<1x32x80x80xf32>
+    %324 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%319, %322 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%323 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.divf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %cst_823 = arith.constant dense<0.000000e+00> : tensor<1x32x80x80xf32>
+    %collapsed_824 = tensor.collapse_shape %324 [[0, 1], [2], [3]] : tensor<1x32x80x80xf32> into tensor<32x80x80xf32>
+    %cst_825 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32>
+    %collapsed_826 = tensor.collapse_shape %273 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32>
+    %cst_827 = arith.constant 0.000000e+00 : f32
+    %325 = tensor.empty() : tensor<32x80x128xf32>
+    %326 = linalg.fill ins(%cst_827 : f32) outs(%325 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32>
+    %327 = linalg.batch_matmul ins(%collapsed_824, %collapsed_826 : tensor<32x80x80xf32>, tensor<32x80x128xf32>) outs(%326 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32>
+    %expanded_828 = tensor.expand_shape %327 [[0, 1], [2], [3]] : tensor<32x80x128xf32> into tensor<1x32x80x128xf32>
+    %328 = tensor.empty() : tensor<1x80x32x128xf32>
+    %329 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_828 : tensor<1x32x80x128xf32>) outs(%328 : tensor<1x80x32x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x80x32x128xf32>
+    %collapsed_829 = tensor.collapse_shape %329 [[0], [1], [2, 3]] : tensor<1x80x32x128xf32> into tensor<1x80x4096xf32>
+    %330 = tensor.empty() : tensor<4096x4096xf32>
+    %331 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_100 : tensor<4096x4096xf32>) outs(%330 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_830 = tensor.collapse_shape %collapsed_829 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_831 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %332 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_830, %331 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_831 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_832 = tensor.expand_shape %332 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %333 = tensor.empty() : tensor<1x80x4096xf32>
+    %334 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%247, %expanded_832 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%333 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %335 = tensor.empty() : tensor<1x80x4096xf32>
+    %cst_833 = arith.constant 2.000000e+00 : f32
+    %336 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%334 : tensor<1x80x4096xf32>) outs(%335 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.powf %in, %cst_833 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %cst_834 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32>
+    %337 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%336 : tensor<1x80x4096xf32>) outs(%cst_834 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %cst_2684 = arith.constant 4.096000e+03 : f32
+      %3652 = arith.divf %in, %cst_2684 : f32
+      %3653 = arith.addf %3652, %out : f32
+      linalg.yield %3653 : f32
+    } -> tensor<1x80x1xf32>
+    %cst_835 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32>
+    %338 = tensor.empty() : tensor<1x80x1xf32>
+    %339 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%337, %cst_835 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%338 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %340 = tensor.empty() : tensor<1x80x1xf32>
+    %341 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%339 : tensor<1x80x1xf32>) outs(%340 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.rsqrt %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %342 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_836 = tensor.collapse_shape %341 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32>
+    %343 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%334, %collapsed_836 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%342 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %expanded_837 = tensor.expand_shape %extracted_slice_4 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32>
+    %344 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_838 = tensor.collapse_shape %expanded_837 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32>
+    %345 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_838, %343 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%344 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %346 = tensor.empty() : tensor<4096x11008xf32>
+    %347 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_102 : tensor<11008x4096xf32>) outs(%346 : tensor<4096x11008xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x11008xf32>
+    %collapsed_839 = tensor.collapse_shape %345 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_840 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32>
+    %348 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_839, %347 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_840 : tensor<80x11008xf32>) -> tensor<80x11008xf32>
+    %expanded_841 = tensor.expand_shape %348 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32>
+    %349 = tensor.empty() : tensor<1x80x11008xf32>
+    %350 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_841 : tensor<1x80x11008xf32>) outs(%349 : tensor<1x80x11008xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.negf %in : f32
+      %3653 = math.exp %3652 : f32
+      %cst_2684 = arith.constant 1.000000e+00 : f32
+      %3654 = arith.addf %cst_2684, %3653 : f32
+      %3655 = arith.divf %in, %3654 : f32
+      linalg.yield %3655 : f32
+    } -> tensor<1x80x11008xf32>
+    %351 = tensor.empty() : tensor<4096x11008xf32>
+    %352 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_104 : tensor<11008x4096xf32>) outs(%351 : tensor<4096x11008xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x11008xf32>
+    %collapsed_842 = tensor.collapse_shape %345 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_843 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32>
+    %353 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_842, %352 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_843 : tensor<80x11008xf32>) -> tensor<80x11008xf32>
+    %expanded_844 = tensor.expand_shape %353 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32>
+    %354 = tensor.empty() : tensor<1x80x11008xf32>
+    %355 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%350, %expanded_844 : tensor<1x80x11008xf32>, tensor<1x80x11008xf32>) outs(%354 : tensor<1x80x11008xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x11008xf32>
+    %356 = tensor.empty() : tensor<11008x4096xf32>
+    %357 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_106 : tensor<4096x11008xf32>) outs(%356 : tensor<11008x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<11008x4096xf32>
+    %collapsed_845 = tensor.collapse_shape %355 [[0, 1], [2]] : tensor<1x80x11008xf32> into tensor<80x11008xf32>
+    %cst_846 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %358 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_845, %357 : tensor<80x11008xf32>, tensor<11008x4096xf32>) outs(%cst_846 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_847 = tensor.expand_shape %358 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %359 = tensor.empty() : tensor<1x80x4096xf32>
+    %360 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%334, %expanded_847 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%359 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %361 = tensor.empty() : tensor<1x80x4096xf32>
+    %cst_848 = arith.constant 2.000000e+00 : f32
+    %362 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%360 : tensor<1x80x4096xf32>) outs(%361 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.powf %in, %cst_848 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %cst_849 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32>
+    %363 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%362 : tensor<1x80x4096xf32>) outs(%cst_849 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %cst_2684 = arith.constant 4.096000e+03 : f32
+      %3652 = arith.divf %in, %cst_2684 : f32
+      %3653 = arith.addf %3652, %out : f32
+      linalg.yield %3653 : f32
+    } -> tensor<1x80x1xf32>
+    %cst_850 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32>
+    %364 = tensor.empty() : tensor<1x80x1xf32>
+    %365 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%363, %cst_850 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%364 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %366 = tensor.empty() : tensor<1x80x1xf32>
+    %367 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%365 : tensor<1x80x1xf32>) outs(%366 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.rsqrt %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %368 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_851 = tensor.collapse_shape %367 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32>
+    %369 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%360, %collapsed_851 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%368 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %expanded_852 = tensor.expand_shape %extracted_slice_5 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32>
+    %370 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_853 = tensor.collapse_shape %expanded_852 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32>
+    %371 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_853, %369 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%370 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %372 = tensor.empty() : tensor<4096x4096xf32>
+    %373 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_108 : tensor<4096x4096xf32>) outs(%372 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_854 = tensor.collapse_shape %371 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_855 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %374 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_854, %373 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_855 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_856 = tensor.expand_shape %374 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %375 = tensor.empty() : tensor<4096x4096xf32>
+    %376 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_110 : tensor<4096x4096xf32>) outs(%375 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_857 = tensor.collapse_shape %371 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_858 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %377 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_857, %376 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_858 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_859 = tensor.expand_shape %377 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %378 = tensor.empty() : tensor<4096x4096xf32>
+    %379 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_112 : tensor<4096x4096xf32>) outs(%378 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_860 = tensor.collapse_shape %371 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_861 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %380 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_860, %379 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_861 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_862 = tensor.expand_shape %380 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %expanded_863 = tensor.expand_shape %expanded_856 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32>
+    %381 = tensor.empty() : tensor<1x32x80x128xf32>
+    %382 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_863 : tensor<1x80x32x128xf32>) outs(%381 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x80x128xf32>
+    %expanded_864 = tensor.expand_shape %expanded_859 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32>
+    %383 = tensor.empty() : tensor<1x32x80x128xf32>
+    %384 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_864 : tensor<1x80x32x128xf32>) outs(%383 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x80x128xf32>
+    %expanded_865 = tensor.expand_shape %expanded_862 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32>
+    %385 = tensor.empty() : tensor<1x32x80x128xf32>
+    %386 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_865 : tensor<1x80x32x128xf32>) outs(%385 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x80x128xf32>
+    %extracted_slice_866 = tensor.extract_slice %expanded_528[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32>
+    %extracted_slice_867 = tensor.extract_slice %expanded_530[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32>
+    %387 = tensor.empty() : tensor<1x80x128xf32>
+    %388 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_866 : tensor<1x1x80x128xf32>) outs(%387 : tensor<1x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x80x128xf32>
+    %389 = tensor.empty() : tensor<80x128xf32>
+    %390 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%388 : tensor<1x80x128xf32>) outs(%389 : tensor<80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<80x128xf32>
+    %391 = tensor.empty() : tensor<1x80x128xf32>
+    %392 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_867 : tensor<1x1x80x128xf32>) outs(%391 : tensor<1x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x80x128xf32>
+    %393 = tensor.empty() : tensor<80x128xf32>
+    %394 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%392 : tensor<1x80x128xf32>) outs(%393 : tensor<80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<80x128xf32>
+    %395 = tensor.empty() : tensor<1x80x128xf32>
+    %396 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%395 : tensor<1x80x128xf32>) {
+    ^bb0(%in: i64, %out: f32):
+      %3652 = arith.index_cast %in : i64 to index
+      %3653 = linalg.index 2 : index
+      %extracted = tensor.extract %390[%3652, %3653] : tensor<80x128xf32>
+      linalg.yield %extracted : f32
+    } -> tensor<1x80x128xf32>
+    %expanded_868 = tensor.expand_shape %396 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32>
+    %397 = tensor.empty() : tensor<1x80x128xf32>
+    %398 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%397 : tensor<1x80x128xf32>) {
+    ^bb0(%in: i64, %out: f32):
+      %3652 = arith.index_cast %in : i64 to index
+      %3653 = linalg.index 2 : index
+      %extracted = tensor.extract %394[%3652, %3653] : tensor<80x128xf32>
+      linalg.yield %extracted : f32
+    } -> tensor<1x80x128xf32>
+    %expanded_869 = tensor.expand_shape %398 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32>
+    %399 = tensor.empty() : tensor<1x32x80x128xf32>
+    %400 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%382, %396 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%399 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %extracted_slice_870 = tensor.extract_slice %382[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %extracted_slice_871 = tensor.extract_slice %382[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %401 = tensor.empty() : tensor<1x32x80x64xf32>
+    %402 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_871 : tensor<1x32x80x64xf32>) outs(%401 : tensor<1x32x80x64xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.negf %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x64xf32>
+    %403 = tensor.empty() : tensor<1x32x80x128xf32>
+    %inserted_slice_872 = tensor.insert_slice %402 into %403[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %inserted_slice_873 = tensor.insert_slice %extracted_slice_870 into %inserted_slice_872[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %404 = tensor.empty() : tensor<1x32x80x128xf32>
+    %405 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_873, %398 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%404 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %406 = tensor.empty() : tensor<1x32x80x128xf32>
+    %407 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%400, %405 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%406 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %408 = tensor.empty() : tensor<1x32x80x128xf32>
+    %409 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%384, %396 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%408 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %extracted_slice_874 = tensor.extract_slice %384[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %extracted_slice_875 = tensor.extract_slice %384[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %410 = tensor.empty() : tensor<1x32x80x64xf32>
+    %411 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_875 : tensor<1x32x80x64xf32>) outs(%410 : tensor<1x32x80x64xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.negf %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x64xf32>
+    %412 = tensor.empty() : tensor<1x32x80x128xf32>
+    %inserted_slice_876 = tensor.insert_slice %411 into %412[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %inserted_slice_877 = tensor.insert_slice %extracted_slice_874 into %inserted_slice_876[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %413 = tensor.empty() : tensor<1x32x80x128xf32>
+    %414 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_877, %398 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%413 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %415 = tensor.empty() : tensor<1x32x80x128xf32>
+    %416 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%409, %414 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%415 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %417 = tensor.empty() : tensor<1x32x128x80xf32>
+    %418 = linalg.generic {indexing_maps = [#map14, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%416 : tensor<1x32x80x128xf32>) outs(%417 : tensor<1x32x128x80xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x128x80xf32>
+    %cst_878 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32>
+    %collapsed_879 = tensor.collapse_shape %407 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32>
+    %cst_880 = arith.constant dense<0.000000e+00> : tensor<1x32x128x80xf32>
+    %collapsed_881 = tensor.collapse_shape %418 [[0, 1], [2], [3]] : tensor<1x32x128x80xf32> into tensor<32x128x80xf32>
+    %cst_882 = arith.constant 0.000000e+00 : f32
+    %419 = tensor.empty() : tensor<32x80x80xf32>
+    %420 = linalg.fill ins(%cst_882 : f32) outs(%419 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32>
+    %421 = linalg.batch_matmul ins(%collapsed_879, %collapsed_881 : tensor<32x80x128xf32>, tensor<32x128x80xf32>) outs(%420 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32>
+    %expanded_883 = tensor.expand_shape %421 [[0, 1], [2], [3]] : tensor<32x80x80xf32> into tensor<1x32x80x80xf32>
+    %cst_884 = arith.constant dense<11.3137083> : tensor<1x32x80x80xf32>
+    %422 = tensor.empty() : tensor<1x32x80x80xf32>
+    %423 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst_884 : tensor<1x32x80x80xf32>) outs(%422 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %cst_2684 = arith.constant 1.000000e+00 : f32
+      %3652 = arith.divf %cst_2684, %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %424 = tensor.empty() : tensor<1x32x80x80xf32>
+    %425 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_883, %423 : tensor<1x32x80x80xf32>, tensor<1x32x80x80xf32>) outs(%424 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %426 = tensor.empty() : tensor<1x32x80x80xf32>
+    %collapsed_885 = tensor.collapse_shape %21 [[0, 1], [2], [3]] : tensor<1x1x80x80xf32> into tensor<1x80x80xf32>
+    %427 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%425, %collapsed_885 : tensor<1x32x80x80xf32>, tensor<1x80x80xf32>) outs(%426 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %428 = tensor.empty() : tensor<1x32x80x1xf32>
+    %429 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%428 : tensor<1x32x80x1xf32>) {
+    ^bb0(%out: f32):
+      %cst_2684 = arith.constant 0xFF800000 : f32
+      linalg.yield %cst_2684 : f32
+    } -> tensor<1x32x80x1xf32>
+    %430 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%427 : tensor<1x32x80x80xf32>) outs(%428 : tensor<1x32x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.maximumf %in, %out : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x1xf32>
+    %431 = tensor.empty() : tensor<1x32x80x80xf32>
+    %432 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%427, %430 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%431 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.subf %in, %in_2684 : f32
+      %3653 = math.exp %3652 : f32
+      linalg.yield %3653 : f32
+    } -> tensor<1x32x80x80xf32>
+    %433 = tensor.empty() : tensor<1x32x80x1xf32>
+    %434 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%433 : tensor<1x32x80x1xf32>) {
+    ^bb0(%out: f32):
+      %cst_2684 = arith.constant 0.000000e+00 : f32
+      linalg.yield %cst_2684 : f32
+    } -> tensor<1x32x80x1xf32>
+    %435 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%432 : tensor<1x32x80x80xf32>) outs(%434 : tensor<1x32x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.addf %in, %out : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x1xf32>
+    %436 = tensor.empty() : tensor<1x32x80x80xf32>
+    %437 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%432, %435 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%436 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.divf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %cst_886 = arith.constant dense<0.000000e+00> : tensor<1x32x80x80xf32>
+    %collapsed_887 = tensor.collapse_shape %437 [[0, 1], [2], [3]] : tensor<1x32x80x80xf32> into tensor<32x80x80xf32>
+    %cst_888 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32>
+    %collapsed_889 = tensor.collapse_shape %386 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32>
+    %cst_890 = arith.constant 0.000000e+00 : f32
+    %438 = tensor.empty() : tensor<32x80x128xf32>
+    %439 = linalg.fill ins(%cst_890 : f32) outs(%438 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32>
+    %440 = linalg.batch_matmul ins(%collapsed_887, %collapsed_889 : tensor<32x80x80xf32>, tensor<32x80x128xf32>) outs(%439 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32>
+    %expanded_891 = tensor.expand_shape %440 [[0, 1], [2], [3]] : tensor<32x80x128xf32> into tensor<1x32x80x128xf32>
+    %441 = tensor.empty() : tensor<1x80x32x128xf32>
+    %442 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_891 : tensor<1x32x80x128xf32>) outs(%441 : tensor<1x80x32x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x80x32x128xf32>
+    %collapsed_892 = tensor.collapse_shape %442 [[0], [1], [2, 3]] : tensor<1x80x32x128xf32> into tensor<1x80x4096xf32>
+    %443 = tensor.empty() : tensor<4096x4096xf32>
+    %444 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_114 : tensor<4096x4096xf32>) outs(%443 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_893 = tensor.collapse_shape %collapsed_892 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_894 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %445 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_893, %444 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_894 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_895 = tensor.expand_shape %445 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %446 = tensor.empty() : tensor<1x80x4096xf32>
+    %447 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%360, %expanded_895 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%446 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %448 = tensor.empty() : tensor<1x80x4096xf32>
+    %cst_896 = arith.constant 2.000000e+00 : f32
+    %449 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%447 : tensor<1x80x4096xf32>) outs(%448 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.powf %in, %cst_896 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %cst_897 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32>
+    %450 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%449 : tensor<1x80x4096xf32>) outs(%cst_897 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %cst_2684 = arith.constant 4.096000e+03 : f32
+      %3652 = arith.divf %in, %cst_2684 : f32
+      %3653 = arith.addf %3652, %out : f32
+      linalg.yield %3653 : f32
+    } -> tensor<1x80x1xf32>
+    %cst_898 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32>
+    %451 = tensor.empty() : tensor<1x80x1xf32>
+    %452 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%450, %cst_898 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%451 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %453 = tensor.empty() : tensor<1x80x1xf32>
+    %454 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%452 : tensor<1x80x1xf32>) outs(%453 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.rsqrt %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %455 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_899 = tensor.collapse_shape %454 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32>
+    %456 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%447, %collapsed_899 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%455 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %expanded_900 = tensor.expand_shape %extracted_slice_6 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32>
+    %457 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_901 = tensor.collapse_shape %expanded_900 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32>
+    %458 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_901, %456 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%457 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %459 = tensor.empty() : tensor<4096x11008xf32>
+    %460 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_116 : tensor<11008x4096xf32>) outs(%459 : tensor<4096x11008xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x11008xf32>
+    %collapsed_902 = tensor.collapse_shape %458 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_903 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32>
+    %461 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_902, %460 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_903 : tensor<80x11008xf32>) -> tensor<80x11008xf32>
+    %expanded_904 = tensor.expand_shape %461 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32>
+    %462 = tensor.empty() : tensor<1x80x11008xf32>
+    %463 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_904 : tensor<1x80x11008xf32>) outs(%462 : tensor<1x80x11008xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.negf %in : f32
+      %3653 = math.exp %3652 : f32
+      %cst_2684 = arith.constant 1.000000e+00 : f32
+      %3654 = arith.addf %cst_2684, %3653 : f32
+      %3655 = arith.divf %in, %3654 : f32
+      linalg.yield %3655 : f32
+    } -> tensor<1x80x11008xf32>
+    %464 = tensor.empty() : tensor<4096x11008xf32>
+    %465 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_118 : tensor<11008x4096xf32>) outs(%464 : tensor<4096x11008xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x11008xf32>
+    %collapsed_905 = tensor.collapse_shape %458 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_906 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32>
+    %466 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_905, %465 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_906 : tensor<80x11008xf32>) -> tensor<80x11008xf32>
+    %expanded_907 = tensor.expand_shape %466 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32>
+    %467 = tensor.empty() : tensor<1x80x11008xf32>
+    %468 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%463, %expanded_907 : tensor<1x80x11008xf32>, tensor<1x80x11008xf32>) outs(%467 : tensor<1x80x11008xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x11008xf32>
+    %469 = tensor.empty() : tensor<11008x4096xf32>
+    %470 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_120 : tensor<4096x11008xf32>) outs(%469 : tensor<11008x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<11008x4096xf32>
+    %collapsed_908 = tensor.collapse_shape %468 [[0, 1], [2]] : tensor<1x80x11008xf32> into tensor<80x11008xf32>
+    %cst_909 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %471 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_908, %470 : tensor<80x11008xf32>, tensor<11008x4096xf32>) outs(%cst_909 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_910 = tensor.expand_shape %471 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %472 = tensor.empty() : tensor<1x80x4096xf32>
+    %473 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%447, %expanded_910 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%472 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %474 = tensor.empty() : tensor<1x80x4096xf32>
+    %cst_911 = arith.constant 2.000000e+00 : f32
+    %475 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%473 : tensor<1x80x4096xf32>) outs(%474 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.powf %in, %cst_911 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %cst_912 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32>
+    %476 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%475 : tensor<1x80x4096xf32>) outs(%cst_912 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %cst_2684 = arith.constant 4.096000e+03 : f32
+      %3652 = arith.divf %in, %cst_2684 : f32
+      %3653 = arith.addf %3652, %out : f32
+      linalg.yield %3653 : f32
+    } -> tensor<1x80x1xf32>
+    %cst_913 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32>
+    %477 = tensor.empty() : tensor<1x80x1xf32>
+    %478 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%476, %cst_913 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%477 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %479 = tensor.empty() : tensor<1x80x1xf32>
+    %480 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%478 : tensor<1x80x1xf32>) outs(%479 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.rsqrt %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %481 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_914 = tensor.collapse_shape %480 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32>
+    %482 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%473, %collapsed_914 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%481 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %expanded_915 = tensor.expand_shape %extracted_slice_7 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32>
+    %483 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_916 = tensor.collapse_shape %expanded_915 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32>
+    %484 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_916, %482 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%483 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %485 = tensor.empty() : tensor<4096x4096xf32>
+    %486 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_122 : tensor<4096x4096xf32>) outs(%485 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_917 = tensor.collapse_shape %484 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_918 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %487 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_917, %486 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_918 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_919 = tensor.expand_shape %487 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %488 = tensor.empty() : tensor<4096x4096xf32>
+    %489 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_124 : tensor<4096x4096xf32>) outs(%488 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_920 = tensor.collapse_shape %484 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_921 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %490 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_920, %489 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_921 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_922 = tensor.expand_shape %490 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %491 = tensor.empty() : tensor<4096x4096xf32>
+    %492 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_126 : tensor<4096x4096xf32>) outs(%491 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_923 = tensor.collapse_shape %484 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_924 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %493 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_923, %492 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_924 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_925 = tensor.expand_shape %493 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %expanded_926 = tensor.expand_shape %expanded_919 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32>
+    %494 = tensor.empty() : tensor<1x32x80x128xf32>
+    %495 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_926 : tensor<1x80x32x128xf32>) outs(%494 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x80x128xf32>
+    %expanded_927 = tensor.expand_shape %expanded_922 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32>
+    %496 = tensor.empty() : tensor<1x32x80x128xf32>
+    %497 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_927 : tensor<1x80x32x128xf32>) outs(%496 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x80x128xf32>
+    %expanded_928 = tensor.expand_shape %expanded_925 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32>
+    %498 = tensor.empty() : tensor<1x32x80x128xf32>
+    %499 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_928 : tensor<1x80x32x128xf32>) outs(%498 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x80x128xf32>
+    %extracted_slice_929 = tensor.extract_slice %expanded_532[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32>
+    %extracted_slice_930 = tensor.extract_slice %expanded_534[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32>
+    %500 = tensor.empty() : tensor<1x80x128xf32>
+    %501 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_929 : tensor<1x1x80x128xf32>) outs(%500 : tensor<1x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x80x128xf32>
+    %502 = tensor.empty() : tensor<80x128xf32>
+    %503 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%501 : tensor<1x80x128xf32>) outs(%502 : tensor<80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<80x128xf32>
+    %504 = tensor.empty() : tensor<1x80x128xf32>
+    %505 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_930 : tensor<1x1x80x128xf32>) outs(%504 : tensor<1x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x80x128xf32>
+    %506 = tensor.empty() : tensor<80x128xf32>
+    %507 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%505 : tensor<1x80x128xf32>) outs(%506 : tensor<80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<80x128xf32>
+    %508 = tensor.empty() : tensor<1x80x128xf32>
+    %509 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%508 : tensor<1x80x128xf32>) {
+    ^bb0(%in: i64, %out: f32):
+      %3652 = arith.index_cast %in : i64 to index
+      %3653 = linalg.index 2 : index
+      %extracted = tensor.extract %503[%3652, %3653] : tensor<80x128xf32>
+      linalg.yield %extracted : f32
+    } -> tensor<1x80x128xf32>
+    %expanded_931 = tensor.expand_shape %509 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32>
+    %510 = tensor.empty() : tensor<1x80x128xf32>
+    %511 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%510 : tensor<1x80x128xf32>) {
+    ^bb0(%in: i64, %out: f32):
+      %3652 = arith.index_cast %in : i64 to index
+      %3653 = linalg.index 2 : index
+      %extracted = tensor.extract %507[%3652, %3653] : tensor<80x128xf32>
+      linalg.yield %extracted : f32
+    } -> tensor<1x80x128xf32>
+    %expanded_932 = tensor.expand_shape %511 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32>
+    %512 = tensor.empty() : tensor<1x32x80x128xf32>
+    %513 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%495, %509 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%512 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %extracted_slice_933 = tensor.extract_slice %495[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %extracted_slice_934 = tensor.extract_slice %495[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %514 = tensor.empty() : tensor<1x32x80x64xf32>
+    %515 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_934 : tensor<1x32x80x64xf32>) outs(%514 : tensor<1x32x80x64xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.negf %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x64xf32>
+    %516 = tensor.empty() : tensor<1x32x80x128xf32>
+    %inserted_slice_935 = tensor.insert_slice %515 into %516[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %inserted_slice_936 = tensor.insert_slice %extracted_slice_933 into %inserted_slice_935[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %517 = tensor.empty() : tensor<1x32x80x128xf32>
+    %518 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_936, %511 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%517 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %519 = tensor.empty() : tensor<1x32x80x128xf32>
+    %520 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%513, %518 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%519 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %521 = tensor.empty() : tensor<1x32x80x128xf32>
+    %522 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%497, %509 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%521 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %extracted_slice_937 = tensor.extract_slice %497[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %extracted_slice_938 = tensor.extract_slice %497[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %523 = tensor.empty() : tensor<1x32x80x64xf32>
+    %524 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_938 : tensor<1x32x80x64xf32>) outs(%523 : tensor<1x32x80x64xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.negf %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x64xf32>
+    %525 = tensor.empty() : tensor<1x32x80x128xf32>
+    %inserted_slice_939 = tensor.insert_slice %524 into %525[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %inserted_slice_940 = tensor.insert_slice %extracted_slice_937 into %inserted_slice_939[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %526 = tensor.empty() : tensor<1x32x80x128xf32>
+    %527 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_940, %511 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%526 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %528 = tensor.empty() : tensor<1x32x80x128xf32>
+    %529 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%522, %527 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%528 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %530 = tensor.empty() : tensor<1x32x128x80xf32>
+    %531 = linalg.generic {indexing_maps = [#map14, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%529 : tensor<1x32x80x128xf32>) outs(%530 : tensor<1x32x128x80xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x128x80xf32>
+    %cst_941 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32>
+    %collapsed_942 = tensor.collapse_shape %520 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32>
+    %cst_943 = arith.constant dense<0.000000e+00> : tensor<1x32x128x80xf32>
+    %collapsed_944 = tensor.collapse_shape %531 [[0, 1], [2], [3]] : tensor<1x32x128x80xf32> into tensor<32x128x80xf32>
+    %cst_945 = arith.constant 0.000000e+00 : f32
+    %532 = tensor.empty() : tensor<32x80x80xf32>
+    %533 = linalg.fill ins(%cst_945 : f32) outs(%532 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32>
+    %534 = linalg.batch_matmul ins(%collapsed_942, %collapsed_944 : tensor<32x80x128xf32>, tensor<32x128x80xf32>) outs(%533 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32>
+    %expanded_946 = tensor.expand_shape %534 [[0, 1], [2], [3]] : tensor<32x80x80xf32> into tensor<1x32x80x80xf32>
+    %cst_947 = arith.constant dense<11.3137083> : tensor<1x32x80x80xf32>
+    %535 = tensor.empty() : tensor<1x32x80x80xf32>
+    %536 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst_947 : tensor<1x32x80x80xf32>) outs(%535 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %cst_2684 = arith.constant 1.000000e+00 : f32
+      %3652 = arith.divf %cst_2684, %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %537 = tensor.empty() : tensor<1x32x80x80xf32>
+    %538 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_946, %536 : tensor<1x32x80x80xf32>, tensor<1x32x80x80xf32>) outs(%537 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %539 = tensor.empty() : tensor<1x32x80x80xf32>
+    %collapsed_948 = tensor.collapse_shape %21 [[0, 1], [2], [3]] : tensor<1x1x80x80xf32> into tensor<1x80x80xf32>
+    %540 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%538, %collapsed_948 : tensor<1x32x80x80xf32>, tensor<1x80x80xf32>) outs(%539 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %541 = tensor.empty() : tensor<1x32x80x1xf32>
+    %542 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%541 : tensor<1x32x80x1xf32>) {
+    ^bb0(%out: f32):
+      %cst_2684 = arith.constant 0xFF800000 : f32
+      linalg.yield %cst_2684 : f32
+    } -> tensor<1x32x80x1xf32>
+    %543 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%540 : tensor<1x32x80x80xf32>) outs(%541 : tensor<1x32x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.maximumf %in, %out : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x1xf32>
+    %544 = tensor.empty() : tensor<1x32x80x80xf32>
+    %545 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%540, %543 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%544 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.subf %in, %in_2684 : f32
+      %3653 = math.exp %3652 : f32
+      linalg.yield %3653 : f32
+    } -> tensor<1x32x80x80xf32>
+    %546 = tensor.empty() : tensor<1x32x80x1xf32>
+    %547 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%546 : tensor<1x32x80x1xf32>) {
+    ^bb0(%out: f32):
+      %cst_2684 = arith.constant 0.000000e+00 : f32
+      linalg.yield %cst_2684 : f32
+    } -> tensor<1x32x80x1xf32>
+    %548 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%545 : tensor<1x32x80x80xf32>) outs(%547 : tensor<1x32x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.addf %in, %out : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x1xf32>
+    %549 = tensor.empty() : tensor<1x32x80x80xf32>
+    %550 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%545, %548 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%549 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.divf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %cst_949 = arith.constant dense<0.000000e+00> : tensor<1x32x80x80xf32>
+    %collapsed_950 = tensor.collapse_shape %550 [[0, 1], [2], [3]] : tensor<1x32x80x80xf32> into tensor<32x80x80xf32>
+    %cst_951 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32>
+    %collapsed_952 = tensor.collapse_shape %499 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32>
+    %cst_953 = arith.constant 0.000000e+00 : f32
+    %551 = tensor.empty() : tensor<32x80x128xf32>
+    %552 = linalg.fill ins(%cst_953 : f32) outs(%551 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32>
+    %553 = linalg.batch_matmul ins(%collapsed_950, %collapsed_952 : tensor<32x80x80xf32>, tensor<32x80x128xf32>) outs(%552 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32>
+    %expanded_954 = tensor.expand_shape %553 [[0, 1], [2], [3]] : tensor<32x80x128xf32> into tensor<1x32x80x128xf32>
+    %554 = tensor.empty() : tensor<1x80x32x128xf32>
+    %555 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_954 : tensor<1x32x80x128xf32>) outs(%554 : tensor<1x80x32x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x80x32x128xf32>
+    %collapsed_955 = tensor.collapse_shape %555 [[0], [1], [2, 3]] : tensor<1x80x32x128xf32> into tensor<1x80x4096xf32>
+    %556 = tensor.empty() : tensor<4096x4096xf32>
+    %557 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_128 : tensor<4096x4096xf32>) outs(%556 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_956 = tensor.collapse_shape %collapsed_955 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_957 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %558 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_956, %557 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_957 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_958 = tensor.expand_shape %558 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %559 = tensor.empty() : tensor<1x80x4096xf32>
+    %560 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%473, %expanded_958 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%559 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %561 = tensor.empty() : tensor<1x80x4096xf32>
+    %cst_959 = arith.constant 2.000000e+00 : f32
+    %562 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%560 : tensor<1x80x4096xf32>) outs(%561 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.powf %in, %cst_959 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %cst_960 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32>
+    %563 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%562 : tensor<1x80x4096xf32>) outs(%cst_960 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %cst_2684 = arith.constant 4.096000e+03 : f32
+      %3652 = arith.divf %in, %cst_2684 : f32
+      %3653 = arith.addf %3652, %out : f32
+      linalg.yield %3653 : f32
+    } -> tensor<1x80x1xf32>
+    %cst_961 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32>
+    %564 = tensor.empty() : tensor<1x80x1xf32>
+    %565 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%563, %cst_961 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%564 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %566 = tensor.empty() : tensor<1x80x1xf32>
+    %567 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%565 : tensor<1x80x1xf32>) outs(%566 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.rsqrt %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %568 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_962 = tensor.collapse_shape %567 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32>
+    %569 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%560, %collapsed_962 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%568 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %expanded_963 = tensor.expand_shape %extracted_slice_8 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32>
+    %570 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_964 = tensor.collapse_shape %expanded_963 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32>
+    %571 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_964, %569 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%570 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %572 = tensor.empty() : tensor<4096x11008xf32>
+    %573 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_130 : tensor<11008x4096xf32>) outs(%572 : tensor<4096x11008xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x11008xf32>
+    %collapsed_965 = tensor.collapse_shape %571 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_966 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32>
+    %574 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_965, %573 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_966 : tensor<80x11008xf32>) -> tensor<80x11008xf32>
+    %expanded_967 = tensor.expand_shape %574 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32>
+    %575 = tensor.empty() : tensor<1x80x11008xf32>
+    %576 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_967 : tensor<1x80x11008xf32>) outs(%575 : tensor<1x80x11008xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.negf %in : f32
+      %3653 = math.exp %3652 : f32
+      %cst_2684 = arith.constant 1.000000e+00 : f32
+      %3654 = arith.addf %cst_2684, %3653 : f32
+      %3655 = arith.divf %in, %3654 : f32
+      linalg.yield %3655 : f32
+    } -> tensor<1x80x11008xf32>
+    %577 = tensor.empty() : tensor<4096x11008xf32>
+    %578 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_132 : tensor<11008x4096xf32>) outs(%577 : tensor<4096x11008xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x11008xf32>
+    %collapsed_968 = tensor.collapse_shape %571 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_969 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32>
+    %579 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_968, %578 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_969 : tensor<80x11008xf32>) -> tensor<80x11008xf32>
+    %expanded_970 = tensor.expand_shape %579 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32>
+    %580 = tensor.empty() : tensor<1x80x11008xf32>
+    %581 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%576, %expanded_970 : tensor<1x80x11008xf32>, tensor<1x80x11008xf32>) outs(%580 : tensor<1x80x11008xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x11008xf32>
+    %582 = tensor.empty() : tensor<11008x4096xf32>
+    %583 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_134 : tensor<4096x11008xf32>) outs(%582 : tensor<11008x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<11008x4096xf32>
+    %collapsed_971 = tensor.collapse_shape %581 [[0, 1], [2]] : tensor<1x80x11008xf32> into tensor<80x11008xf32>
+    %cst_972 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %584 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_971, %583 : tensor<80x11008xf32>, tensor<11008x4096xf32>) outs(%cst_972 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_973 = tensor.expand_shape %584 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %585 = tensor.empty() : tensor<1x80x4096xf32>
+    %586 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%560, %expanded_973 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%585 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %587 = tensor.empty() : tensor<1x80x4096xf32>
+    %cst_974 = arith.constant 2.000000e+00 : f32
+    %588 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%586 : tensor<1x80x4096xf32>) outs(%587 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.powf %in, %cst_974 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %cst_975 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32>
+    %589 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%588 : tensor<1x80x4096xf32>) outs(%cst_975 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %cst_2684 = arith.constant 4.096000e+03 : f32
+      %3652 = arith.divf %in, %cst_2684 : f32
+      %3653 = arith.addf %3652, %out : f32
+      linalg.yield %3653 : f32
+    } -> tensor<1x80x1xf32>
+    %cst_976 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32>
+    %590 = tensor.empty() : tensor<1x80x1xf32>
+    %591 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%589, %cst_976 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%590 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %592 = tensor.empty() : tensor<1x80x1xf32>
+    %593 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%591 : tensor<1x80x1xf32>) outs(%592 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.rsqrt %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %594 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_977 = tensor.collapse_shape %593 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32>
+    %595 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%586, %collapsed_977 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%594 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %expanded_978 = tensor.expand_shape %extracted_slice_9 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32>
+    %596 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_979 = tensor.collapse_shape %expanded_978 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32>
+    %597 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_979, %595 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%596 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %598 = tensor.empty() : tensor<4096x4096xf32>
+    %599 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_136 : tensor<4096x4096xf32>) outs(%598 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_980 = tensor.collapse_shape %597 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_981 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %600 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_980, %599 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_981 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_982 = tensor.expand_shape %600 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %601 = tensor.empty() : tensor<4096x4096xf32>
+    %602 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_138 : tensor<4096x4096xf32>) outs(%601 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_983 = tensor.collapse_shape %597 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_984 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %603 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_983, %602 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_984 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_985 = tensor.expand_shape %603 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %604 = tensor.empty() : tensor<4096x4096xf32>
+    %605 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_140 : tensor<4096x4096xf32>) outs(%604 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_986 = tensor.collapse_shape %597 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_987 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %606 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_986, %605 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_987 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_988 = tensor.expand_shape %606 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %expanded_989 = tensor.expand_shape %expanded_982 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32>
+    %607 = tensor.empty() : tensor<1x32x80x128xf32>
+    %608 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_989 : tensor<1x80x32x128xf32>) outs(%607 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x80x128xf32>
+    %expanded_990 = tensor.expand_shape %expanded_985 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32>
+    %609 = tensor.empty() : tensor<1x32x80x128xf32>
+    %610 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_990 : tensor<1x80x32x128xf32>) outs(%609 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x80x128xf32>
+    %expanded_991 = tensor.expand_shape %expanded_988 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32>
+    %611 = tensor.empty() : tensor<1x32x80x128xf32>
+    %612 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_991 : tensor<1x80x32x128xf32>) outs(%611 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x80x128xf32>
+    %extracted_slice_992 = tensor.extract_slice %expanded_536[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32>
+    %extracted_slice_993 = tensor.extract_slice %expanded_538[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32>
+    %613 = tensor.empty() : tensor<1x80x128xf32>
+    %614 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_992 : tensor<1x1x80x128xf32>) outs(%613 : tensor<1x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x80x128xf32>
+    %615 = tensor.empty() : tensor<80x128xf32>
+    %616 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%614 : tensor<1x80x128xf32>) outs(%615 : tensor<80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<80x128xf32>
+    %617 = tensor.empty() : tensor<1x80x128xf32>
+    %618 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_993 : tensor<1x1x80x128xf32>) outs(%617 : tensor<1x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x80x128xf32>
+    %619 = tensor.empty() : tensor<80x128xf32>
+    %620 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%618 : tensor<1x80x128xf32>) outs(%619 : tensor<80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<80x128xf32>
+    %621 = tensor.empty() : tensor<1x80x128xf32>
+    %622 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%621 : tensor<1x80x128xf32>) {
+    ^bb0(%in: i64, %out: f32):
+      %3652 = arith.index_cast %in : i64 to index
+      %3653 = linalg.index 2 : index
+      %extracted = tensor.extract %616[%3652, %3653] : tensor<80x128xf32>
+      linalg.yield %extracted : f32
+    } -> tensor<1x80x128xf32>
+    %expanded_994 = tensor.expand_shape %622 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32>
+    %623 = tensor.empty() : tensor<1x80x128xf32>
+    %624 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%623 : tensor<1x80x128xf32>) {
+    ^bb0(%in: i64, %out: f32):
+      %3652 = arith.index_cast %in : i64 to index
+      %3653 = linalg.index 2 : index
+      %extracted = tensor.extract %620[%3652, %3653] : tensor<80x128xf32>
+      linalg.yield %extracted : f32
+    } -> tensor<1x80x128xf32>
+    %expanded_995 = tensor.expand_shape %624 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32>
+    %625 = tensor.empty() : tensor<1x32x80x128xf32>
+    %626 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%608, %622 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%625 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %extracted_slice_996 = tensor.extract_slice %608[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %extracted_slice_997 = tensor.extract_slice %608[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %627 = tensor.empty() : tensor<1x32x80x64xf32>
+    %628 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_997 : tensor<1x32x80x64xf32>) outs(%627 : tensor<1x32x80x64xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.negf %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x64xf32>
+    %629 = tensor.empty() : tensor<1x32x80x128xf32>
+    %inserted_slice_998 = tensor.insert_slice %628 into %629[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %inserted_slice_999 = tensor.insert_slice %extracted_slice_996 into %inserted_slice_998[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %630 = tensor.empty() : tensor<1x32x80x128xf32>
+    %631 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_999, %624 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%630 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %632 = tensor.empty() : tensor<1x32x80x128xf32>
+    %633 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%626, %631 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%632 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %634 = tensor.empty() : tensor<1x32x80x128xf32>
+    %635 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%610, %622 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%634 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %extracted_slice_1000 = tensor.extract_slice %610[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %extracted_slice_1001 = tensor.extract_slice %610[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %636 = tensor.empty() : tensor<1x32x80x64xf32>
+    %637 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_1001 : tensor<1x32x80x64xf32>) outs(%636 : tensor<1x32x80x64xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.negf %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x64xf32>
+    %638 = tensor.empty() : tensor<1x32x80x128xf32>
+    %inserted_slice_1002 = tensor.insert_slice %637 into %638[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %inserted_slice_1003 = tensor.insert_slice %extracted_slice_1000 into %inserted_slice_1002[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %639 = tensor.empty() : tensor<1x32x80x128xf32>
+    %640 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_1003, %624 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%639 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %641 = tensor.empty() : tensor<1x32x80x128xf32>
+    %642 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%635, %640 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%641 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %643 = tensor.empty() : tensor<1x32x128x80xf32>
+    %644 = linalg.generic {indexing_maps = [#map14, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%642 : tensor<1x32x80x128xf32>) outs(%643 : tensor<1x32x128x80xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x128x80xf32>
+    %cst_1004 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32>
+    %collapsed_1005 = tensor.collapse_shape %633 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32>
+    %cst_1006 = arith.constant dense<0.000000e+00> : tensor<1x32x128x80xf32>
+    %collapsed_1007 = tensor.collapse_shape %644 [[0, 1], [2], [3]] : tensor<1x32x128x80xf32> into tensor<32x128x80xf32>
+    %cst_1008 = arith.constant 0.000000e+00 : f32
+    %645 = tensor.empty() : tensor<32x80x80xf32>
+    %646 = linalg.fill ins(%cst_1008 : f32) outs(%645 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32>
+    %647 = linalg.batch_matmul ins(%collapsed_1005, %collapsed_1007 : tensor<32x80x128xf32>, tensor<32x128x80xf32>) outs(%646 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32>
+    %expanded_1009 = tensor.expand_shape %647 [[0, 1], [2], [3]] : tensor<32x80x80xf32> into tensor<1x32x80x80xf32>
+    %cst_1010 = arith.constant dense<11.3137083> : tensor<1x32x80x80xf32>
+    %648 = tensor.empty() : tensor<1x32x80x80xf32>
+    %649 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst_1010 : tensor<1x32x80x80xf32>) outs(%648 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %cst_2684 = arith.constant 1.000000e+00 : f32
+      %3652 = arith.divf %cst_2684, %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %650 = tensor.empty() : tensor<1x32x80x80xf32>
+    %651 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1009, %649 : tensor<1x32x80x80xf32>, tensor<1x32x80x80xf32>) outs(%650 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %652 = tensor.empty() : tensor<1x32x80x80xf32>
+    %collapsed_1011 = tensor.collapse_shape %21 [[0, 1], [2], [3]] : tensor<1x1x80x80xf32> into tensor<1x80x80xf32>
+    %653 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%651, %collapsed_1011 : tensor<1x32x80x80xf32>, tensor<1x80x80xf32>) outs(%652 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %654 = tensor.empty() : tensor<1x32x80x1xf32>
+    %655 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%654 : tensor<1x32x80x1xf32>) {
+    ^bb0(%out: f32):
+      %cst_2684 = arith.constant 0xFF800000 : f32
+      linalg.yield %cst_2684 : f32
+    } -> tensor<1x32x80x1xf32>
+    %656 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%653 : tensor<1x32x80x80xf32>) outs(%654 : tensor<1x32x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.maximumf %in, %out : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x1xf32>
+    %657 = tensor.empty() : tensor<1x32x80x80xf32>
+    %658 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%653, %656 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%657 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.subf %in, %in_2684 : f32
+      %3653 = math.exp %3652 : f32
+      linalg.yield %3653 : f32
+    } -> tensor<1x32x80x80xf32>
+    %659 = tensor.empty() : tensor<1x32x80x1xf32>
+    %660 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%659 : tensor<1x32x80x1xf32>) {
+    ^bb0(%out: f32):
+      %cst_2684 = arith.constant 0.000000e+00 : f32
+      linalg.yield %cst_2684 : f32
+    } -> tensor<1x32x80x1xf32>
+    %661 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%658 : tensor<1x32x80x80xf32>) outs(%660 : tensor<1x32x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.addf %in, %out : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x1xf32>
+    %662 = tensor.empty() : tensor<1x32x80x80xf32>
+    %663 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%658, %661 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%662 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.divf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %cst_1012 = arith.constant dense<0.000000e+00> : tensor<1x32x80x80xf32>
+    %collapsed_1013 = tensor.collapse_shape %663 [[0, 1], [2], [3]] : tensor<1x32x80x80xf32> into tensor<32x80x80xf32>
+    %cst_1014 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32>
+    %collapsed_1015 = tensor.collapse_shape %612 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32>
+    %cst_1016 = arith.constant 0.000000e+00 : f32
+    %664 = tensor.empty() : tensor<32x80x128xf32>
+    %665 = linalg.fill ins(%cst_1016 : f32) outs(%664 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32>
+    %666 = linalg.batch_matmul ins(%collapsed_1013, %collapsed_1015 : tensor<32x80x80xf32>, tensor<32x80x128xf32>) outs(%665 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32>
+    %expanded_1017 = tensor.expand_shape %666 [[0, 1], [2], [3]] : tensor<32x80x128xf32> into tensor<1x32x80x128xf32>
+    %667 = tensor.empty() : tensor<1x80x32x128xf32>
+    %668 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1017 : tensor<1x32x80x128xf32>) outs(%667 : tensor<1x80x32x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x80x32x128xf32>
+    %collapsed_1018 = tensor.collapse_shape %668 [[0], [1], [2, 3]] : tensor<1x80x32x128xf32> into tensor<1x80x4096xf32>
+    %669 = tensor.empty() : tensor<4096x4096xf32>
+    %670 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_142 : tensor<4096x4096xf32>) outs(%669 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_1019 = tensor.collapse_shape %collapsed_1018 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_1020 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %671 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1019, %670 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1020 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_1021 = tensor.expand_shape %671 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %672 = tensor.empty() : tensor<1x80x4096xf32>
+    %673 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%586, %expanded_1021 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%672 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %674 = tensor.empty() : tensor<1x80x4096xf32>
+    %cst_1022 = arith.constant 2.000000e+00 : f32
+    %675 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%673 : tensor<1x80x4096xf32>) outs(%674 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.powf %in, %cst_1022 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %cst_1023 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32>
+    %676 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%675 : tensor<1x80x4096xf32>) outs(%cst_1023 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %cst_2684 = arith.constant 4.096000e+03 : f32
+      %3652 = arith.divf %in, %cst_2684 : f32
+      %3653 = arith.addf %3652, %out : f32
+      linalg.yield %3653 : f32
+    } -> tensor<1x80x1xf32>
+    %cst_1024 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32>
+    %677 = tensor.empty() : tensor<1x80x1xf32>
+    %678 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%676, %cst_1024 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%677 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %679 = tensor.empty() : tensor<1x80x1xf32>
+    %680 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%678 : tensor<1x80x1xf32>) outs(%679 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.rsqrt %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %681 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_1025 = tensor.collapse_shape %680 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32>
+    %682 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%673, %collapsed_1025 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%681 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %expanded_1026 = tensor.expand_shape %extracted_slice_10 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32>
+    %683 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_1027 = tensor.collapse_shape %expanded_1026 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32>
+    %684 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_1027, %682 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%683 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %685 = tensor.empty() : tensor<4096x11008xf32>
+    %686 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_144 : tensor<11008x4096xf32>) outs(%685 : tensor<4096x11008xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x11008xf32>
+    %collapsed_1028 = tensor.collapse_shape %684 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_1029 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32>
+    %687 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1028, %686 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_1029 : tensor<80x11008xf32>) -> tensor<80x11008xf32>
+    %expanded_1030 = tensor.expand_shape %687 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32>
+    %688 = tensor.empty() : tensor<1x80x11008xf32>
+    %689 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_1030 : tensor<1x80x11008xf32>) outs(%688 : tensor<1x80x11008xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.negf %in : f32
+      %3653 = math.exp %3652 : f32
+      %cst_2684 = arith.constant 1.000000e+00 : f32
+      %3654 = arith.addf %cst_2684, %3653 : f32
+      %3655 = arith.divf %in, %3654 : f32
+      linalg.yield %3655 : f32
+    } -> tensor<1x80x11008xf32>
+    %690 = tensor.empty() : tensor<4096x11008xf32>
+    %691 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_146 : tensor<11008x4096xf32>) outs(%690 : tensor<4096x11008xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x11008xf32>
+    %collapsed_1031 = tensor.collapse_shape %684 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_1032 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32>
+    %692 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1031, %691 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_1032 : tensor<80x11008xf32>) -> tensor<80x11008xf32>
+    %expanded_1033 = tensor.expand_shape %692 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32>
+    %693 = tensor.empty() : tensor<1x80x11008xf32>
+    %694 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%689, %expanded_1033 : tensor<1x80x11008xf32>, tensor<1x80x11008xf32>) outs(%693 : tensor<1x80x11008xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x11008xf32>
+    %695 = tensor.empty() : tensor<11008x4096xf32>
+    %696 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_148 : tensor<4096x11008xf32>) outs(%695 : tensor<11008x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<11008x4096xf32>
+    %collapsed_1034 = tensor.collapse_shape %694 [[0, 1], [2]] : tensor<1x80x11008xf32> into tensor<80x11008xf32>
+    %cst_1035 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %697 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1034, %696 : tensor<80x11008xf32>, tensor<11008x4096xf32>) outs(%cst_1035 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_1036 = tensor.expand_shape %697 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %698 = tensor.empty() : tensor<1x80x4096xf32>
+    %699 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%673, %expanded_1036 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%698 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %700 = tensor.empty() : tensor<1x80x4096xf32>
+    %cst_1037 = arith.constant 2.000000e+00 : f32
+    %701 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%699 : tensor<1x80x4096xf32>) outs(%700 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.powf %in, %cst_1037 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %cst_1038 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32>
+    %702 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%701 : tensor<1x80x4096xf32>) outs(%cst_1038 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %cst_2684 = arith.constant 4.096000e+03 : f32
+      %3652 = arith.divf %in, %cst_2684 : f32
+      %3653 = arith.addf %3652, %out : f32
+      linalg.yield %3653 : f32
+    } -> tensor<1x80x1xf32>
+    %cst_1039 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32>
+    %703 = tensor.empty() : tensor<1x80x1xf32>
+    %704 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%702, %cst_1039 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%703 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %705 = tensor.empty() : tensor<1x80x1xf32>
+    %706 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%704 : tensor<1x80x1xf32>) outs(%705 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.rsqrt %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %707 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_1040 = tensor.collapse_shape %706 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32>
+    %708 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%699, %collapsed_1040 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%707 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %expanded_1041 = tensor.expand_shape %extracted_slice_11 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32>
+    %709 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_1042 = tensor.collapse_shape %expanded_1041 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32>
+    %710 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_1042, %708 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%709 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %711 = tensor.empty() : tensor<4096x4096xf32>
+    %712 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_150 : tensor<4096x4096xf32>) outs(%711 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_1043 = tensor.collapse_shape %710 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_1044 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %713 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1043, %712 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1044 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_1045 = tensor.expand_shape %713 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %714 = tensor.empty() : tensor<4096x4096xf32>
+    %715 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_152 : tensor<4096x4096xf32>) outs(%714 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_1046 = tensor.collapse_shape %710 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_1047 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %716 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1046, %715 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1047 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_1048 = tensor.expand_shape %716 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %717 = tensor.empty() : tensor<4096x4096xf32>
+    %718 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_154 : tensor<4096x4096xf32>) outs(%717 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_1049 = tensor.collapse_shape %710 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_1050 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %719 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1049, %718 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1050 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_1051 = tensor.expand_shape %719 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %expanded_1052 = tensor.expand_shape %expanded_1045 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32>
+    %720 = tensor.empty() : tensor<1x32x80x128xf32>
+    %721 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1052 : tensor<1x80x32x128xf32>) outs(%720 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x80x128xf32>
+    %expanded_1053 = tensor.expand_shape %expanded_1048 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32>
+    %722 = tensor.empty() : tensor<1x32x80x128xf32>
+    %723 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1053 : tensor<1x80x32x128xf32>) outs(%722 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x80x128xf32>
+    %expanded_1054 = tensor.expand_shape %expanded_1051 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32>
+    %724 = tensor.empty() : tensor<1x32x80x128xf32>
+    %725 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1054 : tensor<1x80x32x128xf32>) outs(%724 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x80x128xf32>
+    %extracted_slice_1055 = tensor.extract_slice %expanded_540[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32>
+    %extracted_slice_1056 = tensor.extract_slice %expanded_542[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32>
+    %726 = tensor.empty() : tensor<1x80x128xf32>
+    %727 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_1055 : tensor<1x1x80x128xf32>) outs(%726 : tensor<1x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x80x128xf32>
+    %728 = tensor.empty() : tensor<80x128xf32>
+    %729 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%727 : tensor<1x80x128xf32>) outs(%728 : tensor<80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<80x128xf32>
+    %730 = tensor.empty() : tensor<1x80x128xf32>
+    %731 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_1056 : tensor<1x1x80x128xf32>) outs(%730 : tensor<1x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x80x128xf32>
+    %732 = tensor.empty() : tensor<80x128xf32>
+    %733 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%731 : tensor<1x80x128xf32>) outs(%732 : tensor<80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<80x128xf32>
+    %734 = tensor.empty() : tensor<1x80x128xf32>
+    %735 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%734 : tensor<1x80x128xf32>) {
+    ^bb0(%in: i64, %out: f32):
+      %3652 = arith.index_cast %in : i64 to index
+      %3653 = linalg.index 2 : index
+      %extracted = tensor.extract %729[%3652, %3653] : tensor<80x128xf32>
+      linalg.yield %extracted : f32
+    } -> tensor<1x80x128xf32>
+    %expanded_1057 = tensor.expand_shape %735 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32>
+    %736 = tensor.empty() : tensor<1x80x128xf32>
+    %737 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%736 : tensor<1x80x128xf32>) {
+    ^bb0(%in: i64, %out: f32):
+      %3652 = arith.index_cast %in : i64 to index
+      %3653 = linalg.index 2 : index
+      %extracted = tensor.extract %733[%3652, %3653] : tensor<80x128xf32>
+      linalg.yield %extracted : f32
+    } -> tensor<1x80x128xf32>
+    %expanded_1058 = tensor.expand_shape %737 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32>
+    %738 = tensor.empty() : tensor<1x32x80x128xf32>
+    %739 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%721, %735 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%738 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %extracted_slice_1059 = tensor.extract_slice %721[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %extracted_slice_1060 = tensor.extract_slice %721[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %740 = tensor.empty() : tensor<1x32x80x64xf32>
+    %741 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_1060 : tensor<1x32x80x64xf32>) outs(%740 : tensor<1x32x80x64xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.negf %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x64xf32>
+    %742 = tensor.empty() : tensor<1x32x80x128xf32>
+    %inserted_slice_1061 = tensor.insert_slice %741 into %742[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %inserted_slice_1062 = tensor.insert_slice %extracted_slice_1059 into %inserted_slice_1061[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %743 = tensor.empty() : tensor<1x32x80x128xf32>
+    %744 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_1062, %737 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%743 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %745 = tensor.empty() : tensor<1x32x80x128xf32>
+    %746 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%739, %744 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%745 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %747 = tensor.empty() : tensor<1x32x80x128xf32>
+    %748 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%723, %735 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%747 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %extracted_slice_1063 = tensor.extract_slice %723[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %extracted_slice_1064 = tensor.extract_slice %723[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %749 = tensor.empty() : tensor<1x32x80x64xf32>
+    %750 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_1064 : tensor<1x32x80x64xf32>) outs(%749 : tensor<1x32x80x64xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.negf %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x64xf32>
+    %751 = tensor.empty() : tensor<1x32x80x128xf32>
+    %inserted_slice_1065 = tensor.insert_slice %750 into %751[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %inserted_slice_1066 = tensor.insert_slice %extracted_slice_1063 into %inserted_slice_1065[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %752 = tensor.empty() : tensor<1x32x80x128xf32>
+    %753 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_1066, %737 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%752 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %754 = tensor.empty() : tensor<1x32x80x128xf32>
+    %755 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%748, %753 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%754 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %756 = tensor.empty() : tensor<1x32x128x80xf32>
+    %757 = linalg.generic {indexing_maps = [#map14, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%755 : tensor<1x32x80x128xf32>) outs(%756 : tensor<1x32x128x80xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x128x80xf32>
+    %cst_1067 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32>
+    %collapsed_1068 = tensor.collapse_shape %746 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32>
+    %cst_1069 = arith.constant dense<0.000000e+00> : tensor<1x32x128x80xf32>
+    %collapsed_1070 = tensor.collapse_shape %757 [[0, 1], [2], [3]] : tensor<1x32x128x80xf32> into tensor<32x128x80xf32>
+    %cst_1071 = arith.constant 0.000000e+00 : f32
+    %758 = tensor.empty() : tensor<32x80x80xf32>
+    %759 = linalg.fill ins(%cst_1071 : f32) outs(%758 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32>
+    %760 = linalg.batch_matmul ins(%collapsed_1068, %collapsed_1070 : tensor<32x80x128xf32>, tensor<32x128x80xf32>) outs(%759 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32>
+    %expanded_1072 = tensor.expand_shape %760 [[0, 1], [2], [3]] : tensor<32x80x80xf32> into tensor<1x32x80x80xf32>
+    %cst_1073 = arith.constant dense<11.3137083> : tensor<1x32x80x80xf32>
+    %761 = tensor.empty() : tensor<1x32x80x80xf32>
+    %762 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst_1073 : tensor<1x32x80x80xf32>) outs(%761 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %cst_2684 = arith.constant 1.000000e+00 : f32
+      %3652 = arith.divf %cst_2684, %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %763 = tensor.empty() : tensor<1x32x80x80xf32>
+    %764 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1072, %762 : tensor<1x32x80x80xf32>, tensor<1x32x80x80xf32>) outs(%763 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %765 = tensor.empty() : tensor<1x32x80x80xf32>
+    %collapsed_1074 = tensor.collapse_shape %21 [[0, 1], [2], [3]] : tensor<1x1x80x80xf32> into tensor<1x80x80xf32>
+    %766 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%764, %collapsed_1074 : tensor<1x32x80x80xf32>, tensor<1x80x80xf32>) outs(%765 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %767 = tensor.empty() : tensor<1x32x80x1xf32>
+    %768 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%767 : tensor<1x32x80x1xf32>) {
+    ^bb0(%out: f32):
+      %cst_2684 = arith.constant 0xFF800000 : f32
+      linalg.yield %cst_2684 : f32
+    } -> tensor<1x32x80x1xf32>
+    %769 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%766 : tensor<1x32x80x80xf32>) outs(%767 : tensor<1x32x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.maximumf %in, %out : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x1xf32>
+    %770 = tensor.empty() : tensor<1x32x80x80xf32>
+    %771 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%766, %769 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%770 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.subf %in, %in_2684 : f32
+      %3653 = math.exp %3652 : f32
+      linalg.yield %3653 : f32
+    } -> tensor<1x32x80x80xf32>
+    %772 = tensor.empty() : tensor<1x32x80x1xf32>
+    %773 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%772 : tensor<1x32x80x1xf32>) {
+    ^bb0(%out: f32):
+      %cst_2684 = arith.constant 0.000000e+00 : f32
+      linalg.yield %cst_2684 : f32
+    } -> tensor<1x32x80x1xf32>
+    %774 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%771 : tensor<1x32x80x80xf32>) outs(%773 : tensor<1x32x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.addf %in, %out : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x1xf32>
+    %775 = tensor.empty() : tensor<1x32x80x80xf32>
+    %776 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%771, %774 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%775 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.divf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %cst_1075 = arith.constant dense<0.000000e+00> : tensor<1x32x80x80xf32>
+    %collapsed_1076 = tensor.collapse_shape %776 [[0, 1], [2], [3]] : tensor<1x32x80x80xf32> into tensor<32x80x80xf32>
+    %cst_1077 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32>
+    %collapsed_1078 = tensor.collapse_shape %725 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32>
+    %cst_1079 = arith.constant 0.000000e+00 : f32
+    %777 = tensor.empty() : tensor<32x80x128xf32>
+    %778 = linalg.fill ins(%cst_1079 : f32) outs(%777 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32>
+    %779 = linalg.batch_matmul ins(%collapsed_1076, %collapsed_1078 : tensor<32x80x80xf32>, tensor<32x80x128xf32>) outs(%778 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32>
+    %expanded_1080 = tensor.expand_shape %779 [[0, 1], [2], [3]] : tensor<32x80x128xf32> into tensor<1x32x80x128xf32>
+    %780 = tensor.empty() : tensor<1x80x32x128xf32>
+    %781 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1080 : tensor<1x32x80x128xf32>) outs(%780 : tensor<1x80x32x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x80x32x128xf32>
+    %collapsed_1081 = tensor.collapse_shape %781 [[0], [1], [2, 3]] : tensor<1x80x32x128xf32> into tensor<1x80x4096xf32>
+    %782 = tensor.empty() : tensor<4096x4096xf32>
+    %783 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_156 : tensor<4096x4096xf32>) outs(%782 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_1082 = tensor.collapse_shape %collapsed_1081 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_1083 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %784 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1082, %783 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1083 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_1084 = tensor.expand_shape %784 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %785 = tensor.empty() : tensor<1x80x4096xf32>
+    %786 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%699, %expanded_1084 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%785 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %787 = tensor.empty() : tensor<1x80x4096xf32>
+    %cst_1085 = arith.constant 2.000000e+00 : f32
+    %788 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%786 : tensor<1x80x4096xf32>) outs(%787 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.powf %in, %cst_1085 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %cst_1086 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32>
+    %789 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%788 : tensor<1x80x4096xf32>) outs(%cst_1086 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %cst_2684 = arith.constant 4.096000e+03 : f32
+      %3652 = arith.divf %in, %cst_2684 : f32
+      %3653 = arith.addf %3652, %out : f32
+      linalg.yield %3653 : f32
+    } -> tensor<1x80x1xf32>
+    %cst_1087 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32>
+    %790 = tensor.empty() : tensor<1x80x1xf32>
+    %791 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%789, %cst_1087 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%790 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %792 = tensor.empty() : tensor<1x80x1xf32>
+    %793 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%791 : tensor<1x80x1xf32>) outs(%792 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.rsqrt %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %794 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_1088 = tensor.collapse_shape %793 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32>
+    %795 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%786, %collapsed_1088 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%794 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %expanded_1089 = tensor.expand_shape %extracted_slice_12 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32>
+    %796 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_1090 = tensor.collapse_shape %expanded_1089 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32>
+    %797 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_1090, %795 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%796 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %798 = tensor.empty() : tensor<4096x11008xf32>
+    %799 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_158 : tensor<11008x4096xf32>) outs(%798 : tensor<4096x11008xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x11008xf32>
+    %collapsed_1091 = tensor.collapse_shape %797 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_1092 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32>
+    %800 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1091, %799 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_1092 : tensor<80x11008xf32>) -> tensor<80x11008xf32>
+    %expanded_1093 = tensor.expand_shape %800 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32>
+    %801 = tensor.empty() : tensor<1x80x11008xf32>
+    %802 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_1093 : tensor<1x80x11008xf32>) outs(%801 : tensor<1x80x11008xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.negf %in : f32
+      %3653 = math.exp %3652 : f32
+      %cst_2684 = arith.constant 1.000000e+00 : f32
+      %3654 = arith.addf %cst_2684, %3653 : f32
+      %3655 = arith.divf %in, %3654 : f32
+      linalg.yield %3655 : f32
+    } -> tensor<1x80x11008xf32>
+    %803 = tensor.empty() : tensor<4096x11008xf32>
+    %804 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_160 : tensor<11008x4096xf32>) outs(%803 : tensor<4096x11008xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x11008xf32>
+    %collapsed_1094 = tensor.collapse_shape %797 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_1095 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32>
+    %805 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1094, %804 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_1095 : tensor<80x11008xf32>) -> tensor<80x11008xf32>
+    %expanded_1096 = tensor.expand_shape %805 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32>
+    %806 = tensor.empty() : tensor<1x80x11008xf32>
+    %807 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%802, %expanded_1096 : tensor<1x80x11008xf32>, tensor<1x80x11008xf32>) outs(%806 : tensor<1x80x11008xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x11008xf32>
+    %808 = tensor.empty() : tensor<11008x4096xf32>
+    %809 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_162 : tensor<4096x11008xf32>) outs(%808 : tensor<11008x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<11008x4096xf32>
+    %collapsed_1097 = tensor.collapse_shape %807 [[0, 1], [2]] : tensor<1x80x11008xf32> into tensor<80x11008xf32>
+    %cst_1098 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %810 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1097, %809 : tensor<80x11008xf32>, tensor<11008x4096xf32>) outs(%cst_1098 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_1099 = tensor.expand_shape %810 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %811 = tensor.empty() : tensor<1x80x4096xf32>
+    %812 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%786, %expanded_1099 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%811 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %813 = tensor.empty() : tensor<1x80x4096xf32>
+    %cst_1100 = arith.constant 2.000000e+00 : f32
+    %814 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%812 : tensor<1x80x4096xf32>) outs(%813 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.powf %in, %cst_1100 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %cst_1101 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32>
+    %815 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%814 : tensor<1x80x4096xf32>) outs(%cst_1101 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %cst_2684 = arith.constant 4.096000e+03 : f32
+      %3652 = arith.divf %in, %cst_2684 : f32
+      %3653 = arith.addf %3652, %out : f32
+      linalg.yield %3653 : f32
+    } -> tensor<1x80x1xf32>
+    %cst_1102 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32>
+    %816 = tensor.empty() : tensor<1x80x1xf32>
+    %817 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%815, %cst_1102 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%816 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %818 = tensor.empty() : tensor<1x80x1xf32>
+    %819 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%817 : tensor<1x80x1xf32>) outs(%818 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.rsqrt %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %820 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_1103 = tensor.collapse_shape %819 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32>
+    %821 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%812, %collapsed_1103 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%820 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %expanded_1104 = tensor.expand_shape %extracted_slice_13 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32>
+    %822 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_1105 = tensor.collapse_shape %expanded_1104 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32>
+    %823 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_1105, %821 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%822 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %824 = tensor.empty() : tensor<4096x4096xf32>
+    %825 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_164 : tensor<4096x4096xf32>) outs(%824 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_1106 = tensor.collapse_shape %823 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_1107 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %826 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1106, %825 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1107 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_1108 = tensor.expand_shape %826 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %827 = tensor.empty() : tensor<4096x4096xf32>
+    %828 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_166 : tensor<4096x4096xf32>) outs(%827 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_1109 = tensor.collapse_shape %823 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_1110 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %829 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1109, %828 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1110 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_1111 = tensor.expand_shape %829 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %830 = tensor.empty() : tensor<4096x4096xf32>
+    %831 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_168 : tensor<4096x4096xf32>) outs(%830 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_1112 = tensor.collapse_shape %823 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_1113 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %832 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1112, %831 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1113 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_1114 = tensor.expand_shape %832 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %expanded_1115 = tensor.expand_shape %expanded_1108 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32>
+    %833 = tensor.empty() : tensor<1x32x80x128xf32>
+    %834 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1115 : tensor<1x80x32x128xf32>) outs(%833 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x80x128xf32>
+    %expanded_1116 = tensor.expand_shape %expanded_1111 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32>
+    %835 = tensor.empty() : tensor<1x32x80x128xf32>
+    %836 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1116 : tensor<1x80x32x128xf32>) outs(%835 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x80x128xf32>
+    %expanded_1117 = tensor.expand_shape %expanded_1114 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32>
+    %837 = tensor.empty() : tensor<1x32x80x128xf32>
+    %838 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1117 : tensor<1x80x32x128xf32>) outs(%837 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x80x128xf32>
+    %extracted_slice_1118 = tensor.extract_slice %expanded_544[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32>
+    %extracted_slice_1119 = tensor.extract_slice %expanded_546[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32>
+    %839 = tensor.empty() : tensor<1x80x128xf32>
+    %840 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_1118 : tensor<1x1x80x128xf32>) outs(%839 : tensor<1x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x80x128xf32>
+    %841 = tensor.empty() : tensor<80x128xf32>
+    %842 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%840 : tensor<1x80x128xf32>) outs(%841 : tensor<80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<80x128xf32>
+    %843 = tensor.empty() : tensor<1x80x128xf32>
+    %844 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_1119 : tensor<1x1x80x128xf32>) outs(%843 : tensor<1x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x80x128xf32>
+    %845 = tensor.empty() : tensor<80x128xf32>
+    %846 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%844 : tensor<1x80x128xf32>) outs(%845 : tensor<80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<80x128xf32>
+    %847 = tensor.empty() : tensor<1x80x128xf32>
+    %848 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%847 : tensor<1x80x128xf32>) {
+    ^bb0(%in: i64, %out: f32):
+      %3652 = arith.index_cast %in : i64 to index
+      %3653 = linalg.index 2 : index
+      %extracted = tensor.extract %842[%3652, %3653] : tensor<80x128xf32>
+      linalg.yield %extracted : f32
+    } -> tensor<1x80x128xf32>
+    %expanded_1120 = tensor.expand_shape %848 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32>
+    %849 = tensor.empty() : tensor<1x80x128xf32>
+    %850 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%849 : tensor<1x80x128xf32>) {
+    ^bb0(%in: i64, %out: f32):
+      %3652 = arith.index_cast %in : i64 to index
+      %3653 = linalg.index 2 : index
+      %extracted = tensor.extract %846[%3652, %3653] : tensor<80x128xf32>
+      linalg.yield %extracted : f32
+    } -> tensor<1x80x128xf32>
+    %expanded_1121 = tensor.expand_shape %850 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32>
+    %851 = tensor.empty() : tensor<1x32x80x128xf32>
+    %852 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%834, %848 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%851 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %extracted_slice_1122 = tensor.extract_slice %834[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %extracted_slice_1123 = tensor.extract_slice %834[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %853 = tensor.empty() : tensor<1x32x80x64xf32>
+    %854 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_1123 : tensor<1x32x80x64xf32>) outs(%853 : tensor<1x32x80x64xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.negf %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x64xf32>
+    %855 = tensor.empty() : tensor<1x32x80x128xf32>
+    %inserted_slice_1124 = tensor.insert_slice %854 into %855[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %inserted_slice_1125 = tensor.insert_slice %extracted_slice_1122 into %inserted_slice_1124[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %856 = tensor.empty() : tensor<1x32x80x128xf32>
+    %857 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_1125, %850 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%856 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %858 = tensor.empty() : tensor<1x32x80x128xf32>
+    %859 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%852, %857 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%858 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %860 = tensor.empty() : tensor<1x32x80x128xf32>
+    %861 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%836, %848 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%860 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %extracted_slice_1126 = tensor.extract_slice %836[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %extracted_slice_1127 = tensor.extract_slice %836[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %862 = tensor.empty() : tensor<1x32x80x64xf32>
+    %863 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_1127 : tensor<1x32x80x64xf32>) outs(%862 : tensor<1x32x80x64xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.negf %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x64xf32>
+    %864 = tensor.empty() : tensor<1x32x80x128xf32>
+    %inserted_slice_1128 = tensor.insert_slice %863 into %864[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %inserted_slice_1129 = tensor.insert_slice %extracted_slice_1126 into %inserted_slice_1128[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %865 = tensor.empty() : tensor<1x32x80x128xf32>
+    %866 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_1129, %850 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%865 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %867 = tensor.empty() : tensor<1x32x80x128xf32>
+    %868 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%861, %866 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%867 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %869 = tensor.empty() : tensor<1x32x128x80xf32>
+    %870 = linalg.generic {indexing_maps = [#map14, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%868 : tensor<1x32x80x128xf32>) outs(%869 : tensor<1x32x128x80xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x128x80xf32>
+    %cst_1130 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32>
+    %collapsed_1131 = tensor.collapse_shape %859 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32>
+    %cst_1132 = arith.constant dense<0.000000e+00> : tensor<1x32x128x80xf32>
+    %collapsed_1133 = tensor.collapse_shape %870 [[0, 1], [2], [3]] : tensor<1x32x128x80xf32> into tensor<32x128x80xf32>
+    %cst_1134 = arith.constant 0.000000e+00 : f32
+    %871 = tensor.empty() : tensor<32x80x80xf32>
+    %872 = linalg.fill ins(%cst_1134 : f32) outs(%871 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32>
+    %873 = linalg.batch_matmul ins(%collapsed_1131, %collapsed_1133 : tensor<32x80x128xf32>, tensor<32x128x80xf32>) outs(%872 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32>
+    %expanded_1135 = tensor.expand_shape %873 [[0, 1], [2], [3]] : tensor<32x80x80xf32> into tensor<1x32x80x80xf32>
+    %cst_1136 = arith.constant dense<11.3137083> : tensor<1x32x80x80xf32>
+    %874 = tensor.empty() : tensor<1x32x80x80xf32>
+    %875 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst_1136 : tensor<1x32x80x80xf32>) outs(%874 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %cst_2684 = arith.constant 1.000000e+00 : f32
+      %3652 = arith.divf %cst_2684, %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %876 = tensor.empty() : tensor<1x32x80x80xf32>
+    %877 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1135, %875 : tensor<1x32x80x80xf32>, tensor<1x32x80x80xf32>) outs(%876 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %878 = tensor.empty() : tensor<1x32x80x80xf32>
+    %collapsed_1137 = tensor.collapse_shape %21 [[0, 1], [2], [3]] : tensor<1x1x80x80xf32> into tensor<1x80x80xf32>
+    %879 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%877, %collapsed_1137 : tensor<1x32x80x80xf32>, tensor<1x80x80xf32>) outs(%878 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %880 = tensor.empty() : tensor<1x32x80x1xf32>
+    %881 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%880 : tensor<1x32x80x1xf32>) {
+    ^bb0(%out: f32):
+      %cst_2684 = arith.constant 0xFF800000 : f32
+      linalg.yield %cst_2684 : f32
+    } -> tensor<1x32x80x1xf32>
+    %882 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%879 : tensor<1x32x80x80xf32>) outs(%880 : tensor<1x32x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.maximumf %in, %out : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x1xf32>
+    %883 = tensor.empty() : tensor<1x32x80x80xf32>
+    %884 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%879, %882 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%883 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.subf %in, %in_2684 : f32
+      %3653 = math.exp %3652 : f32
+      linalg.yield %3653 : f32
+    } -> tensor<1x32x80x80xf32>
+    %885 = tensor.empty() : tensor<1x32x80x1xf32>
+    %886 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%885 : tensor<1x32x80x1xf32>) {
+    ^bb0(%out: f32):
+      %cst_2684 = arith.constant 0.000000e+00 : f32
+      linalg.yield %cst_2684 : f32
+    } -> tensor<1x32x80x1xf32>
+    %887 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%884 : tensor<1x32x80x80xf32>) outs(%886 : tensor<1x32x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.addf %in, %out : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x1xf32>
+    %888 = tensor.empty() : tensor<1x32x80x80xf32>
+    %889 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%884, %887 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%888 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.divf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %cst_1138 = arith.constant dense<0.000000e+00> : tensor<1x32x80x80xf32>
+    %collapsed_1139 = tensor.collapse_shape %889 [[0, 1], [2], [3]] : tensor<1x32x80x80xf32> into tensor<32x80x80xf32>
+    %cst_1140 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32>
+    %collapsed_1141 = tensor.collapse_shape %838 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32>
+    %cst_1142 = arith.constant 0.000000e+00 : f32
+    %890 = tensor.empty() : tensor<32x80x128xf32>
+    %891 = linalg.fill ins(%cst_1142 : f32) outs(%890 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32>
+    %892 = linalg.batch_matmul ins(%collapsed_1139, %collapsed_1141 : tensor<32x80x80xf32>, tensor<32x80x128xf32>) outs(%891 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32>
+    %expanded_1143 = tensor.expand_shape %892 [[0, 1], [2], [3]] : tensor<32x80x128xf32> into tensor<1x32x80x128xf32>
+    %893 = tensor.empty() : tensor<1x80x32x128xf32>
+    %894 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1143 : tensor<1x32x80x128xf32>) outs(%893 : tensor<1x80x32x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x80x32x128xf32>
+    %collapsed_1144 = tensor.collapse_shape %894 [[0], [1], [2, 3]] : tensor<1x80x32x128xf32> into tensor<1x80x4096xf32>
+    %895 = tensor.empty() : tensor<4096x4096xf32>
+    %896 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_170 : tensor<4096x4096xf32>) outs(%895 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_1145 = tensor.collapse_shape %collapsed_1144 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_1146 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %897 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1145, %896 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1146 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_1147 = tensor.expand_shape %897 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %898 = tensor.empty() : tensor<1x80x4096xf32>
+    %899 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%812, %expanded_1147 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%898 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %900 = tensor.empty() : tensor<1x80x4096xf32>
+    %cst_1148 = arith.constant 2.000000e+00 : f32
+    %901 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%899 : tensor<1x80x4096xf32>) outs(%900 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.powf %in, %cst_1148 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %cst_1149 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32>
+    %902 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%901 : tensor<1x80x4096xf32>) outs(%cst_1149 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %cst_2684 = arith.constant 4.096000e+03 : f32
+      %3652 = arith.divf %in, %cst_2684 : f32
+      %3653 = arith.addf %3652, %out : f32
+      linalg.yield %3653 : f32
+    } -> tensor<1x80x1xf32>
+    %cst_1150 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32>
+    %903 = tensor.empty() : tensor<1x80x1xf32>
+    %904 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%902, %cst_1150 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%903 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %905 = tensor.empty() : tensor<1x80x1xf32>
+    %906 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%904 : tensor<1x80x1xf32>) outs(%905 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.rsqrt %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %907 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_1151 = tensor.collapse_shape %906 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32>
+    %908 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%899, %collapsed_1151 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%907 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %expanded_1152 = tensor.expand_shape %extracted_slice_14 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32>
+    %909 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_1153 = tensor.collapse_shape %expanded_1152 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32>
+    %910 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_1153, %908 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%909 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %911 = tensor.empty() : tensor<4096x11008xf32>
+    %912 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_172 : tensor<11008x4096xf32>) outs(%911 : tensor<4096x11008xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x11008xf32>
+    %collapsed_1154 = tensor.collapse_shape %910 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_1155 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32>
+    %913 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1154, %912 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_1155 : tensor<80x11008xf32>) -> tensor<80x11008xf32>
+    %expanded_1156 = tensor.expand_shape %913 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32>
+    %914 = tensor.empty() : tensor<1x80x11008xf32>
+    %915 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_1156 : tensor<1x80x11008xf32>) outs(%914 : tensor<1x80x11008xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.negf %in : f32
+      %3653 = math.exp %3652 : f32
+      %cst_2684 = arith.constant 1.000000e+00 : f32
+      %3654 = arith.addf %cst_2684, %3653 : f32
+      %3655 = arith.divf %in, %3654 : f32
+      linalg.yield %3655 : f32
+    } -> tensor<1x80x11008xf32>
+    %916 = tensor.empty() : tensor<4096x11008xf32>
+    %917 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_174 : tensor<11008x4096xf32>) outs(%916 : tensor<4096x11008xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x11008xf32>
+    %collapsed_1157 = tensor.collapse_shape %910 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_1158 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32>
+    %918 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1157, %917 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_1158 : tensor<80x11008xf32>) -> tensor<80x11008xf32>
+    %expanded_1159 = tensor.expand_shape %918 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32>
+    %919 = tensor.empty() : tensor<1x80x11008xf32>
+    %920 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%915, %expanded_1159 : tensor<1x80x11008xf32>, tensor<1x80x11008xf32>) outs(%919 : tensor<1x80x11008xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x11008xf32>
+    %921 = tensor.empty() : tensor<11008x4096xf32>
+    %922 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_176 : tensor<4096x11008xf32>) outs(%921 : tensor<11008x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<11008x4096xf32>
+    %collapsed_1160 = tensor.collapse_shape %920 [[0, 1], [2]] : tensor<1x80x11008xf32> into tensor<80x11008xf32>
+    %cst_1161 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %923 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1160, %922 : tensor<80x11008xf32>, tensor<11008x4096xf32>) outs(%cst_1161 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_1162 = tensor.expand_shape %923 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %924 = tensor.empty() : tensor<1x80x4096xf32>
+    %925 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%899, %expanded_1162 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%924 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %926 = tensor.empty() : tensor<1x80x4096xf32>
+    %cst_1163 = arith.constant 2.000000e+00 : f32
+    %927 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%925 : tensor<1x80x4096xf32>) outs(%926 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.powf %in, %cst_1163 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %cst_1164 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32>
+    %928 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%927 : tensor<1x80x4096xf32>) outs(%cst_1164 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %cst_2684 = arith.constant 4.096000e+03 : f32
+      %3652 = arith.divf %in, %cst_2684 : f32
+      %3653 = arith.addf %3652, %out : f32
+      linalg.yield %3653 : f32
+    } -> tensor<1x80x1xf32>
+    %cst_1165 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32>
+    %929 = tensor.empty() : tensor<1x80x1xf32>
+    %930 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%928, %cst_1165 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%929 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %931 = tensor.empty() : tensor<1x80x1xf32>
+    %932 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%930 : tensor<1x80x1xf32>) outs(%931 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.rsqrt %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %933 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_1166 = tensor.collapse_shape %932 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32>
+    %934 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%925, %collapsed_1166 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%933 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %expanded_1167 = tensor.expand_shape %extracted_slice_15 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32>
+    %935 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_1168 = tensor.collapse_shape %expanded_1167 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32>
+    %936 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_1168, %934 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%935 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %937 = tensor.empty() : tensor<4096x4096xf32>
+    %938 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_178 : tensor<4096x4096xf32>) outs(%937 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_1169 = tensor.collapse_shape %936 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_1170 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %939 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1169, %938 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1170 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_1171 = tensor.expand_shape %939 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %940 = tensor.empty() : tensor<4096x4096xf32>
+    %941 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_180 : tensor<4096x4096xf32>) outs(%940 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_1172 = tensor.collapse_shape %936 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_1173 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %942 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1172, %941 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1173 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_1174 = tensor.expand_shape %942 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %943 = tensor.empty() : tensor<4096x4096xf32>
+    %944 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_182 : tensor<4096x4096xf32>) outs(%943 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_1175 = tensor.collapse_shape %936 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_1176 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %945 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1175, %944 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1176 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_1177 = tensor.expand_shape %945 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %expanded_1178 = tensor.expand_shape %expanded_1171 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32>
+    %946 = tensor.empty() : tensor<1x32x80x128xf32>
+    %947 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1178 : tensor<1x80x32x128xf32>) outs(%946 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x80x128xf32>
+    %expanded_1179 = tensor.expand_shape %expanded_1174 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32>
+    %948 = tensor.empty() : tensor<1x32x80x128xf32>
+    %949 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1179 : tensor<1x80x32x128xf32>) outs(%948 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x80x128xf32>
+    %expanded_1180 = tensor.expand_shape %expanded_1177 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32>
+    %950 = tensor.empty() : tensor<1x32x80x128xf32>
+    %951 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1180 : tensor<1x80x32x128xf32>) outs(%950 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x80x128xf32>
+    %extracted_slice_1181 = tensor.extract_slice %expanded_548[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32>
+    %extracted_slice_1182 = tensor.extract_slice %expanded_550[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32>
+    %952 = tensor.empty() : tensor<1x80x128xf32>
+    %953 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_1181 : tensor<1x1x80x128xf32>) outs(%952 : tensor<1x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x80x128xf32>
+    %954 = tensor.empty() : tensor<80x128xf32>
+    %955 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%953 : tensor<1x80x128xf32>) outs(%954 : tensor<80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<80x128xf32>
+    %956 = tensor.empty() : tensor<1x80x128xf32>
+    %957 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_1182 : tensor<1x1x80x128xf32>) outs(%956 : tensor<1x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x80x128xf32>
+    %958 = tensor.empty() : tensor<80x128xf32>
+    %959 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%957 : tensor<1x80x128xf32>) outs(%958 : tensor<80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<80x128xf32>
+    %960 = tensor.empty() : tensor<1x80x128xf32>
+    %961 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%960 : tensor<1x80x128xf32>) {
+    ^bb0(%in: i64, %out: f32):
+      %3652 = arith.index_cast %in : i64 to index
+      %3653 = linalg.index 2 : index
+      %extracted = tensor.extract %955[%3652, %3653] : tensor<80x128xf32>
+      linalg.yield %extracted : f32
+    } -> tensor<1x80x128xf32>
+    %expanded_1183 = tensor.expand_shape %961 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32>
+    %962 = tensor.empty() : tensor<1x80x128xf32>
+    %963 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%962 : tensor<1x80x128xf32>) {
+    ^bb0(%in: i64, %out: f32):
+      %3652 = arith.index_cast %in : i64 to index
+      %3653 = linalg.index 2 : index
+      %extracted = tensor.extract %959[%3652, %3653] : tensor<80x128xf32>
+      linalg.yield %extracted : f32
+    } -> tensor<1x80x128xf32>
+    %expanded_1184 = tensor.expand_shape %963 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32>
+    %964 = tensor.empty() : tensor<1x32x80x128xf32>
+    %965 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%947, %961 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%964 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %extracted_slice_1185 = tensor.extract_slice %947[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %extracted_slice_1186 = tensor.extract_slice %947[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %966 = tensor.empty() : tensor<1x32x80x64xf32>
+    %967 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_1186 : tensor<1x32x80x64xf32>) outs(%966 : tensor<1x32x80x64xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.negf %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x64xf32>
+    %968 = tensor.empty() : tensor<1x32x80x128xf32>
+    %inserted_slice_1187 = tensor.insert_slice %967 into %968[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %inserted_slice_1188 = tensor.insert_slice %extracted_slice_1185 into %inserted_slice_1187[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %969 = tensor.empty() : tensor<1x32x80x128xf32>
+    %970 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_1188, %963 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%969 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %971 = tensor.empty() : tensor<1x32x80x128xf32>
+    %972 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%965, %970 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%971 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %973 = tensor.empty() : tensor<1x32x80x128xf32>
+    %974 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%949, %961 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%973 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %extracted_slice_1189 = tensor.extract_slice %949[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %extracted_slice_1190 = tensor.extract_slice %949[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %975 = tensor.empty() : tensor<1x32x80x64xf32>
+    %976 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_1190 : tensor<1x32x80x64xf32>) outs(%975 : tensor<1x32x80x64xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.negf %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x64xf32>
+    %977 = tensor.empty() : tensor<1x32x80x128xf32>
+    %inserted_slice_1191 = tensor.insert_slice %976 into %977[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %inserted_slice_1192 = tensor.insert_slice %extracted_slice_1189 into %inserted_slice_1191[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %978 = tensor.empty() : tensor<1x32x80x128xf32>
+    %979 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_1192, %963 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%978 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %980 = tensor.empty() : tensor<1x32x80x128xf32>
+    %981 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%974, %979 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%980 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %982 = tensor.empty() : tensor<1x32x128x80xf32>
+    %983 = linalg.generic {indexing_maps = [#map14, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%981 : tensor<1x32x80x128xf32>) outs(%982 : tensor<1x32x128x80xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x128x80xf32>
+    %cst_1193 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32>
+    %collapsed_1194 = tensor.collapse_shape %972 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32>
+    %cst_1195 = arith.constant dense<0.000000e+00> : tensor<1x32x128x80xf32>
+    %collapsed_1196 = tensor.collapse_shape %983 [[0, 1], [2], [3]] : tensor<1x32x128x80xf32> into tensor<32x128x80xf32>
+    %cst_1197 = arith.constant 0.000000e+00 : f32
+    %984 = tensor.empty() : tensor<32x80x80xf32>
+    %985 = linalg.fill ins(%cst_1197 : f32) outs(%984 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32>
+    %986 = linalg.batch_matmul ins(%collapsed_1194, %collapsed_1196 : tensor<32x80x128xf32>, tensor<32x128x80xf32>) outs(%985 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32>
+    %expanded_1198 = tensor.expand_shape %986 [[0, 1], [2], [3]] : tensor<32x80x80xf32> into tensor<1x32x80x80xf32>
+    %cst_1199 = arith.constant dense<11.3137083> : tensor<1x32x80x80xf32>
+    %987 = tensor.empty() : tensor<1x32x80x80xf32>
+    %988 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst_1199 : tensor<1x32x80x80xf32>) outs(%987 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %cst_2684 = arith.constant 1.000000e+00 : f32
+      %3652 = arith.divf %cst_2684, %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %989 = tensor.empty() : tensor<1x32x80x80xf32>
+    %990 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1198, %988 : tensor<1x32x80x80xf32>, tensor<1x32x80x80xf32>) outs(%989 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %991 = tensor.empty() : tensor<1x32x80x80xf32>
+    %collapsed_1200 = tensor.collapse_shape %21 [[0, 1], [2], [3]] : tensor<1x1x80x80xf32> into tensor<1x80x80xf32>
+    %992 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%990, %collapsed_1200 : tensor<1x32x80x80xf32>, tensor<1x80x80xf32>) outs(%991 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %993 = tensor.empty() : tensor<1x32x80x1xf32>
+    %994 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%993 : tensor<1x32x80x1xf32>) {
+    ^bb0(%out: f32):
+      %cst_2684 = arith.constant 0xFF800000 : f32
+      linalg.yield %cst_2684 : f32
+    } -> tensor<1x32x80x1xf32>
+    %995 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%992 : tensor<1x32x80x80xf32>) outs(%993 : tensor<1x32x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.maximumf %in, %out : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x1xf32>
+    %996 = tensor.empty() : tensor<1x32x80x80xf32>
+    %997 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%992, %995 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%996 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.subf %in, %in_2684 : f32
+      %3653 = math.exp %3652 : f32
+      linalg.yield %3653 : f32
+    } -> tensor<1x32x80x80xf32>
+    %998 = tensor.empty() : tensor<1x32x80x1xf32>
+    %999 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%998 : tensor<1x32x80x1xf32>) {
+    ^bb0(%out: f32):
+      %cst_2684 = arith.constant 0.000000e+00 : f32
+      linalg.yield %cst_2684 : f32
+    } -> tensor<1x32x80x1xf32>
+    %1000 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%997 : tensor<1x32x80x80xf32>) outs(%999 : tensor<1x32x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.addf %in, %out : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x1xf32>
+    %1001 = tensor.empty() : tensor<1x32x80x80xf32>
+    %1002 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%997, %1000 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%1001 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.divf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %cst_1201 = arith.constant dense<0.000000e+00> : tensor<1x32x80x80xf32>
+    %collapsed_1202 = tensor.collapse_shape %1002 [[0, 1], [2], [3]] : tensor<1x32x80x80xf32> into tensor<32x80x80xf32>
+    %cst_1203 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32>
+    %collapsed_1204 = tensor.collapse_shape %951 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32>
+    %cst_1205 = arith.constant 0.000000e+00 : f32
+    %1003 = tensor.empty() : tensor<32x80x128xf32>
+    %1004 = linalg.fill ins(%cst_1205 : f32) outs(%1003 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32>
+    %1005 = linalg.batch_matmul ins(%collapsed_1202, %collapsed_1204 : tensor<32x80x80xf32>, tensor<32x80x128xf32>) outs(%1004 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32>
+    %expanded_1206 = tensor.expand_shape %1005 [[0, 1], [2], [3]] : tensor<32x80x128xf32> into tensor<1x32x80x128xf32>
+    %1006 = tensor.empty() : tensor<1x80x32x128xf32>
+    %1007 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1206 : tensor<1x32x80x128xf32>) outs(%1006 : tensor<1x80x32x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x80x32x128xf32>
+    %collapsed_1207 = tensor.collapse_shape %1007 [[0], [1], [2, 3]] : tensor<1x80x32x128xf32> into tensor<1x80x4096xf32>
+    %1008 = tensor.empty() : tensor<4096x4096xf32>
+    %1009 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_184 : tensor<4096x4096xf32>) outs(%1008 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_1208 = tensor.collapse_shape %collapsed_1207 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_1209 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %1010 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1208, %1009 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1209 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_1210 = tensor.expand_shape %1010 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %1011 = tensor.empty() : tensor<1x80x4096xf32>
+    %1012 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%925, %expanded_1210 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%1011 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %1013 = tensor.empty() : tensor<1x80x4096xf32>
+    %cst_1211 = arith.constant 2.000000e+00 : f32
+    %1014 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1012 : tensor<1x80x4096xf32>) outs(%1013 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.powf %in, %cst_1211 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %cst_1212 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32>
+    %1015 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%1014 : tensor<1x80x4096xf32>) outs(%cst_1212 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %cst_2684 = arith.constant 4.096000e+03 : f32
+      %3652 = arith.divf %in, %cst_2684 : f32
+      %3653 = arith.addf %3652, %out : f32
+      linalg.yield %3653 : f32
+    } -> tensor<1x80x1xf32>
+    %cst_1213 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32>
+    %1016 = tensor.empty() : tensor<1x80x1xf32>
+    %1017 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1015, %cst_1213 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%1016 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %1018 = tensor.empty() : tensor<1x80x1xf32>
+    %1019 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1017 : tensor<1x80x1xf32>) outs(%1018 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.rsqrt %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %1020 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_1214 = tensor.collapse_shape %1019 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32>
+    %1021 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1012, %collapsed_1214 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%1020 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %expanded_1215 = tensor.expand_shape %extracted_slice_16 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32>
+    %1022 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_1216 = tensor.collapse_shape %expanded_1215 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32>
+    %1023 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_1216, %1021 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%1022 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %1024 = tensor.empty() : tensor<4096x11008xf32>
+    %1025 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_186 : tensor<11008x4096xf32>) outs(%1024 : tensor<4096x11008xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x11008xf32>
+    %collapsed_1217 = tensor.collapse_shape %1023 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_1218 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32>
+    %1026 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1217, %1025 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_1218 : tensor<80x11008xf32>) -> tensor<80x11008xf32>
+    %expanded_1219 = tensor.expand_shape %1026 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32>
+    %1027 = tensor.empty() : tensor<1x80x11008xf32>
+    %1028 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_1219 : tensor<1x80x11008xf32>) outs(%1027 : tensor<1x80x11008xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.negf %in : f32
+      %3653 = math.exp %3652 : f32
+      %cst_2684 = arith.constant 1.000000e+00 : f32
+      %3654 = arith.addf %cst_2684, %3653 : f32
+      %3655 = arith.divf %in, %3654 : f32
+      linalg.yield %3655 : f32
+    } -> tensor<1x80x11008xf32>
+    %1029 = tensor.empty() : tensor<4096x11008xf32>
+    %1030 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_188 : tensor<11008x4096xf32>) outs(%1029 : tensor<4096x11008xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x11008xf32>
+    %collapsed_1220 = tensor.collapse_shape %1023 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_1221 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32>
+    %1031 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1220, %1030 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_1221 : tensor<80x11008xf32>) -> tensor<80x11008xf32>
+    %expanded_1222 = tensor.expand_shape %1031 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32>
+    %1032 = tensor.empty() : tensor<1x80x11008xf32>
+    %1033 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1028, %expanded_1222 : tensor<1x80x11008xf32>, tensor<1x80x11008xf32>) outs(%1032 : tensor<1x80x11008xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x11008xf32>
+    %1034 = tensor.empty() : tensor<11008x4096xf32>
+    %1035 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_190 : tensor<4096x11008xf32>) outs(%1034 : tensor<11008x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<11008x4096xf32>
+    %collapsed_1223 = tensor.collapse_shape %1033 [[0, 1], [2]] : tensor<1x80x11008xf32> into tensor<80x11008xf32>
+    %cst_1224 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %1036 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1223, %1035 : tensor<80x11008xf32>, tensor<11008x4096xf32>) outs(%cst_1224 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_1225 = tensor.expand_shape %1036 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %1037 = tensor.empty() : tensor<1x80x4096xf32>
+    %1038 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1012, %expanded_1225 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%1037 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %1039 = tensor.empty() : tensor<1x80x4096xf32>
+    %cst_1226 = arith.constant 2.000000e+00 : f32
+    %1040 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1038 : tensor<1x80x4096xf32>) outs(%1039 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.powf %in, %cst_1226 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %cst_1227 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32>
+    %1041 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%1040 : tensor<1x80x4096xf32>) outs(%cst_1227 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %cst_2684 = arith.constant 4.096000e+03 : f32
+      %3652 = arith.divf %in, %cst_2684 : f32
+      %3653 = arith.addf %3652, %out : f32
+      linalg.yield %3653 : f32
+    } -> tensor<1x80x1xf32>
+    %cst_1228 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32>
+    %1042 = tensor.empty() : tensor<1x80x1xf32>
+    %1043 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1041, %cst_1228 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%1042 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %1044 = tensor.empty() : tensor<1x80x1xf32>
+    %1045 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1043 : tensor<1x80x1xf32>) outs(%1044 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.rsqrt %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %1046 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_1229 = tensor.collapse_shape %1045 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32>
+    %1047 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1038, %collapsed_1229 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%1046 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %expanded_1230 = tensor.expand_shape %extracted_slice_17 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32>
+    %1048 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_1231 = tensor.collapse_shape %expanded_1230 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32>
+    %1049 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_1231, %1047 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%1048 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %1050 = tensor.empty() : tensor<4096x4096xf32>
+    %1051 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_192 : tensor<4096x4096xf32>) outs(%1050 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_1232 = tensor.collapse_shape %1049 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_1233 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %1052 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1232, %1051 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1233 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_1234 = tensor.expand_shape %1052 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %1053 = tensor.empty() : tensor<4096x4096xf32>
+    %1054 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_194 : tensor<4096x4096xf32>) outs(%1053 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_1235 = tensor.collapse_shape %1049 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_1236 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %1055 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1235, %1054 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1236 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_1237 = tensor.expand_shape %1055 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %1056 = tensor.empty() : tensor<4096x4096xf32>
+    %1057 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_196 : tensor<4096x4096xf32>) outs(%1056 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_1238 = tensor.collapse_shape %1049 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_1239 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %1058 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1238, %1057 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1239 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_1240 = tensor.expand_shape %1058 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %expanded_1241 = tensor.expand_shape %expanded_1234 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32>
+    %1059 = tensor.empty() : tensor<1x32x80x128xf32>
+    %1060 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1241 : tensor<1x80x32x128xf32>) outs(%1059 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x80x128xf32>
+    %expanded_1242 = tensor.expand_shape %expanded_1237 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32>
+    %1061 = tensor.empty() : tensor<1x32x80x128xf32>
+    %1062 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1242 : tensor<1x80x32x128xf32>) outs(%1061 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x80x128xf32>
+    %expanded_1243 = tensor.expand_shape %expanded_1240 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32>
+    %1063 = tensor.empty() : tensor<1x32x80x128xf32>
+    %1064 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1243 : tensor<1x80x32x128xf32>) outs(%1063 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x80x128xf32>
+    %extracted_slice_1244 = tensor.extract_slice %expanded_552[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32>
+    %extracted_slice_1245 = tensor.extract_slice %expanded_554[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32>
+    %1065 = tensor.empty() : tensor<1x80x128xf32>
+    %1066 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_1244 : tensor<1x1x80x128xf32>) outs(%1065 : tensor<1x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x80x128xf32>
+    %1067 = tensor.empty() : tensor<80x128xf32>
+    %1068 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%1066 : tensor<1x80x128xf32>) outs(%1067 : tensor<80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<80x128xf32>
+    %1069 = tensor.empty() : tensor<1x80x128xf32>
+    %1070 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_1245 : tensor<1x1x80x128xf32>) outs(%1069 : tensor<1x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x80x128xf32>
+    %1071 = tensor.empty() : tensor<80x128xf32>
+    %1072 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%1070 : tensor<1x80x128xf32>) outs(%1071 : tensor<80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<80x128xf32>
+    %1073 = tensor.empty() : tensor<1x80x128xf32>
+    %1074 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%1073 : tensor<1x80x128xf32>) {
+    ^bb0(%in: i64, %out: f32):
+      %3652 = arith.index_cast %in : i64 to index
+      %3653 = linalg.index 2 : index
+      %extracted = tensor.extract %1068[%3652, %3653] : tensor<80x128xf32>
+      linalg.yield %extracted : f32
+    } -> tensor<1x80x128xf32>
+    %expanded_1246 = tensor.expand_shape %1074 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32>
+    %1075 = tensor.empty() : tensor<1x80x128xf32>
+    %1076 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%1075 : tensor<1x80x128xf32>) {
+    ^bb0(%in: i64, %out: f32):
+      %3652 = arith.index_cast %in : i64 to index
+      %3653 = linalg.index 2 : index
+      %extracted = tensor.extract %1072[%3652, %3653] : tensor<80x128xf32>
+      linalg.yield %extracted : f32
+    } -> tensor<1x80x128xf32>
+    %expanded_1247 = tensor.expand_shape %1076 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32>
+    %1077 = tensor.empty() : tensor<1x32x80x128xf32>
+    %1078 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1060, %1074 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%1077 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %extracted_slice_1248 = tensor.extract_slice %1060[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %extracted_slice_1249 = tensor.extract_slice %1060[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %1079 = tensor.empty() : tensor<1x32x80x64xf32>
+    %1080 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_1249 : tensor<1x32x80x64xf32>) outs(%1079 : tensor<1x32x80x64xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.negf %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x64xf32>
+    %1081 = tensor.empty() : tensor<1x32x80x128xf32>
+    %inserted_slice_1250 = tensor.insert_slice %1080 into %1081[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %inserted_slice_1251 = tensor.insert_slice %extracted_slice_1248 into %inserted_slice_1250[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %1082 = tensor.empty() : tensor<1x32x80x128xf32>
+    %1083 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_1251, %1076 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%1082 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %1084 = tensor.empty() : tensor<1x32x80x128xf32>
+    %1085 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1078, %1083 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%1084 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %1086 = tensor.empty() : tensor<1x32x80x128xf32>
+    %1087 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1062, %1074 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%1086 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %extracted_slice_1252 = tensor.extract_slice %1062[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %extracted_slice_1253 = tensor.extract_slice %1062[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %1088 = tensor.empty() : tensor<1x32x80x64xf32>
+    %1089 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_1253 : tensor<1x32x80x64xf32>) outs(%1088 : tensor<1x32x80x64xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.negf %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x64xf32>
+    %1090 = tensor.empty() : tensor<1x32x80x128xf32>
+    %inserted_slice_1254 = tensor.insert_slice %1089 into %1090[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %inserted_slice_1255 = tensor.insert_slice %extracted_slice_1252 into %inserted_slice_1254[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %1091 = tensor.empty() : tensor<1x32x80x128xf32>
+    %1092 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_1255, %1076 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%1091 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %1093 = tensor.empty() : tensor<1x32x80x128xf32>
+    %1094 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1087, %1092 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%1093 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %1095 = tensor.empty() : tensor<1x32x128x80xf32>
+    %1096 = linalg.generic {indexing_maps = [#map14, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1094 : tensor<1x32x80x128xf32>) outs(%1095 : tensor<1x32x128x80xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x128x80xf32>
+    %cst_1256 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32>
+    %collapsed_1257 = tensor.collapse_shape %1085 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32>
+    %cst_1258 = arith.constant dense<0.000000e+00> : tensor<1x32x128x80xf32>
+    %collapsed_1259 = tensor.collapse_shape %1096 [[0, 1], [2], [3]] : tensor<1x32x128x80xf32> into tensor<32x128x80xf32>
+    %cst_1260 = arith.constant 0.000000e+00 : f32
+    %1097 = tensor.empty() : tensor<32x80x80xf32>
+    %1098 = linalg.fill ins(%cst_1260 : f32) outs(%1097 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32>
+    %1099 = linalg.batch_matmul ins(%collapsed_1257, %collapsed_1259 : tensor<32x80x128xf32>, tensor<32x128x80xf32>) outs(%1098 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32>
+    %expanded_1261 = tensor.expand_shape %1099 [[0, 1], [2], [3]] : tensor<32x80x80xf32> into tensor<1x32x80x80xf32>
+    %cst_1262 = arith.constant dense<11.3137083> : tensor<1x32x80x80xf32>
+    %1100 = tensor.empty() : tensor<1x32x80x80xf32>
+    %1101 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst_1262 : tensor<1x32x80x80xf32>) outs(%1100 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %cst_2684 = arith.constant 1.000000e+00 : f32
+      %3652 = arith.divf %cst_2684, %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %1102 = tensor.empty() : tensor<1x32x80x80xf32>
+    %1103 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1261, %1101 : tensor<1x32x80x80xf32>, tensor<1x32x80x80xf32>) outs(%1102 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %1104 = tensor.empty() : tensor<1x32x80x80xf32>
+    %collapsed_1263 = tensor.collapse_shape %21 [[0, 1], [2], [3]] : tensor<1x1x80x80xf32> into tensor<1x80x80xf32>
+    %1105 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1103, %collapsed_1263 : tensor<1x32x80x80xf32>, tensor<1x80x80xf32>) outs(%1104 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %1106 = tensor.empty() : tensor<1x32x80x1xf32>
+    %1107 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%1106 : tensor<1x32x80x1xf32>) {
+    ^bb0(%out: f32):
+      %cst_2684 = arith.constant 0xFF800000 : f32
+      linalg.yield %cst_2684 : f32
+    } -> tensor<1x32x80x1xf32>
+    %1108 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%1105 : tensor<1x32x80x80xf32>) outs(%1106 : tensor<1x32x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.maximumf %in, %out : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x1xf32>
+    %1109 = tensor.empty() : tensor<1x32x80x80xf32>
+    %1110 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1105, %1108 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%1109 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.subf %in, %in_2684 : f32
+      %3653 = math.exp %3652 : f32
+      linalg.yield %3653 : f32
+    } -> tensor<1x32x80x80xf32>
+    %1111 = tensor.empty() : tensor<1x32x80x1xf32>
+    %1112 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%1111 : tensor<1x32x80x1xf32>) {
+    ^bb0(%out: f32):
+      %cst_2684 = arith.constant 0.000000e+00 : f32
+      linalg.yield %cst_2684 : f32
+    } -> tensor<1x32x80x1xf32>
+    %1113 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%1110 : tensor<1x32x80x80xf32>) outs(%1112 : tensor<1x32x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.addf %in, %out : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x1xf32>
+    %1114 = tensor.empty() : tensor<1x32x80x80xf32>
+    %1115 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1110, %1113 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%1114 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.divf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %cst_1264 = arith.constant dense<0.000000e+00> : tensor<1x32x80x80xf32>
+    %collapsed_1265 = tensor.collapse_shape %1115 [[0, 1], [2], [3]] : tensor<1x32x80x80xf32> into tensor<32x80x80xf32>
+    %cst_1266 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32>
+    %collapsed_1267 = tensor.collapse_shape %1064 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32>
+    %cst_1268 = arith.constant 0.000000e+00 : f32
+    %1116 = tensor.empty() : tensor<32x80x128xf32>
+    %1117 = linalg.fill ins(%cst_1268 : f32) outs(%1116 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32>
+    %1118 = linalg.batch_matmul ins(%collapsed_1265, %collapsed_1267 : tensor<32x80x80xf32>, tensor<32x80x128xf32>) outs(%1117 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32>
+    %expanded_1269 = tensor.expand_shape %1118 [[0, 1], [2], [3]] : tensor<32x80x128xf32> into tensor<1x32x80x128xf32>
+    %1119 = tensor.empty() : tensor<1x80x32x128xf32>
+    %1120 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1269 : tensor<1x32x80x128xf32>) outs(%1119 : tensor<1x80x32x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x80x32x128xf32>
+    %collapsed_1270 = tensor.collapse_shape %1120 [[0], [1], [2, 3]] : tensor<1x80x32x128xf32> into tensor<1x80x4096xf32>
+    %1121 = tensor.empty() : tensor<4096x4096xf32>
+    %1122 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_198 : tensor<4096x4096xf32>) outs(%1121 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_1271 = tensor.collapse_shape %collapsed_1270 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_1272 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %1123 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1271, %1122 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1272 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_1273 = tensor.expand_shape %1123 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %1124 = tensor.empty() : tensor<1x80x4096xf32>
+    %1125 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1038, %expanded_1273 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%1124 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %1126 = tensor.empty() : tensor<1x80x4096xf32>
+    %cst_1274 = arith.constant 2.000000e+00 : f32
+    %1127 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1125 : tensor<1x80x4096xf32>) outs(%1126 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.powf %in, %cst_1274 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %cst_1275 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32>
+    %1128 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%1127 : tensor<1x80x4096xf32>) outs(%cst_1275 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %cst_2684 = arith.constant 4.096000e+03 : f32
+      %3652 = arith.divf %in, %cst_2684 : f32
+      %3653 = arith.addf %3652, %out : f32
+      linalg.yield %3653 : f32
+    } -> tensor<1x80x1xf32>
+    %cst_1276 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32>
+    %1129 = tensor.empty() : tensor<1x80x1xf32>
+    %1130 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1128, %cst_1276 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%1129 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %1131 = tensor.empty() : tensor<1x80x1xf32>
+    %1132 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1130 : tensor<1x80x1xf32>) outs(%1131 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.rsqrt %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %1133 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_1277 = tensor.collapse_shape %1132 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32>
+    %1134 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1125, %collapsed_1277 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%1133 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %expanded_1278 = tensor.expand_shape %extracted_slice_18 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32>
+    %1135 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_1279 = tensor.collapse_shape %expanded_1278 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32>
+    %1136 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_1279, %1134 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%1135 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %1137 = tensor.empty() : tensor<4096x11008xf32>
+    %1138 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_200 : tensor<11008x4096xf32>) outs(%1137 : tensor<4096x11008xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x11008xf32>
+    %collapsed_1280 = tensor.collapse_shape %1136 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_1281 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32>
+    %1139 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1280, %1138 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_1281 : tensor<80x11008xf32>) -> tensor<80x11008xf32>
+    %expanded_1282 = tensor.expand_shape %1139 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32>
+    %1140 = tensor.empty() : tensor<1x80x11008xf32>
+    %1141 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_1282 : tensor<1x80x11008xf32>) outs(%1140 : tensor<1x80x11008xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.negf %in : f32
+      %3653 = math.exp %3652 : f32
+      %cst_2684 = arith.constant 1.000000e+00 : f32
+      %3654 = arith.addf %cst_2684, %3653 : f32
+      %3655 = arith.divf %in, %3654 : f32
+      linalg.yield %3655 : f32
+    } -> tensor<1x80x11008xf32>
+    %1142 = tensor.empty() : tensor<4096x11008xf32>
+    %1143 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_202 : tensor<11008x4096xf32>) outs(%1142 : tensor<4096x11008xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x11008xf32>
+    %collapsed_1283 = tensor.collapse_shape %1136 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_1284 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32>
+    %1144 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1283, %1143 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_1284 : tensor<80x11008xf32>) -> tensor<80x11008xf32>
+    %expanded_1285 = tensor.expand_shape %1144 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32>
+    %1145 = tensor.empty() : tensor<1x80x11008xf32>
+    %1146 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1141, %expanded_1285 : tensor<1x80x11008xf32>, tensor<1x80x11008xf32>) outs(%1145 : tensor<1x80x11008xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x11008xf32>
+    %1147 = tensor.empty() : tensor<11008x4096xf32>
+    %1148 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_204 : tensor<4096x11008xf32>) outs(%1147 : tensor<11008x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<11008x4096xf32>
+    %collapsed_1286 = tensor.collapse_shape %1146 [[0, 1], [2]] : tensor<1x80x11008xf32> into tensor<80x11008xf32>
+    %cst_1287 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %1149 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1286, %1148 : tensor<80x11008xf32>, tensor<11008x4096xf32>) outs(%cst_1287 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_1288 = tensor.expand_shape %1149 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %1150 = tensor.empty() : tensor<1x80x4096xf32>
+    %1151 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1125, %expanded_1288 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%1150 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %1152 = tensor.empty() : tensor<1x80x4096xf32>
+    %cst_1289 = arith.constant 2.000000e+00 : f32
+    %1153 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1151 : tensor<1x80x4096xf32>) outs(%1152 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.powf %in, %cst_1289 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %cst_1290 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32>
+    %1154 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%1153 : tensor<1x80x4096xf32>) outs(%cst_1290 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %cst_2684 = arith.constant 4.096000e+03 : f32
+      %3652 = arith.divf %in, %cst_2684 : f32
+      %3653 = arith.addf %3652, %out : f32
+      linalg.yield %3653 : f32
+    } -> tensor<1x80x1xf32>
+    %cst_1291 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32>
+    %1155 = tensor.empty() : tensor<1x80x1xf32>
+    %1156 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1154, %cst_1291 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%1155 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %1157 = tensor.empty() : tensor<1x80x1xf32>
+    %1158 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1156 : tensor<1x80x1xf32>) outs(%1157 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.rsqrt %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %1159 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_1292 = tensor.collapse_shape %1158 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32>
+    %1160 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1151, %collapsed_1292 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%1159 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %expanded_1293 = tensor.expand_shape %extracted_slice_19 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32>
+    %1161 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_1294 = tensor.collapse_shape %expanded_1293 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32>
+    %1162 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_1294, %1160 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%1161 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %1163 = tensor.empty() : tensor<4096x4096xf32>
+    %1164 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_206 : tensor<4096x4096xf32>) outs(%1163 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_1295 = tensor.collapse_shape %1162 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_1296 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %1165 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1295, %1164 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1296 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_1297 = tensor.expand_shape %1165 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %1166 = tensor.empty() : tensor<4096x4096xf32>
+    %1167 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_208 : tensor<4096x4096xf32>) outs(%1166 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_1298 = tensor.collapse_shape %1162 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_1299 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %1168 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1298, %1167 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1299 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_1300 = tensor.expand_shape %1168 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %1169 = tensor.empty() : tensor<4096x4096xf32>
+    %1170 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_210 : tensor<4096x4096xf32>) outs(%1169 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_1301 = tensor.collapse_shape %1162 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_1302 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %1171 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1301, %1170 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1302 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_1303 = tensor.expand_shape %1171 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %expanded_1304 = tensor.expand_shape %expanded_1297 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32>
+    %1172 = tensor.empty() : tensor<1x32x80x128xf32>
+    %1173 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1304 : tensor<1x80x32x128xf32>) outs(%1172 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x80x128xf32>
+    %expanded_1305 = tensor.expand_shape %expanded_1300 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32>
+    %1174 = tensor.empty() : tensor<1x32x80x128xf32>
+    %1175 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1305 : tensor<1x80x32x128xf32>) outs(%1174 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x80x128xf32>
+    %expanded_1306 = tensor.expand_shape %expanded_1303 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32>
+    %1176 = tensor.empty() : tensor<1x32x80x128xf32>
+    %1177 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1306 : tensor<1x80x32x128xf32>) outs(%1176 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x80x128xf32>
+    %extracted_slice_1307 = tensor.extract_slice %expanded_556[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32>
+    %extracted_slice_1308 = tensor.extract_slice %expanded_558[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32>
+    %1178 = tensor.empty() : tensor<1x80x128xf32>
+    %1179 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_1307 : tensor<1x1x80x128xf32>) outs(%1178 : tensor<1x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x80x128xf32>
+    %1180 = tensor.empty() : tensor<80x128xf32>
+    %1181 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%1179 : tensor<1x80x128xf32>) outs(%1180 : tensor<80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<80x128xf32>
+    %1182 = tensor.empty() : tensor<1x80x128xf32>
+    %1183 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_1308 : tensor<1x1x80x128xf32>) outs(%1182 : tensor<1x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x80x128xf32>
+    %1184 = tensor.empty() : tensor<80x128xf32>
+    %1185 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%1183 : tensor<1x80x128xf32>) outs(%1184 : tensor<80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<80x128xf32>
+    %1186 = tensor.empty() : tensor<1x80x128xf32>
+    %1187 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%1186 : tensor<1x80x128xf32>) {
+    ^bb0(%in: i64, %out: f32):
+      %3652 = arith.index_cast %in : i64 to index
+      %3653 = linalg.index 2 : index
+      %extracted = tensor.extract %1181[%3652, %3653] : tensor<80x128xf32>
+      linalg.yield %extracted : f32
+    } -> tensor<1x80x128xf32>
+    %expanded_1309 = tensor.expand_shape %1187 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32>
+    %1188 = tensor.empty() : tensor<1x80x128xf32>
+    %1189 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%1188 : tensor<1x80x128xf32>) {
+    ^bb0(%in: i64, %out: f32):
+      %3652 = arith.index_cast %in : i64 to index
+      %3653 = linalg.index 2 : index
+      %extracted = tensor.extract %1185[%3652, %3653] : tensor<80x128xf32>
+      linalg.yield %extracted : f32
+    } -> tensor<1x80x128xf32>
+    %expanded_1310 = tensor.expand_shape %1189 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32>
+    %1190 = tensor.empty() : tensor<1x32x80x128xf32>
+    %1191 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1173, %1187 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%1190 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %extracted_slice_1311 = tensor.extract_slice %1173[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %extracted_slice_1312 = tensor.extract_slice %1173[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %1192 = tensor.empty() : tensor<1x32x80x64xf32>
+    %1193 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_1312 : tensor<1x32x80x64xf32>) outs(%1192 : tensor<1x32x80x64xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.negf %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x64xf32>
+    %1194 = tensor.empty() : tensor<1x32x80x128xf32>
+    %inserted_slice_1313 = tensor.insert_slice %1193 into %1194[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %inserted_slice_1314 = tensor.insert_slice %extracted_slice_1311 into %inserted_slice_1313[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %1195 = tensor.empty() : tensor<1x32x80x128xf32>
+    %1196 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_1314, %1189 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%1195 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %1197 = tensor.empty() : tensor<1x32x80x128xf32>
+    %1198 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1191, %1196 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%1197 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %1199 = tensor.empty() : tensor<1x32x80x128xf32>
+    %1200 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1175, %1187 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%1199 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %extracted_slice_1315 = tensor.extract_slice %1175[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %extracted_slice_1316 = tensor.extract_slice %1175[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %1201 = tensor.empty() : tensor<1x32x80x64xf32>
+    %1202 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_1316 : tensor<1x32x80x64xf32>) outs(%1201 : tensor<1x32x80x64xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.negf %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x64xf32>
+    %1203 = tensor.empty() : tensor<1x32x80x128xf32>
+    %inserted_slice_1317 = tensor.insert_slice %1202 into %1203[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %inserted_slice_1318 = tensor.insert_slice %extracted_slice_1315 into %inserted_slice_1317[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %1204 = tensor.empty() : tensor<1x32x80x128xf32>
+    %1205 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_1318, %1189 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%1204 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %1206 = tensor.empty() : tensor<1x32x80x128xf32>
+    %1207 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1200, %1205 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%1206 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %1208 = tensor.empty() : tensor<1x32x128x80xf32>
+    %1209 = linalg.generic {indexing_maps = [#map14, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1207 : tensor<1x32x80x128xf32>) outs(%1208 : tensor<1x32x128x80xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x128x80xf32>
+    %cst_1319 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32>
+    %collapsed_1320 = tensor.collapse_shape %1198 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32>
+    %cst_1321 = arith.constant dense<0.000000e+00> : tensor<1x32x128x80xf32>
+    %collapsed_1322 = tensor.collapse_shape %1209 [[0, 1], [2], [3]] : tensor<1x32x128x80xf32> into tensor<32x128x80xf32>
+    %cst_1323 = arith.constant 0.000000e+00 : f32
+    %1210 = tensor.empty() : tensor<32x80x80xf32>
+    %1211 = linalg.fill ins(%cst_1323 : f32) outs(%1210 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32>
+    %1212 = linalg.batch_matmul ins(%collapsed_1320, %collapsed_1322 : tensor<32x80x128xf32>, tensor<32x128x80xf32>) outs(%1211 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32>
+    %expanded_1324 = tensor.expand_shape %1212 [[0, 1], [2], [3]] : tensor<32x80x80xf32> into tensor<1x32x80x80xf32>
+    %cst_1325 = arith.constant dense<11.3137083> : tensor<1x32x80x80xf32>
+    %1213 = tensor.empty() : tensor<1x32x80x80xf32>
+    %1214 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst_1325 : tensor<1x32x80x80xf32>) outs(%1213 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %cst_2684 = arith.constant 1.000000e+00 : f32
+      %3652 = arith.divf %cst_2684, %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %1215 = tensor.empty() : tensor<1x32x80x80xf32>
+    %1216 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1324, %1214 : tensor<1x32x80x80xf32>, tensor<1x32x80x80xf32>) outs(%1215 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %1217 = tensor.empty() : tensor<1x32x80x80xf32>
+    %collapsed_1326 = tensor.collapse_shape %21 [[0, 1], [2], [3]] : tensor<1x1x80x80xf32> into tensor<1x80x80xf32>
+    %1218 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1216, %collapsed_1326 : tensor<1x32x80x80xf32>, tensor<1x80x80xf32>) outs(%1217 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %1219 = tensor.empty() : tensor<1x32x80x1xf32>
+    %1220 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%1219 : tensor<1x32x80x1xf32>) {
+    ^bb0(%out: f32):
+      %cst_2684 = arith.constant 0xFF800000 : f32
+      linalg.yield %cst_2684 : f32
+    } -> tensor<1x32x80x1xf32>
+    %1221 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%1218 : tensor<1x32x80x80xf32>) outs(%1219 : tensor<1x32x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.maximumf %in, %out : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x1xf32>
+    %1222 = tensor.empty() : tensor<1x32x80x80xf32>
+    %1223 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1218, %1221 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%1222 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.subf %in, %in_2684 : f32
+      %3653 = math.exp %3652 : f32
+      linalg.yield %3653 : f32
+    } -> tensor<1x32x80x80xf32>
+    %1224 = tensor.empty() : tensor<1x32x80x1xf32>
+    %1225 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%1224 : tensor<1x32x80x1xf32>) {
+    ^bb0(%out: f32):
+      %cst_2684 = arith.constant 0.000000e+00 : f32
+      linalg.yield %cst_2684 : f32
+    } -> tensor<1x32x80x1xf32>
+    %1226 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%1223 : tensor<1x32x80x80xf32>) outs(%1225 : tensor<1x32x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.addf %in, %out : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x1xf32>
+    %1227 = tensor.empty() : tensor<1x32x80x80xf32>
+    %1228 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1223, %1226 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%1227 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.divf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %cst_1327 = arith.constant dense<0.000000e+00> : tensor<1x32x80x80xf32>
+    %collapsed_1328 = tensor.collapse_shape %1228 [[0, 1], [2], [3]] : tensor<1x32x80x80xf32> into tensor<32x80x80xf32>
+    %cst_1329 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32>
+    %collapsed_1330 = tensor.collapse_shape %1177 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32>
+    %cst_1331 = arith.constant 0.000000e+00 : f32
+    %1229 = tensor.empty() : tensor<32x80x128xf32>
+    %1230 = linalg.fill ins(%cst_1331 : f32) outs(%1229 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32>
+    %1231 = linalg.batch_matmul ins(%collapsed_1328, %collapsed_1330 : tensor<32x80x80xf32>, tensor<32x80x128xf32>) outs(%1230 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32>
+    %expanded_1332 = tensor.expand_shape %1231 [[0, 1], [2], [3]] : tensor<32x80x128xf32> into tensor<1x32x80x128xf32>
+    %1232 = tensor.empty() : tensor<1x80x32x128xf32>
+    %1233 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1332 : tensor<1x32x80x128xf32>) outs(%1232 : tensor<1x80x32x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x80x32x128xf32>
+    %collapsed_1333 = tensor.collapse_shape %1233 [[0], [1], [2, 3]] : tensor<1x80x32x128xf32> into tensor<1x80x4096xf32>
+    %1234 = tensor.empty() : tensor<4096x4096xf32>
+    %1235 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_212 : tensor<4096x4096xf32>) outs(%1234 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_1334 = tensor.collapse_shape %collapsed_1333 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_1335 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %1236 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1334, %1235 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1335 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_1336 = tensor.expand_shape %1236 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %1237 = tensor.empty() : tensor<1x80x4096xf32>
+    %1238 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1151, %expanded_1336 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%1237 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %1239 = tensor.empty() : tensor<1x80x4096xf32>
+    %cst_1337 = arith.constant 2.000000e+00 : f32
+    %1240 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1238 : tensor<1x80x4096xf32>) outs(%1239 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.powf %in, %cst_1337 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %cst_1338 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32>
+    %1241 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%1240 : tensor<1x80x4096xf32>) outs(%cst_1338 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %cst_2684 = arith.constant 4.096000e+03 : f32
+      %3652 = arith.divf %in, %cst_2684 : f32
+      %3653 = arith.addf %3652, %out : f32
+      linalg.yield %3653 : f32
+    } -> tensor<1x80x1xf32>
+    %cst_1339 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32>
+    %1242 = tensor.empty() : tensor<1x80x1xf32>
+    %1243 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1241, %cst_1339 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%1242 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %1244 = tensor.empty() : tensor<1x80x1xf32>
+    %1245 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1243 : tensor<1x80x1xf32>) outs(%1244 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.rsqrt %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %1246 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_1340 = tensor.collapse_shape %1245 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32>
+    %1247 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1238, %collapsed_1340 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%1246 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %expanded_1341 = tensor.expand_shape %extracted_slice_20 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32>
+    %1248 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_1342 = tensor.collapse_shape %expanded_1341 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32>
+    %1249 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_1342, %1247 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%1248 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %1250 = tensor.empty() : tensor<4096x11008xf32>
+    %1251 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_214 : tensor<11008x4096xf32>) outs(%1250 : tensor<4096x11008xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x11008xf32>
+    %collapsed_1343 = tensor.collapse_shape %1249 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_1344 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32>
+    %1252 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1343, %1251 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_1344 : tensor<80x11008xf32>) -> tensor<80x11008xf32>
+    %expanded_1345 = tensor.expand_shape %1252 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32>
+    %1253 = tensor.empty() : tensor<1x80x11008xf32>
+    %1254 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_1345 : tensor<1x80x11008xf32>) outs(%1253 : tensor<1x80x11008xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.negf %in : f32
+      %3653 = math.exp %3652 : f32
+      %cst_2684 = arith.constant 1.000000e+00 : f32
+      %3654 = arith.addf %cst_2684, %3653 : f32
+      %3655 = arith.divf %in, %3654 : f32
+      linalg.yield %3655 : f32
+    } -> tensor<1x80x11008xf32>
+    %1255 = tensor.empty() : tensor<4096x11008xf32>
+    %1256 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_216 : tensor<11008x4096xf32>) outs(%1255 : tensor<4096x11008xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x11008xf32>
+    %collapsed_1346 = tensor.collapse_shape %1249 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_1347 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32>
+    %1257 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1346, %1256 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_1347 : tensor<80x11008xf32>) -> tensor<80x11008xf32>
+    %expanded_1348 = tensor.expand_shape %1257 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32>
+    %1258 = tensor.empty() : tensor<1x80x11008xf32>
+    %1259 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1254, %expanded_1348 : tensor<1x80x11008xf32>, tensor<1x80x11008xf32>) outs(%1258 : tensor<1x80x11008xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x11008xf32>
+    %1260 = tensor.empty() : tensor<11008x4096xf32>
+    %1261 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_218 : tensor<4096x11008xf32>) outs(%1260 : tensor<11008x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<11008x4096xf32>
+    %collapsed_1349 = tensor.collapse_shape %1259 [[0, 1], [2]] : tensor<1x80x11008xf32> into tensor<80x11008xf32>
+    %cst_1350 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %1262 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1349, %1261 : tensor<80x11008xf32>, tensor<11008x4096xf32>) outs(%cst_1350 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_1351 = tensor.expand_shape %1262 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %1263 = tensor.empty() : tensor<1x80x4096xf32>
+    %1264 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1238, %expanded_1351 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%1263 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %1265 = tensor.empty() : tensor<1x80x4096xf32>
+    %cst_1352 = arith.constant 2.000000e+00 : f32
+    %1266 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1264 : tensor<1x80x4096xf32>) outs(%1265 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.powf %in, %cst_1352 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %cst_1353 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32>
+    %1267 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%1266 : tensor<1x80x4096xf32>) outs(%cst_1353 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %cst_2684 = arith.constant 4.096000e+03 : f32
+      %3652 = arith.divf %in, %cst_2684 : f32
+      %3653 = arith.addf %3652, %out : f32
+      linalg.yield %3653 : f32
+    } -> tensor<1x80x1xf32>
+    %cst_1354 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32>
+    %1268 = tensor.empty() : tensor<1x80x1xf32>
+    %1269 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1267, %cst_1354 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%1268 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %1270 = tensor.empty() : tensor<1x80x1xf32>
+    %1271 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1269 : tensor<1x80x1xf32>) outs(%1270 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.rsqrt %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %1272 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_1355 = tensor.collapse_shape %1271 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32>
+    %1273 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1264, %collapsed_1355 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%1272 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %expanded_1356 = tensor.expand_shape %extracted_slice_21 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32>
+    %1274 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_1357 = tensor.collapse_shape %expanded_1356 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32>
+    %1275 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_1357, %1273 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%1274 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %1276 = tensor.empty() : tensor<4096x4096xf32>
+    %1277 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_220 : tensor<4096x4096xf32>) outs(%1276 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_1358 = tensor.collapse_shape %1275 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_1359 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %1278 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1358, %1277 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1359 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_1360 = tensor.expand_shape %1278 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %1279 = tensor.empty() : tensor<4096x4096xf32>
+    %1280 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_222 : tensor<4096x4096xf32>) outs(%1279 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_1361 = tensor.collapse_shape %1275 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_1362 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %1281 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1361, %1280 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1362 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_1363 = tensor.expand_shape %1281 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %1282 = tensor.empty() : tensor<4096x4096xf32>
+    %1283 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_224 : tensor<4096x4096xf32>) outs(%1282 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_1364 = tensor.collapse_shape %1275 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_1365 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %1284 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1364, %1283 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1365 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_1366 = tensor.expand_shape %1284 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %expanded_1367 = tensor.expand_shape %expanded_1360 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32>
+    %1285 = tensor.empty() : tensor<1x32x80x128xf32>
+    %1286 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1367 : tensor<1x80x32x128xf32>) outs(%1285 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x80x128xf32>
+    %expanded_1368 = tensor.expand_shape %expanded_1363 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32>
+    %1287 = tensor.empty() : tensor<1x32x80x128xf32>
+    %1288 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1368 : tensor<1x80x32x128xf32>) outs(%1287 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x80x128xf32>
+    %expanded_1369 = tensor.expand_shape %expanded_1366 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32>
+    %1289 = tensor.empty() : tensor<1x32x80x128xf32>
+    %1290 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1369 : tensor<1x80x32x128xf32>) outs(%1289 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x80x128xf32>
+    %extracted_slice_1370 = tensor.extract_slice %expanded_560[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32>
+    %extracted_slice_1371 = tensor.extract_slice %expanded_562[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32>
+    %1291 = tensor.empty() : tensor<1x80x128xf32>
+    %1292 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_1370 : tensor<1x1x80x128xf32>) outs(%1291 : tensor<1x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x80x128xf32>
+    %1293 = tensor.empty() : tensor<80x128xf32>
+    %1294 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%1292 : tensor<1x80x128xf32>) outs(%1293 : tensor<80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<80x128xf32>
+    %1295 = tensor.empty() : tensor<1x80x128xf32>
+    %1296 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_1371 : tensor<1x1x80x128xf32>) outs(%1295 : tensor<1x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x80x128xf32>
+    %1297 = tensor.empty() : tensor<80x128xf32>
+    %1298 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%1296 : tensor<1x80x128xf32>) outs(%1297 : tensor<80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<80x128xf32>
+    %1299 = tensor.empty() : tensor<1x80x128xf32>
+    %1300 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%1299 : tensor<1x80x128xf32>) {
+    ^bb0(%in: i64, %out: f32):
+      %3652 = arith.index_cast %in : i64 to index
+      %3653 = linalg.index 2 : index
+      %extracted = tensor.extract %1294[%3652, %3653] : tensor<80x128xf32>
+      linalg.yield %extracted : f32
+    } -> tensor<1x80x128xf32>
+    %expanded_1372 = tensor.expand_shape %1300 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32>
+    %1301 = tensor.empty() : tensor<1x80x128xf32>
+    %1302 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%1301 : tensor<1x80x128xf32>) {
+    ^bb0(%in: i64, %out: f32):
+      %3652 = arith.index_cast %in : i64 to index
+      %3653 = linalg.index 2 : index
+      %extracted = tensor.extract %1298[%3652, %3653] : tensor<80x128xf32>
+      linalg.yield %extracted : f32
+    } -> tensor<1x80x128xf32>
+    %expanded_1373 = tensor.expand_shape %1302 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32>
+    %1303 = tensor.empty() : tensor<1x32x80x128xf32>
+    %1304 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1286, %1300 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%1303 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %extracted_slice_1374 = tensor.extract_slice %1286[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %extracted_slice_1375 = tensor.extract_slice %1286[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %1305 = tensor.empty() : tensor<1x32x80x64xf32>
+    %1306 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_1375 : tensor<1x32x80x64xf32>) outs(%1305 : tensor<1x32x80x64xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.negf %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x64xf32>
+    %1307 = tensor.empty() : tensor<1x32x80x128xf32>
+    %inserted_slice_1376 = tensor.insert_slice %1306 into %1307[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %inserted_slice_1377 = tensor.insert_slice %extracted_slice_1374 into %inserted_slice_1376[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %1308 = tensor.empty() : tensor<1x32x80x128xf32>
+    %1309 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_1377, %1302 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%1308 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %1310 = tensor.empty() : tensor<1x32x80x128xf32>
+    %1311 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1304, %1309 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%1310 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %1312 = tensor.empty() : tensor<1x32x80x128xf32>
+    %1313 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1288, %1300 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%1312 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %extracted_slice_1378 = tensor.extract_slice %1288[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %extracted_slice_1379 = tensor.extract_slice %1288[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %1314 = tensor.empty() : tensor<1x32x80x64xf32>
+    %1315 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_1379 : tensor<1x32x80x64xf32>) outs(%1314 : tensor<1x32x80x64xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.negf %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x64xf32>
+    %1316 = tensor.empty() : tensor<1x32x80x128xf32>
+    %inserted_slice_1380 = tensor.insert_slice %1315 into %1316[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %inserted_slice_1381 = tensor.insert_slice %extracted_slice_1378 into %inserted_slice_1380[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %1317 = tensor.empty() : tensor<1x32x80x128xf32>
+    %1318 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_1381, %1302 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%1317 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %1319 = tensor.empty() : tensor<1x32x80x128xf32>
+    %1320 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1313, %1318 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%1319 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %1321 = tensor.empty() : tensor<1x32x128x80xf32>
+    %1322 = linalg.generic {indexing_maps = [#map14, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1320 : tensor<1x32x80x128xf32>) outs(%1321 : tensor<1x32x128x80xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x128x80xf32>
+    %cst_1382 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32>
+    %collapsed_1383 = tensor.collapse_shape %1311 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32>
+    %cst_1384 = arith.constant dense<0.000000e+00> : tensor<1x32x128x80xf32>
+    %collapsed_1385 = tensor.collapse_shape %1322 [[0, 1], [2], [3]] : tensor<1x32x128x80xf32> into tensor<32x128x80xf32>
+    %cst_1386 = arith.constant 0.000000e+00 : f32
+    %1323 = tensor.empty() : tensor<32x80x80xf32>
+    %1324 = linalg.fill ins(%cst_1386 : f32) outs(%1323 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32>
+    %1325 = linalg.batch_matmul ins(%collapsed_1383, %collapsed_1385 : tensor<32x80x128xf32>, tensor<32x128x80xf32>) outs(%1324 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32>
+    %expanded_1387 = tensor.expand_shape %1325 [[0, 1], [2], [3]] : tensor<32x80x80xf32> into tensor<1x32x80x80xf32>
+    %cst_1388 = arith.constant dense<11.3137083> : tensor<1x32x80x80xf32>
+    %1326 = tensor.empty() : tensor<1x32x80x80xf32>
+    %1327 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst_1388 : tensor<1x32x80x80xf32>) outs(%1326 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %cst_2684 = arith.constant 1.000000e+00 : f32
+      %3652 = arith.divf %cst_2684, %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %1328 = tensor.empty() : tensor<1x32x80x80xf32>
+    %1329 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1387, %1327 : tensor<1x32x80x80xf32>, tensor<1x32x80x80xf32>) outs(%1328 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %1330 = tensor.empty() : tensor<1x32x80x80xf32>
+    %collapsed_1389 = tensor.collapse_shape %21 [[0, 1], [2], [3]] : tensor<1x1x80x80xf32> into tensor<1x80x80xf32>
+    %1331 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1329, %collapsed_1389 : tensor<1x32x80x80xf32>, tensor<1x80x80xf32>) outs(%1330 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %1332 = tensor.empty() : tensor<1x32x80x1xf32>
+    %1333 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%1332 : tensor<1x32x80x1xf32>) {
+    ^bb0(%out: f32):
+      %cst_2684 = arith.constant 0xFF800000 : f32
+      linalg.yield %cst_2684 : f32
+    } -> tensor<1x32x80x1xf32>
+    %1334 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%1331 : tensor<1x32x80x80xf32>) outs(%1332 : tensor<1x32x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.maximumf %in, %out : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x1xf32>
+    %1335 = tensor.empty() : tensor<1x32x80x80xf32>
+    %1336 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1331, %1334 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%1335 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.subf %in, %in_2684 : f32
+      %3653 = math.exp %3652 : f32
+      linalg.yield %3653 : f32
+    } -> tensor<1x32x80x80xf32>
+    %1337 = tensor.empty() : tensor<1x32x80x1xf32>
+    %1338 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%1337 : tensor<1x32x80x1xf32>) {
+    ^bb0(%out: f32):
+      %cst_2684 = arith.constant 0.000000e+00 : f32
+      linalg.yield %cst_2684 : f32
+    } -> tensor<1x32x80x1xf32>
+    %1339 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%1336 : tensor<1x32x80x80xf32>) outs(%1338 : tensor<1x32x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.addf %in, %out : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x1xf32>
+    %1340 = tensor.empty() : tensor<1x32x80x80xf32>
+    %1341 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1336, %1339 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%1340 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.divf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %cst_1390 = arith.constant dense<0.000000e+00> : tensor<1x32x80x80xf32>
+    %collapsed_1391 = tensor.collapse_shape %1341 [[0, 1], [2], [3]] : tensor<1x32x80x80xf32> into tensor<32x80x80xf32>
+    %cst_1392 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32>
+    %collapsed_1393 = tensor.collapse_shape %1290 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32>
+    %cst_1394 = arith.constant 0.000000e+00 : f32
+    %1342 = tensor.empty() : tensor<32x80x128xf32>
+    %1343 = linalg.fill ins(%cst_1394 : f32) outs(%1342 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32>
+    %1344 = linalg.batch_matmul ins(%collapsed_1391, %collapsed_1393 : tensor<32x80x80xf32>, tensor<32x80x128xf32>) outs(%1343 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32>
+    %expanded_1395 = tensor.expand_shape %1344 [[0, 1], [2], [3]] : tensor<32x80x128xf32> into tensor<1x32x80x128xf32>
+    %1345 = tensor.empty() : tensor<1x80x32x128xf32>
+    %1346 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1395 : tensor<1x32x80x128xf32>) outs(%1345 : tensor<1x80x32x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x80x32x128xf32>
+    %collapsed_1396 = tensor.collapse_shape %1346 [[0], [1], [2, 3]] : tensor<1x80x32x128xf32> into tensor<1x80x4096xf32>
+    %1347 = tensor.empty() : tensor<4096x4096xf32>
+    %1348 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_226 : tensor<4096x4096xf32>) outs(%1347 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_1397 = tensor.collapse_shape %collapsed_1396 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_1398 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %1349 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1397, %1348 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1398 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_1399 = tensor.expand_shape %1349 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %1350 = tensor.empty() : tensor<1x80x4096xf32>
+    %1351 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1264, %expanded_1399 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%1350 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %1352 = tensor.empty() : tensor<1x80x4096xf32>
+    %cst_1400 = arith.constant 2.000000e+00 : f32
+    %1353 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1351 : tensor<1x80x4096xf32>) outs(%1352 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.powf %in, %cst_1400 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %cst_1401 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32>
+    %1354 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%1353 : tensor<1x80x4096xf32>) outs(%cst_1401 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %cst_2684 = arith.constant 4.096000e+03 : f32
+      %3652 = arith.divf %in, %cst_2684 : f32
+      %3653 = arith.addf %3652, %out : f32
+      linalg.yield %3653 : f32
+    } -> tensor<1x80x1xf32>
+    %cst_1402 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32>
+    %1355 = tensor.empty() : tensor<1x80x1xf32>
+    %1356 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1354, %cst_1402 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%1355 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %1357 = tensor.empty() : tensor<1x80x1xf32>
+    %1358 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1356 : tensor<1x80x1xf32>) outs(%1357 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.rsqrt %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %1359 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_1403 = tensor.collapse_shape %1358 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32>
+    %1360 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1351, %collapsed_1403 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%1359 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %expanded_1404 = tensor.expand_shape %extracted_slice_22 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32>
+    %1361 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_1405 = tensor.collapse_shape %expanded_1404 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32>
+    %1362 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_1405, %1360 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%1361 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %1363 = tensor.empty() : tensor<4096x11008xf32>
+    %1364 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_228 : tensor<11008x4096xf32>) outs(%1363 : tensor<4096x11008xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x11008xf32>
+    %collapsed_1406 = tensor.collapse_shape %1362 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_1407 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32>
+    %1365 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1406, %1364 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_1407 : tensor<80x11008xf32>) -> tensor<80x11008xf32>
+    %expanded_1408 = tensor.expand_shape %1365 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32>
+    %1366 = tensor.empty() : tensor<1x80x11008xf32>
+    %1367 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_1408 : tensor<1x80x11008xf32>) outs(%1366 : tensor<1x80x11008xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.negf %in : f32
+      %3653 = math.exp %3652 : f32
+      %cst_2684 = arith.constant 1.000000e+00 : f32
+      %3654 = arith.addf %cst_2684, %3653 : f32
+      %3655 = arith.divf %in, %3654 : f32
+      linalg.yield %3655 : f32
+    } -> tensor<1x80x11008xf32>
+    %1368 = tensor.empty() : tensor<4096x11008xf32>
+    %1369 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_230 : tensor<11008x4096xf32>) outs(%1368 : tensor<4096x11008xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x11008xf32>
+    %collapsed_1409 = tensor.collapse_shape %1362 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_1410 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32>
+    %1370 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1409, %1369 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_1410 : tensor<80x11008xf32>) -> tensor<80x11008xf32>
+    %expanded_1411 = tensor.expand_shape %1370 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32>
+    %1371 = tensor.empty() : tensor<1x80x11008xf32>
+    %1372 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1367, %expanded_1411 : tensor<1x80x11008xf32>, tensor<1x80x11008xf32>) outs(%1371 : tensor<1x80x11008xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x11008xf32>
+    %1373 = tensor.empty() : tensor<11008x4096xf32>
+    %1374 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_232 : tensor<4096x11008xf32>) outs(%1373 : tensor<11008x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<11008x4096xf32>
+    %collapsed_1412 = tensor.collapse_shape %1372 [[0, 1], [2]] : tensor<1x80x11008xf32> into tensor<80x11008xf32>
+    %cst_1413 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %1375 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1412, %1374 : tensor<80x11008xf32>, tensor<11008x4096xf32>) outs(%cst_1413 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_1414 = tensor.expand_shape %1375 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %1376 = tensor.empty() : tensor<1x80x4096xf32>
+    %1377 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1351, %expanded_1414 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%1376 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %1378 = tensor.empty() : tensor<1x80x4096xf32>
+    %cst_1415 = arith.constant 2.000000e+00 : f32
+    %1379 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1377 : tensor<1x80x4096xf32>) outs(%1378 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.powf %in, %cst_1415 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %cst_1416 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32>
+    %1380 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%1379 : tensor<1x80x4096xf32>) outs(%cst_1416 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %cst_2684 = arith.constant 4.096000e+03 : f32
+      %3652 = arith.divf %in, %cst_2684 : f32
+      %3653 = arith.addf %3652, %out : f32
+      linalg.yield %3653 : f32
+    } -> tensor<1x80x1xf32>
+    %cst_1417 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32>
+    %1381 = tensor.empty() : tensor<1x80x1xf32>
+    %1382 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1380, %cst_1417 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%1381 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %1383 = tensor.empty() : tensor<1x80x1xf32>
+    %1384 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1382 : tensor<1x80x1xf32>) outs(%1383 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.rsqrt %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %1385 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_1418 = tensor.collapse_shape %1384 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32>
+    %1386 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1377, %collapsed_1418 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%1385 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %expanded_1419 = tensor.expand_shape %extracted_slice_23 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32>
+    %1387 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_1420 = tensor.collapse_shape %expanded_1419 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32>
+    %1388 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_1420, %1386 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%1387 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %1389 = tensor.empty() : tensor<4096x4096xf32>
+    %1390 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_234 : tensor<4096x4096xf32>) outs(%1389 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_1421 = tensor.collapse_shape %1388 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_1422 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %1391 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1421, %1390 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1422 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_1423 = tensor.expand_shape %1391 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %1392 = tensor.empty() : tensor<4096x4096xf32>
+    %1393 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_236 : tensor<4096x4096xf32>) outs(%1392 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_1424 = tensor.collapse_shape %1388 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_1425 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %1394 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1424, %1393 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1425 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_1426 = tensor.expand_shape %1394 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %1395 = tensor.empty() : tensor<4096x4096xf32>
+    %1396 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_238 : tensor<4096x4096xf32>) outs(%1395 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_1427 = tensor.collapse_shape %1388 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_1428 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %1397 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1427, %1396 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1428 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_1429 = tensor.expand_shape %1397 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %expanded_1430 = tensor.expand_shape %expanded_1423 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32>
+    %1398 = tensor.empty() : tensor<1x32x80x128xf32>
+    %1399 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1430 : tensor<1x80x32x128xf32>) outs(%1398 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x80x128xf32>
+    %expanded_1431 = tensor.expand_shape %expanded_1426 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32>
+    %1400 = tensor.empty() : tensor<1x32x80x128xf32>
+    %1401 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1431 : tensor<1x80x32x128xf32>) outs(%1400 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x80x128xf32>
+    %expanded_1432 = tensor.expand_shape %expanded_1429 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32>
+    %1402 = tensor.empty() : tensor<1x32x80x128xf32>
+    %1403 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1432 : tensor<1x80x32x128xf32>) outs(%1402 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x80x128xf32>
+    %extracted_slice_1433 = tensor.extract_slice %expanded_564[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32>
+    %extracted_slice_1434 = tensor.extract_slice %expanded_566[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32>
+    %1404 = tensor.empty() : tensor<1x80x128xf32>
+    %1405 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_1433 : tensor<1x1x80x128xf32>) outs(%1404 : tensor<1x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x80x128xf32>
+    %1406 = tensor.empty() : tensor<80x128xf32>
+    %1407 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%1405 : tensor<1x80x128xf32>) outs(%1406 : tensor<80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<80x128xf32>
+    %1408 = tensor.empty() : tensor<1x80x128xf32>
+    %1409 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_1434 : tensor<1x1x80x128xf32>) outs(%1408 : tensor<1x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x80x128xf32>
+    %1410 = tensor.empty() : tensor<80x128xf32>
+    %1411 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%1409 : tensor<1x80x128xf32>) outs(%1410 : tensor<80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<80x128xf32>
+    %1412 = tensor.empty() : tensor<1x80x128xf32>
+    %1413 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%1412 : tensor<1x80x128xf32>) {
+    ^bb0(%in: i64, %out: f32):
+      %3652 = arith.index_cast %in : i64 to index
+      %3653 = linalg.index 2 : index
+      %extracted = tensor.extract %1407[%3652, %3653] : tensor<80x128xf32>
+      linalg.yield %extracted : f32
+    } -> tensor<1x80x128xf32>
+    %expanded_1435 = tensor.expand_shape %1413 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32>
+    %1414 = tensor.empty() : tensor<1x80x128xf32>
+    %1415 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%1414 : tensor<1x80x128xf32>) {
+    ^bb0(%in: i64, %out: f32):
+      %3652 = arith.index_cast %in : i64 to index
+      %3653 = linalg.index 2 : index
+      %extracted = tensor.extract %1411[%3652, %3653] : tensor<80x128xf32>
+      linalg.yield %extracted : f32
+    } -> tensor<1x80x128xf32>
+    %expanded_1436 = tensor.expand_shape %1415 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32>
+    %1416 = tensor.empty() : tensor<1x32x80x128xf32>
+    %1417 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1399, %1413 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%1416 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %extracted_slice_1437 = tensor.extract_slice %1399[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %extracted_slice_1438 = tensor.extract_slice %1399[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %1418 = tensor.empty() : tensor<1x32x80x64xf32>
+    %1419 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_1438 : tensor<1x32x80x64xf32>) outs(%1418 : tensor<1x32x80x64xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.negf %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x64xf32>
+    %1420 = tensor.empty() : tensor<1x32x80x128xf32>
+    %inserted_slice_1439 = tensor.insert_slice %1419 into %1420[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %inserted_slice_1440 = tensor.insert_slice %extracted_slice_1437 into %inserted_slice_1439[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %1421 = tensor.empty() : tensor<1x32x80x128xf32>
+    %1422 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_1440, %1415 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%1421 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %1423 = tensor.empty() : tensor<1x32x80x128xf32>
+    %1424 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1417, %1422 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%1423 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %1425 = tensor.empty() : tensor<1x32x80x128xf32>
+    %1426 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1401, %1413 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%1425 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %extracted_slice_1441 = tensor.extract_slice %1401[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %extracted_slice_1442 = tensor.extract_slice %1401[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %1427 = tensor.empty() : tensor<1x32x80x64xf32>
+    %1428 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_1442 : tensor<1x32x80x64xf32>) outs(%1427 : tensor<1x32x80x64xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.negf %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x64xf32>
+    %1429 = tensor.empty() : tensor<1x32x80x128xf32>
+    %inserted_slice_1443 = tensor.insert_slice %1428 into %1429[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %inserted_slice_1444 = tensor.insert_slice %extracted_slice_1441 into %inserted_slice_1443[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %1430 = tensor.empty() : tensor<1x32x80x128xf32>
+    %1431 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_1444, %1415 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%1430 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %1432 = tensor.empty() : tensor<1x32x80x128xf32>
+    %1433 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1426, %1431 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%1432 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %1434 = tensor.empty() : tensor<1x32x128x80xf32>
+    %1435 = linalg.generic {indexing_maps = [#map14, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1433 : tensor<1x32x80x128xf32>) outs(%1434 : tensor<1x32x128x80xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x128x80xf32>
+    %cst_1445 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32>
+    %collapsed_1446 = tensor.collapse_shape %1424 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32>
+    %cst_1447 = arith.constant dense<0.000000e+00> : tensor<1x32x128x80xf32>
+    %collapsed_1448 = tensor.collapse_shape %1435 [[0, 1], [2], [3]] : tensor<1x32x128x80xf32> into tensor<32x128x80xf32>
+    %cst_1449 = arith.constant 0.000000e+00 : f32
+    %1436 = tensor.empty() : tensor<32x80x80xf32>
+    %1437 = linalg.fill ins(%cst_1449 : f32) outs(%1436 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32>
+    %1438 = linalg.batch_matmul ins(%collapsed_1446, %collapsed_1448 : tensor<32x80x128xf32>, tensor<32x128x80xf32>) outs(%1437 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32>
+    %expanded_1450 = tensor.expand_shape %1438 [[0, 1], [2], [3]] : tensor<32x80x80xf32> into tensor<1x32x80x80xf32>
+    %cst_1451 = arith.constant dense<11.3137083> : tensor<1x32x80x80xf32>
+    %1439 = tensor.empty() : tensor<1x32x80x80xf32>
+    %1440 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst_1451 : tensor<1x32x80x80xf32>) outs(%1439 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %cst_2684 = arith.constant 1.000000e+00 : f32
+      %3652 = arith.divf %cst_2684, %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %1441 = tensor.empty() : tensor<1x32x80x80xf32>
+    %1442 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1450, %1440 : tensor<1x32x80x80xf32>, tensor<1x32x80x80xf32>) outs(%1441 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %1443 = tensor.empty() : tensor<1x32x80x80xf32>
+    %collapsed_1452 = tensor.collapse_shape %21 [[0, 1], [2], [3]] : tensor<1x1x80x80xf32> into tensor<1x80x80xf32>
+    %1444 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1442, %collapsed_1452 : tensor<1x32x80x80xf32>, tensor<1x80x80xf32>) outs(%1443 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %1445 = tensor.empty() : tensor<1x32x80x1xf32>
+    %1446 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%1445 : tensor<1x32x80x1xf32>) {
+    ^bb0(%out: f32):
+      %cst_2684 = arith.constant 0xFF800000 : f32
+      linalg.yield %cst_2684 : f32
+    } -> tensor<1x32x80x1xf32>
+    %1447 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%1444 : tensor<1x32x80x80xf32>) outs(%1445 : tensor<1x32x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.maximumf %in, %out : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x1xf32>
+    %1448 = tensor.empty() : tensor<1x32x80x80xf32>
+    %1449 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1444, %1447 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%1448 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.subf %in, %in_2684 : f32
+      %3653 = math.exp %3652 : f32
+      linalg.yield %3653 : f32
+    } -> tensor<1x32x80x80xf32>
+    %1450 = tensor.empty() : tensor<1x32x80x1xf32>
+    %1451 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%1450 : tensor<1x32x80x1xf32>) {
+    ^bb0(%out: f32):
+      %cst_2684 = arith.constant 0.000000e+00 : f32
+      linalg.yield %cst_2684 : f32
+    } -> tensor<1x32x80x1xf32>
+    %1452 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%1449 : tensor<1x32x80x80xf32>) outs(%1451 : tensor<1x32x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.addf %in, %out : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x1xf32>
+    %1453 = tensor.empty() : tensor<1x32x80x80xf32>
+    %1454 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1449, %1452 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%1453 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.divf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %cst_1453 = arith.constant dense<0.000000e+00> : tensor<1x32x80x80xf32>
+    %collapsed_1454 = tensor.collapse_shape %1454 [[0, 1], [2], [3]] : tensor<1x32x80x80xf32> into tensor<32x80x80xf32>
+    %cst_1455 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32>
+    %collapsed_1456 = tensor.collapse_shape %1403 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32>
+    %cst_1457 = arith.constant 0.000000e+00 : f32
+    %1455 = tensor.empty() : tensor<32x80x128xf32>
+    %1456 = linalg.fill ins(%cst_1457 : f32) outs(%1455 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32>
+    %1457 = linalg.batch_matmul ins(%collapsed_1454, %collapsed_1456 : tensor<32x80x80xf32>, tensor<32x80x128xf32>) outs(%1456 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32>
+    %expanded_1458 = tensor.expand_shape %1457 [[0, 1], [2], [3]] : tensor<32x80x128xf32> into tensor<1x32x80x128xf32>
+    %1458 = tensor.empty() : tensor<1x80x32x128xf32>
+    %1459 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1458 : tensor<1x32x80x128xf32>) outs(%1458 : tensor<1x80x32x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x80x32x128xf32>
+    %collapsed_1459 = tensor.collapse_shape %1459 [[0], [1], [2, 3]] : tensor<1x80x32x128xf32> into tensor<1x80x4096xf32>
+    %1460 = tensor.empty() : tensor<4096x4096xf32>
+    %1461 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_240 : tensor<4096x4096xf32>) outs(%1460 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_1460 = tensor.collapse_shape %collapsed_1459 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_1461 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %1462 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1460, %1461 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1461 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_1462 = tensor.expand_shape %1462 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %1463 = tensor.empty() : tensor<1x80x4096xf32>
+    %1464 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1377, %expanded_1462 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%1463 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %1465 = tensor.empty() : tensor<1x80x4096xf32>
+    %cst_1463 = arith.constant 2.000000e+00 : f32
+    %1466 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1464 : tensor<1x80x4096xf32>) outs(%1465 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.powf %in, %cst_1463 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %cst_1464 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32>
+    %1467 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%1466 : tensor<1x80x4096xf32>) outs(%cst_1464 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %cst_2684 = arith.constant 4.096000e+03 : f32
+      %3652 = arith.divf %in, %cst_2684 : f32
+      %3653 = arith.addf %3652, %out : f32
+      linalg.yield %3653 : f32
+    } -> tensor<1x80x1xf32>
+    %cst_1465 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32>
+    %1468 = tensor.empty() : tensor<1x80x1xf32>
+    %1469 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1467, %cst_1465 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%1468 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %1470 = tensor.empty() : tensor<1x80x1xf32>
+    %1471 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1469 : tensor<1x80x1xf32>) outs(%1470 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.rsqrt %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %1472 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_1466 = tensor.collapse_shape %1471 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32>
+    %1473 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1464, %collapsed_1466 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%1472 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %expanded_1467 = tensor.expand_shape %extracted_slice_24 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32>
+    %1474 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_1468 = tensor.collapse_shape %expanded_1467 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32>
+    %1475 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_1468, %1473 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%1474 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %1476 = tensor.empty() : tensor<4096x11008xf32>
+    %1477 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_242 : tensor<11008x4096xf32>) outs(%1476 : tensor<4096x11008xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x11008xf32>
+    %collapsed_1469 = tensor.collapse_shape %1475 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_1470 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32>
+    %1478 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1469, %1477 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_1470 : tensor<80x11008xf32>) -> tensor<80x11008xf32>
+    %expanded_1471 = tensor.expand_shape %1478 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32>
+    %1479 = tensor.empty() : tensor<1x80x11008xf32>
+    %1480 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_1471 : tensor<1x80x11008xf32>) outs(%1479 : tensor<1x80x11008xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.negf %in : f32
+      %3653 = math.exp %3652 : f32
+      %cst_2684 = arith.constant 1.000000e+00 : f32
+      %3654 = arith.addf %cst_2684, %3653 : f32
+      %3655 = arith.divf %in, %3654 : f32
+      linalg.yield %3655 : f32
+    } -> tensor<1x80x11008xf32>
+    %1481 = tensor.empty() : tensor<4096x11008xf32>
+    %1482 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_244 : tensor<11008x4096xf32>) outs(%1481 : tensor<4096x11008xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x11008xf32>
+    %collapsed_1472 = tensor.collapse_shape %1475 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_1473 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32>
+    %1483 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1472, %1482 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_1473 : tensor<80x11008xf32>) -> tensor<80x11008xf32>
+    %expanded_1474 = tensor.expand_shape %1483 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32>
+    %1484 = tensor.empty() : tensor<1x80x11008xf32>
+    %1485 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1480, %expanded_1474 : tensor<1x80x11008xf32>, tensor<1x80x11008xf32>) outs(%1484 : tensor<1x80x11008xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x11008xf32>
+    %1486 = tensor.empty() : tensor<11008x4096xf32>
+    %1487 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_246 : tensor<4096x11008xf32>) outs(%1486 : tensor<11008x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<11008x4096xf32>
+    %collapsed_1475 = tensor.collapse_shape %1485 [[0, 1], [2]] : tensor<1x80x11008xf32> into tensor<80x11008xf32>
+    %cst_1476 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %1488 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1475, %1487 : tensor<80x11008xf32>, tensor<11008x4096xf32>) outs(%cst_1476 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_1477 = tensor.expand_shape %1488 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %1489 = tensor.empty() : tensor<1x80x4096xf32>
+    %1490 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1464, %expanded_1477 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%1489 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %1491 = tensor.empty() : tensor<1x80x4096xf32>
+    %cst_1478 = arith.constant 2.000000e+00 : f32
+    %1492 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1490 : tensor<1x80x4096xf32>) outs(%1491 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.powf %in, %cst_1478 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %cst_1479 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32>
+    %1493 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%1492 : tensor<1x80x4096xf32>) outs(%cst_1479 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %cst_2684 = arith.constant 4.096000e+03 : f32
+      %3652 = arith.divf %in, %cst_2684 : f32
+      %3653 = arith.addf %3652, %out : f32
+      linalg.yield %3653 : f32
+    } -> tensor<1x80x1xf32>
+    %cst_1480 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32>
+    %1494 = tensor.empty() : tensor<1x80x1xf32>
+    %1495 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1493, %cst_1480 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%1494 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %1496 = tensor.empty() : tensor<1x80x1xf32>
+    %1497 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1495 : tensor<1x80x1xf32>) outs(%1496 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.rsqrt %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %1498 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_1481 = tensor.collapse_shape %1497 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32>
+    %1499 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1490, %collapsed_1481 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%1498 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %expanded_1482 = tensor.expand_shape %extracted_slice_25 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32>
+    %1500 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_1483 = tensor.collapse_shape %expanded_1482 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32>
+    %1501 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_1483, %1499 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%1500 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %1502 = tensor.empty() : tensor<4096x4096xf32>
+    %1503 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_248 : tensor<4096x4096xf32>) outs(%1502 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_1484 = tensor.collapse_shape %1501 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_1485 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %1504 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1484, %1503 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1485 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_1486 = tensor.expand_shape %1504 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %1505 = tensor.empty() : tensor<4096x4096xf32>
+    %1506 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_250 : tensor<4096x4096xf32>) outs(%1505 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_1487 = tensor.collapse_shape %1501 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_1488 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %1507 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1487, %1506 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1488 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_1489 = tensor.expand_shape %1507 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %1508 = tensor.empty() : tensor<4096x4096xf32>
+    %1509 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_252 : tensor<4096x4096xf32>) outs(%1508 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_1490 = tensor.collapse_shape %1501 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_1491 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %1510 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1490, %1509 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1491 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_1492 = tensor.expand_shape %1510 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %expanded_1493 = tensor.expand_shape %expanded_1486 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32>
+    %1511 = tensor.empty() : tensor<1x32x80x128xf32>
+    %1512 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1493 : tensor<1x80x32x128xf32>) outs(%1511 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x80x128xf32>
+    %expanded_1494 = tensor.expand_shape %expanded_1489 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32>
+    %1513 = tensor.empty() : tensor<1x32x80x128xf32>
+    %1514 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1494 : tensor<1x80x32x128xf32>) outs(%1513 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x80x128xf32>
+    %expanded_1495 = tensor.expand_shape %expanded_1492 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32>
+    %1515 = tensor.empty() : tensor<1x32x80x128xf32>
+    %1516 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1495 : tensor<1x80x32x128xf32>) outs(%1515 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x80x128xf32>
+    %extracted_slice_1496 = tensor.extract_slice %expanded_568[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32>
+    %extracted_slice_1497 = tensor.extract_slice %expanded_570[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32>
+    %1517 = tensor.empty() : tensor<1x80x128xf32>
+    %1518 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_1496 : tensor<1x1x80x128xf32>) outs(%1517 : tensor<1x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x80x128xf32>
+    %1519 = tensor.empty() : tensor<80x128xf32>
+    %1520 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%1518 : tensor<1x80x128xf32>) outs(%1519 : tensor<80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<80x128xf32>
+    %1521 = tensor.empty() : tensor<1x80x128xf32>
+    %1522 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_1497 : tensor<1x1x80x128xf32>) outs(%1521 : tensor<1x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x80x128xf32>
+    %1523 = tensor.empty() : tensor<80x128xf32>
+    %1524 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%1522 : tensor<1x80x128xf32>) outs(%1523 : tensor<80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<80x128xf32>
+    %1525 = tensor.empty() : tensor<1x80x128xf32>
+    %1526 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%1525 : tensor<1x80x128xf32>) {
+    ^bb0(%in: i64, %out: f32):
+      %3652 = arith.index_cast %in : i64 to index
+      %3653 = linalg.index 2 : index
+      %extracted = tensor.extract %1520[%3652, %3653] : tensor<80x128xf32>
+      linalg.yield %extracted : f32
+    } -> tensor<1x80x128xf32>
+    %expanded_1498 = tensor.expand_shape %1526 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32>
+    %1527 = tensor.empty() : tensor<1x80x128xf32>
+    %1528 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%1527 : tensor<1x80x128xf32>) {
+    ^bb0(%in: i64, %out: f32):
+      %3652 = arith.index_cast %in : i64 to index
+      %3653 = linalg.index 2 : index
+      %extracted = tensor.extract %1524[%3652, %3653] : tensor<80x128xf32>
+      linalg.yield %extracted : f32
+    } -> tensor<1x80x128xf32>
+    %expanded_1499 = tensor.expand_shape %1528 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32>
+    %1529 = tensor.empty() : tensor<1x32x80x128xf32>
+    %1530 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1512, %1526 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%1529 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %extracted_slice_1500 = tensor.extract_slice %1512[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %extracted_slice_1501 = tensor.extract_slice %1512[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %1531 = tensor.empty() : tensor<1x32x80x64xf32>
+    %1532 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_1501 : tensor<1x32x80x64xf32>) outs(%1531 : tensor<1x32x80x64xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.negf %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x64xf32>
+    %1533 = tensor.empty() : tensor<1x32x80x128xf32>
+    %inserted_slice_1502 = tensor.insert_slice %1532 into %1533[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %inserted_slice_1503 = tensor.insert_slice %extracted_slice_1500 into %inserted_slice_1502[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %1534 = tensor.empty() : tensor<1x32x80x128xf32>
+    %1535 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_1503, %1528 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%1534 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %1536 = tensor.empty() : tensor<1x32x80x128xf32>
+    %1537 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1530, %1535 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%1536 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %1538 = tensor.empty() : tensor<1x32x80x128xf32>
+    %1539 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1514, %1526 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%1538 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %extracted_slice_1504 = tensor.extract_slice %1514[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %extracted_slice_1505 = tensor.extract_slice %1514[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %1540 = tensor.empty() : tensor<1x32x80x64xf32>
+    %1541 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_1505 : tensor<1x32x80x64xf32>) outs(%1540 : tensor<1x32x80x64xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.negf %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x64xf32>
+    %1542 = tensor.empty() : tensor<1x32x80x128xf32>
+    %inserted_slice_1506 = tensor.insert_slice %1541 into %1542[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %inserted_slice_1507 = tensor.insert_slice %extracted_slice_1504 into %inserted_slice_1506[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %1543 = tensor.empty() : tensor<1x32x80x128xf32>
+    %1544 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_1507, %1528 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%1543 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %1545 = tensor.empty() : tensor<1x32x80x128xf32>
+    %1546 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1539, %1544 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%1545 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %1547 = tensor.empty() : tensor<1x32x128x80xf32>
+    %1548 = linalg.generic {indexing_maps = [#map14, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1546 : tensor<1x32x80x128xf32>) outs(%1547 : tensor<1x32x128x80xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x128x80xf32>
+    %cst_1508 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32>
+    %collapsed_1509 = tensor.collapse_shape %1537 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32>
+    %cst_1510 = arith.constant dense<0.000000e+00> : tensor<1x32x128x80xf32>
+    %collapsed_1511 = tensor.collapse_shape %1548 [[0, 1], [2], [3]] : tensor<1x32x128x80xf32> into tensor<32x128x80xf32>
+    %cst_1512 = arith.constant 0.000000e+00 : f32
+    %1549 = tensor.empty() : tensor<32x80x80xf32>
+    %1550 = linalg.fill ins(%cst_1512 : f32) outs(%1549 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32>
+    %1551 = linalg.batch_matmul ins(%collapsed_1509, %collapsed_1511 : tensor<32x80x128xf32>, tensor<32x128x80xf32>) outs(%1550 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32>
+    %expanded_1513 = tensor.expand_shape %1551 [[0, 1], [2], [3]] : tensor<32x80x80xf32> into tensor<1x32x80x80xf32>
+    %cst_1514 = arith.constant dense<11.3137083> : tensor<1x32x80x80xf32>
+    %1552 = tensor.empty() : tensor<1x32x80x80xf32>
+    %1553 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst_1514 : tensor<1x32x80x80xf32>) outs(%1552 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %cst_2684 = arith.constant 1.000000e+00 : f32
+      %3652 = arith.divf %cst_2684, %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %1554 = tensor.empty() : tensor<1x32x80x80xf32>
+    %1555 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1513, %1553 : tensor<1x32x80x80xf32>, tensor<1x32x80x80xf32>) outs(%1554 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %1556 = tensor.empty() : tensor<1x32x80x80xf32>
+    %collapsed_1515 = tensor.collapse_shape %21 [[0, 1], [2], [3]] : tensor<1x1x80x80xf32> into tensor<1x80x80xf32>
+    %1557 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1555, %collapsed_1515 : tensor<1x32x80x80xf32>, tensor<1x80x80xf32>) outs(%1556 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %1558 = tensor.empty() : tensor<1x32x80x1xf32>
+    %1559 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%1558 : tensor<1x32x80x1xf32>) {
+    ^bb0(%out: f32):
+      %cst_2684 = arith.constant 0xFF800000 : f32
+      linalg.yield %cst_2684 : f32
+    } -> tensor<1x32x80x1xf32>
+    %1560 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%1557 : tensor<1x32x80x80xf32>) outs(%1558 : tensor<1x32x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.maximumf %in, %out : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x1xf32>
+    %1561 = tensor.empty() : tensor<1x32x80x80xf32>
+    %1562 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1557, %1560 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%1561 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.subf %in, %in_2684 : f32
+      %3653 = math.exp %3652 : f32
+      linalg.yield %3653 : f32
+    } -> tensor<1x32x80x80xf32>
+    %1563 = tensor.empty() : tensor<1x32x80x1xf32>
+    %1564 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%1563 : tensor<1x32x80x1xf32>) {
+    ^bb0(%out: f32):
+      %cst_2684 = arith.constant 0.000000e+00 : f32
+      linalg.yield %cst_2684 : f32
+    } -> tensor<1x32x80x1xf32>
+    %1565 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%1562 : tensor<1x32x80x80xf32>) outs(%1564 : tensor<1x32x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.addf %in, %out : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x1xf32>
+    %1566 = tensor.empty() : tensor<1x32x80x80xf32>
+    %1567 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1562, %1565 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%1566 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.divf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %cst_1516 = arith.constant dense<0.000000e+00> : tensor<1x32x80x80xf32>
+    %collapsed_1517 = tensor.collapse_shape %1567 [[0, 1], [2], [3]] : tensor<1x32x80x80xf32> into tensor<32x80x80xf32>
+    %cst_1518 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32>
+    %collapsed_1519 = tensor.collapse_shape %1516 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32>
+    %cst_1520 = arith.constant 0.000000e+00 : f32
+    %1568 = tensor.empty() : tensor<32x80x128xf32>
+    %1569 = linalg.fill ins(%cst_1520 : f32) outs(%1568 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32>
+    %1570 = linalg.batch_matmul ins(%collapsed_1517, %collapsed_1519 : tensor<32x80x80xf32>, tensor<32x80x128xf32>) outs(%1569 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32>
+    %expanded_1521 = tensor.expand_shape %1570 [[0, 1], [2], [3]] : tensor<32x80x128xf32> into tensor<1x32x80x128xf32>
+    %1571 = tensor.empty() : tensor<1x80x32x128xf32>
+    %1572 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1521 : tensor<1x32x80x128xf32>) outs(%1571 : tensor<1x80x32x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x80x32x128xf32>
+    %collapsed_1522 = tensor.collapse_shape %1572 [[0], [1], [2, 3]] : tensor<1x80x32x128xf32> into tensor<1x80x4096xf32>
+    %1573 = tensor.empty() : tensor<4096x4096xf32>
+    %1574 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_254 : tensor<4096x4096xf32>) outs(%1573 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_1523 = tensor.collapse_shape %collapsed_1522 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_1524 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %1575 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1523, %1574 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1524 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_1525 = tensor.expand_shape %1575 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %1576 = tensor.empty() : tensor<1x80x4096xf32>
+    %1577 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1490, %expanded_1525 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%1576 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %1578 = tensor.empty() : tensor<1x80x4096xf32>
+    %cst_1526 = arith.constant 2.000000e+00 : f32
+    %1579 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1577 : tensor<1x80x4096xf32>) outs(%1578 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.powf %in, %cst_1526 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %cst_1527 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32>
+    %1580 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%1579 : tensor<1x80x4096xf32>) outs(%cst_1527 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %cst_2684 = arith.constant 4.096000e+03 : f32
+      %3652 = arith.divf %in, %cst_2684 : f32
+      %3653 = arith.addf %3652, %out : f32
+      linalg.yield %3653 : f32
+    } -> tensor<1x80x1xf32>
+    %cst_1528 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32>
+    %1581 = tensor.empty() : tensor<1x80x1xf32>
+    %1582 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1580, %cst_1528 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%1581 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %1583 = tensor.empty() : tensor<1x80x1xf32>
+    %1584 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1582 : tensor<1x80x1xf32>) outs(%1583 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.rsqrt %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %1585 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_1529 = tensor.collapse_shape %1584 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32>
+    %1586 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1577, %collapsed_1529 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%1585 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %expanded_1530 = tensor.expand_shape %extracted_slice_26 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32>
+    %1587 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_1531 = tensor.collapse_shape %expanded_1530 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32>
+    %1588 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_1531, %1586 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%1587 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %1589 = tensor.empty() : tensor<4096x11008xf32>
+    %1590 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_256 : tensor<11008x4096xf32>) outs(%1589 : tensor<4096x11008xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x11008xf32>
+    %collapsed_1532 = tensor.collapse_shape %1588 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_1533 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32>
+    %1591 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1532, %1590 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_1533 : tensor<80x11008xf32>) -> tensor<80x11008xf32>
+    %expanded_1534 = tensor.expand_shape %1591 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32>
+    %1592 = tensor.empty() : tensor<1x80x11008xf32>
+    %1593 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_1534 : tensor<1x80x11008xf32>) outs(%1592 : tensor<1x80x11008xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.negf %in : f32
+      %3653 = math.exp %3652 : f32
+      %cst_2684 = arith.constant 1.000000e+00 : f32
+      %3654 = arith.addf %cst_2684, %3653 : f32
+      %3655 = arith.divf %in, %3654 : f32
+      linalg.yield %3655 : f32
+    } -> tensor<1x80x11008xf32>
+    %1594 = tensor.empty() : tensor<4096x11008xf32>
+    %1595 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_258 : tensor<11008x4096xf32>) outs(%1594 : tensor<4096x11008xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x11008xf32>
+    %collapsed_1535 = tensor.collapse_shape %1588 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_1536 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32>
+    %1596 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1535, %1595 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_1536 : tensor<80x11008xf32>) -> tensor<80x11008xf32>
+    %expanded_1537 = tensor.expand_shape %1596 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32>
+    %1597 = tensor.empty() : tensor<1x80x11008xf32>
+    %1598 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1593, %expanded_1537 : tensor<1x80x11008xf32>, tensor<1x80x11008xf32>) outs(%1597 : tensor<1x80x11008xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x11008xf32>
+    %1599 = tensor.empty() : tensor<11008x4096xf32>
+    %1600 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_260 : tensor<4096x11008xf32>) outs(%1599 : tensor<11008x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<11008x4096xf32>
+    %collapsed_1538 = tensor.collapse_shape %1598 [[0, 1], [2]] : tensor<1x80x11008xf32> into tensor<80x11008xf32>
+    %cst_1539 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %1601 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1538, %1600 : tensor<80x11008xf32>, tensor<11008x4096xf32>) outs(%cst_1539 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_1540 = tensor.expand_shape %1601 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %1602 = tensor.empty() : tensor<1x80x4096xf32>
+    %1603 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1577, %expanded_1540 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%1602 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %1604 = tensor.empty() : tensor<1x80x4096xf32>
+    %cst_1541 = arith.constant 2.000000e+00 : f32
+    %1605 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1603 : tensor<1x80x4096xf32>) outs(%1604 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.powf %in, %cst_1541 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %cst_1542 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32>
+    %1606 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%1605 : tensor<1x80x4096xf32>) outs(%cst_1542 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %cst_2684 = arith.constant 4.096000e+03 : f32
+      %3652 = arith.divf %in, %cst_2684 : f32
+      %3653 = arith.addf %3652, %out : f32
+      linalg.yield %3653 : f32
+    } -> tensor<1x80x1xf32>
+    %cst_1543 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32>
+    %1607 = tensor.empty() : tensor<1x80x1xf32>
+    %1608 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1606, %cst_1543 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%1607 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %1609 = tensor.empty() : tensor<1x80x1xf32>
+    %1610 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1608 : tensor<1x80x1xf32>) outs(%1609 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.rsqrt %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %1611 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_1544 = tensor.collapse_shape %1610 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32>
+    %1612 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1603, %collapsed_1544 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%1611 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %expanded_1545 = tensor.expand_shape %extracted_slice_27 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32>
+    %1613 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_1546 = tensor.collapse_shape %expanded_1545 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32>
+    %1614 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_1546, %1612 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%1613 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %1615 = tensor.empty() : tensor<4096x4096xf32>
+    %1616 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_262 : tensor<4096x4096xf32>) outs(%1615 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_1547 = tensor.collapse_shape %1614 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_1548 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %1617 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1547, %1616 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1548 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_1549 = tensor.expand_shape %1617 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %1618 = tensor.empty() : tensor<4096x4096xf32>
+    %1619 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_264 : tensor<4096x4096xf32>) outs(%1618 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_1550 = tensor.collapse_shape %1614 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_1551 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %1620 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1550, %1619 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1551 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_1552 = tensor.expand_shape %1620 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %1621 = tensor.empty() : tensor<4096x4096xf32>
+    %1622 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_266 : tensor<4096x4096xf32>) outs(%1621 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_1553 = tensor.collapse_shape %1614 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_1554 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %1623 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1553, %1622 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1554 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_1555 = tensor.expand_shape %1623 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %expanded_1556 = tensor.expand_shape %expanded_1549 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32>
+    %1624 = tensor.empty() : tensor<1x32x80x128xf32>
+    %1625 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1556 : tensor<1x80x32x128xf32>) outs(%1624 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x80x128xf32>
+    %expanded_1557 = tensor.expand_shape %expanded_1552 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32>
+    %1626 = tensor.empty() : tensor<1x32x80x128xf32>
+    %1627 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1557 : tensor<1x80x32x128xf32>) outs(%1626 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x80x128xf32>
+    %expanded_1558 = tensor.expand_shape %expanded_1555 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32>
+    %1628 = tensor.empty() : tensor<1x32x80x128xf32>
+    %1629 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1558 : tensor<1x80x32x128xf32>) outs(%1628 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x80x128xf32>
+    %extracted_slice_1559 = tensor.extract_slice %expanded_572[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32>
+    %extracted_slice_1560 = tensor.extract_slice %expanded_574[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32>
+    %1630 = tensor.empty() : tensor<1x80x128xf32>
+    %1631 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_1559 : tensor<1x1x80x128xf32>) outs(%1630 : tensor<1x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x80x128xf32>
+    %1632 = tensor.empty() : tensor<80x128xf32>
+    %1633 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%1631 : tensor<1x80x128xf32>) outs(%1632 : tensor<80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<80x128xf32>
+    %1634 = tensor.empty() : tensor<1x80x128xf32>
+    %1635 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_1560 : tensor<1x1x80x128xf32>) outs(%1634 : tensor<1x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x80x128xf32>
+    %1636 = tensor.empty() : tensor<80x128xf32>
+    %1637 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%1635 : tensor<1x80x128xf32>) outs(%1636 : tensor<80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<80x128xf32>
+    %1638 = tensor.empty() : tensor<1x80x128xf32>
+    %1639 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%1638 : tensor<1x80x128xf32>) {
+    ^bb0(%in: i64, %out: f32):
+      %3652 = arith.index_cast %in : i64 to index
+      %3653 = linalg.index 2 : index
+      %extracted = tensor.extract %1633[%3652, %3653] : tensor<80x128xf32>
+      linalg.yield %extracted : f32
+    } -> tensor<1x80x128xf32>
+    %expanded_1561 = tensor.expand_shape %1639 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32>
+    %1640 = tensor.empty() : tensor<1x80x128xf32>
+    %1641 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%1640 : tensor<1x80x128xf32>) {
+    ^bb0(%in: i64, %out: f32):
+      %3652 = arith.index_cast %in : i64 to index
+      %3653 = linalg.index 2 : index
+      %extracted = tensor.extract %1637[%3652, %3653] : tensor<80x128xf32>
+      linalg.yield %extracted : f32
+    } -> tensor<1x80x128xf32>
+    %expanded_1562 = tensor.expand_shape %1641 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32>
+    %1642 = tensor.empty() : tensor<1x32x80x128xf32>
+    %1643 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1625, %1639 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%1642 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %extracted_slice_1563 = tensor.extract_slice %1625[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %extracted_slice_1564 = tensor.extract_slice %1625[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %1644 = tensor.empty() : tensor<1x32x80x64xf32>
+    %1645 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_1564 : tensor<1x32x80x64xf32>) outs(%1644 : tensor<1x32x80x64xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.negf %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x64xf32>
+    %1646 = tensor.empty() : tensor<1x32x80x128xf32>
+    %inserted_slice_1565 = tensor.insert_slice %1645 into %1646[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %inserted_slice_1566 = tensor.insert_slice %extracted_slice_1563 into %inserted_slice_1565[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %1647 = tensor.empty() : tensor<1x32x80x128xf32>
+    %1648 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_1566, %1641 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%1647 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %1649 = tensor.empty() : tensor<1x32x80x128xf32>
+    %1650 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1643, %1648 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%1649 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %1651 = tensor.empty() : tensor<1x32x80x128xf32>
+    %1652 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1627, %1639 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%1651 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %extracted_slice_1567 = tensor.extract_slice %1627[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %extracted_slice_1568 = tensor.extract_slice %1627[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %1653 = tensor.empty() : tensor<1x32x80x64xf32>
+    %1654 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_1568 : tensor<1x32x80x64xf32>) outs(%1653 : tensor<1x32x80x64xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.negf %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x64xf32>
+    %1655 = tensor.empty() : tensor<1x32x80x128xf32>
+    %inserted_slice_1569 = tensor.insert_slice %1654 into %1655[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %inserted_slice_1570 = tensor.insert_slice %extracted_slice_1567 into %inserted_slice_1569[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %1656 = tensor.empty() : tensor<1x32x80x128xf32>
+    %1657 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_1570, %1641 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%1656 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %1658 = tensor.empty() : tensor<1x32x80x128xf32>
+    %1659 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1652, %1657 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%1658 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %1660 = tensor.empty() : tensor<1x32x128x80xf32>
+    %1661 = linalg.generic {indexing_maps = [#map14, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1659 : tensor<1x32x80x128xf32>) outs(%1660 : tensor<1x32x128x80xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x128x80xf32>
+    %cst_1571 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32>
+    %collapsed_1572 = tensor.collapse_shape %1650 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32>
+    %cst_1573 = arith.constant dense<0.000000e+00> : tensor<1x32x128x80xf32>
+    %collapsed_1574 = tensor.collapse_shape %1661 [[0, 1], [2], [3]] : tensor<1x32x128x80xf32> into tensor<32x128x80xf32>
+    %cst_1575 = arith.constant 0.000000e+00 : f32
+    %1662 = tensor.empty() : tensor<32x80x80xf32>
+    %1663 = linalg.fill ins(%cst_1575 : f32) outs(%1662 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32>
+    %1664 = linalg.batch_matmul ins(%collapsed_1572, %collapsed_1574 : tensor<32x80x128xf32>, tensor<32x128x80xf32>) outs(%1663 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32>
+    %expanded_1576 = tensor.expand_shape %1664 [[0, 1], [2], [3]] : tensor<32x80x80xf32> into tensor<1x32x80x80xf32>
+    %cst_1577 = arith.constant dense<11.3137083> : tensor<1x32x80x80xf32>
+    %1665 = tensor.empty() : tensor<1x32x80x80xf32>
+    %1666 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst_1577 : tensor<1x32x80x80xf32>) outs(%1665 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %cst_2684 = arith.constant 1.000000e+00 : f32
+      %3652 = arith.divf %cst_2684, %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %1667 = tensor.empty() : tensor<1x32x80x80xf32>
+    %1668 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1576, %1666 : tensor<1x32x80x80xf32>, tensor<1x32x80x80xf32>) outs(%1667 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %1669 = tensor.empty() : tensor<1x32x80x80xf32>
+    %collapsed_1578 = tensor.collapse_shape %21 [[0, 1], [2], [3]] : tensor<1x1x80x80xf32> into tensor<1x80x80xf32>
+    %1670 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1668, %collapsed_1578 : tensor<1x32x80x80xf32>, tensor<1x80x80xf32>) outs(%1669 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %1671 = tensor.empty() : tensor<1x32x80x1xf32>
+    %1672 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%1671 : tensor<1x32x80x1xf32>) {
+    ^bb0(%out: f32):
+      %cst_2684 = arith.constant 0xFF800000 : f32
+      linalg.yield %cst_2684 : f32
+    } -> tensor<1x32x80x1xf32>
+    %1673 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%1670 : tensor<1x32x80x80xf32>) outs(%1671 : tensor<1x32x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.maximumf %in, %out : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x1xf32>
+    %1674 = tensor.empty() : tensor<1x32x80x80xf32>
+    %1675 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1670, %1673 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%1674 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.subf %in, %in_2684 : f32
+      %3653 = math.exp %3652 : f32
+      linalg.yield %3653 : f32
+    } -> tensor<1x32x80x80xf32>
+    %1676 = tensor.empty() : tensor<1x32x80x1xf32>
+    %1677 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%1676 : tensor<1x32x80x1xf32>) {
+    ^bb0(%out: f32):
+      %cst_2684 = arith.constant 0.000000e+00 : f32
+      linalg.yield %cst_2684 : f32
+    } -> tensor<1x32x80x1xf32>
+    %1678 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%1675 : tensor<1x32x80x80xf32>) outs(%1677 : tensor<1x32x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.addf %in, %out : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x1xf32>
+    %1679 = tensor.empty() : tensor<1x32x80x80xf32>
+    %1680 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1675, %1678 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%1679 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.divf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %cst_1579 = arith.constant dense<0.000000e+00> : tensor<1x32x80x80xf32>
+    %collapsed_1580 = tensor.collapse_shape %1680 [[0, 1], [2], [3]] : tensor<1x32x80x80xf32> into tensor<32x80x80xf32>
+    %cst_1581 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32>
+    %collapsed_1582 = tensor.collapse_shape %1629 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32>
+    %cst_1583 = arith.constant 0.000000e+00 : f32
+    %1681 = tensor.empty() : tensor<32x80x128xf32>
+    %1682 = linalg.fill ins(%cst_1583 : f32) outs(%1681 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32>
+    %1683 = linalg.batch_matmul ins(%collapsed_1580, %collapsed_1582 : tensor<32x80x80xf32>, tensor<32x80x128xf32>) outs(%1682 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32>
+    %expanded_1584 = tensor.expand_shape %1683 [[0, 1], [2], [3]] : tensor<32x80x128xf32> into tensor<1x32x80x128xf32>
+    %1684 = tensor.empty() : tensor<1x80x32x128xf32>
+    %1685 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1584 : tensor<1x32x80x128xf32>) outs(%1684 : tensor<1x80x32x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x80x32x128xf32>
+    %collapsed_1585 = tensor.collapse_shape %1685 [[0], [1], [2, 3]] : tensor<1x80x32x128xf32> into tensor<1x80x4096xf32>
+    %1686 = tensor.empty() : tensor<4096x4096xf32>
+    %1687 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_268 : tensor<4096x4096xf32>) outs(%1686 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_1586 = tensor.collapse_shape %collapsed_1585 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_1587 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %1688 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1586, %1687 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1587 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_1588 = tensor.expand_shape %1688 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %1689 = tensor.empty() : tensor<1x80x4096xf32>
+    %1690 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1603, %expanded_1588 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%1689 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %1691 = tensor.empty() : tensor<1x80x4096xf32>
+    %cst_1589 = arith.constant 2.000000e+00 : f32
+    %1692 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1690 : tensor<1x80x4096xf32>) outs(%1691 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.powf %in, %cst_1589 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %cst_1590 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32>
+    %1693 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%1692 : tensor<1x80x4096xf32>) outs(%cst_1590 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %cst_2684 = arith.constant 4.096000e+03 : f32
+      %3652 = arith.divf %in, %cst_2684 : f32
+      %3653 = arith.addf %3652, %out : f32
+      linalg.yield %3653 : f32
+    } -> tensor<1x80x1xf32>
+    %cst_1591 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32>
+    %1694 = tensor.empty() : tensor<1x80x1xf32>
+    %1695 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1693, %cst_1591 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%1694 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %1696 = tensor.empty() : tensor<1x80x1xf32>
+    %1697 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1695 : tensor<1x80x1xf32>) outs(%1696 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.rsqrt %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %1698 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_1592 = tensor.collapse_shape %1697 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32>
+    %1699 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1690, %collapsed_1592 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%1698 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %expanded_1593 = tensor.expand_shape %extracted_slice_28 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32>
+    %1700 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_1594 = tensor.collapse_shape %expanded_1593 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32>
+    %1701 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_1594, %1699 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%1700 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %1702 = tensor.empty() : tensor<4096x11008xf32>
+    %1703 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_270 : tensor<11008x4096xf32>) outs(%1702 : tensor<4096x11008xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x11008xf32>
+    %collapsed_1595 = tensor.collapse_shape %1701 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_1596 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32>
+    %1704 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1595, %1703 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_1596 : tensor<80x11008xf32>) -> tensor<80x11008xf32>
+    %expanded_1597 = tensor.expand_shape %1704 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32>
+    %1705 = tensor.empty() : tensor<1x80x11008xf32>
+    %1706 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_1597 : tensor<1x80x11008xf32>) outs(%1705 : tensor<1x80x11008xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.negf %in : f32
+      %3653 = math.exp %3652 : f32
+      %cst_2684 = arith.constant 1.000000e+00 : f32
+      %3654 = arith.addf %cst_2684, %3653 : f32
+      %3655 = arith.divf %in, %3654 : f32
+      linalg.yield %3655 : f32
+    } -> tensor<1x80x11008xf32>
+    %1707 = tensor.empty() : tensor<4096x11008xf32>
+    %1708 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_272 : tensor<11008x4096xf32>) outs(%1707 : tensor<4096x11008xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x11008xf32>
+    %collapsed_1598 = tensor.collapse_shape %1701 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_1599 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32>
+    %1709 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1598, %1708 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_1599 : tensor<80x11008xf32>) -> tensor<80x11008xf32>
+    %expanded_1600 = tensor.expand_shape %1709 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32>
+    %1710 = tensor.empty() : tensor<1x80x11008xf32>
+    %1711 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1706, %expanded_1600 : tensor<1x80x11008xf32>, tensor<1x80x11008xf32>) outs(%1710 : tensor<1x80x11008xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x11008xf32>
+    %1712 = tensor.empty() : tensor<11008x4096xf32>
+    %1713 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_274 : tensor<4096x11008xf32>) outs(%1712 : tensor<11008x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<11008x4096xf32>
+    %collapsed_1601 = tensor.collapse_shape %1711 [[0, 1], [2]] : tensor<1x80x11008xf32> into tensor<80x11008xf32>
+    %cst_1602 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %1714 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1601, %1713 : tensor<80x11008xf32>, tensor<11008x4096xf32>) outs(%cst_1602 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_1603 = tensor.expand_shape %1714 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %1715 = tensor.empty() : tensor<1x80x4096xf32>
+    %1716 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1690, %expanded_1603 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%1715 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %1717 = tensor.empty() : tensor<1x80x4096xf32>
+    %cst_1604 = arith.constant 2.000000e+00 : f32
+    %1718 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1716 : tensor<1x80x4096xf32>) outs(%1717 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.powf %in, %cst_1604 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %cst_1605 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32>
+    %1719 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%1718 : tensor<1x80x4096xf32>) outs(%cst_1605 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %cst_2684 = arith.constant 4.096000e+03 : f32
+      %3652 = arith.divf %in, %cst_2684 : f32
+      %3653 = arith.addf %3652, %out : f32
+      linalg.yield %3653 : f32
+    } -> tensor<1x80x1xf32>
+    %cst_1606 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32>
+    %1720 = tensor.empty() : tensor<1x80x1xf32>
+    %1721 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1719, %cst_1606 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%1720 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %1722 = tensor.empty() : tensor<1x80x1xf32>
+    %1723 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1721 : tensor<1x80x1xf32>) outs(%1722 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.rsqrt %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %1724 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_1607 = tensor.collapse_shape %1723 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32>
+    %1725 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1716, %collapsed_1607 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%1724 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %expanded_1608 = tensor.expand_shape %extracted_slice_29 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32>
+    %1726 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_1609 = tensor.collapse_shape %expanded_1608 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32>
+    %1727 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_1609, %1725 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%1726 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %1728 = tensor.empty() : tensor<4096x4096xf32>
+    %1729 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_276 : tensor<4096x4096xf32>) outs(%1728 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_1610 = tensor.collapse_shape %1727 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_1611 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %1730 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1610, %1729 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1611 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_1612 = tensor.expand_shape %1730 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %1731 = tensor.empty() : tensor<4096x4096xf32>
+    %1732 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_278 : tensor<4096x4096xf32>) outs(%1731 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_1613 = tensor.collapse_shape %1727 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_1614 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %1733 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1613, %1732 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1614 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_1615 = tensor.expand_shape %1733 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %1734 = tensor.empty() : tensor<4096x4096xf32>
+    %1735 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_280 : tensor<4096x4096xf32>) outs(%1734 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_1616 = tensor.collapse_shape %1727 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_1617 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %1736 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1616, %1735 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1617 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_1618 = tensor.expand_shape %1736 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %expanded_1619 = tensor.expand_shape %expanded_1612 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32>
+    %1737 = tensor.empty() : tensor<1x32x80x128xf32>
+    %1738 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1619 : tensor<1x80x32x128xf32>) outs(%1737 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x80x128xf32>
+    %expanded_1620 = tensor.expand_shape %expanded_1615 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32>
+    %1739 = tensor.empty() : tensor<1x32x80x128xf32>
+    %1740 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1620 : tensor<1x80x32x128xf32>) outs(%1739 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x80x128xf32>
+    %expanded_1621 = tensor.expand_shape %expanded_1618 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32>
+    %1741 = tensor.empty() : tensor<1x32x80x128xf32>
+    %1742 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1621 : tensor<1x80x32x128xf32>) outs(%1741 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x80x128xf32>
+    %extracted_slice_1622 = tensor.extract_slice %expanded_576[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32>
+    %extracted_slice_1623 = tensor.extract_slice %expanded_578[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32>
+    %1743 = tensor.empty() : tensor<1x80x128xf32>
+    %1744 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_1622 : tensor<1x1x80x128xf32>) outs(%1743 : tensor<1x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x80x128xf32>
+    %1745 = tensor.empty() : tensor<80x128xf32>
+    %1746 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%1744 : tensor<1x80x128xf32>) outs(%1745 : tensor<80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<80x128xf32>
+    %1747 = tensor.empty() : tensor<1x80x128xf32>
+    %1748 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_1623 : tensor<1x1x80x128xf32>) outs(%1747 : tensor<1x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x80x128xf32>
+    %1749 = tensor.empty() : tensor<80x128xf32>
+    %1750 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%1748 : tensor<1x80x128xf32>) outs(%1749 : tensor<80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<80x128xf32>
+    %1751 = tensor.empty() : tensor<1x80x128xf32>
+    %1752 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%1751 : tensor<1x80x128xf32>) {
+    ^bb0(%in: i64, %out: f32):
+      %3652 = arith.index_cast %in : i64 to index
+      %3653 = linalg.index 2 : index
+      %extracted = tensor.extract %1746[%3652, %3653] : tensor<80x128xf32>
+      linalg.yield %extracted : f32
+    } -> tensor<1x80x128xf32>
+    %expanded_1624 = tensor.expand_shape %1752 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32>
+    %1753 = tensor.empty() : tensor<1x80x128xf32>
+    %1754 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%1753 : tensor<1x80x128xf32>) {
+    ^bb0(%in: i64, %out: f32):
+      %3652 = arith.index_cast %in : i64 to index
+      %3653 = linalg.index 2 : index
+      %extracted = tensor.extract %1750[%3652, %3653] : tensor<80x128xf32>
+      linalg.yield %extracted : f32
+    } -> tensor<1x80x128xf32>
+    %expanded_1625 = tensor.expand_shape %1754 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32>
+    %1755 = tensor.empty() : tensor<1x32x80x128xf32>
+    %1756 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1738, %1752 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%1755 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %extracted_slice_1626 = tensor.extract_slice %1738[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %extracted_slice_1627 = tensor.extract_slice %1738[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %1757 = tensor.empty() : tensor<1x32x80x64xf32>
+    %1758 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_1627 : tensor<1x32x80x64xf32>) outs(%1757 : tensor<1x32x80x64xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.negf %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x64xf32>
+    %1759 = tensor.empty() : tensor<1x32x80x128xf32>
+    %inserted_slice_1628 = tensor.insert_slice %1758 into %1759[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %inserted_slice_1629 = tensor.insert_slice %extracted_slice_1626 into %inserted_slice_1628[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %1760 = tensor.empty() : tensor<1x32x80x128xf32>
+    %1761 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_1629, %1754 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%1760 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %1762 = tensor.empty() : tensor<1x32x80x128xf32>
+    %1763 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1756, %1761 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%1762 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %1764 = tensor.empty() : tensor<1x32x80x128xf32>
+    %1765 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1740, %1752 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%1764 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %extracted_slice_1630 = tensor.extract_slice %1740[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %extracted_slice_1631 = tensor.extract_slice %1740[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %1766 = tensor.empty() : tensor<1x32x80x64xf32>
+    %1767 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_1631 : tensor<1x32x80x64xf32>) outs(%1766 : tensor<1x32x80x64xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.negf %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x64xf32>
+    %1768 = tensor.empty() : tensor<1x32x80x128xf32>
+    %inserted_slice_1632 = tensor.insert_slice %1767 into %1768[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %inserted_slice_1633 = tensor.insert_slice %extracted_slice_1630 into %inserted_slice_1632[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %1769 = tensor.empty() : tensor<1x32x80x128xf32>
+    %1770 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_1633, %1754 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%1769 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %1771 = tensor.empty() : tensor<1x32x80x128xf32>
+    %1772 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1765, %1770 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%1771 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %1773 = tensor.empty() : tensor<1x32x128x80xf32>
+    %1774 = linalg.generic {indexing_maps = [#map14, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1772 : tensor<1x32x80x128xf32>) outs(%1773 : tensor<1x32x128x80xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x128x80xf32>
+    %cst_1634 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32>
+    %collapsed_1635 = tensor.collapse_shape %1763 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32>
+    %cst_1636 = arith.constant dense<0.000000e+00> : tensor<1x32x128x80xf32>
+    %collapsed_1637 = tensor.collapse_shape %1774 [[0, 1], [2], [3]] : tensor<1x32x128x80xf32> into tensor<32x128x80xf32>
+    %cst_1638 = arith.constant 0.000000e+00 : f32
+    %1775 = tensor.empty() : tensor<32x80x80xf32>
+    %1776 = linalg.fill ins(%cst_1638 : f32) outs(%1775 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32>
+    %1777 = linalg.batch_matmul ins(%collapsed_1635, %collapsed_1637 : tensor<32x80x128xf32>, tensor<32x128x80xf32>) outs(%1776 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32>
+    %expanded_1639 = tensor.expand_shape %1777 [[0, 1], [2], [3]] : tensor<32x80x80xf32> into tensor<1x32x80x80xf32>
+    %cst_1640 = arith.constant dense<11.3137083> : tensor<1x32x80x80xf32>
+    %1778 = tensor.empty() : tensor<1x32x80x80xf32>
+    %1779 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst_1640 : tensor<1x32x80x80xf32>) outs(%1778 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %cst_2684 = arith.constant 1.000000e+00 : f32
+      %3652 = arith.divf %cst_2684, %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %1780 = tensor.empty() : tensor<1x32x80x80xf32>
+    %1781 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1639, %1779 : tensor<1x32x80x80xf32>, tensor<1x32x80x80xf32>) outs(%1780 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %1782 = tensor.empty() : tensor<1x32x80x80xf32>
+    %collapsed_1641 = tensor.collapse_shape %21 [[0, 1], [2], [3]] : tensor<1x1x80x80xf32> into tensor<1x80x80xf32>
+    %1783 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1781, %collapsed_1641 : tensor<1x32x80x80xf32>, tensor<1x80x80xf32>) outs(%1782 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %1784 = tensor.empty() : tensor<1x32x80x1xf32>
+    %1785 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%1784 : tensor<1x32x80x1xf32>) {
+    ^bb0(%out: f32):
+      %cst_2684 = arith.constant 0xFF800000 : f32
+      linalg.yield %cst_2684 : f32
+    } -> tensor<1x32x80x1xf32>
+    %1786 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%1783 : tensor<1x32x80x80xf32>) outs(%1784 : tensor<1x32x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.maximumf %in, %out : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x1xf32>
+    %1787 = tensor.empty() : tensor<1x32x80x80xf32>
+    %1788 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1783, %1786 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%1787 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.subf %in, %in_2684 : f32
+      %3653 = math.exp %3652 : f32
+      linalg.yield %3653 : f32
+    } -> tensor<1x32x80x80xf32>
+    %1789 = tensor.empty() : tensor<1x32x80x1xf32>
+    %1790 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%1789 : tensor<1x32x80x1xf32>) {
+    ^bb0(%out: f32):
+      %cst_2684 = arith.constant 0.000000e+00 : f32
+      linalg.yield %cst_2684 : f32
+    } -> tensor<1x32x80x1xf32>
+    %1791 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%1788 : tensor<1x32x80x80xf32>) outs(%1790 : tensor<1x32x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.addf %in, %out : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x1xf32>
+    %1792 = tensor.empty() : tensor<1x32x80x80xf32>
+    %1793 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1788, %1791 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%1792 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.divf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %cst_1642 = arith.constant dense<0.000000e+00> : tensor<1x32x80x80xf32>
+    %collapsed_1643 = tensor.collapse_shape %1793 [[0, 1], [2], [3]] : tensor<1x32x80x80xf32> into tensor<32x80x80xf32>
+    %cst_1644 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32>
+    %collapsed_1645 = tensor.collapse_shape %1742 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32>
+    %cst_1646 = arith.constant 0.000000e+00 : f32
+    %1794 = tensor.empty() : tensor<32x80x128xf32>
+    %1795 = linalg.fill ins(%cst_1646 : f32) outs(%1794 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32>
+    %1796 = linalg.batch_matmul ins(%collapsed_1643, %collapsed_1645 : tensor<32x80x80xf32>, tensor<32x80x128xf32>) outs(%1795 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32>
+    %expanded_1647 = tensor.expand_shape %1796 [[0, 1], [2], [3]] : tensor<32x80x128xf32> into tensor<1x32x80x128xf32>
+    %1797 = tensor.empty() : tensor<1x80x32x128xf32>
+    %1798 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1647 : tensor<1x32x80x128xf32>) outs(%1797 : tensor<1x80x32x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x80x32x128xf32>
+    %collapsed_1648 = tensor.collapse_shape %1798 [[0], [1], [2, 3]] : tensor<1x80x32x128xf32> into tensor<1x80x4096xf32>
+    %1799 = tensor.empty() : tensor<4096x4096xf32>
+    %1800 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_282 : tensor<4096x4096xf32>) outs(%1799 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_1649 = tensor.collapse_shape %collapsed_1648 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_1650 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %1801 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1649, %1800 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1650 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_1651 = tensor.expand_shape %1801 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %1802 = tensor.empty() : tensor<1x80x4096xf32>
+    %1803 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1716, %expanded_1651 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%1802 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %1804 = tensor.empty() : tensor<1x80x4096xf32>
+    %cst_1652 = arith.constant 2.000000e+00 : f32
+    %1805 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1803 : tensor<1x80x4096xf32>) outs(%1804 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.powf %in, %cst_1652 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %cst_1653 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32>
+    %1806 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%1805 : tensor<1x80x4096xf32>) outs(%cst_1653 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %cst_2684 = arith.constant 4.096000e+03 : f32
+      %3652 = arith.divf %in, %cst_2684 : f32
+      %3653 = arith.addf %3652, %out : f32
+      linalg.yield %3653 : f32
+    } -> tensor<1x80x1xf32>
+    %cst_1654 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32>
+    %1807 = tensor.empty() : tensor<1x80x1xf32>
+    %1808 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1806, %cst_1654 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%1807 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %1809 = tensor.empty() : tensor<1x80x1xf32>
+    %1810 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1808 : tensor<1x80x1xf32>) outs(%1809 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.rsqrt %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %1811 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_1655 = tensor.collapse_shape %1810 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32>
+    %1812 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1803, %collapsed_1655 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%1811 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %expanded_1656 = tensor.expand_shape %extracted_slice_30 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32>
+    %1813 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_1657 = tensor.collapse_shape %expanded_1656 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32>
+    %1814 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_1657, %1812 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%1813 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %1815 = tensor.empty() : tensor<4096x11008xf32>
+    %1816 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_284 : tensor<11008x4096xf32>) outs(%1815 : tensor<4096x11008xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x11008xf32>
+    %collapsed_1658 = tensor.collapse_shape %1814 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_1659 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32>
+    %1817 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1658, %1816 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_1659 : tensor<80x11008xf32>) -> tensor<80x11008xf32>
+    %expanded_1660 = tensor.expand_shape %1817 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32>
+    %1818 = tensor.empty() : tensor<1x80x11008xf32>
+    %1819 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_1660 : tensor<1x80x11008xf32>) outs(%1818 : tensor<1x80x11008xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.negf %in : f32
+      %3653 = math.exp %3652 : f32
+      %cst_2684 = arith.constant 1.000000e+00 : f32
+      %3654 = arith.addf %cst_2684, %3653 : f32
+      %3655 = arith.divf %in, %3654 : f32
+      linalg.yield %3655 : f32
+    } -> tensor<1x80x11008xf32>
+    %1820 = tensor.empty() : tensor<4096x11008xf32>
+    %1821 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_286 : tensor<11008x4096xf32>) outs(%1820 : tensor<4096x11008xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x11008xf32>
+    %collapsed_1661 = tensor.collapse_shape %1814 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_1662 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32>
+    %1822 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1661, %1821 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_1662 : tensor<80x11008xf32>) -> tensor<80x11008xf32>
+    %expanded_1663 = tensor.expand_shape %1822 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32>
+    %1823 = tensor.empty() : tensor<1x80x11008xf32>
+    %1824 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1819, %expanded_1663 : tensor<1x80x11008xf32>, tensor<1x80x11008xf32>) outs(%1823 : tensor<1x80x11008xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x11008xf32>
+    %1825 = tensor.empty() : tensor<11008x4096xf32>
+    %1826 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_288 : tensor<4096x11008xf32>) outs(%1825 : tensor<11008x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<11008x4096xf32>
+    %collapsed_1664 = tensor.collapse_shape %1824 [[0, 1], [2]] : tensor<1x80x11008xf32> into tensor<80x11008xf32>
+    %cst_1665 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %1827 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1664, %1826 : tensor<80x11008xf32>, tensor<11008x4096xf32>) outs(%cst_1665 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_1666 = tensor.expand_shape %1827 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %1828 = tensor.empty() : tensor<1x80x4096xf32>
+    %1829 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1803, %expanded_1666 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%1828 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %1830 = tensor.empty() : tensor<1x80x4096xf32>
+    %cst_1667 = arith.constant 2.000000e+00 : f32
+    %1831 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1829 : tensor<1x80x4096xf32>) outs(%1830 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.powf %in, %cst_1667 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %cst_1668 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32>
+    %1832 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%1831 : tensor<1x80x4096xf32>) outs(%cst_1668 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %cst_2684 = arith.constant 4.096000e+03 : f32
+      %3652 = arith.divf %in, %cst_2684 : f32
+      %3653 = arith.addf %3652, %out : f32
+      linalg.yield %3653 : f32
+    } -> tensor<1x80x1xf32>
+    %cst_1669 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32>
+    %1833 = tensor.empty() : tensor<1x80x1xf32>
+    %1834 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1832, %cst_1669 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%1833 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %1835 = tensor.empty() : tensor<1x80x1xf32>
+    %1836 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1834 : tensor<1x80x1xf32>) outs(%1835 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.rsqrt %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %1837 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_1670 = tensor.collapse_shape %1836 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32>
+    %1838 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1829, %collapsed_1670 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%1837 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %expanded_1671 = tensor.expand_shape %extracted_slice_31 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32>
+    %1839 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_1672 = tensor.collapse_shape %expanded_1671 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32>
+    %1840 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_1672, %1838 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%1839 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %1841 = tensor.empty() : tensor<4096x4096xf32>
+    %1842 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_290 : tensor<4096x4096xf32>) outs(%1841 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_1673 = tensor.collapse_shape %1840 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_1674 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %1843 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1673, %1842 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1674 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_1675 = tensor.expand_shape %1843 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %1844 = tensor.empty() : tensor<4096x4096xf32>
+    %1845 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_292 : tensor<4096x4096xf32>) outs(%1844 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_1676 = tensor.collapse_shape %1840 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_1677 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %1846 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1676, %1845 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1677 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_1678 = tensor.expand_shape %1846 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %1847 = tensor.empty() : tensor<4096x4096xf32>
+    %1848 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_294 : tensor<4096x4096xf32>) outs(%1847 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_1679 = tensor.collapse_shape %1840 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_1680 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %1849 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1679, %1848 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1680 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_1681 = tensor.expand_shape %1849 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %expanded_1682 = tensor.expand_shape %expanded_1675 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32>
+    %1850 = tensor.empty() : tensor<1x32x80x128xf32>
+    %1851 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1682 : tensor<1x80x32x128xf32>) outs(%1850 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x80x128xf32>
+    %expanded_1683 = tensor.expand_shape %expanded_1678 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32>
+    %1852 = tensor.empty() : tensor<1x32x80x128xf32>
+    %1853 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1683 : tensor<1x80x32x128xf32>) outs(%1852 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x80x128xf32>
+    %expanded_1684 = tensor.expand_shape %expanded_1681 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32>
+    %1854 = tensor.empty() : tensor<1x32x80x128xf32>
+    %1855 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1684 : tensor<1x80x32x128xf32>) outs(%1854 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x80x128xf32>
+    %extracted_slice_1685 = tensor.extract_slice %expanded_580[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32>
+    %extracted_slice_1686 = tensor.extract_slice %expanded_582[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32>
+    %1856 = tensor.empty() : tensor<1x80x128xf32>
+    %1857 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_1685 : tensor<1x1x80x128xf32>) outs(%1856 : tensor<1x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x80x128xf32>
+    %1858 = tensor.empty() : tensor<80x128xf32>
+    %1859 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%1857 : tensor<1x80x128xf32>) outs(%1858 : tensor<80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<80x128xf32>
+    %1860 = tensor.empty() : tensor<1x80x128xf32>
+    %1861 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_1686 : tensor<1x1x80x128xf32>) outs(%1860 : tensor<1x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x80x128xf32>
+    %1862 = tensor.empty() : tensor<80x128xf32>
+    %1863 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%1861 : tensor<1x80x128xf32>) outs(%1862 : tensor<80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<80x128xf32>
+    %1864 = tensor.empty() : tensor<1x80x128xf32>
+    %1865 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%1864 : tensor<1x80x128xf32>) {
+    ^bb0(%in: i64, %out: f32):
+      %3652 = arith.index_cast %in : i64 to index
+      %3653 = linalg.index 2 : index
+      %extracted = tensor.extract %1859[%3652, %3653] : tensor<80x128xf32>
+      linalg.yield %extracted : f32
+    } -> tensor<1x80x128xf32>
+    %expanded_1687 = tensor.expand_shape %1865 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32>
+    %1866 = tensor.empty() : tensor<1x80x128xf32>
+    %1867 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%1866 : tensor<1x80x128xf32>) {
+    ^bb0(%in: i64, %out: f32):
+      %3652 = arith.index_cast %in : i64 to index
+      %3653 = linalg.index 2 : index
+      %extracted = tensor.extract %1863[%3652, %3653] : tensor<80x128xf32>
+      linalg.yield %extracted : f32
+    } -> tensor<1x80x128xf32>
+    %expanded_1688 = tensor.expand_shape %1867 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32>
+    %1868 = tensor.empty() : tensor<1x32x80x128xf32>
+    %1869 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1851, %1865 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%1868 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %extracted_slice_1689 = tensor.extract_slice %1851[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %extracted_slice_1690 = tensor.extract_slice %1851[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %1870 = tensor.empty() : tensor<1x32x80x64xf32>
+    %1871 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_1690 : tensor<1x32x80x64xf32>) outs(%1870 : tensor<1x32x80x64xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.negf %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x64xf32>
+    %1872 = tensor.empty() : tensor<1x32x80x128xf32>
+    %inserted_slice_1691 = tensor.insert_slice %1871 into %1872[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %inserted_slice_1692 = tensor.insert_slice %extracted_slice_1689 into %inserted_slice_1691[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %1873 = tensor.empty() : tensor<1x32x80x128xf32>
+    %1874 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_1692, %1867 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%1873 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %1875 = tensor.empty() : tensor<1x32x80x128xf32>
+    %1876 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1869, %1874 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%1875 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %1877 = tensor.empty() : tensor<1x32x80x128xf32>
+    %1878 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1853, %1865 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%1877 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %extracted_slice_1693 = tensor.extract_slice %1853[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %extracted_slice_1694 = tensor.extract_slice %1853[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %1879 = tensor.empty() : tensor<1x32x80x64xf32>
+    %1880 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_1694 : tensor<1x32x80x64xf32>) outs(%1879 : tensor<1x32x80x64xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.negf %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x64xf32>
+    %1881 = tensor.empty() : tensor<1x32x80x128xf32>
+    %inserted_slice_1695 = tensor.insert_slice %1880 into %1881[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %inserted_slice_1696 = tensor.insert_slice %extracted_slice_1693 into %inserted_slice_1695[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %1882 = tensor.empty() : tensor<1x32x80x128xf32>
+    %1883 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_1696, %1867 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%1882 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %1884 = tensor.empty() : tensor<1x32x80x128xf32>
+    %1885 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1878, %1883 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%1884 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %1886 = tensor.empty() : tensor<1x32x128x80xf32>
+    %1887 = linalg.generic {indexing_maps = [#map14, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1885 : tensor<1x32x80x128xf32>) outs(%1886 : tensor<1x32x128x80xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x128x80xf32>
+    %cst_1697 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32>
+    %collapsed_1698 = tensor.collapse_shape %1876 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32>
+    %cst_1699 = arith.constant dense<0.000000e+00> : tensor<1x32x128x80xf32>
+    %collapsed_1700 = tensor.collapse_shape %1887 [[0, 1], [2], [3]] : tensor<1x32x128x80xf32> into tensor<32x128x80xf32>
+    %cst_1701 = arith.constant 0.000000e+00 : f32
+    %1888 = tensor.empty() : tensor<32x80x80xf32>
+    %1889 = linalg.fill ins(%cst_1701 : f32) outs(%1888 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32>
+    %1890 = linalg.batch_matmul ins(%collapsed_1698, %collapsed_1700 : tensor<32x80x128xf32>, tensor<32x128x80xf32>) outs(%1889 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32>
+    %expanded_1702 = tensor.expand_shape %1890 [[0, 1], [2], [3]] : tensor<32x80x80xf32> into tensor<1x32x80x80xf32>
+    %cst_1703 = arith.constant dense<11.3137083> : tensor<1x32x80x80xf32>
+    %1891 = tensor.empty() : tensor<1x32x80x80xf32>
+    %1892 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst_1703 : tensor<1x32x80x80xf32>) outs(%1891 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %cst_2684 = arith.constant 1.000000e+00 : f32
+      %3652 = arith.divf %cst_2684, %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %1893 = tensor.empty() : tensor<1x32x80x80xf32>
+    %1894 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1702, %1892 : tensor<1x32x80x80xf32>, tensor<1x32x80x80xf32>) outs(%1893 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %1895 = tensor.empty() : tensor<1x32x80x80xf32>
+    %collapsed_1704 = tensor.collapse_shape %21 [[0, 1], [2], [3]] : tensor<1x1x80x80xf32> into tensor<1x80x80xf32>
+    %1896 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1894, %collapsed_1704 : tensor<1x32x80x80xf32>, tensor<1x80x80xf32>) outs(%1895 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %1897 = tensor.empty() : tensor<1x32x80x1xf32>
+    %1898 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%1897 : tensor<1x32x80x1xf32>) {
+    ^bb0(%out: f32):
+      %cst_2684 = arith.constant 0xFF800000 : f32
+      linalg.yield %cst_2684 : f32
+    } -> tensor<1x32x80x1xf32>
+    %1899 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%1896 : tensor<1x32x80x80xf32>) outs(%1897 : tensor<1x32x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.maximumf %in, %out : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x1xf32>
+    %1900 = tensor.empty() : tensor<1x32x80x80xf32>
+    %1901 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1896, %1899 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%1900 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.subf %in, %in_2684 : f32
+      %3653 = math.exp %3652 : f32
+      linalg.yield %3653 : f32
+    } -> tensor<1x32x80x80xf32>
+    %1902 = tensor.empty() : tensor<1x32x80x1xf32>
+    %1903 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%1902 : tensor<1x32x80x1xf32>) {
+    ^bb0(%out: f32):
+      %cst_2684 = arith.constant 0.000000e+00 : f32
+      linalg.yield %cst_2684 : f32
+    } -> tensor<1x32x80x1xf32>
+    %1904 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%1901 : tensor<1x32x80x80xf32>) outs(%1903 : tensor<1x32x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.addf %in, %out : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x1xf32>
+    %1905 = tensor.empty() : tensor<1x32x80x80xf32>
+    %1906 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1901, %1904 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%1905 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.divf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %cst_1705 = arith.constant dense<0.000000e+00> : tensor<1x32x80x80xf32>
+    %collapsed_1706 = tensor.collapse_shape %1906 [[0, 1], [2], [3]] : tensor<1x32x80x80xf32> into tensor<32x80x80xf32>
+    %cst_1707 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32>
+    %collapsed_1708 = tensor.collapse_shape %1855 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32>
+    %cst_1709 = arith.constant 0.000000e+00 : f32
+    %1907 = tensor.empty() : tensor<32x80x128xf32>
+    %1908 = linalg.fill ins(%cst_1709 : f32) outs(%1907 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32>
+    %1909 = linalg.batch_matmul ins(%collapsed_1706, %collapsed_1708 : tensor<32x80x80xf32>, tensor<32x80x128xf32>) outs(%1908 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32>
+    %expanded_1710 = tensor.expand_shape %1909 [[0, 1], [2], [3]] : tensor<32x80x128xf32> into tensor<1x32x80x128xf32>
+    %1910 = tensor.empty() : tensor<1x80x32x128xf32>
+    %1911 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1710 : tensor<1x32x80x128xf32>) outs(%1910 : tensor<1x80x32x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x80x32x128xf32>
+    %collapsed_1711 = tensor.collapse_shape %1911 [[0], [1], [2, 3]] : tensor<1x80x32x128xf32> into tensor<1x80x4096xf32>
+    %1912 = tensor.empty() : tensor<4096x4096xf32>
+    %1913 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_296 : tensor<4096x4096xf32>) outs(%1912 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_1712 = tensor.collapse_shape %collapsed_1711 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_1713 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %1914 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1712, %1913 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1713 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_1714 = tensor.expand_shape %1914 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %1915 = tensor.empty() : tensor<1x80x4096xf32>
+    %1916 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1829, %expanded_1714 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%1915 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %1917 = tensor.empty() : tensor<1x80x4096xf32>
+    %cst_1715 = arith.constant 2.000000e+00 : f32
+    %1918 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1916 : tensor<1x80x4096xf32>) outs(%1917 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.powf %in, %cst_1715 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %cst_1716 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32>
+    %1919 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%1918 : tensor<1x80x4096xf32>) outs(%cst_1716 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %cst_2684 = arith.constant 4.096000e+03 : f32
+      %3652 = arith.divf %in, %cst_2684 : f32
+      %3653 = arith.addf %3652, %out : f32
+      linalg.yield %3653 : f32
+    } -> tensor<1x80x1xf32>
+    %cst_1717 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32>
+    %1920 = tensor.empty() : tensor<1x80x1xf32>
+    %1921 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1919, %cst_1717 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%1920 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %1922 = tensor.empty() : tensor<1x80x1xf32>
+    %1923 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1921 : tensor<1x80x1xf32>) outs(%1922 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.rsqrt %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %1924 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_1718 = tensor.collapse_shape %1923 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32>
+    %1925 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1916, %collapsed_1718 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%1924 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %expanded_1719 = tensor.expand_shape %extracted_slice_32 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32>
+    %1926 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_1720 = tensor.collapse_shape %expanded_1719 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32>
+    %1927 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_1720, %1925 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%1926 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %1928 = tensor.empty() : tensor<4096x11008xf32>
+    %1929 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_298 : tensor<11008x4096xf32>) outs(%1928 : tensor<4096x11008xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x11008xf32>
+    %collapsed_1721 = tensor.collapse_shape %1927 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_1722 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32>
+    %1930 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1721, %1929 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_1722 : tensor<80x11008xf32>) -> tensor<80x11008xf32>
+    %expanded_1723 = tensor.expand_shape %1930 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32>
+    %1931 = tensor.empty() : tensor<1x80x11008xf32>
+    %1932 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_1723 : tensor<1x80x11008xf32>) outs(%1931 : tensor<1x80x11008xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.negf %in : f32
+      %3653 = math.exp %3652 : f32
+      %cst_2684 = arith.constant 1.000000e+00 : f32
+      %3654 = arith.addf %cst_2684, %3653 : f32
+      %3655 = arith.divf %in, %3654 : f32
+      linalg.yield %3655 : f32
+    } -> tensor<1x80x11008xf32>
+    %1933 = tensor.empty() : tensor<4096x11008xf32>
+    %1934 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_300 : tensor<11008x4096xf32>) outs(%1933 : tensor<4096x11008xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x11008xf32>
+    %collapsed_1724 = tensor.collapse_shape %1927 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_1725 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32>
+    %1935 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1724, %1934 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_1725 : tensor<80x11008xf32>) -> tensor<80x11008xf32>
+    %expanded_1726 = tensor.expand_shape %1935 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32>
+    %1936 = tensor.empty() : tensor<1x80x11008xf32>
+    %1937 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1932, %expanded_1726 : tensor<1x80x11008xf32>, tensor<1x80x11008xf32>) outs(%1936 : tensor<1x80x11008xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x11008xf32>
+    %1938 = tensor.empty() : tensor<11008x4096xf32>
+    %1939 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_302 : tensor<4096x11008xf32>) outs(%1938 : tensor<11008x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<11008x4096xf32>
+    %collapsed_1727 = tensor.collapse_shape %1937 [[0, 1], [2]] : tensor<1x80x11008xf32> into tensor<80x11008xf32>
+    %cst_1728 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %1940 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1727, %1939 : tensor<80x11008xf32>, tensor<11008x4096xf32>) outs(%cst_1728 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_1729 = tensor.expand_shape %1940 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %1941 = tensor.empty() : tensor<1x80x4096xf32>
+    %1942 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1916, %expanded_1729 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%1941 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %1943 = tensor.empty() : tensor<1x80x4096xf32>
+    %cst_1730 = arith.constant 2.000000e+00 : f32
+    %1944 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1942 : tensor<1x80x4096xf32>) outs(%1943 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.powf %in, %cst_1730 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %cst_1731 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32>
+    %1945 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%1944 : tensor<1x80x4096xf32>) outs(%cst_1731 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %cst_2684 = arith.constant 4.096000e+03 : f32
+      %3652 = arith.divf %in, %cst_2684 : f32
+      %3653 = arith.addf %3652, %out : f32
+      linalg.yield %3653 : f32
+    } -> tensor<1x80x1xf32>
+    %cst_1732 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32>
+    %1946 = tensor.empty() : tensor<1x80x1xf32>
+    %1947 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1945, %cst_1732 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%1946 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %1948 = tensor.empty() : tensor<1x80x1xf32>
+    %1949 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1947 : tensor<1x80x1xf32>) outs(%1948 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.rsqrt %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %1950 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_1733 = tensor.collapse_shape %1949 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32>
+    %1951 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1942, %collapsed_1733 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%1950 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %expanded_1734 = tensor.expand_shape %extracted_slice_33 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32>
+    %1952 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_1735 = tensor.collapse_shape %expanded_1734 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32>
+    %1953 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_1735, %1951 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%1952 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %1954 = tensor.empty() : tensor<4096x4096xf32>
+    %1955 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_304 : tensor<4096x4096xf32>) outs(%1954 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_1736 = tensor.collapse_shape %1953 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_1737 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %1956 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1736, %1955 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1737 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_1738 = tensor.expand_shape %1956 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %1957 = tensor.empty() : tensor<4096x4096xf32>
+    %1958 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_306 : tensor<4096x4096xf32>) outs(%1957 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_1739 = tensor.collapse_shape %1953 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_1740 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %1959 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1739, %1958 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1740 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_1741 = tensor.expand_shape %1959 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %1960 = tensor.empty() : tensor<4096x4096xf32>
+    %1961 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_308 : tensor<4096x4096xf32>) outs(%1960 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_1742 = tensor.collapse_shape %1953 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_1743 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %1962 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1742, %1961 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1743 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_1744 = tensor.expand_shape %1962 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %expanded_1745 = tensor.expand_shape %expanded_1738 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32>
+    %1963 = tensor.empty() : tensor<1x32x80x128xf32>
+    %1964 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1745 : tensor<1x80x32x128xf32>) outs(%1963 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x80x128xf32>
+    %expanded_1746 = tensor.expand_shape %expanded_1741 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32>
+    %1965 = tensor.empty() : tensor<1x32x80x128xf32>
+    %1966 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1746 : tensor<1x80x32x128xf32>) outs(%1965 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x80x128xf32>
+    %expanded_1747 = tensor.expand_shape %expanded_1744 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32>
+    %1967 = tensor.empty() : tensor<1x32x80x128xf32>
+    %1968 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1747 : tensor<1x80x32x128xf32>) outs(%1967 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x80x128xf32>
+    %extracted_slice_1748 = tensor.extract_slice %expanded_584[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32>
+    %extracted_slice_1749 = tensor.extract_slice %expanded_586[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32>
+    %1969 = tensor.empty() : tensor<1x80x128xf32>
+    %1970 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_1748 : tensor<1x1x80x128xf32>) outs(%1969 : tensor<1x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x80x128xf32>
+    %1971 = tensor.empty() : tensor<80x128xf32>
+    %1972 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%1970 : tensor<1x80x128xf32>) outs(%1971 : tensor<80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<80x128xf32>
+    %1973 = tensor.empty() : tensor<1x80x128xf32>
+    %1974 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_1749 : tensor<1x1x80x128xf32>) outs(%1973 : tensor<1x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x80x128xf32>
+    %1975 = tensor.empty() : tensor<80x128xf32>
+    %1976 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%1974 : tensor<1x80x128xf32>) outs(%1975 : tensor<80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<80x128xf32>
+    %1977 = tensor.empty() : tensor<1x80x128xf32>
+    %1978 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%1977 : tensor<1x80x128xf32>) {
+    ^bb0(%in: i64, %out: f32):
+      %3652 = arith.index_cast %in : i64 to index
+      %3653 = linalg.index 2 : index
+      %extracted = tensor.extract %1972[%3652, %3653] : tensor<80x128xf32>
+      linalg.yield %extracted : f32
+    } -> tensor<1x80x128xf32>
+    %expanded_1750 = tensor.expand_shape %1978 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32>
+    %1979 = tensor.empty() : tensor<1x80x128xf32>
+    %1980 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%1979 : tensor<1x80x128xf32>) {
+    ^bb0(%in: i64, %out: f32):
+      %3652 = arith.index_cast %in : i64 to index
+      %3653 = linalg.index 2 : index
+      %extracted = tensor.extract %1976[%3652, %3653] : tensor<80x128xf32>
+      linalg.yield %extracted : f32
+    } -> tensor<1x80x128xf32>
+    %expanded_1751 = tensor.expand_shape %1980 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32>
+    %1981 = tensor.empty() : tensor<1x32x80x128xf32>
+    %1982 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1964, %1978 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%1981 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %extracted_slice_1752 = tensor.extract_slice %1964[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %extracted_slice_1753 = tensor.extract_slice %1964[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %1983 = tensor.empty() : tensor<1x32x80x64xf32>
+    %1984 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_1753 : tensor<1x32x80x64xf32>) outs(%1983 : tensor<1x32x80x64xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.negf %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x64xf32>
+    %1985 = tensor.empty() : tensor<1x32x80x128xf32>
+    %inserted_slice_1754 = tensor.insert_slice %1984 into %1985[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %inserted_slice_1755 = tensor.insert_slice %extracted_slice_1752 into %inserted_slice_1754[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %1986 = tensor.empty() : tensor<1x32x80x128xf32>
+    %1987 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_1755, %1980 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%1986 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %1988 = tensor.empty() : tensor<1x32x80x128xf32>
+    %1989 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1982, %1987 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%1988 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %1990 = tensor.empty() : tensor<1x32x80x128xf32>
+    %1991 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1966, %1978 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%1990 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %extracted_slice_1756 = tensor.extract_slice %1966[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %extracted_slice_1757 = tensor.extract_slice %1966[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %1992 = tensor.empty() : tensor<1x32x80x64xf32>
+    %1993 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_1757 : tensor<1x32x80x64xf32>) outs(%1992 : tensor<1x32x80x64xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.negf %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x64xf32>
+    %1994 = tensor.empty() : tensor<1x32x80x128xf32>
+    %inserted_slice_1758 = tensor.insert_slice %1993 into %1994[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %inserted_slice_1759 = tensor.insert_slice %extracted_slice_1756 into %inserted_slice_1758[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %1995 = tensor.empty() : tensor<1x32x80x128xf32>
+    %1996 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_1759, %1980 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%1995 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %1997 = tensor.empty() : tensor<1x32x80x128xf32>
+    %1998 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1991, %1996 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%1997 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %1999 = tensor.empty() : tensor<1x32x128x80xf32>
+    %2000 = linalg.generic {indexing_maps = [#map14, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1998 : tensor<1x32x80x128xf32>) outs(%1999 : tensor<1x32x128x80xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x128x80xf32>
+    %cst_1760 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32>
+    %collapsed_1761 = tensor.collapse_shape %1989 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32>
+    %cst_1762 = arith.constant dense<0.000000e+00> : tensor<1x32x128x80xf32>
+    %collapsed_1763 = tensor.collapse_shape %2000 [[0, 1], [2], [3]] : tensor<1x32x128x80xf32> into tensor<32x128x80xf32>
+    %cst_1764 = arith.constant 0.000000e+00 : f32
+    %2001 = tensor.empty() : tensor<32x80x80xf32>
+    %2002 = linalg.fill ins(%cst_1764 : f32) outs(%2001 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32>
+    %2003 = linalg.batch_matmul ins(%collapsed_1761, %collapsed_1763 : tensor<32x80x128xf32>, tensor<32x128x80xf32>) outs(%2002 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32>
+    %expanded_1765 = tensor.expand_shape %2003 [[0, 1], [2], [3]] : tensor<32x80x80xf32> into tensor<1x32x80x80xf32>
+    %cst_1766 = arith.constant dense<11.3137083> : tensor<1x32x80x80xf32>
+    %2004 = tensor.empty() : tensor<1x32x80x80xf32>
+    %2005 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst_1766 : tensor<1x32x80x80xf32>) outs(%2004 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %cst_2684 = arith.constant 1.000000e+00 : f32
+      %3652 = arith.divf %cst_2684, %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %2006 = tensor.empty() : tensor<1x32x80x80xf32>
+    %2007 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1765, %2005 : tensor<1x32x80x80xf32>, tensor<1x32x80x80xf32>) outs(%2006 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %2008 = tensor.empty() : tensor<1x32x80x80xf32>
+    %collapsed_1767 = tensor.collapse_shape %21 [[0, 1], [2], [3]] : tensor<1x1x80x80xf32> into tensor<1x80x80xf32>
+    %2009 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2007, %collapsed_1767 : tensor<1x32x80x80xf32>, tensor<1x80x80xf32>) outs(%2008 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %2010 = tensor.empty() : tensor<1x32x80x1xf32>
+    %2011 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%2010 : tensor<1x32x80x1xf32>) {
+    ^bb0(%out: f32):
+      %cst_2684 = arith.constant 0xFF800000 : f32
+      linalg.yield %cst_2684 : f32
+    } -> tensor<1x32x80x1xf32>
+    %2012 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%2009 : tensor<1x32x80x80xf32>) outs(%2010 : tensor<1x32x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.maximumf %in, %out : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x1xf32>
+    %2013 = tensor.empty() : tensor<1x32x80x80xf32>
+    %2014 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2009, %2012 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%2013 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.subf %in, %in_2684 : f32
+      %3653 = math.exp %3652 : f32
+      linalg.yield %3653 : f32
+    } -> tensor<1x32x80x80xf32>
+    %2015 = tensor.empty() : tensor<1x32x80x1xf32>
+    %2016 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%2015 : tensor<1x32x80x1xf32>) {
+    ^bb0(%out: f32):
+      %cst_2684 = arith.constant 0.000000e+00 : f32
+      linalg.yield %cst_2684 : f32
+    } -> tensor<1x32x80x1xf32>
+    %2017 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%2014 : tensor<1x32x80x80xf32>) outs(%2016 : tensor<1x32x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.addf %in, %out : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x1xf32>
+    %2018 = tensor.empty() : tensor<1x32x80x80xf32>
+    %2019 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2014, %2017 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%2018 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.divf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %cst_1768 = arith.constant dense<0.000000e+00> : tensor<1x32x80x80xf32>
+    %collapsed_1769 = tensor.collapse_shape %2019 [[0, 1], [2], [3]] : tensor<1x32x80x80xf32> into tensor<32x80x80xf32>
+    %cst_1770 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32>
+    %collapsed_1771 = tensor.collapse_shape %1968 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32>
+    %cst_1772 = arith.constant 0.000000e+00 : f32
+    %2020 = tensor.empty() : tensor<32x80x128xf32>
+    %2021 = linalg.fill ins(%cst_1772 : f32) outs(%2020 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32>
+    %2022 = linalg.batch_matmul ins(%collapsed_1769, %collapsed_1771 : tensor<32x80x80xf32>, tensor<32x80x128xf32>) outs(%2021 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32>
+    %expanded_1773 = tensor.expand_shape %2022 [[0, 1], [2], [3]] : tensor<32x80x128xf32> into tensor<1x32x80x128xf32>
+    %2023 = tensor.empty() : tensor<1x80x32x128xf32>
+    %2024 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1773 : tensor<1x32x80x128xf32>) outs(%2023 : tensor<1x80x32x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x80x32x128xf32>
+    %collapsed_1774 = tensor.collapse_shape %2024 [[0], [1], [2, 3]] : tensor<1x80x32x128xf32> into tensor<1x80x4096xf32>
+    %2025 = tensor.empty() : tensor<4096x4096xf32>
+    %2026 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_310 : tensor<4096x4096xf32>) outs(%2025 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_1775 = tensor.collapse_shape %collapsed_1774 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_1776 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %2027 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1775, %2026 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1776 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_1777 = tensor.expand_shape %2027 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %2028 = tensor.empty() : tensor<1x80x4096xf32>
+    %2029 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1942, %expanded_1777 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%2028 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %2030 = tensor.empty() : tensor<1x80x4096xf32>
+    %cst_1778 = arith.constant 2.000000e+00 : f32
+    %2031 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2029 : tensor<1x80x4096xf32>) outs(%2030 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.powf %in, %cst_1778 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %cst_1779 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32>
+    %2032 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%2031 : tensor<1x80x4096xf32>) outs(%cst_1779 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %cst_2684 = arith.constant 4.096000e+03 : f32
+      %3652 = arith.divf %in, %cst_2684 : f32
+      %3653 = arith.addf %3652, %out : f32
+      linalg.yield %3653 : f32
+    } -> tensor<1x80x1xf32>
+    %cst_1780 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32>
+    %2033 = tensor.empty() : tensor<1x80x1xf32>
+    %2034 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2032, %cst_1780 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%2033 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %2035 = tensor.empty() : tensor<1x80x1xf32>
+    %2036 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2034 : tensor<1x80x1xf32>) outs(%2035 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.rsqrt %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %2037 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_1781 = tensor.collapse_shape %2036 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32>
+    %2038 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2029, %collapsed_1781 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%2037 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %expanded_1782 = tensor.expand_shape %extracted_slice_34 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32>
+    %2039 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_1783 = tensor.collapse_shape %expanded_1782 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32>
+    %2040 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_1783, %2038 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%2039 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %2041 = tensor.empty() : tensor<4096x11008xf32>
+    %2042 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_312 : tensor<11008x4096xf32>) outs(%2041 : tensor<4096x11008xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x11008xf32>
+    %collapsed_1784 = tensor.collapse_shape %2040 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_1785 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32>
+    %2043 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1784, %2042 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_1785 : tensor<80x11008xf32>) -> tensor<80x11008xf32>
+    %expanded_1786 = tensor.expand_shape %2043 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32>
+    %2044 = tensor.empty() : tensor<1x80x11008xf32>
+    %2045 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_1786 : tensor<1x80x11008xf32>) outs(%2044 : tensor<1x80x11008xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.negf %in : f32
+      %3653 = math.exp %3652 : f32
+      %cst_2684 = arith.constant 1.000000e+00 : f32
+      %3654 = arith.addf %cst_2684, %3653 : f32
+      %3655 = arith.divf %in, %3654 : f32
+      linalg.yield %3655 : f32
+    } -> tensor<1x80x11008xf32>
+    %2046 = tensor.empty() : tensor<4096x11008xf32>
+    %2047 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_314 : tensor<11008x4096xf32>) outs(%2046 : tensor<4096x11008xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x11008xf32>
+    %collapsed_1787 = tensor.collapse_shape %2040 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_1788 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32>
+    %2048 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1787, %2047 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_1788 : tensor<80x11008xf32>) -> tensor<80x11008xf32>
+    %expanded_1789 = tensor.expand_shape %2048 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32>
+    %2049 = tensor.empty() : tensor<1x80x11008xf32>
+    %2050 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2045, %expanded_1789 : tensor<1x80x11008xf32>, tensor<1x80x11008xf32>) outs(%2049 : tensor<1x80x11008xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x11008xf32>
+    %2051 = tensor.empty() : tensor<11008x4096xf32>
+    %2052 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_316 : tensor<4096x11008xf32>) outs(%2051 : tensor<11008x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<11008x4096xf32>
+    %collapsed_1790 = tensor.collapse_shape %2050 [[0, 1], [2]] : tensor<1x80x11008xf32> into tensor<80x11008xf32>
+    %cst_1791 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %2053 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1790, %2052 : tensor<80x11008xf32>, tensor<11008x4096xf32>) outs(%cst_1791 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_1792 = tensor.expand_shape %2053 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %2054 = tensor.empty() : tensor<1x80x4096xf32>
+    %2055 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2029, %expanded_1792 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%2054 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %2056 = tensor.empty() : tensor<1x80x4096xf32>
+    %cst_1793 = arith.constant 2.000000e+00 : f32
+    %2057 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2055 : tensor<1x80x4096xf32>) outs(%2056 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.powf %in, %cst_1793 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %cst_1794 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32>
+    %2058 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%2057 : tensor<1x80x4096xf32>) outs(%cst_1794 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %cst_2684 = arith.constant 4.096000e+03 : f32
+      %3652 = arith.divf %in, %cst_2684 : f32
+      %3653 = arith.addf %3652, %out : f32
+      linalg.yield %3653 : f32
+    } -> tensor<1x80x1xf32>
+    %cst_1795 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32>
+    %2059 = tensor.empty() : tensor<1x80x1xf32>
+    %2060 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2058, %cst_1795 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%2059 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %2061 = tensor.empty() : tensor<1x80x1xf32>
+    %2062 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2060 : tensor<1x80x1xf32>) outs(%2061 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.rsqrt %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %2063 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_1796 = tensor.collapse_shape %2062 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32>
+    %2064 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2055, %collapsed_1796 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%2063 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %expanded_1797 = tensor.expand_shape %extracted_slice_35 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32>
+    %2065 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_1798 = tensor.collapse_shape %expanded_1797 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32>
+    %2066 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_1798, %2064 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%2065 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %2067 = tensor.empty() : tensor<4096x4096xf32>
+    %2068 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_318 : tensor<4096x4096xf32>) outs(%2067 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_1799 = tensor.collapse_shape %2066 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_1800 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %2069 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1799, %2068 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1800 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_1801 = tensor.expand_shape %2069 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %2070 = tensor.empty() : tensor<4096x4096xf32>
+    %2071 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_320 : tensor<4096x4096xf32>) outs(%2070 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_1802 = tensor.collapse_shape %2066 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_1803 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %2072 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1802, %2071 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1803 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_1804 = tensor.expand_shape %2072 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %2073 = tensor.empty() : tensor<4096x4096xf32>
+    %2074 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_322 : tensor<4096x4096xf32>) outs(%2073 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_1805 = tensor.collapse_shape %2066 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_1806 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %2075 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1805, %2074 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1806 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_1807 = tensor.expand_shape %2075 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %expanded_1808 = tensor.expand_shape %expanded_1801 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32>
+    %2076 = tensor.empty() : tensor<1x32x80x128xf32>
+    %2077 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1808 : tensor<1x80x32x128xf32>) outs(%2076 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x80x128xf32>
+    %expanded_1809 = tensor.expand_shape %expanded_1804 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32>
+    %2078 = tensor.empty() : tensor<1x32x80x128xf32>
+    %2079 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1809 : tensor<1x80x32x128xf32>) outs(%2078 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x80x128xf32>
+    %expanded_1810 = tensor.expand_shape %expanded_1807 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32>
+    %2080 = tensor.empty() : tensor<1x32x80x128xf32>
+    %2081 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1810 : tensor<1x80x32x128xf32>) outs(%2080 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x80x128xf32>
+    %extracted_slice_1811 = tensor.extract_slice %expanded_588[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32>
+    %extracted_slice_1812 = tensor.extract_slice %expanded_590[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32>
+    %2082 = tensor.empty() : tensor<1x80x128xf32>
+    %2083 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_1811 : tensor<1x1x80x128xf32>) outs(%2082 : tensor<1x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x80x128xf32>
+    %2084 = tensor.empty() : tensor<80x128xf32>
+    %2085 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%2083 : tensor<1x80x128xf32>) outs(%2084 : tensor<80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<80x128xf32>
+    %2086 = tensor.empty() : tensor<1x80x128xf32>
+    %2087 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_1812 : tensor<1x1x80x128xf32>) outs(%2086 : tensor<1x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x80x128xf32>
+    %2088 = tensor.empty() : tensor<80x128xf32>
+    %2089 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%2087 : tensor<1x80x128xf32>) outs(%2088 : tensor<80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<80x128xf32>
+    %2090 = tensor.empty() : tensor<1x80x128xf32>
+    %2091 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%2090 : tensor<1x80x128xf32>) {
+    ^bb0(%in: i64, %out: f32):
+      %3652 = arith.index_cast %in : i64 to index
+      %3653 = linalg.index 2 : index
+      %extracted = tensor.extract %2085[%3652, %3653] : tensor<80x128xf32>
+      linalg.yield %extracted : f32
+    } -> tensor<1x80x128xf32>
+    %expanded_1813 = tensor.expand_shape %2091 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32>
+    %2092 = tensor.empty() : tensor<1x80x128xf32>
+    %2093 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%2092 : tensor<1x80x128xf32>) {
+    ^bb0(%in: i64, %out: f32):
+      %3652 = arith.index_cast %in : i64 to index
+      %3653 = linalg.index 2 : index
+      %extracted = tensor.extract %2089[%3652, %3653] : tensor<80x128xf32>
+      linalg.yield %extracted : f32
+    } -> tensor<1x80x128xf32>
+    %expanded_1814 = tensor.expand_shape %2093 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32>
+    %2094 = tensor.empty() : tensor<1x32x80x128xf32>
+    %2095 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2077, %2091 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%2094 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %extracted_slice_1815 = tensor.extract_slice %2077[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %extracted_slice_1816 = tensor.extract_slice %2077[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %2096 = tensor.empty() : tensor<1x32x80x64xf32>
+    %2097 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_1816 : tensor<1x32x80x64xf32>) outs(%2096 : tensor<1x32x80x64xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.negf %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x64xf32>
+    %2098 = tensor.empty() : tensor<1x32x80x128xf32>
+    %inserted_slice_1817 = tensor.insert_slice %2097 into %2098[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %inserted_slice_1818 = tensor.insert_slice %extracted_slice_1815 into %inserted_slice_1817[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %2099 = tensor.empty() : tensor<1x32x80x128xf32>
+    %2100 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_1818, %2093 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%2099 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %2101 = tensor.empty() : tensor<1x32x80x128xf32>
+    %2102 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2095, %2100 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%2101 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %2103 = tensor.empty() : tensor<1x32x80x128xf32>
+    %2104 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2079, %2091 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%2103 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %extracted_slice_1819 = tensor.extract_slice %2079[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %extracted_slice_1820 = tensor.extract_slice %2079[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %2105 = tensor.empty() : tensor<1x32x80x64xf32>
+    %2106 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_1820 : tensor<1x32x80x64xf32>) outs(%2105 : tensor<1x32x80x64xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.negf %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x64xf32>
+    %2107 = tensor.empty() : tensor<1x32x80x128xf32>
+    %inserted_slice_1821 = tensor.insert_slice %2106 into %2107[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %inserted_slice_1822 = tensor.insert_slice %extracted_slice_1819 into %inserted_slice_1821[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %2108 = tensor.empty() : tensor<1x32x80x128xf32>
+    %2109 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_1822, %2093 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%2108 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %2110 = tensor.empty() : tensor<1x32x80x128xf32>
+    %2111 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2104, %2109 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%2110 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %2112 = tensor.empty() : tensor<1x32x128x80xf32>
+    %2113 = linalg.generic {indexing_maps = [#map14, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2111 : tensor<1x32x80x128xf32>) outs(%2112 : tensor<1x32x128x80xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x128x80xf32>
+    %cst_1823 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32>
+    %collapsed_1824 = tensor.collapse_shape %2102 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32>
+    %cst_1825 = arith.constant dense<0.000000e+00> : tensor<1x32x128x80xf32>
+    %collapsed_1826 = tensor.collapse_shape %2113 [[0, 1], [2], [3]] : tensor<1x32x128x80xf32> into tensor<32x128x80xf32>
+    %cst_1827 = arith.constant 0.000000e+00 : f32
+    %2114 = tensor.empty() : tensor<32x80x80xf32>
+    %2115 = linalg.fill ins(%cst_1827 : f32) outs(%2114 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32>
+    %2116 = linalg.batch_matmul ins(%collapsed_1824, %collapsed_1826 : tensor<32x80x128xf32>, tensor<32x128x80xf32>) outs(%2115 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32>
+    %expanded_1828 = tensor.expand_shape %2116 [[0, 1], [2], [3]] : tensor<32x80x80xf32> into tensor<1x32x80x80xf32>
+    %cst_1829 = arith.constant dense<11.3137083> : tensor<1x32x80x80xf32>
+    %2117 = tensor.empty() : tensor<1x32x80x80xf32>
+    %2118 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst_1829 : tensor<1x32x80x80xf32>) outs(%2117 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %cst_2684 = arith.constant 1.000000e+00 : f32
+      %3652 = arith.divf %cst_2684, %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %2119 = tensor.empty() : tensor<1x32x80x80xf32>
+    %2120 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1828, %2118 : tensor<1x32x80x80xf32>, tensor<1x32x80x80xf32>) outs(%2119 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %2121 = tensor.empty() : tensor<1x32x80x80xf32>
+    %collapsed_1830 = tensor.collapse_shape %21 [[0, 1], [2], [3]] : tensor<1x1x80x80xf32> into tensor<1x80x80xf32>
+    %2122 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2120, %collapsed_1830 : tensor<1x32x80x80xf32>, tensor<1x80x80xf32>) outs(%2121 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %2123 = tensor.empty() : tensor<1x32x80x1xf32>
+    %2124 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%2123 : tensor<1x32x80x1xf32>) {
+    ^bb0(%out: f32):
+      %cst_2684 = arith.constant 0xFF800000 : f32
+      linalg.yield %cst_2684 : f32
+    } -> tensor<1x32x80x1xf32>
+    %2125 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%2122 : tensor<1x32x80x80xf32>) outs(%2123 : tensor<1x32x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.maximumf %in, %out : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x1xf32>
+    %2126 = tensor.empty() : tensor<1x32x80x80xf32>
+    %2127 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2122, %2125 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%2126 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.subf %in, %in_2684 : f32
+      %3653 = math.exp %3652 : f32
+      linalg.yield %3653 : f32
+    } -> tensor<1x32x80x80xf32>
+    %2128 = tensor.empty() : tensor<1x32x80x1xf32>
+    %2129 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%2128 : tensor<1x32x80x1xf32>) {
+    ^bb0(%out: f32):
+      %cst_2684 = arith.constant 0.000000e+00 : f32
+      linalg.yield %cst_2684 : f32
+    } -> tensor<1x32x80x1xf32>
+    %2130 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%2127 : tensor<1x32x80x80xf32>) outs(%2129 : tensor<1x32x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.addf %in, %out : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x1xf32>
+    %2131 = tensor.empty() : tensor<1x32x80x80xf32>
+    %2132 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2127, %2130 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%2131 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.divf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %cst_1831 = arith.constant dense<0.000000e+00> : tensor<1x32x80x80xf32>
+    %collapsed_1832 = tensor.collapse_shape %2132 [[0, 1], [2], [3]] : tensor<1x32x80x80xf32> into tensor<32x80x80xf32>
+    %cst_1833 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32>
+    %collapsed_1834 = tensor.collapse_shape %2081 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32>
+    %cst_1835 = arith.constant 0.000000e+00 : f32
+    %2133 = tensor.empty() : tensor<32x80x128xf32>
+    %2134 = linalg.fill ins(%cst_1835 : f32) outs(%2133 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32>
+    %2135 = linalg.batch_matmul ins(%collapsed_1832, %collapsed_1834 : tensor<32x80x80xf32>, tensor<32x80x128xf32>) outs(%2134 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32>
+    %expanded_1836 = tensor.expand_shape %2135 [[0, 1], [2], [3]] : tensor<32x80x128xf32> into tensor<1x32x80x128xf32>
+    %2136 = tensor.empty() : tensor<1x80x32x128xf32>
+    %2137 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1836 : tensor<1x32x80x128xf32>) outs(%2136 : tensor<1x80x32x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x80x32x128xf32>
+    %collapsed_1837 = tensor.collapse_shape %2137 [[0], [1], [2, 3]] : tensor<1x80x32x128xf32> into tensor<1x80x4096xf32>
+    %2138 = tensor.empty() : tensor<4096x4096xf32>
+    %2139 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_324 : tensor<4096x4096xf32>) outs(%2138 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_1838 = tensor.collapse_shape %collapsed_1837 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_1839 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %2140 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1838, %2139 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1839 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_1840 = tensor.expand_shape %2140 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %2141 = tensor.empty() : tensor<1x80x4096xf32>
+    %2142 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2055, %expanded_1840 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%2141 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %2143 = tensor.empty() : tensor<1x80x4096xf32>
+    %cst_1841 = arith.constant 2.000000e+00 : f32
+    %2144 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2142 : tensor<1x80x4096xf32>) outs(%2143 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.powf %in, %cst_1841 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %cst_1842 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32>
+    %2145 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%2144 : tensor<1x80x4096xf32>) outs(%cst_1842 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %cst_2684 = arith.constant 4.096000e+03 : f32
+      %3652 = arith.divf %in, %cst_2684 : f32
+      %3653 = arith.addf %3652, %out : f32
+      linalg.yield %3653 : f32
+    } -> tensor<1x80x1xf32>
+    %cst_1843 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32>
+    %2146 = tensor.empty() : tensor<1x80x1xf32>
+    %2147 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2145, %cst_1843 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%2146 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %2148 = tensor.empty() : tensor<1x80x1xf32>
+    %2149 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2147 : tensor<1x80x1xf32>) outs(%2148 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.rsqrt %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %2150 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_1844 = tensor.collapse_shape %2149 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32>
+    %2151 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2142, %collapsed_1844 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%2150 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %expanded_1845 = tensor.expand_shape %extracted_slice_36 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32>
+    %2152 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_1846 = tensor.collapse_shape %expanded_1845 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32>
+    %2153 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_1846, %2151 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%2152 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %2154 = tensor.empty() : tensor<4096x11008xf32>
+    %2155 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_326 : tensor<11008x4096xf32>) outs(%2154 : tensor<4096x11008xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x11008xf32>
+    %collapsed_1847 = tensor.collapse_shape %2153 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_1848 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32>
+    %2156 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1847, %2155 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_1848 : tensor<80x11008xf32>) -> tensor<80x11008xf32>
+    %expanded_1849 = tensor.expand_shape %2156 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32>
+    %2157 = tensor.empty() : tensor<1x80x11008xf32>
+    %2158 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_1849 : tensor<1x80x11008xf32>) outs(%2157 : tensor<1x80x11008xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.negf %in : f32
+      %3653 = math.exp %3652 : f32
+      %cst_2684 = arith.constant 1.000000e+00 : f32
+      %3654 = arith.addf %cst_2684, %3653 : f32
+      %3655 = arith.divf %in, %3654 : f32
+      linalg.yield %3655 : f32
+    } -> tensor<1x80x11008xf32>
+    %2159 = tensor.empty() : tensor<4096x11008xf32>
+    %2160 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_328 : tensor<11008x4096xf32>) outs(%2159 : tensor<4096x11008xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x11008xf32>
+    %collapsed_1850 = tensor.collapse_shape %2153 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_1851 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32>
+    %2161 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1850, %2160 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_1851 : tensor<80x11008xf32>) -> tensor<80x11008xf32>
+    %expanded_1852 = tensor.expand_shape %2161 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32>
+    %2162 = tensor.empty() : tensor<1x80x11008xf32>
+    %2163 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2158, %expanded_1852 : tensor<1x80x11008xf32>, tensor<1x80x11008xf32>) outs(%2162 : tensor<1x80x11008xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x11008xf32>
+    %2164 = tensor.empty() : tensor<11008x4096xf32>
+    %2165 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_330 : tensor<4096x11008xf32>) outs(%2164 : tensor<11008x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<11008x4096xf32>
+    %collapsed_1853 = tensor.collapse_shape %2163 [[0, 1], [2]] : tensor<1x80x11008xf32> into tensor<80x11008xf32>
+    %cst_1854 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %2166 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1853, %2165 : tensor<80x11008xf32>, tensor<11008x4096xf32>) outs(%cst_1854 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_1855 = tensor.expand_shape %2166 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %2167 = tensor.empty() : tensor<1x80x4096xf32>
+    %2168 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2142, %expanded_1855 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%2167 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %2169 = tensor.empty() : tensor<1x80x4096xf32>
+    %cst_1856 = arith.constant 2.000000e+00 : f32
+    %2170 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2168 : tensor<1x80x4096xf32>) outs(%2169 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.powf %in, %cst_1856 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %cst_1857 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32>
+    %2171 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%2170 : tensor<1x80x4096xf32>) outs(%cst_1857 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %cst_2684 = arith.constant 4.096000e+03 : f32
+      %3652 = arith.divf %in, %cst_2684 : f32
+      %3653 = arith.addf %3652, %out : f32
+      linalg.yield %3653 : f32
+    } -> tensor<1x80x1xf32>
+    %cst_1858 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32>
+    %2172 = tensor.empty() : tensor<1x80x1xf32>
+    %2173 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2171, %cst_1858 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%2172 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %2174 = tensor.empty() : tensor<1x80x1xf32>
+    %2175 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2173 : tensor<1x80x1xf32>) outs(%2174 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.rsqrt %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %2176 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_1859 = tensor.collapse_shape %2175 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32>
+    %2177 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2168, %collapsed_1859 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%2176 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %expanded_1860 = tensor.expand_shape %extracted_slice_37 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32>
+    %2178 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_1861 = tensor.collapse_shape %expanded_1860 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32>
+    %2179 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_1861, %2177 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%2178 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %2180 = tensor.empty() : tensor<4096x4096xf32>
+    %2181 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_332 : tensor<4096x4096xf32>) outs(%2180 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_1862 = tensor.collapse_shape %2179 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_1863 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %2182 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1862, %2181 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1863 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_1864 = tensor.expand_shape %2182 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %2183 = tensor.empty() : tensor<4096x4096xf32>
+    %2184 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_334 : tensor<4096x4096xf32>) outs(%2183 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_1865 = tensor.collapse_shape %2179 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_1866 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %2185 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1865, %2184 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1866 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_1867 = tensor.expand_shape %2185 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %2186 = tensor.empty() : tensor<4096x4096xf32>
+    %2187 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_336 : tensor<4096x4096xf32>) outs(%2186 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_1868 = tensor.collapse_shape %2179 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_1869 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %2188 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1868, %2187 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1869 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_1870 = tensor.expand_shape %2188 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %expanded_1871 = tensor.expand_shape %expanded_1864 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32>
+    %2189 = tensor.empty() : tensor<1x32x80x128xf32>
+    %2190 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1871 : tensor<1x80x32x128xf32>) outs(%2189 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x80x128xf32>
+    %expanded_1872 = tensor.expand_shape %expanded_1867 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32>
+    %2191 = tensor.empty() : tensor<1x32x80x128xf32>
+    %2192 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1872 : tensor<1x80x32x128xf32>) outs(%2191 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x80x128xf32>
+    %expanded_1873 = tensor.expand_shape %expanded_1870 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32>
+    %2193 = tensor.empty() : tensor<1x32x80x128xf32>
+    %2194 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1873 : tensor<1x80x32x128xf32>) outs(%2193 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x80x128xf32>
+    %extracted_slice_1874 = tensor.extract_slice %expanded_592[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32>
+    %extracted_slice_1875 = tensor.extract_slice %expanded_594[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32>
+    %2195 = tensor.empty() : tensor<1x80x128xf32>
+    %2196 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_1874 : tensor<1x1x80x128xf32>) outs(%2195 : tensor<1x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x80x128xf32>
+    %2197 = tensor.empty() : tensor<80x128xf32>
+    %2198 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%2196 : tensor<1x80x128xf32>) outs(%2197 : tensor<80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<80x128xf32>
+    %2199 = tensor.empty() : tensor<1x80x128xf32>
+    %2200 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_1875 : tensor<1x1x80x128xf32>) outs(%2199 : tensor<1x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x80x128xf32>
+    %2201 = tensor.empty() : tensor<80x128xf32>
+    %2202 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%2200 : tensor<1x80x128xf32>) outs(%2201 : tensor<80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<80x128xf32>
+    %2203 = tensor.empty() : tensor<1x80x128xf32>
+    %2204 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%2203 : tensor<1x80x128xf32>) {
+    ^bb0(%in: i64, %out: f32):
+      %3652 = arith.index_cast %in : i64 to index
+      %3653 = linalg.index 2 : index
+      %extracted = tensor.extract %2198[%3652, %3653] : tensor<80x128xf32>
+      linalg.yield %extracted : f32
+    } -> tensor<1x80x128xf32>
+    %expanded_1876 = tensor.expand_shape %2204 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32>
+    %2205 = tensor.empty() : tensor<1x80x128xf32>
+    %2206 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%2205 : tensor<1x80x128xf32>) {
+    ^bb0(%in: i64, %out: f32):
+      %3652 = arith.index_cast %in : i64 to index
+      %3653 = linalg.index 2 : index
+      %extracted = tensor.extract %2202[%3652, %3653] : tensor<80x128xf32>
+      linalg.yield %extracted : f32
+    } -> tensor<1x80x128xf32>
+    %expanded_1877 = tensor.expand_shape %2206 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32>
+    %2207 = tensor.empty() : tensor<1x32x80x128xf32>
+    %2208 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2190, %2204 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%2207 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %extracted_slice_1878 = tensor.extract_slice %2190[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %extracted_slice_1879 = tensor.extract_slice %2190[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %2209 = tensor.empty() : tensor<1x32x80x64xf32>
+    %2210 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_1879 : tensor<1x32x80x64xf32>) outs(%2209 : tensor<1x32x80x64xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.negf %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x64xf32>
+    %2211 = tensor.empty() : tensor<1x32x80x128xf32>
+    %inserted_slice_1880 = tensor.insert_slice %2210 into %2211[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %inserted_slice_1881 = tensor.insert_slice %extracted_slice_1878 into %inserted_slice_1880[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %2212 = tensor.empty() : tensor<1x32x80x128xf32>
+    %2213 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_1881, %2206 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%2212 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %2214 = tensor.empty() : tensor<1x32x80x128xf32>
+    %2215 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2208, %2213 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%2214 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %2216 = tensor.empty() : tensor<1x32x80x128xf32>
+    %2217 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2192, %2204 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%2216 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %extracted_slice_1882 = tensor.extract_slice %2192[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %extracted_slice_1883 = tensor.extract_slice %2192[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %2218 = tensor.empty() : tensor<1x32x80x64xf32>
+    %2219 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_1883 : tensor<1x32x80x64xf32>) outs(%2218 : tensor<1x32x80x64xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.negf %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x64xf32>
+    %2220 = tensor.empty() : tensor<1x32x80x128xf32>
+    %inserted_slice_1884 = tensor.insert_slice %2219 into %2220[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %inserted_slice_1885 = tensor.insert_slice %extracted_slice_1882 into %inserted_slice_1884[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %2221 = tensor.empty() : tensor<1x32x80x128xf32>
+    %2222 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_1885, %2206 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%2221 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %2223 = tensor.empty() : tensor<1x32x80x128xf32>
+    %2224 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2217, %2222 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%2223 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %2225 = tensor.empty() : tensor<1x32x128x80xf32>
+    %2226 = linalg.generic {indexing_maps = [#map14, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2224 : tensor<1x32x80x128xf32>) outs(%2225 : tensor<1x32x128x80xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x128x80xf32>
+    %cst_1886 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32>
+    %collapsed_1887 = tensor.collapse_shape %2215 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32>
+    %cst_1888 = arith.constant dense<0.000000e+00> : tensor<1x32x128x80xf32>
+    %collapsed_1889 = tensor.collapse_shape %2226 [[0, 1], [2], [3]] : tensor<1x32x128x80xf32> into tensor<32x128x80xf32>
+    %cst_1890 = arith.constant 0.000000e+00 : f32
+    %2227 = tensor.empty() : tensor<32x80x80xf32>
+    %2228 = linalg.fill ins(%cst_1890 : f32) outs(%2227 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32>
+    %2229 = linalg.batch_matmul ins(%collapsed_1887, %collapsed_1889 : tensor<32x80x128xf32>, tensor<32x128x80xf32>) outs(%2228 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32>
+    %expanded_1891 = tensor.expand_shape %2229 [[0, 1], [2], [3]] : tensor<32x80x80xf32> into tensor<1x32x80x80xf32>
+    %cst_1892 = arith.constant dense<11.3137083> : tensor<1x32x80x80xf32>
+    %2230 = tensor.empty() : tensor<1x32x80x80xf32>
+    %2231 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst_1892 : tensor<1x32x80x80xf32>) outs(%2230 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %cst_2684 = arith.constant 1.000000e+00 : f32
+      %3652 = arith.divf %cst_2684, %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %2232 = tensor.empty() : tensor<1x32x80x80xf32>
+    %2233 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1891, %2231 : tensor<1x32x80x80xf32>, tensor<1x32x80x80xf32>) outs(%2232 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %2234 = tensor.empty() : tensor<1x32x80x80xf32>
+    %collapsed_1893 = tensor.collapse_shape %21 [[0, 1], [2], [3]] : tensor<1x1x80x80xf32> into tensor<1x80x80xf32>
+    %2235 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2233, %collapsed_1893 : tensor<1x32x80x80xf32>, tensor<1x80x80xf32>) outs(%2234 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %2236 = tensor.empty() : tensor<1x32x80x1xf32>
+    %2237 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%2236 : tensor<1x32x80x1xf32>) {
+    ^bb0(%out: f32):
+      %cst_2684 = arith.constant 0xFF800000 : f32
+      linalg.yield %cst_2684 : f32
+    } -> tensor<1x32x80x1xf32>
+    %2238 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%2235 : tensor<1x32x80x80xf32>) outs(%2236 : tensor<1x32x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.maximumf %in, %out : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x1xf32>
+    %2239 = tensor.empty() : tensor<1x32x80x80xf32>
+    %2240 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2235, %2238 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%2239 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.subf %in, %in_2684 : f32
+      %3653 = math.exp %3652 : f32
+      linalg.yield %3653 : f32
+    } -> tensor<1x32x80x80xf32>
+    %2241 = tensor.empty() : tensor<1x32x80x1xf32>
+    %2242 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%2241 : tensor<1x32x80x1xf32>) {
+    ^bb0(%out: f32):
+      %cst_2684 = arith.constant 0.000000e+00 : f32
+      linalg.yield %cst_2684 : f32
+    } -> tensor<1x32x80x1xf32>
+    %2243 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%2240 : tensor<1x32x80x80xf32>) outs(%2242 : tensor<1x32x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.addf %in, %out : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x1xf32>
+    %2244 = tensor.empty() : tensor<1x32x80x80xf32>
+    %2245 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2240, %2243 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%2244 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.divf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %cst_1894 = arith.constant dense<0.000000e+00> : tensor<1x32x80x80xf32>
+    %collapsed_1895 = tensor.collapse_shape %2245 [[0, 1], [2], [3]] : tensor<1x32x80x80xf32> into tensor<32x80x80xf32>
+    %cst_1896 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32>
+    %collapsed_1897 = tensor.collapse_shape %2194 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32>
+    %cst_1898 = arith.constant 0.000000e+00 : f32
+    %2246 = tensor.empty() : tensor<32x80x128xf32>
+    %2247 = linalg.fill ins(%cst_1898 : f32) outs(%2246 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32>
+    %2248 = linalg.batch_matmul ins(%collapsed_1895, %collapsed_1897 : tensor<32x80x80xf32>, tensor<32x80x128xf32>) outs(%2247 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32>
+    %expanded_1899 = tensor.expand_shape %2248 [[0, 1], [2], [3]] : tensor<32x80x128xf32> into tensor<1x32x80x128xf32>
+    %2249 = tensor.empty() : tensor<1x80x32x128xf32>
+    %2250 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1899 : tensor<1x32x80x128xf32>) outs(%2249 : tensor<1x80x32x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x80x32x128xf32>
+    %collapsed_1900 = tensor.collapse_shape %2250 [[0], [1], [2, 3]] : tensor<1x80x32x128xf32> into tensor<1x80x4096xf32>
+    %2251 = tensor.empty() : tensor<4096x4096xf32>
+    %2252 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_338 : tensor<4096x4096xf32>) outs(%2251 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_1901 = tensor.collapse_shape %collapsed_1900 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_1902 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %2253 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1901, %2252 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1902 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_1903 = tensor.expand_shape %2253 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %2254 = tensor.empty() : tensor<1x80x4096xf32>
+    %2255 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2168, %expanded_1903 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%2254 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %2256 = tensor.empty() : tensor<1x80x4096xf32>
+    %cst_1904 = arith.constant 2.000000e+00 : f32
+    %2257 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2255 : tensor<1x80x4096xf32>) outs(%2256 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.powf %in, %cst_1904 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %cst_1905 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32>
+    %2258 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%2257 : tensor<1x80x4096xf32>) outs(%cst_1905 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %cst_2684 = arith.constant 4.096000e+03 : f32
+      %3652 = arith.divf %in, %cst_2684 : f32
+      %3653 = arith.addf %3652, %out : f32
+      linalg.yield %3653 : f32
+    } -> tensor<1x80x1xf32>
+    %cst_1906 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32>
+    %2259 = tensor.empty() : tensor<1x80x1xf32>
+    %2260 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2258, %cst_1906 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%2259 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %2261 = tensor.empty() : tensor<1x80x1xf32>
+    %2262 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2260 : tensor<1x80x1xf32>) outs(%2261 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.rsqrt %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %2263 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_1907 = tensor.collapse_shape %2262 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32>
+    %2264 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2255, %collapsed_1907 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%2263 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %expanded_1908 = tensor.expand_shape %extracted_slice_38 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32>
+    %2265 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_1909 = tensor.collapse_shape %expanded_1908 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32>
+    %2266 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_1909, %2264 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%2265 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %2267 = tensor.empty() : tensor<4096x11008xf32>
+    %2268 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_340 : tensor<11008x4096xf32>) outs(%2267 : tensor<4096x11008xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x11008xf32>
+    %collapsed_1910 = tensor.collapse_shape %2266 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_1911 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32>
+    %2269 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1910, %2268 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_1911 : tensor<80x11008xf32>) -> tensor<80x11008xf32>
+    %expanded_1912 = tensor.expand_shape %2269 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32>
+    %2270 = tensor.empty() : tensor<1x80x11008xf32>
+    %2271 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_1912 : tensor<1x80x11008xf32>) outs(%2270 : tensor<1x80x11008xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.negf %in : f32
+      %3653 = math.exp %3652 : f32
+      %cst_2684 = arith.constant 1.000000e+00 : f32
+      %3654 = arith.addf %cst_2684, %3653 : f32
+      %3655 = arith.divf %in, %3654 : f32
+      linalg.yield %3655 : f32
+    } -> tensor<1x80x11008xf32>
+    %2272 = tensor.empty() : tensor<4096x11008xf32>
+    %2273 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_342 : tensor<11008x4096xf32>) outs(%2272 : tensor<4096x11008xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x11008xf32>
+    %collapsed_1913 = tensor.collapse_shape %2266 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_1914 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32>
+    %2274 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1913, %2273 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_1914 : tensor<80x11008xf32>) -> tensor<80x11008xf32>
+    %expanded_1915 = tensor.expand_shape %2274 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32>
+    %2275 = tensor.empty() : tensor<1x80x11008xf32>
+    %2276 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2271, %expanded_1915 : tensor<1x80x11008xf32>, tensor<1x80x11008xf32>) outs(%2275 : tensor<1x80x11008xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x11008xf32>
+    %2277 = tensor.empty() : tensor<11008x4096xf32>
+    %2278 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_344 : tensor<4096x11008xf32>) outs(%2277 : tensor<11008x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<11008x4096xf32>
+    %collapsed_1916 = tensor.collapse_shape %2276 [[0, 1], [2]] : tensor<1x80x11008xf32> into tensor<80x11008xf32>
+    %cst_1917 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %2279 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1916, %2278 : tensor<80x11008xf32>, tensor<11008x4096xf32>) outs(%cst_1917 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_1918 = tensor.expand_shape %2279 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %2280 = tensor.empty() : tensor<1x80x4096xf32>
+    %2281 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2255, %expanded_1918 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%2280 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %2282 = tensor.empty() : tensor<1x80x4096xf32>
+    %cst_1919 = arith.constant 2.000000e+00 : f32
+    %2283 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2281 : tensor<1x80x4096xf32>) outs(%2282 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.powf %in, %cst_1919 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %cst_1920 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32>
+    %2284 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%2283 : tensor<1x80x4096xf32>) outs(%cst_1920 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %cst_2684 = arith.constant 4.096000e+03 : f32
+      %3652 = arith.divf %in, %cst_2684 : f32
+      %3653 = arith.addf %3652, %out : f32
+      linalg.yield %3653 : f32
+    } -> tensor<1x80x1xf32>
+    %cst_1921 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32>
+    %2285 = tensor.empty() : tensor<1x80x1xf32>
+    %2286 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2284, %cst_1921 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%2285 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %2287 = tensor.empty() : tensor<1x80x1xf32>
+    %2288 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2286 : tensor<1x80x1xf32>) outs(%2287 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.rsqrt %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %2289 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_1922 = tensor.collapse_shape %2288 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32>
+    %2290 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2281, %collapsed_1922 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%2289 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %expanded_1923 = tensor.expand_shape %extracted_slice_39 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32>
+    %2291 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_1924 = tensor.collapse_shape %expanded_1923 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32>
+    %2292 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_1924, %2290 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%2291 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %2293 = tensor.empty() : tensor<4096x4096xf32>
+    %2294 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_346 : tensor<4096x4096xf32>) outs(%2293 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_1925 = tensor.collapse_shape %2292 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_1926 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %2295 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1925, %2294 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1926 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_1927 = tensor.expand_shape %2295 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %2296 = tensor.empty() : tensor<4096x4096xf32>
+    %2297 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_348 : tensor<4096x4096xf32>) outs(%2296 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_1928 = tensor.collapse_shape %2292 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_1929 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %2298 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1928, %2297 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1929 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_1930 = tensor.expand_shape %2298 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %2299 = tensor.empty() : tensor<4096x4096xf32>
+    %2300 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_350 : tensor<4096x4096xf32>) outs(%2299 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_1931 = tensor.collapse_shape %2292 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_1932 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %2301 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1931, %2300 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1932 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_1933 = tensor.expand_shape %2301 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %expanded_1934 = tensor.expand_shape %expanded_1927 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32>
+    %2302 = tensor.empty() : tensor<1x32x80x128xf32>
+    %2303 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1934 : tensor<1x80x32x128xf32>) outs(%2302 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x80x128xf32>
+    %expanded_1935 = tensor.expand_shape %expanded_1930 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32>
+    %2304 = tensor.empty() : tensor<1x32x80x128xf32>
+    %2305 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1935 : tensor<1x80x32x128xf32>) outs(%2304 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x80x128xf32>
+    %expanded_1936 = tensor.expand_shape %expanded_1933 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32>
+    %2306 = tensor.empty() : tensor<1x32x80x128xf32>
+    %2307 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1936 : tensor<1x80x32x128xf32>) outs(%2306 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x80x128xf32>
+    %extracted_slice_1937 = tensor.extract_slice %expanded_596[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32>
+    %extracted_slice_1938 = tensor.extract_slice %expanded_598[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32>
+    %2308 = tensor.empty() : tensor<1x80x128xf32>
+    %2309 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_1937 : tensor<1x1x80x128xf32>) outs(%2308 : tensor<1x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x80x128xf32>
+    %2310 = tensor.empty() : tensor<80x128xf32>
+    %2311 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%2309 : tensor<1x80x128xf32>) outs(%2310 : tensor<80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<80x128xf32>
+    %2312 = tensor.empty() : tensor<1x80x128xf32>
+    %2313 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_1938 : tensor<1x1x80x128xf32>) outs(%2312 : tensor<1x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x80x128xf32>
+    %2314 = tensor.empty() : tensor<80x128xf32>
+    %2315 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%2313 : tensor<1x80x128xf32>) outs(%2314 : tensor<80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<80x128xf32>
+    %2316 = tensor.empty() : tensor<1x80x128xf32>
+    %2317 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%2316 : tensor<1x80x128xf32>) {
+    ^bb0(%in: i64, %out: f32):
+      %3652 = arith.index_cast %in : i64 to index
+      %3653 = linalg.index 2 : index
+      %extracted = tensor.extract %2311[%3652, %3653] : tensor<80x128xf32>
+      linalg.yield %extracted : f32
+    } -> tensor<1x80x128xf32>
+    %expanded_1939 = tensor.expand_shape %2317 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32>
+    %2318 = tensor.empty() : tensor<1x80x128xf32>
+    %2319 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%2318 : tensor<1x80x128xf32>) {
+    ^bb0(%in: i64, %out: f32):
+      %3652 = arith.index_cast %in : i64 to index
+      %3653 = linalg.index 2 : index
+      %extracted = tensor.extract %2315[%3652, %3653] : tensor<80x128xf32>
+      linalg.yield %extracted : f32
+    } -> tensor<1x80x128xf32>
+    %expanded_1940 = tensor.expand_shape %2319 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32>
+    %2320 = tensor.empty() : tensor<1x32x80x128xf32>
+    %2321 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2303, %2317 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%2320 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %extracted_slice_1941 = tensor.extract_slice %2303[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %extracted_slice_1942 = tensor.extract_slice %2303[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %2322 = tensor.empty() : tensor<1x32x80x64xf32>
+    %2323 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_1942 : tensor<1x32x80x64xf32>) outs(%2322 : tensor<1x32x80x64xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.negf %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x64xf32>
+    %2324 = tensor.empty() : tensor<1x32x80x128xf32>
+    %inserted_slice_1943 = tensor.insert_slice %2323 into %2324[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %inserted_slice_1944 = tensor.insert_slice %extracted_slice_1941 into %inserted_slice_1943[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %2325 = tensor.empty() : tensor<1x32x80x128xf32>
+    %2326 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_1944, %2319 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%2325 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %2327 = tensor.empty() : tensor<1x32x80x128xf32>
+    %2328 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2321, %2326 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%2327 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %2329 = tensor.empty() : tensor<1x32x80x128xf32>
+    %2330 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2305, %2317 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%2329 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %extracted_slice_1945 = tensor.extract_slice %2305[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %extracted_slice_1946 = tensor.extract_slice %2305[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %2331 = tensor.empty() : tensor<1x32x80x64xf32>
+    %2332 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_1946 : tensor<1x32x80x64xf32>) outs(%2331 : tensor<1x32x80x64xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.negf %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x64xf32>
+    %2333 = tensor.empty() : tensor<1x32x80x128xf32>
+    %inserted_slice_1947 = tensor.insert_slice %2332 into %2333[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %inserted_slice_1948 = tensor.insert_slice %extracted_slice_1945 into %inserted_slice_1947[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %2334 = tensor.empty() : tensor<1x32x80x128xf32>
+    %2335 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_1948, %2319 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%2334 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %2336 = tensor.empty() : tensor<1x32x80x128xf32>
+    %2337 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2330, %2335 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%2336 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %2338 = tensor.empty() : tensor<1x32x128x80xf32>
+    %2339 = linalg.generic {indexing_maps = [#map14, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2337 : tensor<1x32x80x128xf32>) outs(%2338 : tensor<1x32x128x80xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x128x80xf32>
+    %cst_1949 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32>
+    %collapsed_1950 = tensor.collapse_shape %2328 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32>
+    %cst_1951 = arith.constant dense<0.000000e+00> : tensor<1x32x128x80xf32>
+    %collapsed_1952 = tensor.collapse_shape %2339 [[0, 1], [2], [3]] : tensor<1x32x128x80xf32> into tensor<32x128x80xf32>
+    %cst_1953 = arith.constant 0.000000e+00 : f32
+    %2340 = tensor.empty() : tensor<32x80x80xf32>
+    %2341 = linalg.fill ins(%cst_1953 : f32) outs(%2340 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32>
+    %2342 = linalg.batch_matmul ins(%collapsed_1950, %collapsed_1952 : tensor<32x80x128xf32>, tensor<32x128x80xf32>) outs(%2341 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32>
+    %expanded_1954 = tensor.expand_shape %2342 [[0, 1], [2], [3]] : tensor<32x80x80xf32> into tensor<1x32x80x80xf32>
+    %cst_1955 = arith.constant dense<11.3137083> : tensor<1x32x80x80xf32>
+    %2343 = tensor.empty() : tensor<1x32x80x80xf32>
+    %2344 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst_1955 : tensor<1x32x80x80xf32>) outs(%2343 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %cst_2684 = arith.constant 1.000000e+00 : f32
+      %3652 = arith.divf %cst_2684, %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %2345 = tensor.empty() : tensor<1x32x80x80xf32>
+    %2346 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1954, %2344 : tensor<1x32x80x80xf32>, tensor<1x32x80x80xf32>) outs(%2345 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %2347 = tensor.empty() : tensor<1x32x80x80xf32>
+    %collapsed_1956 = tensor.collapse_shape %21 [[0, 1], [2], [3]] : tensor<1x1x80x80xf32> into tensor<1x80x80xf32>
+    %2348 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2346, %collapsed_1956 : tensor<1x32x80x80xf32>, tensor<1x80x80xf32>) outs(%2347 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %2349 = tensor.empty() : tensor<1x32x80x1xf32>
+    %2350 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%2349 : tensor<1x32x80x1xf32>) {
+    ^bb0(%out: f32):
+      %cst_2684 = arith.constant 0xFF800000 : f32
+      linalg.yield %cst_2684 : f32
+    } -> tensor<1x32x80x1xf32>
+    %2351 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%2348 : tensor<1x32x80x80xf32>) outs(%2349 : tensor<1x32x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.maximumf %in, %out : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x1xf32>
+    %2352 = tensor.empty() : tensor<1x32x80x80xf32>
+    %2353 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2348, %2351 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%2352 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.subf %in, %in_2684 : f32
+      %3653 = math.exp %3652 : f32
+      linalg.yield %3653 : f32
+    } -> tensor<1x32x80x80xf32>
+    %2354 = tensor.empty() : tensor<1x32x80x1xf32>
+    %2355 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%2354 : tensor<1x32x80x1xf32>) {
+    ^bb0(%out: f32):
+      %cst_2684 = arith.constant 0.000000e+00 : f32
+      linalg.yield %cst_2684 : f32
+    } -> tensor<1x32x80x1xf32>
+    %2356 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%2353 : tensor<1x32x80x80xf32>) outs(%2355 : tensor<1x32x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.addf %in, %out : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x1xf32>
+    %2357 = tensor.empty() : tensor<1x32x80x80xf32>
+    %2358 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2353, %2356 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%2357 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.divf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %cst_1957 = arith.constant dense<0.000000e+00> : tensor<1x32x80x80xf32>
+    %collapsed_1958 = tensor.collapse_shape %2358 [[0, 1], [2], [3]] : tensor<1x32x80x80xf32> into tensor<32x80x80xf32>
+    %cst_1959 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32>
+    %collapsed_1960 = tensor.collapse_shape %2307 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32>
+    %cst_1961 = arith.constant 0.000000e+00 : f32
+    %2359 = tensor.empty() : tensor<32x80x128xf32>
+    %2360 = linalg.fill ins(%cst_1961 : f32) outs(%2359 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32>
+    %2361 = linalg.batch_matmul ins(%collapsed_1958, %collapsed_1960 : tensor<32x80x80xf32>, tensor<32x80x128xf32>) outs(%2360 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32>
+    %expanded_1962 = tensor.expand_shape %2361 [[0, 1], [2], [3]] : tensor<32x80x128xf32> into tensor<1x32x80x128xf32>
+    %2362 = tensor.empty() : tensor<1x80x32x128xf32>
+    %2363 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1962 : tensor<1x32x80x128xf32>) outs(%2362 : tensor<1x80x32x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x80x32x128xf32>
+    %collapsed_1963 = tensor.collapse_shape %2363 [[0], [1], [2, 3]] : tensor<1x80x32x128xf32> into tensor<1x80x4096xf32>
+    %2364 = tensor.empty() : tensor<4096x4096xf32>
+    %2365 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_352 : tensor<4096x4096xf32>) outs(%2364 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_1964 = tensor.collapse_shape %collapsed_1963 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_1965 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %2366 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1964, %2365 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1965 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_1966 = tensor.expand_shape %2366 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %2367 = tensor.empty() : tensor<1x80x4096xf32>
+    %2368 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2281, %expanded_1966 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%2367 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %2369 = tensor.empty() : tensor<1x80x4096xf32>
+    %cst_1967 = arith.constant 2.000000e+00 : f32
+    %2370 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2368 : tensor<1x80x4096xf32>) outs(%2369 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.powf %in, %cst_1967 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %cst_1968 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32>
+    %2371 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%2370 : tensor<1x80x4096xf32>) outs(%cst_1968 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %cst_2684 = arith.constant 4.096000e+03 : f32
+      %3652 = arith.divf %in, %cst_2684 : f32
+      %3653 = arith.addf %3652, %out : f32
+      linalg.yield %3653 : f32
+    } -> tensor<1x80x1xf32>
+    %cst_1969 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32>
+    %2372 = tensor.empty() : tensor<1x80x1xf32>
+    %2373 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2371, %cst_1969 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%2372 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %2374 = tensor.empty() : tensor<1x80x1xf32>
+    %2375 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2373 : tensor<1x80x1xf32>) outs(%2374 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.rsqrt %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %2376 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_1970 = tensor.collapse_shape %2375 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32>
+    %2377 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2368, %collapsed_1970 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%2376 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %expanded_1971 = tensor.expand_shape %extracted_slice_40 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32>
+    %2378 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_1972 = tensor.collapse_shape %expanded_1971 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32>
+    %2379 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_1972, %2377 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%2378 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %2380 = tensor.empty() : tensor<4096x11008xf32>
+    %2381 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_354 : tensor<11008x4096xf32>) outs(%2380 : tensor<4096x11008xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x11008xf32>
+    %collapsed_1973 = tensor.collapse_shape %2379 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_1974 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32>
+    %2382 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1973, %2381 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_1974 : tensor<80x11008xf32>) -> tensor<80x11008xf32>
+    %expanded_1975 = tensor.expand_shape %2382 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32>
+    %2383 = tensor.empty() : tensor<1x80x11008xf32>
+    %2384 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_1975 : tensor<1x80x11008xf32>) outs(%2383 : tensor<1x80x11008xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.negf %in : f32
+      %3653 = math.exp %3652 : f32
+      %cst_2684 = arith.constant 1.000000e+00 : f32
+      %3654 = arith.addf %cst_2684, %3653 : f32
+      %3655 = arith.divf %in, %3654 : f32
+      linalg.yield %3655 : f32
+    } -> tensor<1x80x11008xf32>
+    %2385 = tensor.empty() : tensor<4096x11008xf32>
+    %2386 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_356 : tensor<11008x4096xf32>) outs(%2385 : tensor<4096x11008xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x11008xf32>
+    %collapsed_1976 = tensor.collapse_shape %2379 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_1977 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32>
+    %2387 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1976, %2386 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_1977 : tensor<80x11008xf32>) -> tensor<80x11008xf32>
+    %expanded_1978 = tensor.expand_shape %2387 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32>
+    %2388 = tensor.empty() : tensor<1x80x11008xf32>
+    %2389 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2384, %expanded_1978 : tensor<1x80x11008xf32>, tensor<1x80x11008xf32>) outs(%2388 : tensor<1x80x11008xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x11008xf32>
+    %2390 = tensor.empty() : tensor<11008x4096xf32>
+    %2391 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_358 : tensor<4096x11008xf32>) outs(%2390 : tensor<11008x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<11008x4096xf32>
+    %collapsed_1979 = tensor.collapse_shape %2389 [[0, 1], [2]] : tensor<1x80x11008xf32> into tensor<80x11008xf32>
+    %cst_1980 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %2392 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1979, %2391 : tensor<80x11008xf32>, tensor<11008x4096xf32>) outs(%cst_1980 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_1981 = tensor.expand_shape %2392 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %2393 = tensor.empty() : tensor<1x80x4096xf32>
+    %2394 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2368, %expanded_1981 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%2393 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %2395 = tensor.empty() : tensor<1x80x4096xf32>
+    %cst_1982 = arith.constant 2.000000e+00 : f32
+    %2396 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2394 : tensor<1x80x4096xf32>) outs(%2395 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.powf %in, %cst_1982 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %cst_1983 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32>
+    %2397 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%2396 : tensor<1x80x4096xf32>) outs(%cst_1983 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %cst_2684 = arith.constant 4.096000e+03 : f32
+      %3652 = arith.divf %in, %cst_2684 : f32
+      %3653 = arith.addf %3652, %out : f32
+      linalg.yield %3653 : f32
+    } -> tensor<1x80x1xf32>
+    %cst_1984 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32>
+    %2398 = tensor.empty() : tensor<1x80x1xf32>
+    %2399 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2397, %cst_1984 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%2398 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %2400 = tensor.empty() : tensor<1x80x1xf32>
+    %2401 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2399 : tensor<1x80x1xf32>) outs(%2400 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.rsqrt %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %2402 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_1985 = tensor.collapse_shape %2401 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32>
+    %2403 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2394, %collapsed_1985 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%2402 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %expanded_1986 = tensor.expand_shape %extracted_slice_41 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32>
+    %2404 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_1987 = tensor.collapse_shape %expanded_1986 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32>
+    %2405 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_1987, %2403 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%2404 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %2406 = tensor.empty() : tensor<4096x4096xf32>
+    %2407 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_360 : tensor<4096x4096xf32>) outs(%2406 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_1988 = tensor.collapse_shape %2405 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_1989 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %2408 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1988, %2407 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1989 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_1990 = tensor.expand_shape %2408 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %2409 = tensor.empty() : tensor<4096x4096xf32>
+    %2410 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_362 : tensor<4096x4096xf32>) outs(%2409 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_1991 = tensor.collapse_shape %2405 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_1992 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %2411 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1991, %2410 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1992 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_1993 = tensor.expand_shape %2411 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %2412 = tensor.empty() : tensor<4096x4096xf32>
+    %2413 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_364 : tensor<4096x4096xf32>) outs(%2412 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_1994 = tensor.collapse_shape %2405 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_1995 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %2414 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_1994, %2413 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1995 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_1996 = tensor.expand_shape %2414 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %expanded_1997 = tensor.expand_shape %expanded_1990 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32>
+    %2415 = tensor.empty() : tensor<1x32x80x128xf32>
+    %2416 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1997 : tensor<1x80x32x128xf32>) outs(%2415 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x80x128xf32>
+    %expanded_1998 = tensor.expand_shape %expanded_1993 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32>
+    %2417 = tensor.empty() : tensor<1x32x80x128xf32>
+    %2418 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1998 : tensor<1x80x32x128xf32>) outs(%2417 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x80x128xf32>
+    %expanded_1999 = tensor.expand_shape %expanded_1996 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32>
+    %2419 = tensor.empty() : tensor<1x32x80x128xf32>
+    %2420 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1999 : tensor<1x80x32x128xf32>) outs(%2419 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x80x128xf32>
+    %extracted_slice_2000 = tensor.extract_slice %expanded_600[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32>
+    %extracted_slice_2001 = tensor.extract_slice %expanded_602[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32>
+    %2421 = tensor.empty() : tensor<1x80x128xf32>
+    %2422 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_2000 : tensor<1x1x80x128xf32>) outs(%2421 : tensor<1x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x80x128xf32>
+    %2423 = tensor.empty() : tensor<80x128xf32>
+    %2424 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%2422 : tensor<1x80x128xf32>) outs(%2423 : tensor<80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<80x128xf32>
+    %2425 = tensor.empty() : tensor<1x80x128xf32>
+    %2426 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_2001 : tensor<1x1x80x128xf32>) outs(%2425 : tensor<1x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x80x128xf32>
+    %2427 = tensor.empty() : tensor<80x128xf32>
+    %2428 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%2426 : tensor<1x80x128xf32>) outs(%2427 : tensor<80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<80x128xf32>
+    %2429 = tensor.empty() : tensor<1x80x128xf32>
+    %2430 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%2429 : tensor<1x80x128xf32>) {
+    ^bb0(%in: i64, %out: f32):
+      %3652 = arith.index_cast %in : i64 to index
+      %3653 = linalg.index 2 : index
+      %extracted = tensor.extract %2424[%3652, %3653] : tensor<80x128xf32>
+      linalg.yield %extracted : f32
+    } -> tensor<1x80x128xf32>
+    %expanded_2002 = tensor.expand_shape %2430 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32>
+    %2431 = tensor.empty() : tensor<1x80x128xf32>
+    %2432 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%2431 : tensor<1x80x128xf32>) {
+    ^bb0(%in: i64, %out: f32):
+      %3652 = arith.index_cast %in : i64 to index
+      %3653 = linalg.index 2 : index
+      %extracted = tensor.extract %2428[%3652, %3653] : tensor<80x128xf32>
+      linalg.yield %extracted : f32
+    } -> tensor<1x80x128xf32>
+    %expanded_2003 = tensor.expand_shape %2432 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32>
+    %2433 = tensor.empty() : tensor<1x32x80x128xf32>
+    %2434 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2416, %2430 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%2433 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %extracted_slice_2004 = tensor.extract_slice %2416[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %extracted_slice_2005 = tensor.extract_slice %2416[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %2435 = tensor.empty() : tensor<1x32x80x64xf32>
+    %2436 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_2005 : tensor<1x32x80x64xf32>) outs(%2435 : tensor<1x32x80x64xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.negf %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x64xf32>
+    %2437 = tensor.empty() : tensor<1x32x80x128xf32>
+    %inserted_slice_2006 = tensor.insert_slice %2436 into %2437[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %inserted_slice_2007 = tensor.insert_slice %extracted_slice_2004 into %inserted_slice_2006[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %2438 = tensor.empty() : tensor<1x32x80x128xf32>
+    %2439 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_2007, %2432 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%2438 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %2440 = tensor.empty() : tensor<1x32x80x128xf32>
+    %2441 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2434, %2439 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%2440 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %2442 = tensor.empty() : tensor<1x32x80x128xf32>
+    %2443 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2418, %2430 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%2442 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %extracted_slice_2008 = tensor.extract_slice %2418[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %extracted_slice_2009 = tensor.extract_slice %2418[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %2444 = tensor.empty() : tensor<1x32x80x64xf32>
+    %2445 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_2009 : tensor<1x32x80x64xf32>) outs(%2444 : tensor<1x32x80x64xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.negf %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x64xf32>
+    %2446 = tensor.empty() : tensor<1x32x80x128xf32>
+    %inserted_slice_2010 = tensor.insert_slice %2445 into %2446[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %inserted_slice_2011 = tensor.insert_slice %extracted_slice_2008 into %inserted_slice_2010[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %2447 = tensor.empty() : tensor<1x32x80x128xf32>
+    %2448 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_2011, %2432 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%2447 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %2449 = tensor.empty() : tensor<1x32x80x128xf32>
+    %2450 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2443, %2448 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%2449 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %2451 = tensor.empty() : tensor<1x32x128x80xf32>
+    %2452 = linalg.generic {indexing_maps = [#map14, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2450 : tensor<1x32x80x128xf32>) outs(%2451 : tensor<1x32x128x80xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x128x80xf32>
+    %cst_2012 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32>
+    %collapsed_2013 = tensor.collapse_shape %2441 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32>
+    %cst_2014 = arith.constant dense<0.000000e+00> : tensor<1x32x128x80xf32>
+    %collapsed_2015 = tensor.collapse_shape %2452 [[0, 1], [2], [3]] : tensor<1x32x128x80xf32> into tensor<32x128x80xf32>
+    %cst_2016 = arith.constant 0.000000e+00 : f32
+    %2453 = tensor.empty() : tensor<32x80x80xf32>
+    %2454 = linalg.fill ins(%cst_2016 : f32) outs(%2453 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32>
+    %2455 = linalg.batch_matmul ins(%collapsed_2013, %collapsed_2015 : tensor<32x80x128xf32>, tensor<32x128x80xf32>) outs(%2454 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32>
+    %expanded_2017 = tensor.expand_shape %2455 [[0, 1], [2], [3]] : tensor<32x80x80xf32> into tensor<1x32x80x80xf32>
+    %cst_2018 = arith.constant dense<11.3137083> : tensor<1x32x80x80xf32>
+    %2456 = tensor.empty() : tensor<1x32x80x80xf32>
+    %2457 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst_2018 : tensor<1x32x80x80xf32>) outs(%2456 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %cst_2684 = arith.constant 1.000000e+00 : f32
+      %3652 = arith.divf %cst_2684, %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %2458 = tensor.empty() : tensor<1x32x80x80xf32>
+    %2459 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2017, %2457 : tensor<1x32x80x80xf32>, tensor<1x32x80x80xf32>) outs(%2458 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %2460 = tensor.empty() : tensor<1x32x80x80xf32>
+    %collapsed_2019 = tensor.collapse_shape %21 [[0, 1], [2], [3]] : tensor<1x1x80x80xf32> into tensor<1x80x80xf32>
+    %2461 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2459, %collapsed_2019 : tensor<1x32x80x80xf32>, tensor<1x80x80xf32>) outs(%2460 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %2462 = tensor.empty() : tensor<1x32x80x1xf32>
+    %2463 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%2462 : tensor<1x32x80x1xf32>) {
+    ^bb0(%out: f32):
+      %cst_2684 = arith.constant 0xFF800000 : f32
+      linalg.yield %cst_2684 : f32
+    } -> tensor<1x32x80x1xf32>
+    %2464 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%2461 : tensor<1x32x80x80xf32>) outs(%2462 : tensor<1x32x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.maximumf %in, %out : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x1xf32>
+    %2465 = tensor.empty() : tensor<1x32x80x80xf32>
+    %2466 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2461, %2464 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%2465 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.subf %in, %in_2684 : f32
+      %3653 = math.exp %3652 : f32
+      linalg.yield %3653 : f32
+    } -> tensor<1x32x80x80xf32>
+    %2467 = tensor.empty() : tensor<1x32x80x1xf32>
+    %2468 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%2467 : tensor<1x32x80x1xf32>) {
+    ^bb0(%out: f32):
+      %cst_2684 = arith.constant 0.000000e+00 : f32
+      linalg.yield %cst_2684 : f32
+    } -> tensor<1x32x80x1xf32>
+    %2469 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%2466 : tensor<1x32x80x80xf32>) outs(%2468 : tensor<1x32x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.addf %in, %out : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x1xf32>
+    %2470 = tensor.empty() : tensor<1x32x80x80xf32>
+    %2471 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2466, %2469 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%2470 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.divf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %cst_2020 = arith.constant dense<0.000000e+00> : tensor<1x32x80x80xf32>
+    %collapsed_2021 = tensor.collapse_shape %2471 [[0, 1], [2], [3]] : tensor<1x32x80x80xf32> into tensor<32x80x80xf32>
+    %cst_2022 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32>
+    %collapsed_2023 = tensor.collapse_shape %2420 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32>
+    %cst_2024 = arith.constant 0.000000e+00 : f32
+    %2472 = tensor.empty() : tensor<32x80x128xf32>
+    %2473 = linalg.fill ins(%cst_2024 : f32) outs(%2472 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32>
+    %2474 = linalg.batch_matmul ins(%collapsed_2021, %collapsed_2023 : tensor<32x80x80xf32>, tensor<32x80x128xf32>) outs(%2473 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32>
+    %expanded_2025 = tensor.expand_shape %2474 [[0, 1], [2], [3]] : tensor<32x80x128xf32> into tensor<1x32x80x128xf32>
+    %2475 = tensor.empty() : tensor<1x80x32x128xf32>
+    %2476 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2025 : tensor<1x32x80x128xf32>) outs(%2475 : tensor<1x80x32x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x80x32x128xf32>
+    %collapsed_2026 = tensor.collapse_shape %2476 [[0], [1], [2, 3]] : tensor<1x80x32x128xf32> into tensor<1x80x4096xf32>
+    %2477 = tensor.empty() : tensor<4096x4096xf32>
+    %2478 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_366 : tensor<4096x4096xf32>) outs(%2477 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_2027 = tensor.collapse_shape %collapsed_2026 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_2028 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %2479 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_2027, %2478 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_2028 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_2029 = tensor.expand_shape %2479 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %2480 = tensor.empty() : tensor<1x80x4096xf32>
+    %2481 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2394, %expanded_2029 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%2480 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %2482 = tensor.empty() : tensor<1x80x4096xf32>
+    %cst_2030 = arith.constant 2.000000e+00 : f32
+    %2483 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2481 : tensor<1x80x4096xf32>) outs(%2482 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.powf %in, %cst_2030 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %cst_2031 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32>
+    %2484 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%2483 : tensor<1x80x4096xf32>) outs(%cst_2031 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %cst_2684 = arith.constant 4.096000e+03 : f32
+      %3652 = arith.divf %in, %cst_2684 : f32
+      %3653 = arith.addf %3652, %out : f32
+      linalg.yield %3653 : f32
+    } -> tensor<1x80x1xf32>
+    %cst_2032 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32>
+    %2485 = tensor.empty() : tensor<1x80x1xf32>
+    %2486 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2484, %cst_2032 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%2485 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %2487 = tensor.empty() : tensor<1x80x1xf32>
+    %2488 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2486 : tensor<1x80x1xf32>) outs(%2487 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.rsqrt %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %2489 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_2033 = tensor.collapse_shape %2488 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32>
+    %2490 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2481, %collapsed_2033 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%2489 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %expanded_2034 = tensor.expand_shape %extracted_slice_42 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32>
+    %2491 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_2035 = tensor.collapse_shape %expanded_2034 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32>
+    %2492 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_2035, %2490 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%2491 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %2493 = tensor.empty() : tensor<4096x11008xf32>
+    %2494 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_368 : tensor<11008x4096xf32>) outs(%2493 : tensor<4096x11008xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x11008xf32>
+    %collapsed_2036 = tensor.collapse_shape %2492 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_2037 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32>
+    %2495 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_2036, %2494 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_2037 : tensor<80x11008xf32>) -> tensor<80x11008xf32>
+    %expanded_2038 = tensor.expand_shape %2495 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32>
+    %2496 = tensor.empty() : tensor<1x80x11008xf32>
+    %2497 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_2038 : tensor<1x80x11008xf32>) outs(%2496 : tensor<1x80x11008xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.negf %in : f32
+      %3653 = math.exp %3652 : f32
+      %cst_2684 = arith.constant 1.000000e+00 : f32
+      %3654 = arith.addf %cst_2684, %3653 : f32
+      %3655 = arith.divf %in, %3654 : f32
+      linalg.yield %3655 : f32
+    } -> tensor<1x80x11008xf32>
+    %2498 = tensor.empty() : tensor<4096x11008xf32>
+    %2499 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_370 : tensor<11008x4096xf32>) outs(%2498 : tensor<4096x11008xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x11008xf32>
+    %collapsed_2039 = tensor.collapse_shape %2492 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_2040 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32>
+    %2500 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_2039, %2499 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_2040 : tensor<80x11008xf32>) -> tensor<80x11008xf32>
+    %expanded_2041 = tensor.expand_shape %2500 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32>
+    %2501 = tensor.empty() : tensor<1x80x11008xf32>
+    %2502 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2497, %expanded_2041 : tensor<1x80x11008xf32>, tensor<1x80x11008xf32>) outs(%2501 : tensor<1x80x11008xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x11008xf32>
+    %2503 = tensor.empty() : tensor<11008x4096xf32>
+    %2504 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_372 : tensor<4096x11008xf32>) outs(%2503 : tensor<11008x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<11008x4096xf32>
+    %collapsed_2042 = tensor.collapse_shape %2502 [[0, 1], [2]] : tensor<1x80x11008xf32> into tensor<80x11008xf32>
+    %cst_2043 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %2505 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_2042, %2504 : tensor<80x11008xf32>, tensor<11008x4096xf32>) outs(%cst_2043 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_2044 = tensor.expand_shape %2505 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %2506 = tensor.empty() : tensor<1x80x4096xf32>
+    %2507 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2481, %expanded_2044 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%2506 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %2508 = tensor.empty() : tensor<1x80x4096xf32>
+    %cst_2045 = arith.constant 2.000000e+00 : f32
+    %2509 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2507 : tensor<1x80x4096xf32>) outs(%2508 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.powf %in, %cst_2045 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %cst_2046 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32>
+    %2510 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%2509 : tensor<1x80x4096xf32>) outs(%cst_2046 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %cst_2684 = arith.constant 4.096000e+03 : f32
+      %3652 = arith.divf %in, %cst_2684 : f32
+      %3653 = arith.addf %3652, %out : f32
+      linalg.yield %3653 : f32
+    } -> tensor<1x80x1xf32>
+    %cst_2047 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32>
+    %2511 = tensor.empty() : tensor<1x80x1xf32>
+    %2512 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2510, %cst_2047 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%2511 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %2513 = tensor.empty() : tensor<1x80x1xf32>
+    %2514 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2512 : tensor<1x80x1xf32>) outs(%2513 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.rsqrt %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %2515 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_2048 = tensor.collapse_shape %2514 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32>
+    %2516 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2507, %collapsed_2048 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%2515 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %expanded_2049 = tensor.expand_shape %extracted_slice_43 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32>
+    %2517 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_2050 = tensor.collapse_shape %expanded_2049 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32>
+    %2518 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_2050, %2516 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%2517 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %2519 = tensor.empty() : tensor<4096x4096xf32>
+    %2520 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_374 : tensor<4096x4096xf32>) outs(%2519 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_2051 = tensor.collapse_shape %2518 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_2052 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %2521 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_2051, %2520 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_2052 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_2053 = tensor.expand_shape %2521 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %2522 = tensor.empty() : tensor<4096x4096xf32>
+    %2523 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_376 : tensor<4096x4096xf32>) outs(%2522 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_2054 = tensor.collapse_shape %2518 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_2055 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %2524 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_2054, %2523 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_2055 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_2056 = tensor.expand_shape %2524 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %2525 = tensor.empty() : tensor<4096x4096xf32>
+    %2526 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_378 : tensor<4096x4096xf32>) outs(%2525 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_2057 = tensor.collapse_shape %2518 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_2058 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %2527 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_2057, %2526 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_2058 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_2059 = tensor.expand_shape %2527 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %expanded_2060 = tensor.expand_shape %expanded_2053 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32>
+    %2528 = tensor.empty() : tensor<1x32x80x128xf32>
+    %2529 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2060 : tensor<1x80x32x128xf32>) outs(%2528 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x80x128xf32>
+    %expanded_2061 = tensor.expand_shape %expanded_2056 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32>
+    %2530 = tensor.empty() : tensor<1x32x80x128xf32>
+    %2531 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2061 : tensor<1x80x32x128xf32>) outs(%2530 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x80x128xf32>
+    %expanded_2062 = tensor.expand_shape %expanded_2059 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32>
+    %2532 = tensor.empty() : tensor<1x32x80x128xf32>
+    %2533 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2062 : tensor<1x80x32x128xf32>) outs(%2532 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x80x128xf32>
+    %extracted_slice_2063 = tensor.extract_slice %expanded_604[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32>
+    %extracted_slice_2064 = tensor.extract_slice %expanded_606[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32>
+    %2534 = tensor.empty() : tensor<1x80x128xf32>
+    %2535 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_2063 : tensor<1x1x80x128xf32>) outs(%2534 : tensor<1x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x80x128xf32>
+    %2536 = tensor.empty() : tensor<80x128xf32>
+    %2537 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%2535 : tensor<1x80x128xf32>) outs(%2536 : tensor<80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<80x128xf32>
+    %2538 = tensor.empty() : tensor<1x80x128xf32>
+    %2539 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_2064 : tensor<1x1x80x128xf32>) outs(%2538 : tensor<1x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x80x128xf32>
+    %2540 = tensor.empty() : tensor<80x128xf32>
+    %2541 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%2539 : tensor<1x80x128xf32>) outs(%2540 : tensor<80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<80x128xf32>
+    %2542 = tensor.empty() : tensor<1x80x128xf32>
+    %2543 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%2542 : tensor<1x80x128xf32>) {
+    ^bb0(%in: i64, %out: f32):
+      %3652 = arith.index_cast %in : i64 to index
+      %3653 = linalg.index 2 : index
+      %extracted = tensor.extract %2537[%3652, %3653] : tensor<80x128xf32>
+      linalg.yield %extracted : f32
+    } -> tensor<1x80x128xf32>
+    %expanded_2065 = tensor.expand_shape %2543 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32>
+    %2544 = tensor.empty() : tensor<1x80x128xf32>
+    %2545 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%2544 : tensor<1x80x128xf32>) {
+    ^bb0(%in: i64, %out: f32):
+      %3652 = arith.index_cast %in : i64 to index
+      %3653 = linalg.index 2 : index
+      %extracted = tensor.extract %2541[%3652, %3653] : tensor<80x128xf32>
+      linalg.yield %extracted : f32
+    } -> tensor<1x80x128xf32>
+    %expanded_2066 = tensor.expand_shape %2545 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32>
+    %2546 = tensor.empty() : tensor<1x32x80x128xf32>
+    %2547 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2529, %2543 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%2546 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %extracted_slice_2067 = tensor.extract_slice %2529[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %extracted_slice_2068 = tensor.extract_slice %2529[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %2548 = tensor.empty() : tensor<1x32x80x64xf32>
+    %2549 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_2068 : tensor<1x32x80x64xf32>) outs(%2548 : tensor<1x32x80x64xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.negf %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x64xf32>
+    %2550 = tensor.empty() : tensor<1x32x80x128xf32>
+    %inserted_slice_2069 = tensor.insert_slice %2549 into %2550[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %inserted_slice_2070 = tensor.insert_slice %extracted_slice_2067 into %inserted_slice_2069[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %2551 = tensor.empty() : tensor<1x32x80x128xf32>
+    %2552 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_2070, %2545 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%2551 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %2553 = tensor.empty() : tensor<1x32x80x128xf32>
+    %2554 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2547, %2552 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%2553 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %2555 = tensor.empty() : tensor<1x32x80x128xf32>
+    %2556 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2531, %2543 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%2555 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %extracted_slice_2071 = tensor.extract_slice %2531[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %extracted_slice_2072 = tensor.extract_slice %2531[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %2557 = tensor.empty() : tensor<1x32x80x64xf32>
+    %2558 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_2072 : tensor<1x32x80x64xf32>) outs(%2557 : tensor<1x32x80x64xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.negf %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x64xf32>
+    %2559 = tensor.empty() : tensor<1x32x80x128xf32>
+    %inserted_slice_2073 = tensor.insert_slice %2558 into %2559[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %inserted_slice_2074 = tensor.insert_slice %extracted_slice_2071 into %inserted_slice_2073[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %2560 = tensor.empty() : tensor<1x32x80x128xf32>
+    %2561 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_2074, %2545 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%2560 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %2562 = tensor.empty() : tensor<1x32x80x128xf32>
+    %2563 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2556, %2561 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%2562 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %2564 = tensor.empty() : tensor<1x32x128x80xf32>
+    %2565 = linalg.generic {indexing_maps = [#map14, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2563 : tensor<1x32x80x128xf32>) outs(%2564 : tensor<1x32x128x80xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x128x80xf32>
+    %cst_2075 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32>
+    %collapsed_2076 = tensor.collapse_shape %2554 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32>
+    %cst_2077 = arith.constant dense<0.000000e+00> : tensor<1x32x128x80xf32>
+    %collapsed_2078 = tensor.collapse_shape %2565 [[0, 1], [2], [3]] : tensor<1x32x128x80xf32> into tensor<32x128x80xf32>
+    %cst_2079 = arith.constant 0.000000e+00 : f32
+    %2566 = tensor.empty() : tensor<32x80x80xf32>
+    %2567 = linalg.fill ins(%cst_2079 : f32) outs(%2566 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32>
+    %2568 = linalg.batch_matmul ins(%collapsed_2076, %collapsed_2078 : tensor<32x80x128xf32>, tensor<32x128x80xf32>) outs(%2567 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32>
+    %expanded_2080 = tensor.expand_shape %2568 [[0, 1], [2], [3]] : tensor<32x80x80xf32> into tensor<1x32x80x80xf32>
+    %cst_2081 = arith.constant dense<11.3137083> : tensor<1x32x80x80xf32>
+    %2569 = tensor.empty() : tensor<1x32x80x80xf32>
+    %2570 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst_2081 : tensor<1x32x80x80xf32>) outs(%2569 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %cst_2684 = arith.constant 1.000000e+00 : f32
+      %3652 = arith.divf %cst_2684, %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %2571 = tensor.empty() : tensor<1x32x80x80xf32>
+    %2572 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2080, %2570 : tensor<1x32x80x80xf32>, tensor<1x32x80x80xf32>) outs(%2571 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %2573 = tensor.empty() : tensor<1x32x80x80xf32>
+    %collapsed_2082 = tensor.collapse_shape %21 [[0, 1], [2], [3]] : tensor<1x1x80x80xf32> into tensor<1x80x80xf32>
+    %2574 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2572, %collapsed_2082 : tensor<1x32x80x80xf32>, tensor<1x80x80xf32>) outs(%2573 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %2575 = tensor.empty() : tensor<1x32x80x1xf32>
+    %2576 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%2575 : tensor<1x32x80x1xf32>) {
+    ^bb0(%out: f32):
+      %cst_2684 = arith.constant 0xFF800000 : f32
+      linalg.yield %cst_2684 : f32
+    } -> tensor<1x32x80x1xf32>
+    %2577 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%2574 : tensor<1x32x80x80xf32>) outs(%2575 : tensor<1x32x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.maximumf %in, %out : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x1xf32>
+    %2578 = tensor.empty() : tensor<1x32x80x80xf32>
+    %2579 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2574, %2577 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%2578 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.subf %in, %in_2684 : f32
+      %3653 = math.exp %3652 : f32
+      linalg.yield %3653 : f32
+    } -> tensor<1x32x80x80xf32>
+    %2580 = tensor.empty() : tensor<1x32x80x1xf32>
+    %2581 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%2580 : tensor<1x32x80x1xf32>) {
+    ^bb0(%out: f32):
+      %cst_2684 = arith.constant 0.000000e+00 : f32
+      linalg.yield %cst_2684 : f32
+    } -> tensor<1x32x80x1xf32>
+    %2582 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%2579 : tensor<1x32x80x80xf32>) outs(%2581 : tensor<1x32x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.addf %in, %out : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x1xf32>
+    %2583 = tensor.empty() : tensor<1x32x80x80xf32>
+    %2584 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2579, %2582 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%2583 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.divf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %cst_2083 = arith.constant dense<0.000000e+00> : tensor<1x32x80x80xf32>
+    %collapsed_2084 = tensor.collapse_shape %2584 [[0, 1], [2], [3]] : tensor<1x32x80x80xf32> into tensor<32x80x80xf32>
+    %cst_2085 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32>
+    %collapsed_2086 = tensor.collapse_shape %2533 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32>
+    %cst_2087 = arith.constant 0.000000e+00 : f32
+    %2585 = tensor.empty() : tensor<32x80x128xf32>
+    %2586 = linalg.fill ins(%cst_2087 : f32) outs(%2585 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32>
+    %2587 = linalg.batch_matmul ins(%collapsed_2084, %collapsed_2086 : tensor<32x80x80xf32>, tensor<32x80x128xf32>) outs(%2586 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32>
+    %expanded_2088 = tensor.expand_shape %2587 [[0, 1], [2], [3]] : tensor<32x80x128xf32> into tensor<1x32x80x128xf32>
+    %2588 = tensor.empty() : tensor<1x80x32x128xf32>
+    %2589 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2088 : tensor<1x32x80x128xf32>) outs(%2588 : tensor<1x80x32x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x80x32x128xf32>
+    %collapsed_2089 = tensor.collapse_shape %2589 [[0], [1], [2, 3]] : tensor<1x80x32x128xf32> into tensor<1x80x4096xf32>
+    %2590 = tensor.empty() : tensor<4096x4096xf32>
+    %2591 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_380 : tensor<4096x4096xf32>) outs(%2590 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_2090 = tensor.collapse_shape %collapsed_2089 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_2091 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %2592 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_2090, %2591 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_2091 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_2092 = tensor.expand_shape %2592 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %2593 = tensor.empty() : tensor<1x80x4096xf32>
+    %2594 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2507, %expanded_2092 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%2593 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %2595 = tensor.empty() : tensor<1x80x4096xf32>
+    %cst_2093 = arith.constant 2.000000e+00 : f32
+    %2596 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2594 : tensor<1x80x4096xf32>) outs(%2595 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.powf %in, %cst_2093 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %cst_2094 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32>
+    %2597 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%2596 : tensor<1x80x4096xf32>) outs(%cst_2094 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %cst_2684 = arith.constant 4.096000e+03 : f32
+      %3652 = arith.divf %in, %cst_2684 : f32
+      %3653 = arith.addf %3652, %out : f32
+      linalg.yield %3653 : f32
+    } -> tensor<1x80x1xf32>
+    %cst_2095 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32>
+    %2598 = tensor.empty() : tensor<1x80x1xf32>
+    %2599 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2597, %cst_2095 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%2598 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %2600 = tensor.empty() : tensor<1x80x1xf32>
+    %2601 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2599 : tensor<1x80x1xf32>) outs(%2600 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.rsqrt %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %2602 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_2096 = tensor.collapse_shape %2601 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32>
+    %2603 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2594, %collapsed_2096 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%2602 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %expanded_2097 = tensor.expand_shape %extracted_slice_44 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32>
+    %2604 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_2098 = tensor.collapse_shape %expanded_2097 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32>
+    %2605 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_2098, %2603 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%2604 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %2606 = tensor.empty() : tensor<4096x11008xf32>
+    %2607 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_382 : tensor<11008x4096xf32>) outs(%2606 : tensor<4096x11008xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x11008xf32>
+    %collapsed_2099 = tensor.collapse_shape %2605 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_2100 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32>
+    %2608 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_2099, %2607 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_2100 : tensor<80x11008xf32>) -> tensor<80x11008xf32>
+    %expanded_2101 = tensor.expand_shape %2608 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32>
+    %2609 = tensor.empty() : tensor<1x80x11008xf32>
+    %2610 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_2101 : tensor<1x80x11008xf32>) outs(%2609 : tensor<1x80x11008xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.negf %in : f32
+      %3653 = math.exp %3652 : f32
+      %cst_2684 = arith.constant 1.000000e+00 : f32
+      %3654 = arith.addf %cst_2684, %3653 : f32
+      %3655 = arith.divf %in, %3654 : f32
+      linalg.yield %3655 : f32
+    } -> tensor<1x80x11008xf32>
+    %2611 = tensor.empty() : tensor<4096x11008xf32>
+    %2612 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_384 : tensor<11008x4096xf32>) outs(%2611 : tensor<4096x11008xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x11008xf32>
+    %collapsed_2102 = tensor.collapse_shape %2605 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_2103 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32>
+    %2613 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_2102, %2612 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_2103 : tensor<80x11008xf32>) -> tensor<80x11008xf32>
+    %expanded_2104 = tensor.expand_shape %2613 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32>
+    %2614 = tensor.empty() : tensor<1x80x11008xf32>
+    %2615 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2610, %expanded_2104 : tensor<1x80x11008xf32>, tensor<1x80x11008xf32>) outs(%2614 : tensor<1x80x11008xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x11008xf32>
+    %2616 = tensor.empty() : tensor<11008x4096xf32>
+    %2617 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_386 : tensor<4096x11008xf32>) outs(%2616 : tensor<11008x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<11008x4096xf32>
+    %collapsed_2105 = tensor.collapse_shape %2615 [[0, 1], [2]] : tensor<1x80x11008xf32> into tensor<80x11008xf32>
+    %cst_2106 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %2618 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_2105, %2617 : tensor<80x11008xf32>, tensor<11008x4096xf32>) outs(%cst_2106 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_2107 = tensor.expand_shape %2618 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %2619 = tensor.empty() : tensor<1x80x4096xf32>
+    %2620 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2594, %expanded_2107 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%2619 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %2621 = tensor.empty() : tensor<1x80x4096xf32>
+    %cst_2108 = arith.constant 2.000000e+00 : f32
+    %2622 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2620 : tensor<1x80x4096xf32>) outs(%2621 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.powf %in, %cst_2108 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %cst_2109 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32>
+    %2623 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%2622 : tensor<1x80x4096xf32>) outs(%cst_2109 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %cst_2684 = arith.constant 4.096000e+03 : f32
+      %3652 = arith.divf %in, %cst_2684 : f32
+      %3653 = arith.addf %3652, %out : f32
+      linalg.yield %3653 : f32
+    } -> tensor<1x80x1xf32>
+    %cst_2110 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32>
+    %2624 = tensor.empty() : tensor<1x80x1xf32>
+    %2625 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2623, %cst_2110 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%2624 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %2626 = tensor.empty() : tensor<1x80x1xf32>
+    %2627 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2625 : tensor<1x80x1xf32>) outs(%2626 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.rsqrt %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %2628 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_2111 = tensor.collapse_shape %2627 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32>
+    %2629 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2620, %collapsed_2111 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%2628 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %expanded_2112 = tensor.expand_shape %extracted_slice_45 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32>
+    %2630 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_2113 = tensor.collapse_shape %expanded_2112 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32>
+    %2631 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_2113, %2629 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%2630 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %2632 = tensor.empty() : tensor<4096x4096xf32>
+    %2633 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_388 : tensor<4096x4096xf32>) outs(%2632 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_2114 = tensor.collapse_shape %2631 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_2115 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %2634 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_2114, %2633 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_2115 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_2116 = tensor.expand_shape %2634 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %2635 = tensor.empty() : tensor<4096x4096xf32>
+    %2636 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_390 : tensor<4096x4096xf32>) outs(%2635 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_2117 = tensor.collapse_shape %2631 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_2118 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %2637 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_2117, %2636 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_2118 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_2119 = tensor.expand_shape %2637 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %2638 = tensor.empty() : tensor<4096x4096xf32>
+    %2639 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_392 : tensor<4096x4096xf32>) outs(%2638 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_2120 = tensor.collapse_shape %2631 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_2121 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %2640 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_2120, %2639 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_2121 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_2122 = tensor.expand_shape %2640 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %expanded_2123 = tensor.expand_shape %expanded_2116 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32>
+    %2641 = tensor.empty() : tensor<1x32x80x128xf32>
+    %2642 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2123 : tensor<1x80x32x128xf32>) outs(%2641 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x80x128xf32>
+    %expanded_2124 = tensor.expand_shape %expanded_2119 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32>
+    %2643 = tensor.empty() : tensor<1x32x80x128xf32>
+    %2644 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2124 : tensor<1x80x32x128xf32>) outs(%2643 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x80x128xf32>
+    %expanded_2125 = tensor.expand_shape %expanded_2122 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32>
+    %2645 = tensor.empty() : tensor<1x32x80x128xf32>
+    %2646 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2125 : tensor<1x80x32x128xf32>) outs(%2645 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x80x128xf32>
+    %extracted_slice_2126 = tensor.extract_slice %expanded_608[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32>
+    %extracted_slice_2127 = tensor.extract_slice %expanded_610[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32>
+    %2647 = tensor.empty() : tensor<1x80x128xf32>
+    %2648 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_2126 : tensor<1x1x80x128xf32>) outs(%2647 : tensor<1x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x80x128xf32>
+    %2649 = tensor.empty() : tensor<80x128xf32>
+    %2650 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%2648 : tensor<1x80x128xf32>) outs(%2649 : tensor<80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<80x128xf32>
+    %2651 = tensor.empty() : tensor<1x80x128xf32>
+    %2652 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_2127 : tensor<1x1x80x128xf32>) outs(%2651 : tensor<1x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x80x128xf32>
+    %2653 = tensor.empty() : tensor<80x128xf32>
+    %2654 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%2652 : tensor<1x80x128xf32>) outs(%2653 : tensor<80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<80x128xf32>
+    %2655 = tensor.empty() : tensor<1x80x128xf32>
+    %2656 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%2655 : tensor<1x80x128xf32>) {
+    ^bb0(%in: i64, %out: f32):
+      %3652 = arith.index_cast %in : i64 to index
+      %3653 = linalg.index 2 : index
+      %extracted = tensor.extract %2650[%3652, %3653] : tensor<80x128xf32>
+      linalg.yield %extracted : f32
+    } -> tensor<1x80x128xf32>
+    %expanded_2128 = tensor.expand_shape %2656 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32>
+    %2657 = tensor.empty() : tensor<1x80x128xf32>
+    %2658 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%2657 : tensor<1x80x128xf32>) {
+    ^bb0(%in: i64, %out: f32):
+      %3652 = arith.index_cast %in : i64 to index
+      %3653 = linalg.index 2 : index
+      %extracted = tensor.extract %2654[%3652, %3653] : tensor<80x128xf32>
+      linalg.yield %extracted : f32
+    } -> tensor<1x80x128xf32>
+    %expanded_2129 = tensor.expand_shape %2658 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32>
+    %2659 = tensor.empty() : tensor<1x32x80x128xf32>
+    %2660 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2642, %2656 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%2659 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %extracted_slice_2130 = tensor.extract_slice %2642[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %extracted_slice_2131 = tensor.extract_slice %2642[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %2661 = tensor.empty() : tensor<1x32x80x64xf32>
+    %2662 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_2131 : tensor<1x32x80x64xf32>) outs(%2661 : tensor<1x32x80x64xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.negf %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x64xf32>
+    %2663 = tensor.empty() : tensor<1x32x80x128xf32>
+    %inserted_slice_2132 = tensor.insert_slice %2662 into %2663[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %inserted_slice_2133 = tensor.insert_slice %extracted_slice_2130 into %inserted_slice_2132[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %2664 = tensor.empty() : tensor<1x32x80x128xf32>
+    %2665 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_2133, %2658 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%2664 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %2666 = tensor.empty() : tensor<1x32x80x128xf32>
+    %2667 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2660, %2665 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%2666 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %2668 = tensor.empty() : tensor<1x32x80x128xf32>
+    %2669 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2644, %2656 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%2668 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %extracted_slice_2134 = tensor.extract_slice %2644[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %extracted_slice_2135 = tensor.extract_slice %2644[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %2670 = tensor.empty() : tensor<1x32x80x64xf32>
+    %2671 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_2135 : tensor<1x32x80x64xf32>) outs(%2670 : tensor<1x32x80x64xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.negf %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x64xf32>
+    %2672 = tensor.empty() : tensor<1x32x80x128xf32>
+    %inserted_slice_2136 = tensor.insert_slice %2671 into %2672[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %inserted_slice_2137 = tensor.insert_slice %extracted_slice_2134 into %inserted_slice_2136[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %2673 = tensor.empty() : tensor<1x32x80x128xf32>
+    %2674 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_2137, %2658 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%2673 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %2675 = tensor.empty() : tensor<1x32x80x128xf32>
+    %2676 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2669, %2674 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%2675 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %2677 = tensor.empty() : tensor<1x32x128x80xf32>
+    %2678 = linalg.generic {indexing_maps = [#map14, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2676 : tensor<1x32x80x128xf32>) outs(%2677 : tensor<1x32x128x80xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x128x80xf32>
+    %cst_2138 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32>
+    %collapsed_2139 = tensor.collapse_shape %2667 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32>
+    %cst_2140 = arith.constant dense<0.000000e+00> : tensor<1x32x128x80xf32>
+    %collapsed_2141 = tensor.collapse_shape %2678 [[0, 1], [2], [3]] : tensor<1x32x128x80xf32> into tensor<32x128x80xf32>
+    %cst_2142 = arith.constant 0.000000e+00 : f32
+    %2679 = tensor.empty() : tensor<32x80x80xf32>
+    %2680 = linalg.fill ins(%cst_2142 : f32) outs(%2679 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32>
+    %2681 = linalg.batch_matmul ins(%collapsed_2139, %collapsed_2141 : tensor<32x80x128xf32>, tensor<32x128x80xf32>) outs(%2680 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32>
+    %expanded_2143 = tensor.expand_shape %2681 [[0, 1], [2], [3]] : tensor<32x80x80xf32> into tensor<1x32x80x80xf32>
+    %cst_2144 = arith.constant dense<11.3137083> : tensor<1x32x80x80xf32>
+    %2682 = tensor.empty() : tensor<1x32x80x80xf32>
+    %2683 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst_2144 : tensor<1x32x80x80xf32>) outs(%2682 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %cst_2684 = arith.constant 1.000000e+00 : f32
+      %3652 = arith.divf %cst_2684, %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %2684 = tensor.empty() : tensor<1x32x80x80xf32>
+    %2685 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2143, %2683 : tensor<1x32x80x80xf32>, tensor<1x32x80x80xf32>) outs(%2684 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %2686 = tensor.empty() : tensor<1x32x80x80xf32>
+    %collapsed_2145 = tensor.collapse_shape %21 [[0, 1], [2], [3]] : tensor<1x1x80x80xf32> into tensor<1x80x80xf32>
+    %2687 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2685, %collapsed_2145 : tensor<1x32x80x80xf32>, tensor<1x80x80xf32>) outs(%2686 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %2688 = tensor.empty() : tensor<1x32x80x1xf32>
+    %2689 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%2688 : tensor<1x32x80x1xf32>) {
+    ^bb0(%out: f32):
+      %cst_2684 = arith.constant 0xFF800000 : f32
+      linalg.yield %cst_2684 : f32
+    } -> tensor<1x32x80x1xf32>
+    %2690 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%2687 : tensor<1x32x80x80xf32>) outs(%2688 : tensor<1x32x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.maximumf %in, %out : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x1xf32>
+    %2691 = tensor.empty() : tensor<1x32x80x80xf32>
+    %2692 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2687, %2690 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%2691 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.subf %in, %in_2684 : f32
+      %3653 = math.exp %3652 : f32
+      linalg.yield %3653 : f32
+    } -> tensor<1x32x80x80xf32>
+    %2693 = tensor.empty() : tensor<1x32x80x1xf32>
+    %2694 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%2693 : tensor<1x32x80x1xf32>) {
+    ^bb0(%out: f32):
+      %cst_2684 = arith.constant 0.000000e+00 : f32
+      linalg.yield %cst_2684 : f32
+    } -> tensor<1x32x80x1xf32>
+    %2695 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%2692 : tensor<1x32x80x80xf32>) outs(%2694 : tensor<1x32x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.addf %in, %out : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x1xf32>
+    %2696 = tensor.empty() : tensor<1x32x80x80xf32>
+    %2697 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2692, %2695 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%2696 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.divf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %cst_2146 = arith.constant dense<0.000000e+00> : tensor<1x32x80x80xf32>
+    %collapsed_2147 = tensor.collapse_shape %2697 [[0, 1], [2], [3]] : tensor<1x32x80x80xf32> into tensor<32x80x80xf32>
+    %cst_2148 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32>
+    %collapsed_2149 = tensor.collapse_shape %2646 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32>
+    %cst_2150 = arith.constant 0.000000e+00 : f32
+    %2698 = tensor.empty() : tensor<32x80x128xf32>
+    %2699 = linalg.fill ins(%cst_2150 : f32) outs(%2698 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32>
+    %2700 = linalg.batch_matmul ins(%collapsed_2147, %collapsed_2149 : tensor<32x80x80xf32>, tensor<32x80x128xf32>) outs(%2699 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32>
+    %expanded_2151 = tensor.expand_shape %2700 [[0, 1], [2], [3]] : tensor<32x80x128xf32> into tensor<1x32x80x128xf32>
+    %2701 = tensor.empty() : tensor<1x80x32x128xf32>
+    %2702 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2151 : tensor<1x32x80x128xf32>) outs(%2701 : tensor<1x80x32x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x80x32x128xf32>
+    %collapsed_2152 = tensor.collapse_shape %2702 [[0], [1], [2, 3]] : tensor<1x80x32x128xf32> into tensor<1x80x4096xf32>
+    %2703 = tensor.empty() : tensor<4096x4096xf32>
+    %2704 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_394 : tensor<4096x4096xf32>) outs(%2703 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_2153 = tensor.collapse_shape %collapsed_2152 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_2154 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %2705 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_2153, %2704 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_2154 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_2155 = tensor.expand_shape %2705 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %2706 = tensor.empty() : tensor<1x80x4096xf32>
+    %2707 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2620, %expanded_2155 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%2706 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %2708 = tensor.empty() : tensor<1x80x4096xf32>
+    %cst_2156 = arith.constant 2.000000e+00 : f32
+    %2709 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2707 : tensor<1x80x4096xf32>) outs(%2708 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.powf %in, %cst_2156 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %cst_2157 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32>
+    %2710 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%2709 : tensor<1x80x4096xf32>) outs(%cst_2157 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %cst_2684 = arith.constant 4.096000e+03 : f32
+      %3652 = arith.divf %in, %cst_2684 : f32
+      %3653 = arith.addf %3652, %out : f32
+      linalg.yield %3653 : f32
+    } -> tensor<1x80x1xf32>
+    %cst_2158 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32>
+    %2711 = tensor.empty() : tensor<1x80x1xf32>
+    %2712 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2710, %cst_2158 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%2711 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %2713 = tensor.empty() : tensor<1x80x1xf32>
+    %2714 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2712 : tensor<1x80x1xf32>) outs(%2713 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.rsqrt %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %2715 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_2159 = tensor.collapse_shape %2714 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32>
+    %2716 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2707, %collapsed_2159 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%2715 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %expanded_2160 = tensor.expand_shape %extracted_slice_46 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32>
+    %2717 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_2161 = tensor.collapse_shape %expanded_2160 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32>
+    %2718 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_2161, %2716 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%2717 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %2719 = tensor.empty() : tensor<4096x11008xf32>
+    %2720 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_396 : tensor<11008x4096xf32>) outs(%2719 : tensor<4096x11008xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x11008xf32>
+    %collapsed_2162 = tensor.collapse_shape %2718 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_2163 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32>
+    %2721 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_2162, %2720 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_2163 : tensor<80x11008xf32>) -> tensor<80x11008xf32>
+    %expanded_2164 = tensor.expand_shape %2721 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32>
+    %2722 = tensor.empty() : tensor<1x80x11008xf32>
+    %2723 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_2164 : tensor<1x80x11008xf32>) outs(%2722 : tensor<1x80x11008xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.negf %in : f32
+      %3653 = math.exp %3652 : f32
+      %cst_2684 = arith.constant 1.000000e+00 : f32
+      %3654 = arith.addf %cst_2684, %3653 : f32
+      %3655 = arith.divf %in, %3654 : f32
+      linalg.yield %3655 : f32
+    } -> tensor<1x80x11008xf32>
+    %2724 = tensor.empty() : tensor<4096x11008xf32>
+    %2725 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_398 : tensor<11008x4096xf32>) outs(%2724 : tensor<4096x11008xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x11008xf32>
+    %collapsed_2165 = tensor.collapse_shape %2718 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_2166 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32>
+    %2726 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_2165, %2725 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_2166 : tensor<80x11008xf32>) -> tensor<80x11008xf32>
+    %expanded_2167 = tensor.expand_shape %2726 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32>
+    %2727 = tensor.empty() : tensor<1x80x11008xf32>
+    %2728 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2723, %expanded_2167 : tensor<1x80x11008xf32>, tensor<1x80x11008xf32>) outs(%2727 : tensor<1x80x11008xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x11008xf32>
+    %2729 = tensor.empty() : tensor<11008x4096xf32>
+    %2730 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_400 : tensor<4096x11008xf32>) outs(%2729 : tensor<11008x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<11008x4096xf32>
+    %collapsed_2168 = tensor.collapse_shape %2728 [[0, 1], [2]] : tensor<1x80x11008xf32> into tensor<80x11008xf32>
+    %cst_2169 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %2731 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_2168, %2730 : tensor<80x11008xf32>, tensor<11008x4096xf32>) outs(%cst_2169 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_2170 = tensor.expand_shape %2731 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %2732 = tensor.empty() : tensor<1x80x4096xf32>
+    %2733 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2707, %expanded_2170 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%2732 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %2734 = tensor.empty() : tensor<1x80x4096xf32>
+    %cst_2171 = arith.constant 2.000000e+00 : f32
+    %2735 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2733 : tensor<1x80x4096xf32>) outs(%2734 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.powf %in, %cst_2171 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %cst_2172 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32>
+    %2736 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%2735 : tensor<1x80x4096xf32>) outs(%cst_2172 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %cst_2684 = arith.constant 4.096000e+03 : f32
+      %3652 = arith.divf %in, %cst_2684 : f32
+      %3653 = arith.addf %3652, %out : f32
+      linalg.yield %3653 : f32
+    } -> tensor<1x80x1xf32>
+    %cst_2173 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32>
+    %2737 = tensor.empty() : tensor<1x80x1xf32>
+    %2738 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2736, %cst_2173 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%2737 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %2739 = tensor.empty() : tensor<1x80x1xf32>
+    %2740 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2738 : tensor<1x80x1xf32>) outs(%2739 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.rsqrt %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %2741 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_2174 = tensor.collapse_shape %2740 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32>
+    %2742 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2733, %collapsed_2174 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%2741 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %expanded_2175 = tensor.expand_shape %extracted_slice_47 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32>
+    %2743 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_2176 = tensor.collapse_shape %expanded_2175 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32>
+    %2744 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_2176, %2742 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%2743 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %2745 = tensor.empty() : tensor<4096x4096xf32>
+    %2746 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_402 : tensor<4096x4096xf32>) outs(%2745 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_2177 = tensor.collapse_shape %2744 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_2178 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %2747 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_2177, %2746 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_2178 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_2179 = tensor.expand_shape %2747 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %2748 = tensor.empty() : tensor<4096x4096xf32>
+    %2749 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_404 : tensor<4096x4096xf32>) outs(%2748 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_2180 = tensor.collapse_shape %2744 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_2181 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %2750 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_2180, %2749 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_2181 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_2182 = tensor.expand_shape %2750 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %2751 = tensor.empty() : tensor<4096x4096xf32>
+    %2752 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_406 : tensor<4096x4096xf32>) outs(%2751 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_2183 = tensor.collapse_shape %2744 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_2184 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %2753 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_2183, %2752 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_2184 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_2185 = tensor.expand_shape %2753 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %expanded_2186 = tensor.expand_shape %expanded_2179 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32>
+    %2754 = tensor.empty() : tensor<1x32x80x128xf32>
+    %2755 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2186 : tensor<1x80x32x128xf32>) outs(%2754 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x80x128xf32>
+    %expanded_2187 = tensor.expand_shape %expanded_2182 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32>
+    %2756 = tensor.empty() : tensor<1x32x80x128xf32>
+    %2757 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2187 : tensor<1x80x32x128xf32>) outs(%2756 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x80x128xf32>
+    %expanded_2188 = tensor.expand_shape %expanded_2185 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32>
+    %2758 = tensor.empty() : tensor<1x32x80x128xf32>
+    %2759 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2188 : tensor<1x80x32x128xf32>) outs(%2758 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x80x128xf32>
+    %extracted_slice_2189 = tensor.extract_slice %expanded_612[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32>
+    %extracted_slice_2190 = tensor.extract_slice %expanded_614[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32>
+    %2760 = tensor.empty() : tensor<1x80x128xf32>
+    %2761 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_2189 : tensor<1x1x80x128xf32>) outs(%2760 : tensor<1x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x80x128xf32>
+    %2762 = tensor.empty() : tensor<80x128xf32>
+    %2763 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%2761 : tensor<1x80x128xf32>) outs(%2762 : tensor<80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<80x128xf32>
+    %2764 = tensor.empty() : tensor<1x80x128xf32>
+    %2765 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_2190 : tensor<1x1x80x128xf32>) outs(%2764 : tensor<1x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x80x128xf32>
+    %2766 = tensor.empty() : tensor<80x128xf32>
+    %2767 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%2765 : tensor<1x80x128xf32>) outs(%2766 : tensor<80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<80x128xf32>
+    %2768 = tensor.empty() : tensor<1x80x128xf32>
+    %2769 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%2768 : tensor<1x80x128xf32>) {
+    ^bb0(%in: i64, %out: f32):
+      %3652 = arith.index_cast %in : i64 to index
+      %3653 = linalg.index 2 : index
+      %extracted = tensor.extract %2763[%3652, %3653] : tensor<80x128xf32>
+      linalg.yield %extracted : f32
+    } -> tensor<1x80x128xf32>
+    %expanded_2191 = tensor.expand_shape %2769 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32>
+    %2770 = tensor.empty() : tensor<1x80x128xf32>
+    %2771 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%2770 : tensor<1x80x128xf32>) {
+    ^bb0(%in: i64, %out: f32):
+      %3652 = arith.index_cast %in : i64 to index
+      %3653 = linalg.index 2 : index
+      %extracted = tensor.extract %2767[%3652, %3653] : tensor<80x128xf32>
+      linalg.yield %extracted : f32
+    } -> tensor<1x80x128xf32>
+    %expanded_2192 = tensor.expand_shape %2771 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32>
+    %2772 = tensor.empty() : tensor<1x32x80x128xf32>
+    %2773 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2755, %2769 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%2772 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %extracted_slice_2193 = tensor.extract_slice %2755[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %extracted_slice_2194 = tensor.extract_slice %2755[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %2774 = tensor.empty() : tensor<1x32x80x64xf32>
+    %2775 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_2194 : tensor<1x32x80x64xf32>) outs(%2774 : tensor<1x32x80x64xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.negf %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x64xf32>
+    %2776 = tensor.empty() : tensor<1x32x80x128xf32>
+    %inserted_slice_2195 = tensor.insert_slice %2775 into %2776[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %inserted_slice_2196 = tensor.insert_slice %extracted_slice_2193 into %inserted_slice_2195[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %2777 = tensor.empty() : tensor<1x32x80x128xf32>
+    %2778 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_2196, %2771 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%2777 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %2779 = tensor.empty() : tensor<1x32x80x128xf32>
+    %2780 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2773, %2778 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%2779 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %2781 = tensor.empty() : tensor<1x32x80x128xf32>
+    %2782 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2757, %2769 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%2781 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %extracted_slice_2197 = tensor.extract_slice %2757[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %extracted_slice_2198 = tensor.extract_slice %2757[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %2783 = tensor.empty() : tensor<1x32x80x64xf32>
+    %2784 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_2198 : tensor<1x32x80x64xf32>) outs(%2783 : tensor<1x32x80x64xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.negf %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x64xf32>
+    %2785 = tensor.empty() : tensor<1x32x80x128xf32>
+    %inserted_slice_2199 = tensor.insert_slice %2784 into %2785[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %inserted_slice_2200 = tensor.insert_slice %extracted_slice_2197 into %inserted_slice_2199[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %2786 = tensor.empty() : tensor<1x32x80x128xf32>
+    %2787 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_2200, %2771 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%2786 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %2788 = tensor.empty() : tensor<1x32x80x128xf32>
+    %2789 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2782, %2787 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%2788 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %2790 = tensor.empty() : tensor<1x32x128x80xf32>
+    %2791 = linalg.generic {indexing_maps = [#map14, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2789 : tensor<1x32x80x128xf32>) outs(%2790 : tensor<1x32x128x80xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x128x80xf32>
+    %cst_2201 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32>
+    %collapsed_2202 = tensor.collapse_shape %2780 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32>
+    %cst_2203 = arith.constant dense<0.000000e+00> : tensor<1x32x128x80xf32>
+    %collapsed_2204 = tensor.collapse_shape %2791 [[0, 1], [2], [3]] : tensor<1x32x128x80xf32> into tensor<32x128x80xf32>
+    %cst_2205 = arith.constant 0.000000e+00 : f32
+    %2792 = tensor.empty() : tensor<32x80x80xf32>
+    %2793 = linalg.fill ins(%cst_2205 : f32) outs(%2792 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32>
+    %2794 = linalg.batch_matmul ins(%collapsed_2202, %collapsed_2204 : tensor<32x80x128xf32>, tensor<32x128x80xf32>) outs(%2793 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32>
+    %expanded_2206 = tensor.expand_shape %2794 [[0, 1], [2], [3]] : tensor<32x80x80xf32> into tensor<1x32x80x80xf32>
+    %cst_2207 = arith.constant dense<11.3137083> : tensor<1x32x80x80xf32>
+    %2795 = tensor.empty() : tensor<1x32x80x80xf32>
+    %2796 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst_2207 : tensor<1x32x80x80xf32>) outs(%2795 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %cst_2684 = arith.constant 1.000000e+00 : f32
+      %3652 = arith.divf %cst_2684, %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %2797 = tensor.empty() : tensor<1x32x80x80xf32>
+    %2798 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2206, %2796 : tensor<1x32x80x80xf32>, tensor<1x32x80x80xf32>) outs(%2797 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %2799 = tensor.empty() : tensor<1x32x80x80xf32>
+    %collapsed_2208 = tensor.collapse_shape %21 [[0, 1], [2], [3]] : tensor<1x1x80x80xf32> into tensor<1x80x80xf32>
+    %2800 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2798, %collapsed_2208 : tensor<1x32x80x80xf32>, tensor<1x80x80xf32>) outs(%2799 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %2801 = tensor.empty() : tensor<1x32x80x1xf32>
+    %2802 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%2801 : tensor<1x32x80x1xf32>) {
+    ^bb0(%out: f32):
+      %cst_2684 = arith.constant 0xFF800000 : f32
+      linalg.yield %cst_2684 : f32
+    } -> tensor<1x32x80x1xf32>
+    %2803 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%2800 : tensor<1x32x80x80xf32>) outs(%2801 : tensor<1x32x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.maximumf %in, %out : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x1xf32>
+    %2804 = tensor.empty() : tensor<1x32x80x80xf32>
+    %2805 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2800, %2803 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%2804 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.subf %in, %in_2684 : f32
+      %3653 = math.exp %3652 : f32
+      linalg.yield %3653 : f32
+    } -> tensor<1x32x80x80xf32>
+    %2806 = tensor.empty() : tensor<1x32x80x1xf32>
+    %2807 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%2806 : tensor<1x32x80x1xf32>) {
+    ^bb0(%out: f32):
+      %cst_2684 = arith.constant 0.000000e+00 : f32
+      linalg.yield %cst_2684 : f32
+    } -> tensor<1x32x80x1xf32>
+    %2808 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%2805 : tensor<1x32x80x80xf32>) outs(%2807 : tensor<1x32x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.addf %in, %out : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x1xf32>
+    %2809 = tensor.empty() : tensor<1x32x80x80xf32>
+    %2810 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2805, %2808 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%2809 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.divf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %cst_2209 = arith.constant dense<0.000000e+00> : tensor<1x32x80x80xf32>
+    %collapsed_2210 = tensor.collapse_shape %2810 [[0, 1], [2], [3]] : tensor<1x32x80x80xf32> into tensor<32x80x80xf32>
+    %cst_2211 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32>
+    %collapsed_2212 = tensor.collapse_shape %2759 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32>
+    %cst_2213 = arith.constant 0.000000e+00 : f32
+    %2811 = tensor.empty() : tensor<32x80x128xf32>
+    %2812 = linalg.fill ins(%cst_2213 : f32) outs(%2811 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32>
+    %2813 = linalg.batch_matmul ins(%collapsed_2210, %collapsed_2212 : tensor<32x80x80xf32>, tensor<32x80x128xf32>) outs(%2812 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32>
+    %expanded_2214 = tensor.expand_shape %2813 [[0, 1], [2], [3]] : tensor<32x80x128xf32> into tensor<1x32x80x128xf32>
+    %2814 = tensor.empty() : tensor<1x80x32x128xf32>
+    %2815 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2214 : tensor<1x32x80x128xf32>) outs(%2814 : tensor<1x80x32x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x80x32x128xf32>
+    %collapsed_2215 = tensor.collapse_shape %2815 [[0], [1], [2, 3]] : tensor<1x80x32x128xf32> into tensor<1x80x4096xf32>
+    %2816 = tensor.empty() : tensor<4096x4096xf32>
+    %2817 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_408 : tensor<4096x4096xf32>) outs(%2816 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_2216 = tensor.collapse_shape %collapsed_2215 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_2217 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %2818 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_2216, %2817 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_2217 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_2218 = tensor.expand_shape %2818 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %2819 = tensor.empty() : tensor<1x80x4096xf32>
+    %2820 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2733, %expanded_2218 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%2819 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %2821 = tensor.empty() : tensor<1x80x4096xf32>
+    %cst_2219 = arith.constant 2.000000e+00 : f32
+    %2822 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2820 : tensor<1x80x4096xf32>) outs(%2821 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.powf %in, %cst_2219 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %cst_2220 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32>
+    %2823 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%2822 : tensor<1x80x4096xf32>) outs(%cst_2220 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %cst_2684 = arith.constant 4.096000e+03 : f32
+      %3652 = arith.divf %in, %cst_2684 : f32
+      %3653 = arith.addf %3652, %out : f32
+      linalg.yield %3653 : f32
+    } -> tensor<1x80x1xf32>
+    %cst_2221 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32>
+    %2824 = tensor.empty() : tensor<1x80x1xf32>
+    %2825 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2823, %cst_2221 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%2824 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %2826 = tensor.empty() : tensor<1x80x1xf32>
+    %2827 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2825 : tensor<1x80x1xf32>) outs(%2826 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.rsqrt %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %2828 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_2222 = tensor.collapse_shape %2827 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32>
+    %2829 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2820, %collapsed_2222 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%2828 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %expanded_2223 = tensor.expand_shape %extracted_slice_48 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32>
+    %2830 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_2224 = tensor.collapse_shape %expanded_2223 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32>
+    %2831 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_2224, %2829 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%2830 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %2832 = tensor.empty() : tensor<4096x11008xf32>
+    %2833 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_410 : tensor<11008x4096xf32>) outs(%2832 : tensor<4096x11008xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x11008xf32>
+    %collapsed_2225 = tensor.collapse_shape %2831 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_2226 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32>
+    %2834 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_2225, %2833 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_2226 : tensor<80x11008xf32>) -> tensor<80x11008xf32>
+    %expanded_2227 = tensor.expand_shape %2834 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32>
+    %2835 = tensor.empty() : tensor<1x80x11008xf32>
+    %2836 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_2227 : tensor<1x80x11008xf32>) outs(%2835 : tensor<1x80x11008xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.negf %in : f32
+      %3653 = math.exp %3652 : f32
+      %cst_2684 = arith.constant 1.000000e+00 : f32
+      %3654 = arith.addf %cst_2684, %3653 : f32
+      %3655 = arith.divf %in, %3654 : f32
+      linalg.yield %3655 : f32
+    } -> tensor<1x80x11008xf32>
+    %2837 = tensor.empty() : tensor<4096x11008xf32>
+    %2838 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_412 : tensor<11008x4096xf32>) outs(%2837 : tensor<4096x11008xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x11008xf32>
+    %collapsed_2228 = tensor.collapse_shape %2831 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_2229 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32>
+    %2839 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_2228, %2838 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_2229 : tensor<80x11008xf32>) -> tensor<80x11008xf32>
+    %expanded_2230 = tensor.expand_shape %2839 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32>
+    %2840 = tensor.empty() : tensor<1x80x11008xf32>
+    %2841 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2836, %expanded_2230 : tensor<1x80x11008xf32>, tensor<1x80x11008xf32>) outs(%2840 : tensor<1x80x11008xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x11008xf32>
+    %2842 = tensor.empty() : tensor<11008x4096xf32>
+    %2843 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_414 : tensor<4096x11008xf32>) outs(%2842 : tensor<11008x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<11008x4096xf32>
+    %collapsed_2231 = tensor.collapse_shape %2841 [[0, 1], [2]] : tensor<1x80x11008xf32> into tensor<80x11008xf32>
+    %cst_2232 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %2844 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_2231, %2843 : tensor<80x11008xf32>, tensor<11008x4096xf32>) outs(%cst_2232 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_2233 = tensor.expand_shape %2844 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %2845 = tensor.empty() : tensor<1x80x4096xf32>
+    %2846 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2820, %expanded_2233 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%2845 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %2847 = tensor.empty() : tensor<1x80x4096xf32>
+    %cst_2234 = arith.constant 2.000000e+00 : f32
+    %2848 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2846 : tensor<1x80x4096xf32>) outs(%2847 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.powf %in, %cst_2234 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %cst_2235 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32>
+    %2849 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%2848 : tensor<1x80x4096xf32>) outs(%cst_2235 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %cst_2684 = arith.constant 4.096000e+03 : f32
+      %3652 = arith.divf %in, %cst_2684 : f32
+      %3653 = arith.addf %3652, %out : f32
+      linalg.yield %3653 : f32
+    } -> tensor<1x80x1xf32>
+    %cst_2236 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32>
+    %2850 = tensor.empty() : tensor<1x80x1xf32>
+    %2851 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2849, %cst_2236 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%2850 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %2852 = tensor.empty() : tensor<1x80x1xf32>
+    %2853 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2851 : tensor<1x80x1xf32>) outs(%2852 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.rsqrt %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %2854 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_2237 = tensor.collapse_shape %2853 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32>
+    %2855 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2846, %collapsed_2237 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%2854 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %expanded_2238 = tensor.expand_shape %extracted_slice_49 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32>
+    %2856 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_2239 = tensor.collapse_shape %expanded_2238 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32>
+    %2857 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_2239, %2855 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%2856 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %2858 = tensor.empty() : tensor<4096x4096xf32>
+    %2859 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_416 : tensor<4096x4096xf32>) outs(%2858 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_2240 = tensor.collapse_shape %2857 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_2241 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %2860 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_2240, %2859 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_2241 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_2242 = tensor.expand_shape %2860 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %2861 = tensor.empty() : tensor<4096x4096xf32>
+    %2862 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_418 : tensor<4096x4096xf32>) outs(%2861 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_2243 = tensor.collapse_shape %2857 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_2244 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %2863 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_2243, %2862 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_2244 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_2245 = tensor.expand_shape %2863 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %2864 = tensor.empty() : tensor<4096x4096xf32>
+    %2865 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_420 : tensor<4096x4096xf32>) outs(%2864 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_2246 = tensor.collapse_shape %2857 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_2247 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %2866 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_2246, %2865 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_2247 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_2248 = tensor.expand_shape %2866 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %expanded_2249 = tensor.expand_shape %expanded_2242 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32>
+    %2867 = tensor.empty() : tensor<1x32x80x128xf32>
+    %2868 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2249 : tensor<1x80x32x128xf32>) outs(%2867 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x80x128xf32>
+    %expanded_2250 = tensor.expand_shape %expanded_2245 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32>
+    %2869 = tensor.empty() : tensor<1x32x80x128xf32>
+    %2870 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2250 : tensor<1x80x32x128xf32>) outs(%2869 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x80x128xf32>
+    %expanded_2251 = tensor.expand_shape %expanded_2248 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32>
+    %2871 = tensor.empty() : tensor<1x32x80x128xf32>
+    %2872 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2251 : tensor<1x80x32x128xf32>) outs(%2871 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x80x128xf32>
+    %extracted_slice_2252 = tensor.extract_slice %expanded_616[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32>
+    %extracted_slice_2253 = tensor.extract_slice %expanded_618[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32>
+    %2873 = tensor.empty() : tensor<1x80x128xf32>
+    %2874 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_2252 : tensor<1x1x80x128xf32>) outs(%2873 : tensor<1x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x80x128xf32>
+    %2875 = tensor.empty() : tensor<80x128xf32>
+    %2876 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%2874 : tensor<1x80x128xf32>) outs(%2875 : tensor<80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<80x128xf32>
+    %2877 = tensor.empty() : tensor<1x80x128xf32>
+    %2878 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_2253 : tensor<1x1x80x128xf32>) outs(%2877 : tensor<1x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x80x128xf32>
+    %2879 = tensor.empty() : tensor<80x128xf32>
+    %2880 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%2878 : tensor<1x80x128xf32>) outs(%2879 : tensor<80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<80x128xf32>
+    %2881 = tensor.empty() : tensor<1x80x128xf32>
+    %2882 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%2881 : tensor<1x80x128xf32>) {
+    ^bb0(%in: i64, %out: f32):
+      %3652 = arith.index_cast %in : i64 to index
+      %3653 = linalg.index 2 : index
+      %extracted = tensor.extract %2876[%3652, %3653] : tensor<80x128xf32>
+      linalg.yield %extracted : f32
+    } -> tensor<1x80x128xf32>
+    %expanded_2254 = tensor.expand_shape %2882 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32>
+    %2883 = tensor.empty() : tensor<1x80x128xf32>
+    %2884 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%2883 : tensor<1x80x128xf32>) {
+    ^bb0(%in: i64, %out: f32):
+      %3652 = arith.index_cast %in : i64 to index
+      %3653 = linalg.index 2 : index
+      %extracted = tensor.extract %2880[%3652, %3653] : tensor<80x128xf32>
+      linalg.yield %extracted : f32
+    } -> tensor<1x80x128xf32>
+    %expanded_2255 = tensor.expand_shape %2884 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32>
+    %2885 = tensor.empty() : tensor<1x32x80x128xf32>
+    %2886 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2868, %2882 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%2885 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %extracted_slice_2256 = tensor.extract_slice %2868[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %extracted_slice_2257 = tensor.extract_slice %2868[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %2887 = tensor.empty() : tensor<1x32x80x64xf32>
+    %2888 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_2257 : tensor<1x32x80x64xf32>) outs(%2887 : tensor<1x32x80x64xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.negf %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x64xf32>
+    %2889 = tensor.empty() : tensor<1x32x80x128xf32>
+    %inserted_slice_2258 = tensor.insert_slice %2888 into %2889[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %inserted_slice_2259 = tensor.insert_slice %extracted_slice_2256 into %inserted_slice_2258[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %2890 = tensor.empty() : tensor<1x32x80x128xf32>
+    %2891 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_2259, %2884 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%2890 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %2892 = tensor.empty() : tensor<1x32x80x128xf32>
+    %2893 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2886, %2891 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%2892 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %2894 = tensor.empty() : tensor<1x32x80x128xf32>
+    %2895 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2870, %2882 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%2894 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %extracted_slice_2260 = tensor.extract_slice %2870[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %extracted_slice_2261 = tensor.extract_slice %2870[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %2896 = tensor.empty() : tensor<1x32x80x64xf32>
+    %2897 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_2261 : tensor<1x32x80x64xf32>) outs(%2896 : tensor<1x32x80x64xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.negf %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x64xf32>
+    %2898 = tensor.empty() : tensor<1x32x80x128xf32>
+    %inserted_slice_2262 = tensor.insert_slice %2897 into %2898[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %inserted_slice_2263 = tensor.insert_slice %extracted_slice_2260 into %inserted_slice_2262[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %2899 = tensor.empty() : tensor<1x32x80x128xf32>
+    %2900 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_2263, %2884 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%2899 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %2901 = tensor.empty() : tensor<1x32x80x128xf32>
+    %2902 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2895, %2900 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%2901 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %2903 = tensor.empty() : tensor<1x32x128x80xf32>
+    %2904 = linalg.generic {indexing_maps = [#map14, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2902 : tensor<1x32x80x128xf32>) outs(%2903 : tensor<1x32x128x80xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x128x80xf32>
+    %cst_2264 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32>
+    %collapsed_2265 = tensor.collapse_shape %2893 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32>
+    %cst_2266 = arith.constant dense<0.000000e+00> : tensor<1x32x128x80xf32>
+    %collapsed_2267 = tensor.collapse_shape %2904 [[0, 1], [2], [3]] : tensor<1x32x128x80xf32> into tensor<32x128x80xf32>
+    %cst_2268 = arith.constant 0.000000e+00 : f32
+    %2905 = tensor.empty() : tensor<32x80x80xf32>
+    %2906 = linalg.fill ins(%cst_2268 : f32) outs(%2905 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32>
+    %2907 = linalg.batch_matmul ins(%collapsed_2265, %collapsed_2267 : tensor<32x80x128xf32>, tensor<32x128x80xf32>) outs(%2906 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32>
+    %expanded_2269 = tensor.expand_shape %2907 [[0, 1], [2], [3]] : tensor<32x80x80xf32> into tensor<1x32x80x80xf32>
+    %cst_2270 = arith.constant dense<11.3137083> : tensor<1x32x80x80xf32>
+    %2908 = tensor.empty() : tensor<1x32x80x80xf32>
+    %2909 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst_2270 : tensor<1x32x80x80xf32>) outs(%2908 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %cst_2684 = arith.constant 1.000000e+00 : f32
+      %3652 = arith.divf %cst_2684, %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %2910 = tensor.empty() : tensor<1x32x80x80xf32>
+    %2911 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2269, %2909 : tensor<1x32x80x80xf32>, tensor<1x32x80x80xf32>) outs(%2910 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %2912 = tensor.empty() : tensor<1x32x80x80xf32>
+    %collapsed_2271 = tensor.collapse_shape %21 [[0, 1], [2], [3]] : tensor<1x1x80x80xf32> into tensor<1x80x80xf32>
+    %2913 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2911, %collapsed_2271 : tensor<1x32x80x80xf32>, tensor<1x80x80xf32>) outs(%2912 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %2914 = tensor.empty() : tensor<1x32x80x1xf32>
+    %2915 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%2914 : tensor<1x32x80x1xf32>) {
+    ^bb0(%out: f32):
+      %cst_2684 = arith.constant 0xFF800000 : f32
+      linalg.yield %cst_2684 : f32
+    } -> tensor<1x32x80x1xf32>
+    %2916 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%2913 : tensor<1x32x80x80xf32>) outs(%2914 : tensor<1x32x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.maximumf %in, %out : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x1xf32>
+    %2917 = tensor.empty() : tensor<1x32x80x80xf32>
+    %2918 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2913, %2916 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%2917 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.subf %in, %in_2684 : f32
+      %3653 = math.exp %3652 : f32
+      linalg.yield %3653 : f32
+    } -> tensor<1x32x80x80xf32>
+    %2919 = tensor.empty() : tensor<1x32x80x1xf32>
+    %2920 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%2919 : tensor<1x32x80x1xf32>) {
+    ^bb0(%out: f32):
+      %cst_2684 = arith.constant 0.000000e+00 : f32
+      linalg.yield %cst_2684 : f32
+    } -> tensor<1x32x80x1xf32>
+    %2921 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%2918 : tensor<1x32x80x80xf32>) outs(%2920 : tensor<1x32x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.addf %in, %out : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x1xf32>
+    %2922 = tensor.empty() : tensor<1x32x80x80xf32>
+    %2923 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2918, %2921 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%2922 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.divf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %cst_2272 = arith.constant dense<0.000000e+00> : tensor<1x32x80x80xf32>
+    %collapsed_2273 = tensor.collapse_shape %2923 [[0, 1], [2], [3]] : tensor<1x32x80x80xf32> into tensor<32x80x80xf32>
+    %cst_2274 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32>
+    %collapsed_2275 = tensor.collapse_shape %2872 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32>
+    %cst_2276 = arith.constant 0.000000e+00 : f32
+    %2924 = tensor.empty() : tensor<32x80x128xf32>
+    %2925 = linalg.fill ins(%cst_2276 : f32) outs(%2924 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32>
+    %2926 = linalg.batch_matmul ins(%collapsed_2273, %collapsed_2275 : tensor<32x80x80xf32>, tensor<32x80x128xf32>) outs(%2925 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32>
+    %expanded_2277 = tensor.expand_shape %2926 [[0, 1], [2], [3]] : tensor<32x80x128xf32> into tensor<1x32x80x128xf32>
+    %2927 = tensor.empty() : tensor<1x80x32x128xf32>
+    %2928 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2277 : tensor<1x32x80x128xf32>) outs(%2927 : tensor<1x80x32x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x80x32x128xf32>
+    %collapsed_2278 = tensor.collapse_shape %2928 [[0], [1], [2, 3]] : tensor<1x80x32x128xf32> into tensor<1x80x4096xf32>
+    %2929 = tensor.empty() : tensor<4096x4096xf32>
+    %2930 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_422 : tensor<4096x4096xf32>) outs(%2929 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_2279 = tensor.collapse_shape %collapsed_2278 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_2280 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %2931 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_2279, %2930 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_2280 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_2281 = tensor.expand_shape %2931 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %2932 = tensor.empty() : tensor<1x80x4096xf32>
+    %2933 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2846, %expanded_2281 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%2932 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %2934 = tensor.empty() : tensor<1x80x4096xf32>
+    %cst_2282 = arith.constant 2.000000e+00 : f32
+    %2935 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2933 : tensor<1x80x4096xf32>) outs(%2934 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.powf %in, %cst_2282 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %cst_2283 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32>
+    %2936 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%2935 : tensor<1x80x4096xf32>) outs(%cst_2283 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %cst_2684 = arith.constant 4.096000e+03 : f32
+      %3652 = arith.divf %in, %cst_2684 : f32
+      %3653 = arith.addf %3652, %out : f32
+      linalg.yield %3653 : f32
+    } -> tensor<1x80x1xf32>
+    %cst_2284 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32>
+    %2937 = tensor.empty() : tensor<1x80x1xf32>
+    %2938 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2936, %cst_2284 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%2937 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %2939 = tensor.empty() : tensor<1x80x1xf32>
+    %2940 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2938 : tensor<1x80x1xf32>) outs(%2939 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.rsqrt %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %2941 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_2285 = tensor.collapse_shape %2940 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32>
+    %2942 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2933, %collapsed_2285 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%2941 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %expanded_2286 = tensor.expand_shape %extracted_slice_50 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32>
+    %2943 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_2287 = tensor.collapse_shape %expanded_2286 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32>
+    %2944 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_2287, %2942 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%2943 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %2945 = tensor.empty() : tensor<4096x11008xf32>
+    %2946 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_424 : tensor<11008x4096xf32>) outs(%2945 : tensor<4096x11008xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x11008xf32>
+    %collapsed_2288 = tensor.collapse_shape %2944 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_2289 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32>
+    %2947 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_2288, %2946 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_2289 : tensor<80x11008xf32>) -> tensor<80x11008xf32>
+    %expanded_2290 = tensor.expand_shape %2947 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32>
+    %2948 = tensor.empty() : tensor<1x80x11008xf32>
+    %2949 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_2290 : tensor<1x80x11008xf32>) outs(%2948 : tensor<1x80x11008xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.negf %in : f32
+      %3653 = math.exp %3652 : f32
+      %cst_2684 = arith.constant 1.000000e+00 : f32
+      %3654 = arith.addf %cst_2684, %3653 : f32
+      %3655 = arith.divf %in, %3654 : f32
+      linalg.yield %3655 : f32
+    } -> tensor<1x80x11008xf32>
+    %2950 = tensor.empty() : tensor<4096x11008xf32>
+    %2951 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_426 : tensor<11008x4096xf32>) outs(%2950 : tensor<4096x11008xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x11008xf32>
+    %collapsed_2291 = tensor.collapse_shape %2944 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_2292 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32>
+    %2952 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_2291, %2951 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_2292 : tensor<80x11008xf32>) -> tensor<80x11008xf32>
+    %expanded_2293 = tensor.expand_shape %2952 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32>
+    %2953 = tensor.empty() : tensor<1x80x11008xf32>
+    %2954 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2949, %expanded_2293 : tensor<1x80x11008xf32>, tensor<1x80x11008xf32>) outs(%2953 : tensor<1x80x11008xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x11008xf32>
+    %2955 = tensor.empty() : tensor<11008x4096xf32>
+    %2956 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_428 : tensor<4096x11008xf32>) outs(%2955 : tensor<11008x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<11008x4096xf32>
+    %collapsed_2294 = tensor.collapse_shape %2954 [[0, 1], [2]] : tensor<1x80x11008xf32> into tensor<80x11008xf32>
+    %cst_2295 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %2957 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_2294, %2956 : tensor<80x11008xf32>, tensor<11008x4096xf32>) outs(%cst_2295 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_2296 = tensor.expand_shape %2957 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %2958 = tensor.empty() : tensor<1x80x4096xf32>
+    %2959 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2933, %expanded_2296 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%2958 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %2960 = tensor.empty() : tensor<1x80x4096xf32>
+    %cst_2297 = arith.constant 2.000000e+00 : f32
+    %2961 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2959 : tensor<1x80x4096xf32>) outs(%2960 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.powf %in, %cst_2297 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %cst_2298 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32>
+    %2962 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%2961 : tensor<1x80x4096xf32>) outs(%cst_2298 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %cst_2684 = arith.constant 4.096000e+03 : f32
+      %3652 = arith.divf %in, %cst_2684 : f32
+      %3653 = arith.addf %3652, %out : f32
+      linalg.yield %3653 : f32
+    } -> tensor<1x80x1xf32>
+    %cst_2299 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32>
+    %2963 = tensor.empty() : tensor<1x80x1xf32>
+    %2964 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2962, %cst_2299 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%2963 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %2965 = tensor.empty() : tensor<1x80x1xf32>
+    %2966 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2964 : tensor<1x80x1xf32>) outs(%2965 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.rsqrt %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %2967 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_2300 = tensor.collapse_shape %2966 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32>
+    %2968 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2959, %collapsed_2300 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%2967 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %expanded_2301 = tensor.expand_shape %extracted_slice_51 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32>
+    %2969 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_2302 = tensor.collapse_shape %expanded_2301 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32>
+    %2970 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_2302, %2968 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%2969 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %2971 = tensor.empty() : tensor<4096x4096xf32>
+    %2972 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_430 : tensor<4096x4096xf32>) outs(%2971 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_2303 = tensor.collapse_shape %2970 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_2304 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %2973 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_2303, %2972 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_2304 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_2305 = tensor.expand_shape %2973 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %2974 = tensor.empty() : tensor<4096x4096xf32>
+    %2975 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_432 : tensor<4096x4096xf32>) outs(%2974 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_2306 = tensor.collapse_shape %2970 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_2307 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %2976 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_2306, %2975 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_2307 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_2308 = tensor.expand_shape %2976 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %2977 = tensor.empty() : tensor<4096x4096xf32>
+    %2978 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_434 : tensor<4096x4096xf32>) outs(%2977 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_2309 = tensor.collapse_shape %2970 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_2310 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %2979 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_2309, %2978 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_2310 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_2311 = tensor.expand_shape %2979 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %expanded_2312 = tensor.expand_shape %expanded_2305 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32>
+    %2980 = tensor.empty() : tensor<1x32x80x128xf32>
+    %2981 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2312 : tensor<1x80x32x128xf32>) outs(%2980 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x80x128xf32>
+    %expanded_2313 = tensor.expand_shape %expanded_2308 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32>
+    %2982 = tensor.empty() : tensor<1x32x80x128xf32>
+    %2983 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2313 : tensor<1x80x32x128xf32>) outs(%2982 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x80x128xf32>
+    %expanded_2314 = tensor.expand_shape %expanded_2311 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32>
+    %2984 = tensor.empty() : tensor<1x32x80x128xf32>
+    %2985 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2314 : tensor<1x80x32x128xf32>) outs(%2984 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x80x128xf32>
+    %extracted_slice_2315 = tensor.extract_slice %expanded_620[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32>
+    %extracted_slice_2316 = tensor.extract_slice %expanded_622[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32>
+    %2986 = tensor.empty() : tensor<1x80x128xf32>
+    %2987 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_2315 : tensor<1x1x80x128xf32>) outs(%2986 : tensor<1x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x80x128xf32>
+    %2988 = tensor.empty() : tensor<80x128xf32>
+    %2989 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%2987 : tensor<1x80x128xf32>) outs(%2988 : tensor<80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<80x128xf32>
+    %2990 = tensor.empty() : tensor<1x80x128xf32>
+    %2991 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_2316 : tensor<1x1x80x128xf32>) outs(%2990 : tensor<1x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x80x128xf32>
+    %2992 = tensor.empty() : tensor<80x128xf32>
+    %2993 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%2991 : tensor<1x80x128xf32>) outs(%2992 : tensor<80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<80x128xf32>
+    %2994 = tensor.empty() : tensor<1x80x128xf32>
+    %2995 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%2994 : tensor<1x80x128xf32>) {
+    ^bb0(%in: i64, %out: f32):
+      %3652 = arith.index_cast %in : i64 to index
+      %3653 = linalg.index 2 : index
+      %extracted = tensor.extract %2989[%3652, %3653] : tensor<80x128xf32>
+      linalg.yield %extracted : f32
+    } -> tensor<1x80x128xf32>
+    %expanded_2317 = tensor.expand_shape %2995 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32>
+    %2996 = tensor.empty() : tensor<1x80x128xf32>
+    %2997 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%2996 : tensor<1x80x128xf32>) {
+    ^bb0(%in: i64, %out: f32):
+      %3652 = arith.index_cast %in : i64 to index
+      %3653 = linalg.index 2 : index
+      %extracted = tensor.extract %2993[%3652, %3653] : tensor<80x128xf32>
+      linalg.yield %extracted : f32
+    } -> tensor<1x80x128xf32>
+    %expanded_2318 = tensor.expand_shape %2997 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32>
+    %2998 = tensor.empty() : tensor<1x32x80x128xf32>
+    %2999 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2981, %2995 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%2998 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %extracted_slice_2319 = tensor.extract_slice %2981[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %extracted_slice_2320 = tensor.extract_slice %2981[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %3000 = tensor.empty() : tensor<1x32x80x64xf32>
+    %3001 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_2320 : tensor<1x32x80x64xf32>) outs(%3000 : tensor<1x32x80x64xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.negf %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x64xf32>
+    %3002 = tensor.empty() : tensor<1x32x80x128xf32>
+    %inserted_slice_2321 = tensor.insert_slice %3001 into %3002[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %inserted_slice_2322 = tensor.insert_slice %extracted_slice_2319 into %inserted_slice_2321[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %3003 = tensor.empty() : tensor<1x32x80x128xf32>
+    %3004 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_2322, %2997 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%3003 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %3005 = tensor.empty() : tensor<1x32x80x128xf32>
+    %3006 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2999, %3004 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%3005 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %3007 = tensor.empty() : tensor<1x32x80x128xf32>
+    %3008 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2983, %2995 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%3007 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %extracted_slice_2323 = tensor.extract_slice %2983[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %extracted_slice_2324 = tensor.extract_slice %2983[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %3009 = tensor.empty() : tensor<1x32x80x64xf32>
+    %3010 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_2324 : tensor<1x32x80x64xf32>) outs(%3009 : tensor<1x32x80x64xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.negf %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x64xf32>
+    %3011 = tensor.empty() : tensor<1x32x80x128xf32>
+    %inserted_slice_2325 = tensor.insert_slice %3010 into %3011[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %inserted_slice_2326 = tensor.insert_slice %extracted_slice_2323 into %inserted_slice_2325[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %3012 = tensor.empty() : tensor<1x32x80x128xf32>
+    %3013 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_2326, %2997 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%3012 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %3014 = tensor.empty() : tensor<1x32x80x128xf32>
+    %3015 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3008, %3013 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%3014 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %3016 = tensor.empty() : tensor<1x32x128x80xf32>
+    %3017 = linalg.generic {indexing_maps = [#map14, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3015 : tensor<1x32x80x128xf32>) outs(%3016 : tensor<1x32x128x80xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x128x80xf32>
+    %cst_2327 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32>
+    %collapsed_2328 = tensor.collapse_shape %3006 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32>
+    %cst_2329 = arith.constant dense<0.000000e+00> : tensor<1x32x128x80xf32>
+    %collapsed_2330 = tensor.collapse_shape %3017 [[0, 1], [2], [3]] : tensor<1x32x128x80xf32> into tensor<32x128x80xf32>
+    %cst_2331 = arith.constant 0.000000e+00 : f32
+    %3018 = tensor.empty() : tensor<32x80x80xf32>
+    %3019 = linalg.fill ins(%cst_2331 : f32) outs(%3018 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32>
+    %3020 = linalg.batch_matmul ins(%collapsed_2328, %collapsed_2330 : tensor<32x80x128xf32>, tensor<32x128x80xf32>) outs(%3019 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32>
+    %expanded_2332 = tensor.expand_shape %3020 [[0, 1], [2], [3]] : tensor<32x80x80xf32> into tensor<1x32x80x80xf32>
+    %cst_2333 = arith.constant dense<11.3137083> : tensor<1x32x80x80xf32>
+    %3021 = tensor.empty() : tensor<1x32x80x80xf32>
+    %3022 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst_2333 : tensor<1x32x80x80xf32>) outs(%3021 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %cst_2684 = arith.constant 1.000000e+00 : f32
+      %3652 = arith.divf %cst_2684, %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %3023 = tensor.empty() : tensor<1x32x80x80xf32>
+    %3024 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2332, %3022 : tensor<1x32x80x80xf32>, tensor<1x32x80x80xf32>) outs(%3023 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %3025 = tensor.empty() : tensor<1x32x80x80xf32>
+    %collapsed_2334 = tensor.collapse_shape %21 [[0, 1], [2], [3]] : tensor<1x1x80x80xf32> into tensor<1x80x80xf32>
+    %3026 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3024, %collapsed_2334 : tensor<1x32x80x80xf32>, tensor<1x80x80xf32>) outs(%3025 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %3027 = tensor.empty() : tensor<1x32x80x1xf32>
+    %3028 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%3027 : tensor<1x32x80x1xf32>) {
+    ^bb0(%out: f32):
+      %cst_2684 = arith.constant 0xFF800000 : f32
+      linalg.yield %cst_2684 : f32
+    } -> tensor<1x32x80x1xf32>
+    %3029 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%3026 : tensor<1x32x80x80xf32>) outs(%3027 : tensor<1x32x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.maximumf %in, %out : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x1xf32>
+    %3030 = tensor.empty() : tensor<1x32x80x80xf32>
+    %3031 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3026, %3029 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%3030 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.subf %in, %in_2684 : f32
+      %3653 = math.exp %3652 : f32
+      linalg.yield %3653 : f32
+    } -> tensor<1x32x80x80xf32>
+    %3032 = tensor.empty() : tensor<1x32x80x1xf32>
+    %3033 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%3032 : tensor<1x32x80x1xf32>) {
+    ^bb0(%out: f32):
+      %cst_2684 = arith.constant 0.000000e+00 : f32
+      linalg.yield %cst_2684 : f32
+    } -> tensor<1x32x80x1xf32>
+    %3034 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%3031 : tensor<1x32x80x80xf32>) outs(%3033 : tensor<1x32x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.addf %in, %out : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x1xf32>
+    %3035 = tensor.empty() : tensor<1x32x80x80xf32>
+    %3036 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3031, %3034 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%3035 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.divf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %cst_2335 = arith.constant dense<0.000000e+00> : tensor<1x32x80x80xf32>
+    %collapsed_2336 = tensor.collapse_shape %3036 [[0, 1], [2], [3]] : tensor<1x32x80x80xf32> into tensor<32x80x80xf32>
+    %cst_2337 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32>
+    %collapsed_2338 = tensor.collapse_shape %2985 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32>
+    %cst_2339 = arith.constant 0.000000e+00 : f32
+    %3037 = tensor.empty() : tensor<32x80x128xf32>
+    %3038 = linalg.fill ins(%cst_2339 : f32) outs(%3037 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32>
+    %3039 = linalg.batch_matmul ins(%collapsed_2336, %collapsed_2338 : tensor<32x80x80xf32>, tensor<32x80x128xf32>) outs(%3038 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32>
+    %expanded_2340 = tensor.expand_shape %3039 [[0, 1], [2], [3]] : tensor<32x80x128xf32> into tensor<1x32x80x128xf32>
+    %3040 = tensor.empty() : tensor<1x80x32x128xf32>
+    %3041 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2340 : tensor<1x32x80x128xf32>) outs(%3040 : tensor<1x80x32x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x80x32x128xf32>
+    %collapsed_2341 = tensor.collapse_shape %3041 [[0], [1], [2, 3]] : tensor<1x80x32x128xf32> into tensor<1x80x4096xf32>
+    %3042 = tensor.empty() : tensor<4096x4096xf32>
+    %3043 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_436 : tensor<4096x4096xf32>) outs(%3042 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_2342 = tensor.collapse_shape %collapsed_2341 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_2343 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %3044 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_2342, %3043 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_2343 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_2344 = tensor.expand_shape %3044 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %3045 = tensor.empty() : tensor<1x80x4096xf32>
+    %3046 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2959, %expanded_2344 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%3045 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %3047 = tensor.empty() : tensor<1x80x4096xf32>
+    %cst_2345 = arith.constant 2.000000e+00 : f32
+    %3048 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3046 : tensor<1x80x4096xf32>) outs(%3047 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.powf %in, %cst_2345 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %cst_2346 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32>
+    %3049 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%3048 : tensor<1x80x4096xf32>) outs(%cst_2346 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %cst_2684 = arith.constant 4.096000e+03 : f32
+      %3652 = arith.divf %in, %cst_2684 : f32
+      %3653 = arith.addf %3652, %out : f32
+      linalg.yield %3653 : f32
+    } -> tensor<1x80x1xf32>
+    %cst_2347 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32>
+    %3050 = tensor.empty() : tensor<1x80x1xf32>
+    %3051 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3049, %cst_2347 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%3050 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %3052 = tensor.empty() : tensor<1x80x1xf32>
+    %3053 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3051 : tensor<1x80x1xf32>) outs(%3052 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.rsqrt %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %3054 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_2348 = tensor.collapse_shape %3053 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32>
+    %3055 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3046, %collapsed_2348 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%3054 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %expanded_2349 = tensor.expand_shape %extracted_slice_52 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32>
+    %3056 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_2350 = tensor.collapse_shape %expanded_2349 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32>
+    %3057 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_2350, %3055 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%3056 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %3058 = tensor.empty() : tensor<4096x11008xf32>
+    %3059 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_438 : tensor<11008x4096xf32>) outs(%3058 : tensor<4096x11008xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x11008xf32>
+    %collapsed_2351 = tensor.collapse_shape %3057 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_2352 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32>
+    %3060 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_2351, %3059 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_2352 : tensor<80x11008xf32>) -> tensor<80x11008xf32>
+    %expanded_2353 = tensor.expand_shape %3060 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32>
+    %3061 = tensor.empty() : tensor<1x80x11008xf32>
+    %3062 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_2353 : tensor<1x80x11008xf32>) outs(%3061 : tensor<1x80x11008xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.negf %in : f32
+      %3653 = math.exp %3652 : f32
+      %cst_2684 = arith.constant 1.000000e+00 : f32
+      %3654 = arith.addf %cst_2684, %3653 : f32
+      %3655 = arith.divf %in, %3654 : f32
+      linalg.yield %3655 : f32
+    } -> tensor<1x80x11008xf32>
+    %3063 = tensor.empty() : tensor<4096x11008xf32>
+    %3064 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_440 : tensor<11008x4096xf32>) outs(%3063 : tensor<4096x11008xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x11008xf32>
+    %collapsed_2354 = tensor.collapse_shape %3057 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_2355 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32>
+    %3065 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_2354, %3064 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_2355 : tensor<80x11008xf32>) -> tensor<80x11008xf32>
+    %expanded_2356 = tensor.expand_shape %3065 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32>
+    %3066 = tensor.empty() : tensor<1x80x11008xf32>
+    %3067 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3062, %expanded_2356 : tensor<1x80x11008xf32>, tensor<1x80x11008xf32>) outs(%3066 : tensor<1x80x11008xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x11008xf32>
+    %3068 = tensor.empty() : tensor<11008x4096xf32>
+    %3069 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_442 : tensor<4096x11008xf32>) outs(%3068 : tensor<11008x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<11008x4096xf32>
+    %collapsed_2357 = tensor.collapse_shape %3067 [[0, 1], [2]] : tensor<1x80x11008xf32> into tensor<80x11008xf32>
+    %cst_2358 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %3070 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_2357, %3069 : tensor<80x11008xf32>, tensor<11008x4096xf32>) outs(%cst_2358 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_2359 = tensor.expand_shape %3070 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %3071 = tensor.empty() : tensor<1x80x4096xf32>
+    %3072 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3046, %expanded_2359 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%3071 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %3073 = tensor.empty() : tensor<1x80x4096xf32>
+    %cst_2360 = arith.constant 2.000000e+00 : f32
+    %3074 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3072 : tensor<1x80x4096xf32>) outs(%3073 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.powf %in, %cst_2360 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %cst_2361 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32>
+    %3075 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%3074 : tensor<1x80x4096xf32>) outs(%cst_2361 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %cst_2684 = arith.constant 4.096000e+03 : f32
+      %3652 = arith.divf %in, %cst_2684 : f32
+      %3653 = arith.addf %3652, %out : f32
+      linalg.yield %3653 : f32
+    } -> tensor<1x80x1xf32>
+    %cst_2362 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32>
+    %3076 = tensor.empty() : tensor<1x80x1xf32>
+    %3077 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3075, %cst_2362 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%3076 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %3078 = tensor.empty() : tensor<1x80x1xf32>
+    %3079 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3077 : tensor<1x80x1xf32>) outs(%3078 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.rsqrt %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %3080 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_2363 = tensor.collapse_shape %3079 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32>
+    %3081 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3072, %collapsed_2363 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%3080 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %expanded_2364 = tensor.expand_shape %extracted_slice_53 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32>
+    %3082 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_2365 = tensor.collapse_shape %expanded_2364 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32>
+    %3083 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_2365, %3081 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%3082 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %3084 = tensor.empty() : tensor<4096x4096xf32>
+    %3085 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_444 : tensor<4096x4096xf32>) outs(%3084 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_2366 = tensor.collapse_shape %3083 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_2367 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %3086 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_2366, %3085 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_2367 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_2368 = tensor.expand_shape %3086 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %3087 = tensor.empty() : tensor<4096x4096xf32>
+    %3088 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_446 : tensor<4096x4096xf32>) outs(%3087 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_2369 = tensor.collapse_shape %3083 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_2370 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %3089 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_2369, %3088 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_2370 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_2371 = tensor.expand_shape %3089 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %3090 = tensor.empty() : tensor<4096x4096xf32>
+    %3091 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_448 : tensor<4096x4096xf32>) outs(%3090 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_2372 = tensor.collapse_shape %3083 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_2373 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %3092 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_2372, %3091 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_2373 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_2374 = tensor.expand_shape %3092 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %expanded_2375 = tensor.expand_shape %expanded_2368 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32>
+    %3093 = tensor.empty() : tensor<1x32x80x128xf32>
+    %3094 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2375 : tensor<1x80x32x128xf32>) outs(%3093 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x80x128xf32>
+    %expanded_2376 = tensor.expand_shape %expanded_2371 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32>
+    %3095 = tensor.empty() : tensor<1x32x80x128xf32>
+    %3096 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2376 : tensor<1x80x32x128xf32>) outs(%3095 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x80x128xf32>
+    %expanded_2377 = tensor.expand_shape %expanded_2374 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32>
+    %3097 = tensor.empty() : tensor<1x32x80x128xf32>
+    %3098 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2377 : tensor<1x80x32x128xf32>) outs(%3097 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x80x128xf32>
+    %extracted_slice_2378 = tensor.extract_slice %expanded_624[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32>
+    %extracted_slice_2379 = tensor.extract_slice %expanded_626[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32>
+    %3099 = tensor.empty() : tensor<1x80x128xf32>
+    %3100 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_2378 : tensor<1x1x80x128xf32>) outs(%3099 : tensor<1x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x80x128xf32>
+    %3101 = tensor.empty() : tensor<80x128xf32>
+    %3102 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%3100 : tensor<1x80x128xf32>) outs(%3101 : tensor<80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<80x128xf32>
+    %3103 = tensor.empty() : tensor<1x80x128xf32>
+    %3104 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_2379 : tensor<1x1x80x128xf32>) outs(%3103 : tensor<1x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x80x128xf32>
+    %3105 = tensor.empty() : tensor<80x128xf32>
+    %3106 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%3104 : tensor<1x80x128xf32>) outs(%3105 : tensor<80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<80x128xf32>
+    %3107 = tensor.empty() : tensor<1x80x128xf32>
+    %3108 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%3107 : tensor<1x80x128xf32>) {
+    ^bb0(%in: i64, %out: f32):
+      %3652 = arith.index_cast %in : i64 to index
+      %3653 = linalg.index 2 : index
+      %extracted = tensor.extract %3102[%3652, %3653] : tensor<80x128xf32>
+      linalg.yield %extracted : f32
+    } -> tensor<1x80x128xf32>
+    %expanded_2380 = tensor.expand_shape %3108 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32>
+    %3109 = tensor.empty() : tensor<1x80x128xf32>
+    %3110 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%3109 : tensor<1x80x128xf32>) {
+    ^bb0(%in: i64, %out: f32):
+      %3652 = arith.index_cast %in : i64 to index
+      %3653 = linalg.index 2 : index
+      %extracted = tensor.extract %3106[%3652, %3653] : tensor<80x128xf32>
+      linalg.yield %extracted : f32
+    } -> tensor<1x80x128xf32>
+    %expanded_2381 = tensor.expand_shape %3110 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32>
+    %3111 = tensor.empty() : tensor<1x32x80x128xf32>
+    %3112 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3094, %3108 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%3111 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %extracted_slice_2382 = tensor.extract_slice %3094[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %extracted_slice_2383 = tensor.extract_slice %3094[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %3113 = tensor.empty() : tensor<1x32x80x64xf32>
+    %3114 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_2383 : tensor<1x32x80x64xf32>) outs(%3113 : tensor<1x32x80x64xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.negf %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x64xf32>
+    %3115 = tensor.empty() : tensor<1x32x80x128xf32>
+    %inserted_slice_2384 = tensor.insert_slice %3114 into %3115[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %inserted_slice_2385 = tensor.insert_slice %extracted_slice_2382 into %inserted_slice_2384[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %3116 = tensor.empty() : tensor<1x32x80x128xf32>
+    %3117 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_2385, %3110 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%3116 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %3118 = tensor.empty() : tensor<1x32x80x128xf32>
+    %3119 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3112, %3117 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%3118 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %3120 = tensor.empty() : tensor<1x32x80x128xf32>
+    %3121 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3096, %3108 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%3120 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %extracted_slice_2386 = tensor.extract_slice %3096[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %extracted_slice_2387 = tensor.extract_slice %3096[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %3122 = tensor.empty() : tensor<1x32x80x64xf32>
+    %3123 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_2387 : tensor<1x32x80x64xf32>) outs(%3122 : tensor<1x32x80x64xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.negf %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x64xf32>
+    %3124 = tensor.empty() : tensor<1x32x80x128xf32>
+    %inserted_slice_2388 = tensor.insert_slice %3123 into %3124[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %inserted_slice_2389 = tensor.insert_slice %extracted_slice_2386 into %inserted_slice_2388[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %3125 = tensor.empty() : tensor<1x32x80x128xf32>
+    %3126 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_2389, %3110 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%3125 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %3127 = tensor.empty() : tensor<1x32x80x128xf32>
+    %3128 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3121, %3126 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%3127 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %3129 = tensor.empty() : tensor<1x32x128x80xf32>
+    %3130 = linalg.generic {indexing_maps = [#map14, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3128 : tensor<1x32x80x128xf32>) outs(%3129 : tensor<1x32x128x80xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x128x80xf32>
+    %cst_2390 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32>
+    %collapsed_2391 = tensor.collapse_shape %3119 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32>
+    %cst_2392 = arith.constant dense<0.000000e+00> : tensor<1x32x128x80xf32>
+    %collapsed_2393 = tensor.collapse_shape %3130 [[0, 1], [2], [3]] : tensor<1x32x128x80xf32> into tensor<32x128x80xf32>
+    %cst_2394 = arith.constant 0.000000e+00 : f32
+    %3131 = tensor.empty() : tensor<32x80x80xf32>
+    %3132 = linalg.fill ins(%cst_2394 : f32) outs(%3131 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32>
+    %3133 = linalg.batch_matmul ins(%collapsed_2391, %collapsed_2393 : tensor<32x80x128xf32>, tensor<32x128x80xf32>) outs(%3132 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32>
+    %expanded_2395 = tensor.expand_shape %3133 [[0, 1], [2], [3]] : tensor<32x80x80xf32> into tensor<1x32x80x80xf32>
+    %cst_2396 = arith.constant dense<11.3137083> : tensor<1x32x80x80xf32>
+    %3134 = tensor.empty() : tensor<1x32x80x80xf32>
+    %3135 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst_2396 : tensor<1x32x80x80xf32>) outs(%3134 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %cst_2684 = arith.constant 1.000000e+00 : f32
+      %3652 = arith.divf %cst_2684, %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %3136 = tensor.empty() : tensor<1x32x80x80xf32>
+    %3137 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2395, %3135 : tensor<1x32x80x80xf32>, tensor<1x32x80x80xf32>) outs(%3136 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %3138 = tensor.empty() : tensor<1x32x80x80xf32>
+    %collapsed_2397 = tensor.collapse_shape %21 [[0, 1], [2], [3]] : tensor<1x1x80x80xf32> into tensor<1x80x80xf32>
+    %3139 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3137, %collapsed_2397 : tensor<1x32x80x80xf32>, tensor<1x80x80xf32>) outs(%3138 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %3140 = tensor.empty() : tensor<1x32x80x1xf32>
+    %3141 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%3140 : tensor<1x32x80x1xf32>) {
+    ^bb0(%out: f32):
+      %cst_2684 = arith.constant 0xFF800000 : f32
+      linalg.yield %cst_2684 : f32
+    } -> tensor<1x32x80x1xf32>
+    %3142 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%3139 : tensor<1x32x80x80xf32>) outs(%3140 : tensor<1x32x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.maximumf %in, %out : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x1xf32>
+    %3143 = tensor.empty() : tensor<1x32x80x80xf32>
+    %3144 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3139, %3142 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%3143 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.subf %in, %in_2684 : f32
+      %3653 = math.exp %3652 : f32
+      linalg.yield %3653 : f32
+    } -> tensor<1x32x80x80xf32>
+    %3145 = tensor.empty() : tensor<1x32x80x1xf32>
+    %3146 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%3145 : tensor<1x32x80x1xf32>) {
+    ^bb0(%out: f32):
+      %cst_2684 = arith.constant 0.000000e+00 : f32
+      linalg.yield %cst_2684 : f32
+    } -> tensor<1x32x80x1xf32>
+    %3147 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%3144 : tensor<1x32x80x80xf32>) outs(%3146 : tensor<1x32x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.addf %in, %out : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x1xf32>
+    %3148 = tensor.empty() : tensor<1x32x80x80xf32>
+    %3149 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3144, %3147 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%3148 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.divf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %cst_2398 = arith.constant dense<0.000000e+00> : tensor<1x32x80x80xf32>
+    %collapsed_2399 = tensor.collapse_shape %3149 [[0, 1], [2], [3]] : tensor<1x32x80x80xf32> into tensor<32x80x80xf32>
+    %cst_2400 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32>
+    %collapsed_2401 = tensor.collapse_shape %3098 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32>
+    %cst_2402 = arith.constant 0.000000e+00 : f32
+    %3150 = tensor.empty() : tensor<32x80x128xf32>
+    %3151 = linalg.fill ins(%cst_2402 : f32) outs(%3150 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32>
+    %3152 = linalg.batch_matmul ins(%collapsed_2399, %collapsed_2401 : tensor<32x80x80xf32>, tensor<32x80x128xf32>) outs(%3151 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32>
+    %expanded_2403 = tensor.expand_shape %3152 [[0, 1], [2], [3]] : tensor<32x80x128xf32> into tensor<1x32x80x128xf32>
+    %3153 = tensor.empty() : tensor<1x80x32x128xf32>
+    %3154 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2403 : tensor<1x32x80x128xf32>) outs(%3153 : tensor<1x80x32x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x80x32x128xf32>
+    %collapsed_2404 = tensor.collapse_shape %3154 [[0], [1], [2, 3]] : tensor<1x80x32x128xf32> into tensor<1x80x4096xf32>
+    %3155 = tensor.empty() : tensor<4096x4096xf32>
+    %3156 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_450 : tensor<4096x4096xf32>) outs(%3155 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_2405 = tensor.collapse_shape %collapsed_2404 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_2406 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %3157 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_2405, %3156 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_2406 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_2407 = tensor.expand_shape %3157 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %3158 = tensor.empty() : tensor<1x80x4096xf32>
+    %3159 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3072, %expanded_2407 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%3158 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %3160 = tensor.empty() : tensor<1x80x4096xf32>
+    %cst_2408 = arith.constant 2.000000e+00 : f32
+    %3161 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3159 : tensor<1x80x4096xf32>) outs(%3160 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.powf %in, %cst_2408 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %cst_2409 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32>
+    %3162 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%3161 : tensor<1x80x4096xf32>) outs(%cst_2409 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %cst_2684 = arith.constant 4.096000e+03 : f32
+      %3652 = arith.divf %in, %cst_2684 : f32
+      %3653 = arith.addf %3652, %out : f32
+      linalg.yield %3653 : f32
+    } -> tensor<1x80x1xf32>
+    %cst_2410 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32>
+    %3163 = tensor.empty() : tensor<1x80x1xf32>
+    %3164 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3162, %cst_2410 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%3163 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %3165 = tensor.empty() : tensor<1x80x1xf32>
+    %3166 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3164 : tensor<1x80x1xf32>) outs(%3165 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.rsqrt %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %3167 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_2411 = tensor.collapse_shape %3166 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32>
+    %3168 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3159, %collapsed_2411 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%3167 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %expanded_2412 = tensor.expand_shape %extracted_slice_54 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32>
+    %3169 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_2413 = tensor.collapse_shape %expanded_2412 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32>
+    %3170 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_2413, %3168 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%3169 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %3171 = tensor.empty() : tensor<4096x11008xf32>
+    %3172 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_452 : tensor<11008x4096xf32>) outs(%3171 : tensor<4096x11008xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x11008xf32>
+    %collapsed_2414 = tensor.collapse_shape %3170 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_2415 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32>
+    %3173 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_2414, %3172 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_2415 : tensor<80x11008xf32>) -> tensor<80x11008xf32>
+    %expanded_2416 = tensor.expand_shape %3173 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32>
+    %3174 = tensor.empty() : tensor<1x80x11008xf32>
+    %3175 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_2416 : tensor<1x80x11008xf32>) outs(%3174 : tensor<1x80x11008xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.negf %in : f32
+      %3653 = math.exp %3652 : f32
+      %cst_2684 = arith.constant 1.000000e+00 : f32
+      %3654 = arith.addf %cst_2684, %3653 : f32
+      %3655 = arith.divf %in, %3654 : f32
+      linalg.yield %3655 : f32
+    } -> tensor<1x80x11008xf32>
+    %3176 = tensor.empty() : tensor<4096x11008xf32>
+    %3177 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_454 : tensor<11008x4096xf32>) outs(%3176 : tensor<4096x11008xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x11008xf32>
+    %collapsed_2417 = tensor.collapse_shape %3170 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_2418 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32>
+    %3178 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_2417, %3177 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_2418 : tensor<80x11008xf32>) -> tensor<80x11008xf32>
+    %expanded_2419 = tensor.expand_shape %3178 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32>
+    %3179 = tensor.empty() : tensor<1x80x11008xf32>
+    %3180 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3175, %expanded_2419 : tensor<1x80x11008xf32>, tensor<1x80x11008xf32>) outs(%3179 : tensor<1x80x11008xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x11008xf32>
+    %3181 = tensor.empty() : tensor<11008x4096xf32>
+    %3182 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_456 : tensor<4096x11008xf32>) outs(%3181 : tensor<11008x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<11008x4096xf32>
+    %collapsed_2420 = tensor.collapse_shape %3180 [[0, 1], [2]] : tensor<1x80x11008xf32> into tensor<80x11008xf32>
+    %cst_2421 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %3183 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_2420, %3182 : tensor<80x11008xf32>, tensor<11008x4096xf32>) outs(%cst_2421 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_2422 = tensor.expand_shape %3183 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %3184 = tensor.empty() : tensor<1x80x4096xf32>
+    %3185 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3159, %expanded_2422 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%3184 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %3186 = tensor.empty() : tensor<1x80x4096xf32>
+    %cst_2423 = arith.constant 2.000000e+00 : f32
+    %3187 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3185 : tensor<1x80x4096xf32>) outs(%3186 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.powf %in, %cst_2423 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %cst_2424 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32>
+    %3188 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%3187 : tensor<1x80x4096xf32>) outs(%cst_2424 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %cst_2684 = arith.constant 4.096000e+03 : f32
+      %3652 = arith.divf %in, %cst_2684 : f32
+      %3653 = arith.addf %3652, %out : f32
+      linalg.yield %3653 : f32
+    } -> tensor<1x80x1xf32>
+    %cst_2425 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32>
+    %3189 = tensor.empty() : tensor<1x80x1xf32>
+    %3190 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3188, %cst_2425 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%3189 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %3191 = tensor.empty() : tensor<1x80x1xf32>
+    %3192 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3190 : tensor<1x80x1xf32>) outs(%3191 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.rsqrt %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %3193 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_2426 = tensor.collapse_shape %3192 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32>
+    %3194 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3185, %collapsed_2426 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%3193 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %expanded_2427 = tensor.expand_shape %extracted_slice_55 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32>
+    %3195 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_2428 = tensor.collapse_shape %expanded_2427 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32>
+    %3196 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_2428, %3194 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%3195 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %3197 = tensor.empty() : tensor<4096x4096xf32>
+    %3198 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_458 : tensor<4096x4096xf32>) outs(%3197 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_2429 = tensor.collapse_shape %3196 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_2430 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %3199 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_2429, %3198 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_2430 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_2431 = tensor.expand_shape %3199 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %3200 = tensor.empty() : tensor<4096x4096xf32>
+    %3201 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_460 : tensor<4096x4096xf32>) outs(%3200 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_2432 = tensor.collapse_shape %3196 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_2433 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %3202 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_2432, %3201 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_2433 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_2434 = tensor.expand_shape %3202 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %3203 = tensor.empty() : tensor<4096x4096xf32>
+    %3204 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_462 : tensor<4096x4096xf32>) outs(%3203 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_2435 = tensor.collapse_shape %3196 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_2436 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %3205 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_2435, %3204 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_2436 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_2437 = tensor.expand_shape %3205 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %expanded_2438 = tensor.expand_shape %expanded_2431 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32>
+    %3206 = tensor.empty() : tensor<1x32x80x128xf32>
+    %3207 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2438 : tensor<1x80x32x128xf32>) outs(%3206 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x80x128xf32>
+    %expanded_2439 = tensor.expand_shape %expanded_2434 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32>
+    %3208 = tensor.empty() : tensor<1x32x80x128xf32>
+    %3209 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2439 : tensor<1x80x32x128xf32>) outs(%3208 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x80x128xf32>
+    %expanded_2440 = tensor.expand_shape %expanded_2437 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32>
+    %3210 = tensor.empty() : tensor<1x32x80x128xf32>
+    %3211 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2440 : tensor<1x80x32x128xf32>) outs(%3210 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x80x128xf32>
+    %extracted_slice_2441 = tensor.extract_slice %expanded_628[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32>
+    %extracted_slice_2442 = tensor.extract_slice %expanded_630[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32>
+    %3212 = tensor.empty() : tensor<1x80x128xf32>
+    %3213 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_2441 : tensor<1x1x80x128xf32>) outs(%3212 : tensor<1x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x80x128xf32>
+    %3214 = tensor.empty() : tensor<80x128xf32>
+    %3215 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%3213 : tensor<1x80x128xf32>) outs(%3214 : tensor<80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<80x128xf32>
+    %3216 = tensor.empty() : tensor<1x80x128xf32>
+    %3217 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_2442 : tensor<1x1x80x128xf32>) outs(%3216 : tensor<1x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x80x128xf32>
+    %3218 = tensor.empty() : tensor<80x128xf32>
+    %3219 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%3217 : tensor<1x80x128xf32>) outs(%3218 : tensor<80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<80x128xf32>
+    %3220 = tensor.empty() : tensor<1x80x128xf32>
+    %3221 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%3220 : tensor<1x80x128xf32>) {
+    ^bb0(%in: i64, %out: f32):
+      %3652 = arith.index_cast %in : i64 to index
+      %3653 = linalg.index 2 : index
+      %extracted = tensor.extract %3215[%3652, %3653] : tensor<80x128xf32>
+      linalg.yield %extracted : f32
+    } -> tensor<1x80x128xf32>
+    %expanded_2443 = tensor.expand_shape %3221 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32>
+    %3222 = tensor.empty() : tensor<1x80x128xf32>
+    %3223 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%3222 : tensor<1x80x128xf32>) {
+    ^bb0(%in: i64, %out: f32):
+      %3652 = arith.index_cast %in : i64 to index
+      %3653 = linalg.index 2 : index
+      %extracted = tensor.extract %3219[%3652, %3653] : tensor<80x128xf32>
+      linalg.yield %extracted : f32
+    } -> tensor<1x80x128xf32>
+    %expanded_2444 = tensor.expand_shape %3223 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32>
+    %3224 = tensor.empty() : tensor<1x32x80x128xf32>
+    %3225 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3207, %3221 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%3224 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %extracted_slice_2445 = tensor.extract_slice %3207[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %extracted_slice_2446 = tensor.extract_slice %3207[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %3226 = tensor.empty() : tensor<1x32x80x64xf32>
+    %3227 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_2446 : tensor<1x32x80x64xf32>) outs(%3226 : tensor<1x32x80x64xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.negf %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x64xf32>
+    %3228 = tensor.empty() : tensor<1x32x80x128xf32>
+    %inserted_slice_2447 = tensor.insert_slice %3227 into %3228[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %inserted_slice_2448 = tensor.insert_slice %extracted_slice_2445 into %inserted_slice_2447[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %3229 = tensor.empty() : tensor<1x32x80x128xf32>
+    %3230 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_2448, %3223 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%3229 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %3231 = tensor.empty() : tensor<1x32x80x128xf32>
+    %3232 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3225, %3230 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%3231 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %3233 = tensor.empty() : tensor<1x32x80x128xf32>
+    %3234 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3209, %3221 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%3233 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %extracted_slice_2449 = tensor.extract_slice %3209[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %extracted_slice_2450 = tensor.extract_slice %3209[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %3235 = tensor.empty() : tensor<1x32x80x64xf32>
+    %3236 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_2450 : tensor<1x32x80x64xf32>) outs(%3235 : tensor<1x32x80x64xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.negf %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x64xf32>
+    %3237 = tensor.empty() : tensor<1x32x80x128xf32>
+    %inserted_slice_2451 = tensor.insert_slice %3236 into %3237[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %inserted_slice_2452 = tensor.insert_slice %extracted_slice_2449 into %inserted_slice_2451[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %3238 = tensor.empty() : tensor<1x32x80x128xf32>
+    %3239 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_2452, %3223 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%3238 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %3240 = tensor.empty() : tensor<1x32x80x128xf32>
+    %3241 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3234, %3239 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%3240 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %3242 = tensor.empty() : tensor<1x32x128x80xf32>
+    %3243 = linalg.generic {indexing_maps = [#map14, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3241 : tensor<1x32x80x128xf32>) outs(%3242 : tensor<1x32x128x80xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x128x80xf32>
+    %cst_2453 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32>
+    %collapsed_2454 = tensor.collapse_shape %3232 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32>
+    %cst_2455 = arith.constant dense<0.000000e+00> : tensor<1x32x128x80xf32>
+    %collapsed_2456 = tensor.collapse_shape %3243 [[0, 1], [2], [3]] : tensor<1x32x128x80xf32> into tensor<32x128x80xf32>
+    %cst_2457 = arith.constant 0.000000e+00 : f32
+    %3244 = tensor.empty() : tensor<32x80x80xf32>
+    %3245 = linalg.fill ins(%cst_2457 : f32) outs(%3244 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32>
+    %3246 = linalg.batch_matmul ins(%collapsed_2454, %collapsed_2456 : tensor<32x80x128xf32>, tensor<32x128x80xf32>) outs(%3245 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32>
+    %expanded_2458 = tensor.expand_shape %3246 [[0, 1], [2], [3]] : tensor<32x80x80xf32> into tensor<1x32x80x80xf32>
+    %cst_2459 = arith.constant dense<11.3137083> : tensor<1x32x80x80xf32>
+    %3247 = tensor.empty() : tensor<1x32x80x80xf32>
+    %3248 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst_2459 : tensor<1x32x80x80xf32>) outs(%3247 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %cst_2684 = arith.constant 1.000000e+00 : f32
+      %3652 = arith.divf %cst_2684, %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %3249 = tensor.empty() : tensor<1x32x80x80xf32>
+    %3250 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2458, %3248 : tensor<1x32x80x80xf32>, tensor<1x32x80x80xf32>) outs(%3249 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %3251 = tensor.empty() : tensor<1x32x80x80xf32>
+    %collapsed_2460 = tensor.collapse_shape %21 [[0, 1], [2], [3]] : tensor<1x1x80x80xf32> into tensor<1x80x80xf32>
+    %3252 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3250, %collapsed_2460 : tensor<1x32x80x80xf32>, tensor<1x80x80xf32>) outs(%3251 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %3253 = tensor.empty() : tensor<1x32x80x1xf32>
+    %3254 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%3253 : tensor<1x32x80x1xf32>) {
+    ^bb0(%out: f32):
+      %cst_2684 = arith.constant 0xFF800000 : f32
+      linalg.yield %cst_2684 : f32
+    } -> tensor<1x32x80x1xf32>
+    %3255 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%3252 : tensor<1x32x80x80xf32>) outs(%3253 : tensor<1x32x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.maximumf %in, %out : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x1xf32>
+    %3256 = tensor.empty() : tensor<1x32x80x80xf32>
+    %3257 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3252, %3255 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%3256 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.subf %in, %in_2684 : f32
+      %3653 = math.exp %3652 : f32
+      linalg.yield %3653 : f32
+    } -> tensor<1x32x80x80xf32>
+    %3258 = tensor.empty() : tensor<1x32x80x1xf32>
+    %3259 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%3258 : tensor<1x32x80x1xf32>) {
+    ^bb0(%out: f32):
+      %cst_2684 = arith.constant 0.000000e+00 : f32
+      linalg.yield %cst_2684 : f32
+    } -> tensor<1x32x80x1xf32>
+    %3260 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%3257 : tensor<1x32x80x80xf32>) outs(%3259 : tensor<1x32x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.addf %in, %out : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x1xf32>
+    %3261 = tensor.empty() : tensor<1x32x80x80xf32>
+    %3262 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3257, %3260 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%3261 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.divf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %cst_2461 = arith.constant dense<0.000000e+00> : tensor<1x32x80x80xf32>
+    %collapsed_2462 = tensor.collapse_shape %3262 [[0, 1], [2], [3]] : tensor<1x32x80x80xf32> into tensor<32x80x80xf32>
+    %cst_2463 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32>
+    %collapsed_2464 = tensor.collapse_shape %3211 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32>
+    %cst_2465 = arith.constant 0.000000e+00 : f32
+    %3263 = tensor.empty() : tensor<32x80x128xf32>
+    %3264 = linalg.fill ins(%cst_2465 : f32) outs(%3263 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32>
+    %3265 = linalg.batch_matmul ins(%collapsed_2462, %collapsed_2464 : tensor<32x80x80xf32>, tensor<32x80x128xf32>) outs(%3264 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32>
+    %expanded_2466 = tensor.expand_shape %3265 [[0, 1], [2], [3]] : tensor<32x80x128xf32> into tensor<1x32x80x128xf32>
+    %3266 = tensor.empty() : tensor<1x80x32x128xf32>
+    %3267 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2466 : tensor<1x32x80x128xf32>) outs(%3266 : tensor<1x80x32x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x80x32x128xf32>
+    %collapsed_2467 = tensor.collapse_shape %3267 [[0], [1], [2, 3]] : tensor<1x80x32x128xf32> into tensor<1x80x4096xf32>
+    %3268 = tensor.empty() : tensor<4096x4096xf32>
+    %3269 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_464 : tensor<4096x4096xf32>) outs(%3268 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_2468 = tensor.collapse_shape %collapsed_2467 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_2469 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %3270 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_2468, %3269 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_2469 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_2470 = tensor.expand_shape %3270 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %3271 = tensor.empty() : tensor<1x80x4096xf32>
+    %3272 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3185, %expanded_2470 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%3271 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %3273 = tensor.empty() : tensor<1x80x4096xf32>
+    %cst_2471 = arith.constant 2.000000e+00 : f32
+    %3274 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3272 : tensor<1x80x4096xf32>) outs(%3273 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.powf %in, %cst_2471 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %cst_2472 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32>
+    %3275 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%3274 : tensor<1x80x4096xf32>) outs(%cst_2472 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %cst_2684 = arith.constant 4.096000e+03 : f32
+      %3652 = arith.divf %in, %cst_2684 : f32
+      %3653 = arith.addf %3652, %out : f32
+      linalg.yield %3653 : f32
+    } -> tensor<1x80x1xf32>
+    %cst_2473 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32>
+    %3276 = tensor.empty() : tensor<1x80x1xf32>
+    %3277 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3275, %cst_2473 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%3276 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %3278 = tensor.empty() : tensor<1x80x1xf32>
+    %3279 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3277 : tensor<1x80x1xf32>) outs(%3278 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.rsqrt %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %3280 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_2474 = tensor.collapse_shape %3279 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32>
+    %3281 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3272, %collapsed_2474 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%3280 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %expanded_2475 = tensor.expand_shape %extracted_slice_56 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32>
+    %3282 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_2476 = tensor.collapse_shape %expanded_2475 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32>
+    %3283 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_2476, %3281 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%3282 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %3284 = tensor.empty() : tensor<4096x11008xf32>
+    %3285 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_466 : tensor<11008x4096xf32>) outs(%3284 : tensor<4096x11008xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x11008xf32>
+    %collapsed_2477 = tensor.collapse_shape %3283 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_2478 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32>
+    %3286 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_2477, %3285 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_2478 : tensor<80x11008xf32>) -> tensor<80x11008xf32>
+    %expanded_2479 = tensor.expand_shape %3286 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32>
+    %3287 = tensor.empty() : tensor<1x80x11008xf32>
+    %3288 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_2479 : tensor<1x80x11008xf32>) outs(%3287 : tensor<1x80x11008xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.negf %in : f32
+      %3653 = math.exp %3652 : f32
+      %cst_2684 = arith.constant 1.000000e+00 : f32
+      %3654 = arith.addf %cst_2684, %3653 : f32
+      %3655 = arith.divf %in, %3654 : f32
+      linalg.yield %3655 : f32
+    } -> tensor<1x80x11008xf32>
+    %3289 = tensor.empty() : tensor<4096x11008xf32>
+    %3290 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_468 : tensor<11008x4096xf32>) outs(%3289 : tensor<4096x11008xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x11008xf32>
+    %collapsed_2480 = tensor.collapse_shape %3283 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_2481 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32>
+    %3291 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_2480, %3290 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_2481 : tensor<80x11008xf32>) -> tensor<80x11008xf32>
+    %expanded_2482 = tensor.expand_shape %3291 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32>
+    %3292 = tensor.empty() : tensor<1x80x11008xf32>
+    %3293 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3288, %expanded_2482 : tensor<1x80x11008xf32>, tensor<1x80x11008xf32>) outs(%3292 : tensor<1x80x11008xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x11008xf32>
+    %3294 = tensor.empty() : tensor<11008x4096xf32>
+    %3295 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_470 : tensor<4096x11008xf32>) outs(%3294 : tensor<11008x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<11008x4096xf32>
+    %collapsed_2483 = tensor.collapse_shape %3293 [[0, 1], [2]] : tensor<1x80x11008xf32> into tensor<80x11008xf32>
+    %cst_2484 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %3296 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_2483, %3295 : tensor<80x11008xf32>, tensor<11008x4096xf32>) outs(%cst_2484 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_2485 = tensor.expand_shape %3296 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %3297 = tensor.empty() : tensor<1x80x4096xf32>
+    %3298 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3272, %expanded_2485 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%3297 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %3299 = tensor.empty() : tensor<1x80x4096xf32>
+    %cst_2486 = arith.constant 2.000000e+00 : f32
+    %3300 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3298 : tensor<1x80x4096xf32>) outs(%3299 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.powf %in, %cst_2486 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %cst_2487 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32>
+    %3301 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%3300 : tensor<1x80x4096xf32>) outs(%cst_2487 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %cst_2684 = arith.constant 4.096000e+03 : f32
+      %3652 = arith.divf %in, %cst_2684 : f32
+      %3653 = arith.addf %3652, %out : f32
+      linalg.yield %3653 : f32
+    } -> tensor<1x80x1xf32>
+    %cst_2488 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32>
+    %3302 = tensor.empty() : tensor<1x80x1xf32>
+    %3303 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3301, %cst_2488 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%3302 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %3304 = tensor.empty() : tensor<1x80x1xf32>
+    %3305 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3303 : tensor<1x80x1xf32>) outs(%3304 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.rsqrt %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %3306 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_2489 = tensor.collapse_shape %3305 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32>
+    %3307 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3298, %collapsed_2489 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%3306 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %expanded_2490 = tensor.expand_shape %extracted_slice_57 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32>
+    %3308 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_2491 = tensor.collapse_shape %expanded_2490 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32>
+    %3309 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_2491, %3307 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%3308 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %3310 = tensor.empty() : tensor<4096x4096xf32>
+    %3311 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_472 : tensor<4096x4096xf32>) outs(%3310 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_2492 = tensor.collapse_shape %3309 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_2493 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %3312 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_2492, %3311 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_2493 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_2494 = tensor.expand_shape %3312 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %3313 = tensor.empty() : tensor<4096x4096xf32>
+    %3314 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_474 : tensor<4096x4096xf32>) outs(%3313 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_2495 = tensor.collapse_shape %3309 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_2496 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %3315 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_2495, %3314 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_2496 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_2497 = tensor.expand_shape %3315 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %3316 = tensor.empty() : tensor<4096x4096xf32>
+    %3317 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_476 : tensor<4096x4096xf32>) outs(%3316 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_2498 = tensor.collapse_shape %3309 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_2499 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %3318 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_2498, %3317 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_2499 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_2500 = tensor.expand_shape %3318 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %expanded_2501 = tensor.expand_shape %expanded_2494 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32>
+    %3319 = tensor.empty() : tensor<1x32x80x128xf32>
+    %3320 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2501 : tensor<1x80x32x128xf32>) outs(%3319 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x80x128xf32>
+    %expanded_2502 = tensor.expand_shape %expanded_2497 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32>
+    %3321 = tensor.empty() : tensor<1x32x80x128xf32>
+    %3322 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2502 : tensor<1x80x32x128xf32>) outs(%3321 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x80x128xf32>
+    %expanded_2503 = tensor.expand_shape %expanded_2500 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32>
+    %3323 = tensor.empty() : tensor<1x32x80x128xf32>
+    %3324 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2503 : tensor<1x80x32x128xf32>) outs(%3323 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x80x128xf32>
+    %extracted_slice_2504 = tensor.extract_slice %expanded_632[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32>
+    %extracted_slice_2505 = tensor.extract_slice %expanded_634[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32>
+    %3325 = tensor.empty() : tensor<1x80x128xf32>
+    %3326 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_2504 : tensor<1x1x80x128xf32>) outs(%3325 : tensor<1x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x80x128xf32>
+    %3327 = tensor.empty() : tensor<80x128xf32>
+    %3328 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%3326 : tensor<1x80x128xf32>) outs(%3327 : tensor<80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<80x128xf32>
+    %3329 = tensor.empty() : tensor<1x80x128xf32>
+    %3330 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_2505 : tensor<1x1x80x128xf32>) outs(%3329 : tensor<1x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x80x128xf32>
+    %3331 = tensor.empty() : tensor<80x128xf32>
+    %3332 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%3330 : tensor<1x80x128xf32>) outs(%3331 : tensor<80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<80x128xf32>
+    %3333 = tensor.empty() : tensor<1x80x128xf32>
+    %3334 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%3333 : tensor<1x80x128xf32>) {
+    ^bb0(%in: i64, %out: f32):
+      %3652 = arith.index_cast %in : i64 to index
+      %3653 = linalg.index 2 : index
+      %extracted = tensor.extract %3328[%3652, %3653] : tensor<80x128xf32>
+      linalg.yield %extracted : f32
+    } -> tensor<1x80x128xf32>
+    %expanded_2506 = tensor.expand_shape %3334 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32>
+    %3335 = tensor.empty() : tensor<1x80x128xf32>
+    %3336 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%3335 : tensor<1x80x128xf32>) {
+    ^bb0(%in: i64, %out: f32):
+      %3652 = arith.index_cast %in : i64 to index
+      %3653 = linalg.index 2 : index
+      %extracted = tensor.extract %3332[%3652, %3653] : tensor<80x128xf32>
+      linalg.yield %extracted : f32
+    } -> tensor<1x80x128xf32>
+    %expanded_2507 = tensor.expand_shape %3336 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32>
+    %3337 = tensor.empty() : tensor<1x32x80x128xf32>
+    %3338 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3320, %3334 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%3337 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %extracted_slice_2508 = tensor.extract_slice %3320[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %extracted_slice_2509 = tensor.extract_slice %3320[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %3339 = tensor.empty() : tensor<1x32x80x64xf32>
+    %3340 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_2509 : tensor<1x32x80x64xf32>) outs(%3339 : tensor<1x32x80x64xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.negf %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x64xf32>
+    %3341 = tensor.empty() : tensor<1x32x80x128xf32>
+    %inserted_slice_2510 = tensor.insert_slice %3340 into %3341[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %inserted_slice_2511 = tensor.insert_slice %extracted_slice_2508 into %inserted_slice_2510[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %3342 = tensor.empty() : tensor<1x32x80x128xf32>
+    %3343 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_2511, %3336 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%3342 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %3344 = tensor.empty() : tensor<1x32x80x128xf32>
+    %3345 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3338, %3343 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%3344 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %3346 = tensor.empty() : tensor<1x32x80x128xf32>
+    %3347 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3322, %3334 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%3346 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %extracted_slice_2512 = tensor.extract_slice %3322[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %extracted_slice_2513 = tensor.extract_slice %3322[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %3348 = tensor.empty() : tensor<1x32x80x64xf32>
+    %3349 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_2513 : tensor<1x32x80x64xf32>) outs(%3348 : tensor<1x32x80x64xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.negf %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x64xf32>
+    %3350 = tensor.empty() : tensor<1x32x80x128xf32>
+    %inserted_slice_2514 = tensor.insert_slice %3349 into %3350[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %inserted_slice_2515 = tensor.insert_slice %extracted_slice_2512 into %inserted_slice_2514[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %3351 = tensor.empty() : tensor<1x32x80x128xf32>
+    %3352 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_2515, %3336 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%3351 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %3353 = tensor.empty() : tensor<1x32x80x128xf32>
+    %3354 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3347, %3352 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%3353 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %3355 = tensor.empty() : tensor<1x32x128x80xf32>
+    %3356 = linalg.generic {indexing_maps = [#map14, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3354 : tensor<1x32x80x128xf32>) outs(%3355 : tensor<1x32x128x80xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x128x80xf32>
+    %cst_2516 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32>
+    %collapsed_2517 = tensor.collapse_shape %3345 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32>
+    %cst_2518 = arith.constant dense<0.000000e+00> : tensor<1x32x128x80xf32>
+    %collapsed_2519 = tensor.collapse_shape %3356 [[0, 1], [2], [3]] : tensor<1x32x128x80xf32> into tensor<32x128x80xf32>
+    %cst_2520 = arith.constant 0.000000e+00 : f32
+    %3357 = tensor.empty() : tensor<32x80x80xf32>
+    %3358 = linalg.fill ins(%cst_2520 : f32) outs(%3357 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32>
+    %3359 = linalg.batch_matmul ins(%collapsed_2517, %collapsed_2519 : tensor<32x80x128xf32>, tensor<32x128x80xf32>) outs(%3358 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32>
+    %expanded_2521 = tensor.expand_shape %3359 [[0, 1], [2], [3]] : tensor<32x80x80xf32> into tensor<1x32x80x80xf32>
+    %cst_2522 = arith.constant dense<11.3137083> : tensor<1x32x80x80xf32>
+    %3360 = tensor.empty() : tensor<1x32x80x80xf32>
+    %3361 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst_2522 : tensor<1x32x80x80xf32>) outs(%3360 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %cst_2684 = arith.constant 1.000000e+00 : f32
+      %3652 = arith.divf %cst_2684, %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %3362 = tensor.empty() : tensor<1x32x80x80xf32>
+    %3363 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2521, %3361 : tensor<1x32x80x80xf32>, tensor<1x32x80x80xf32>) outs(%3362 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %3364 = tensor.empty() : tensor<1x32x80x80xf32>
+    %collapsed_2523 = tensor.collapse_shape %21 [[0, 1], [2], [3]] : tensor<1x1x80x80xf32> into tensor<1x80x80xf32>
+    %3365 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3363, %collapsed_2523 : tensor<1x32x80x80xf32>, tensor<1x80x80xf32>) outs(%3364 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %3366 = tensor.empty() : tensor<1x32x80x1xf32>
+    %3367 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%3366 : tensor<1x32x80x1xf32>) {
+    ^bb0(%out: f32):
+      %cst_2684 = arith.constant 0xFF800000 : f32
+      linalg.yield %cst_2684 : f32
+    } -> tensor<1x32x80x1xf32>
+    %3368 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%3365 : tensor<1x32x80x80xf32>) outs(%3366 : tensor<1x32x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.maximumf %in, %out : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x1xf32>
+    %3369 = tensor.empty() : tensor<1x32x80x80xf32>
+    %3370 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3365, %3368 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%3369 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.subf %in, %in_2684 : f32
+      %3653 = math.exp %3652 : f32
+      linalg.yield %3653 : f32
+    } -> tensor<1x32x80x80xf32>
+    %3371 = tensor.empty() : tensor<1x32x80x1xf32>
+    %3372 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%3371 : tensor<1x32x80x1xf32>) {
+    ^bb0(%out: f32):
+      %cst_2684 = arith.constant 0.000000e+00 : f32
+      linalg.yield %cst_2684 : f32
+    } -> tensor<1x32x80x1xf32>
+    %3373 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%3370 : tensor<1x32x80x80xf32>) outs(%3372 : tensor<1x32x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.addf %in, %out : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x1xf32>
+    %3374 = tensor.empty() : tensor<1x32x80x80xf32>
+    %3375 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3370, %3373 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%3374 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.divf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %cst_2524 = arith.constant dense<0.000000e+00> : tensor<1x32x80x80xf32>
+    %collapsed_2525 = tensor.collapse_shape %3375 [[0, 1], [2], [3]] : tensor<1x32x80x80xf32> into tensor<32x80x80xf32>
+    %cst_2526 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32>
+    %collapsed_2527 = tensor.collapse_shape %3324 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32>
+    %cst_2528 = arith.constant 0.000000e+00 : f32
+    %3376 = tensor.empty() : tensor<32x80x128xf32>
+    %3377 = linalg.fill ins(%cst_2528 : f32) outs(%3376 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32>
+    %3378 = linalg.batch_matmul ins(%collapsed_2525, %collapsed_2527 : tensor<32x80x80xf32>, tensor<32x80x128xf32>) outs(%3377 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32>
+    %expanded_2529 = tensor.expand_shape %3378 [[0, 1], [2], [3]] : tensor<32x80x128xf32> into tensor<1x32x80x128xf32>
+    %3379 = tensor.empty() : tensor<1x80x32x128xf32>
+    %3380 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2529 : tensor<1x32x80x128xf32>) outs(%3379 : tensor<1x80x32x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x80x32x128xf32>
+    %collapsed_2530 = tensor.collapse_shape %3380 [[0], [1], [2, 3]] : tensor<1x80x32x128xf32> into tensor<1x80x4096xf32>
+    %3381 = tensor.empty() : tensor<4096x4096xf32>
+    %3382 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_478 : tensor<4096x4096xf32>) outs(%3381 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_2531 = tensor.collapse_shape %collapsed_2530 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_2532 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %3383 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_2531, %3382 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_2532 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_2533 = tensor.expand_shape %3383 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %3384 = tensor.empty() : tensor<1x80x4096xf32>
+    %3385 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3298, %expanded_2533 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%3384 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %3386 = tensor.empty() : tensor<1x80x4096xf32>
+    %cst_2534 = arith.constant 2.000000e+00 : f32
+    %3387 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3385 : tensor<1x80x4096xf32>) outs(%3386 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.powf %in, %cst_2534 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %cst_2535 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32>
+    %3388 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%3387 : tensor<1x80x4096xf32>) outs(%cst_2535 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %cst_2684 = arith.constant 4.096000e+03 : f32
+      %3652 = arith.divf %in, %cst_2684 : f32
+      %3653 = arith.addf %3652, %out : f32
+      linalg.yield %3653 : f32
+    } -> tensor<1x80x1xf32>
+    %cst_2536 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32>
+    %3389 = tensor.empty() : tensor<1x80x1xf32>
+    %3390 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3388, %cst_2536 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%3389 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %3391 = tensor.empty() : tensor<1x80x1xf32>
+    %3392 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3390 : tensor<1x80x1xf32>) outs(%3391 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.rsqrt %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %3393 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_2537 = tensor.collapse_shape %3392 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32>
+    %3394 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3385, %collapsed_2537 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%3393 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %expanded_2538 = tensor.expand_shape %extracted_slice_58 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32>
+    %3395 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_2539 = tensor.collapse_shape %expanded_2538 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32>
+    %3396 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_2539, %3394 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%3395 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %3397 = tensor.empty() : tensor<4096x11008xf32>
+    %3398 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_480 : tensor<11008x4096xf32>) outs(%3397 : tensor<4096x11008xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x11008xf32>
+    %collapsed_2540 = tensor.collapse_shape %3396 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_2541 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32>
+    %3399 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_2540, %3398 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_2541 : tensor<80x11008xf32>) -> tensor<80x11008xf32>
+    %expanded_2542 = tensor.expand_shape %3399 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32>
+    %3400 = tensor.empty() : tensor<1x80x11008xf32>
+    %3401 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_2542 : tensor<1x80x11008xf32>) outs(%3400 : tensor<1x80x11008xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.negf %in : f32
+      %3653 = math.exp %3652 : f32
+      %cst_2684 = arith.constant 1.000000e+00 : f32
+      %3654 = arith.addf %cst_2684, %3653 : f32
+      %3655 = arith.divf %in, %3654 : f32
+      linalg.yield %3655 : f32
+    } -> tensor<1x80x11008xf32>
+    %3402 = tensor.empty() : tensor<4096x11008xf32>
+    %3403 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_482 : tensor<11008x4096xf32>) outs(%3402 : tensor<4096x11008xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x11008xf32>
+    %collapsed_2543 = tensor.collapse_shape %3396 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_2544 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32>
+    %3404 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_2543, %3403 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_2544 : tensor<80x11008xf32>) -> tensor<80x11008xf32>
+    %expanded_2545 = tensor.expand_shape %3404 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32>
+    %3405 = tensor.empty() : tensor<1x80x11008xf32>
+    %3406 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3401, %expanded_2545 : tensor<1x80x11008xf32>, tensor<1x80x11008xf32>) outs(%3405 : tensor<1x80x11008xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x11008xf32>
+    %3407 = tensor.empty() : tensor<11008x4096xf32>
+    %3408 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_484 : tensor<4096x11008xf32>) outs(%3407 : tensor<11008x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<11008x4096xf32>
+    %collapsed_2546 = tensor.collapse_shape %3406 [[0, 1], [2]] : tensor<1x80x11008xf32> into tensor<80x11008xf32>
+    %cst_2547 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %3409 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_2546, %3408 : tensor<80x11008xf32>, tensor<11008x4096xf32>) outs(%cst_2547 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_2548 = tensor.expand_shape %3409 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %3410 = tensor.empty() : tensor<1x80x4096xf32>
+    %3411 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3385, %expanded_2548 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%3410 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %3412 = tensor.empty() : tensor<1x80x4096xf32>
+    %cst_2549 = arith.constant 2.000000e+00 : f32
+    %3413 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3411 : tensor<1x80x4096xf32>) outs(%3412 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.powf %in, %cst_2549 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %cst_2550 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32>
+    %3414 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%3413 : tensor<1x80x4096xf32>) outs(%cst_2550 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %cst_2684 = arith.constant 4.096000e+03 : f32
+      %3652 = arith.divf %in, %cst_2684 : f32
+      %3653 = arith.addf %3652, %out : f32
+      linalg.yield %3653 : f32
+    } -> tensor<1x80x1xf32>
+    %cst_2551 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32>
+    %3415 = tensor.empty() : tensor<1x80x1xf32>
+    %3416 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3414, %cst_2551 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%3415 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %3417 = tensor.empty() : tensor<1x80x1xf32>
+    %3418 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3416 : tensor<1x80x1xf32>) outs(%3417 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.rsqrt %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %3419 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_2552 = tensor.collapse_shape %3418 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32>
+    %3420 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3411, %collapsed_2552 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%3419 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %expanded_2553 = tensor.expand_shape %extracted_slice_59 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32>
+    %3421 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_2554 = tensor.collapse_shape %expanded_2553 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32>
+    %3422 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_2554, %3420 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%3421 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %3423 = tensor.empty() : tensor<4096x4096xf32>
+    %3424 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_486 : tensor<4096x4096xf32>) outs(%3423 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_2555 = tensor.collapse_shape %3422 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_2556 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %3425 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_2555, %3424 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_2556 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_2557 = tensor.expand_shape %3425 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %3426 = tensor.empty() : tensor<4096x4096xf32>
+    %3427 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_488 : tensor<4096x4096xf32>) outs(%3426 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_2558 = tensor.collapse_shape %3422 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_2559 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %3428 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_2558, %3427 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_2559 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_2560 = tensor.expand_shape %3428 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %3429 = tensor.empty() : tensor<4096x4096xf32>
+    %3430 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_490 : tensor<4096x4096xf32>) outs(%3429 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_2561 = tensor.collapse_shape %3422 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_2562 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %3431 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_2561, %3430 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_2562 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_2563 = tensor.expand_shape %3431 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %expanded_2564 = tensor.expand_shape %expanded_2557 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32>
+    %3432 = tensor.empty() : tensor<1x32x80x128xf32>
+    %3433 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2564 : tensor<1x80x32x128xf32>) outs(%3432 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x80x128xf32>
+    %expanded_2565 = tensor.expand_shape %expanded_2560 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32>
+    %3434 = tensor.empty() : tensor<1x32x80x128xf32>
+    %3435 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2565 : tensor<1x80x32x128xf32>) outs(%3434 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x80x128xf32>
+    %expanded_2566 = tensor.expand_shape %expanded_2563 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32>
+    %3436 = tensor.empty() : tensor<1x32x80x128xf32>
+    %3437 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2566 : tensor<1x80x32x128xf32>) outs(%3436 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x80x128xf32>
+    %extracted_slice_2567 = tensor.extract_slice %expanded_636[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32>
+    %extracted_slice_2568 = tensor.extract_slice %expanded_638[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32>
+    %3438 = tensor.empty() : tensor<1x80x128xf32>
+    %3439 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_2567 : tensor<1x1x80x128xf32>) outs(%3438 : tensor<1x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x80x128xf32>
+    %3440 = tensor.empty() : tensor<80x128xf32>
+    %3441 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%3439 : tensor<1x80x128xf32>) outs(%3440 : tensor<80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<80x128xf32>
+    %3442 = tensor.empty() : tensor<1x80x128xf32>
+    %3443 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_2568 : tensor<1x1x80x128xf32>) outs(%3442 : tensor<1x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x80x128xf32>
+    %3444 = tensor.empty() : tensor<80x128xf32>
+    %3445 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%3443 : tensor<1x80x128xf32>) outs(%3444 : tensor<80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<80x128xf32>
+    %3446 = tensor.empty() : tensor<1x80x128xf32>
+    %3447 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%3446 : tensor<1x80x128xf32>) {
+    ^bb0(%in: i64, %out: f32):
+      %3652 = arith.index_cast %in : i64 to index
+      %3653 = linalg.index 2 : index
+      %extracted = tensor.extract %3441[%3652, %3653] : tensor<80x128xf32>
+      linalg.yield %extracted : f32
+    } -> tensor<1x80x128xf32>
+    %expanded_2569 = tensor.expand_shape %3447 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32>
+    %3448 = tensor.empty() : tensor<1x80x128xf32>
+    %3449 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%3448 : tensor<1x80x128xf32>) {
+    ^bb0(%in: i64, %out: f32):
+      %3652 = arith.index_cast %in : i64 to index
+      %3653 = linalg.index 2 : index
+      %extracted = tensor.extract %3445[%3652, %3653] : tensor<80x128xf32>
+      linalg.yield %extracted : f32
+    } -> tensor<1x80x128xf32>
+    %expanded_2570 = tensor.expand_shape %3449 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32>
+    %3450 = tensor.empty() : tensor<1x32x80x128xf32>
+    %3451 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3433, %3447 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%3450 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %extracted_slice_2571 = tensor.extract_slice %3433[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %extracted_slice_2572 = tensor.extract_slice %3433[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %3452 = tensor.empty() : tensor<1x32x80x64xf32>
+    %3453 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_2572 : tensor<1x32x80x64xf32>) outs(%3452 : tensor<1x32x80x64xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.negf %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x64xf32>
+    %3454 = tensor.empty() : tensor<1x32x80x128xf32>
+    %inserted_slice_2573 = tensor.insert_slice %3453 into %3454[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %inserted_slice_2574 = tensor.insert_slice %extracted_slice_2571 into %inserted_slice_2573[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %3455 = tensor.empty() : tensor<1x32x80x128xf32>
+    %3456 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_2574, %3449 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%3455 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %3457 = tensor.empty() : tensor<1x32x80x128xf32>
+    %3458 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3451, %3456 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%3457 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %3459 = tensor.empty() : tensor<1x32x80x128xf32>
+    %3460 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3435, %3447 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%3459 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %extracted_slice_2575 = tensor.extract_slice %3435[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %extracted_slice_2576 = tensor.extract_slice %3435[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %3461 = tensor.empty() : tensor<1x32x80x64xf32>
+    %3462 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_2576 : tensor<1x32x80x64xf32>) outs(%3461 : tensor<1x32x80x64xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.negf %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x64xf32>
+    %3463 = tensor.empty() : tensor<1x32x80x128xf32>
+    %inserted_slice_2577 = tensor.insert_slice %3462 into %3463[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %inserted_slice_2578 = tensor.insert_slice %extracted_slice_2575 into %inserted_slice_2577[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %3464 = tensor.empty() : tensor<1x32x80x128xf32>
+    %3465 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_2578, %3449 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%3464 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %3466 = tensor.empty() : tensor<1x32x80x128xf32>
+    %3467 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3460, %3465 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%3466 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %3468 = tensor.empty() : tensor<1x32x128x80xf32>
+    %3469 = linalg.generic {indexing_maps = [#map14, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3467 : tensor<1x32x80x128xf32>) outs(%3468 : tensor<1x32x128x80xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x128x80xf32>
+    %cst_2579 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32>
+    %collapsed_2580 = tensor.collapse_shape %3458 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32>
+    %cst_2581 = arith.constant dense<0.000000e+00> : tensor<1x32x128x80xf32>
+    %collapsed_2582 = tensor.collapse_shape %3469 [[0, 1], [2], [3]] : tensor<1x32x128x80xf32> into tensor<32x128x80xf32>
+    %cst_2583 = arith.constant 0.000000e+00 : f32
+    %3470 = tensor.empty() : tensor<32x80x80xf32>
+    %3471 = linalg.fill ins(%cst_2583 : f32) outs(%3470 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32>
+    %3472 = linalg.batch_matmul ins(%collapsed_2580, %collapsed_2582 : tensor<32x80x128xf32>, tensor<32x128x80xf32>) outs(%3471 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32>
+    %expanded_2584 = tensor.expand_shape %3472 [[0, 1], [2], [3]] : tensor<32x80x80xf32> into tensor<1x32x80x80xf32>
+    %cst_2585 = arith.constant dense<11.3137083> : tensor<1x32x80x80xf32>
+    %3473 = tensor.empty() : tensor<1x32x80x80xf32>
+    %3474 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst_2585 : tensor<1x32x80x80xf32>) outs(%3473 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %cst_2684 = arith.constant 1.000000e+00 : f32
+      %3652 = arith.divf %cst_2684, %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %3475 = tensor.empty() : tensor<1x32x80x80xf32>
+    %3476 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2584, %3474 : tensor<1x32x80x80xf32>, tensor<1x32x80x80xf32>) outs(%3475 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %3477 = tensor.empty() : tensor<1x32x80x80xf32>
+    %collapsed_2586 = tensor.collapse_shape %21 [[0, 1], [2], [3]] : tensor<1x1x80x80xf32> into tensor<1x80x80xf32>
+    %3478 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3476, %collapsed_2586 : tensor<1x32x80x80xf32>, tensor<1x80x80xf32>) outs(%3477 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %3479 = tensor.empty() : tensor<1x32x80x1xf32>
+    %3480 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%3479 : tensor<1x32x80x1xf32>) {
+    ^bb0(%out: f32):
+      %cst_2684 = arith.constant 0xFF800000 : f32
+      linalg.yield %cst_2684 : f32
+    } -> tensor<1x32x80x1xf32>
+    %3481 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%3478 : tensor<1x32x80x80xf32>) outs(%3479 : tensor<1x32x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.maximumf %in, %out : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x1xf32>
+    %3482 = tensor.empty() : tensor<1x32x80x80xf32>
+    %3483 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3478, %3481 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%3482 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.subf %in, %in_2684 : f32
+      %3653 = math.exp %3652 : f32
+      linalg.yield %3653 : f32
+    } -> tensor<1x32x80x80xf32>
+    %3484 = tensor.empty() : tensor<1x32x80x1xf32>
+    %3485 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%3484 : tensor<1x32x80x1xf32>) {
+    ^bb0(%out: f32):
+      %cst_2684 = arith.constant 0.000000e+00 : f32
+      linalg.yield %cst_2684 : f32
+    } -> tensor<1x32x80x1xf32>
+    %3486 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%3483 : tensor<1x32x80x80xf32>) outs(%3485 : tensor<1x32x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.addf %in, %out : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x1xf32>
+    %3487 = tensor.empty() : tensor<1x32x80x80xf32>
+    %3488 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3483, %3486 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%3487 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.divf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %cst_2587 = arith.constant dense<0.000000e+00> : tensor<1x32x80x80xf32>
+    %collapsed_2588 = tensor.collapse_shape %3488 [[0, 1], [2], [3]] : tensor<1x32x80x80xf32> into tensor<32x80x80xf32>
+    %cst_2589 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32>
+    %collapsed_2590 = tensor.collapse_shape %3437 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32>
+    %cst_2591 = arith.constant 0.000000e+00 : f32
+    %3489 = tensor.empty() : tensor<32x80x128xf32>
+    %3490 = linalg.fill ins(%cst_2591 : f32) outs(%3489 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32>
+    %3491 = linalg.batch_matmul ins(%collapsed_2588, %collapsed_2590 : tensor<32x80x80xf32>, tensor<32x80x128xf32>) outs(%3490 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32>
+    %expanded_2592 = tensor.expand_shape %3491 [[0, 1], [2], [3]] : tensor<32x80x128xf32> into tensor<1x32x80x128xf32>
+    %3492 = tensor.empty() : tensor<1x80x32x128xf32>
+    %3493 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2592 : tensor<1x32x80x128xf32>) outs(%3492 : tensor<1x80x32x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x80x32x128xf32>
+    %collapsed_2593 = tensor.collapse_shape %3493 [[0], [1], [2, 3]] : tensor<1x80x32x128xf32> into tensor<1x80x4096xf32>
+    %3494 = tensor.empty() : tensor<4096x4096xf32>
+    %3495 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_492 : tensor<4096x4096xf32>) outs(%3494 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_2594 = tensor.collapse_shape %collapsed_2593 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_2595 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %3496 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_2594, %3495 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_2595 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_2596 = tensor.expand_shape %3496 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %3497 = tensor.empty() : tensor<1x80x4096xf32>
+    %3498 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3411, %expanded_2596 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%3497 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %3499 = tensor.empty() : tensor<1x80x4096xf32>
+    %cst_2597 = arith.constant 2.000000e+00 : f32
+    %3500 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3498 : tensor<1x80x4096xf32>) outs(%3499 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.powf %in, %cst_2597 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %cst_2598 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32>
+    %3501 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%3500 : tensor<1x80x4096xf32>) outs(%cst_2598 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %cst_2684 = arith.constant 4.096000e+03 : f32
+      %3652 = arith.divf %in, %cst_2684 : f32
+      %3653 = arith.addf %3652, %out : f32
+      linalg.yield %3653 : f32
+    } -> tensor<1x80x1xf32>
+    %cst_2599 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32>
+    %3502 = tensor.empty() : tensor<1x80x1xf32>
+    %3503 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3501, %cst_2599 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%3502 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %3504 = tensor.empty() : tensor<1x80x1xf32>
+    %3505 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3503 : tensor<1x80x1xf32>) outs(%3504 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.rsqrt %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %3506 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_2600 = tensor.collapse_shape %3505 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32>
+    %3507 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3498, %collapsed_2600 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%3506 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %expanded_2601 = tensor.expand_shape %extracted_slice_60 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32>
+    %3508 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_2602 = tensor.collapse_shape %expanded_2601 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32>
+    %3509 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_2602, %3507 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%3508 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %3510 = tensor.empty() : tensor<4096x11008xf32>
+    %3511 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_494 : tensor<11008x4096xf32>) outs(%3510 : tensor<4096x11008xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x11008xf32>
+    %collapsed_2603 = tensor.collapse_shape %3509 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_2604 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32>
+    %3512 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_2603, %3511 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_2604 : tensor<80x11008xf32>) -> tensor<80x11008xf32>
+    %expanded_2605 = tensor.expand_shape %3512 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32>
+    %3513 = tensor.empty() : tensor<1x80x11008xf32>
+    %3514 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_2605 : tensor<1x80x11008xf32>) outs(%3513 : tensor<1x80x11008xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.negf %in : f32
+      %3653 = math.exp %3652 : f32
+      %cst_2684 = arith.constant 1.000000e+00 : f32
+      %3654 = arith.addf %cst_2684, %3653 : f32
+      %3655 = arith.divf %in, %3654 : f32
+      linalg.yield %3655 : f32
+    } -> tensor<1x80x11008xf32>
+    %3515 = tensor.empty() : tensor<4096x11008xf32>
+    %3516 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_496 : tensor<11008x4096xf32>) outs(%3515 : tensor<4096x11008xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x11008xf32>
+    %collapsed_2606 = tensor.collapse_shape %3509 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_2607 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32>
+    %3517 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_2606, %3516 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_2607 : tensor<80x11008xf32>) -> tensor<80x11008xf32>
+    %expanded_2608 = tensor.expand_shape %3517 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32>
+    %3518 = tensor.empty() : tensor<1x80x11008xf32>
+    %3519 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3514, %expanded_2608 : tensor<1x80x11008xf32>, tensor<1x80x11008xf32>) outs(%3518 : tensor<1x80x11008xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x11008xf32>
+    %3520 = tensor.empty() : tensor<11008x4096xf32>
+    %3521 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_498 : tensor<4096x11008xf32>) outs(%3520 : tensor<11008x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<11008x4096xf32>
+    %collapsed_2609 = tensor.collapse_shape %3519 [[0, 1], [2]] : tensor<1x80x11008xf32> into tensor<80x11008xf32>
+    %cst_2610 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %3522 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_2609, %3521 : tensor<80x11008xf32>, tensor<11008x4096xf32>) outs(%cst_2610 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_2611 = tensor.expand_shape %3522 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %3523 = tensor.empty() : tensor<1x80x4096xf32>
+    %3524 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3498, %expanded_2611 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%3523 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %3525 = tensor.empty() : tensor<1x80x4096xf32>
+    %cst_2612 = arith.constant 2.000000e+00 : f32
+    %3526 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3524 : tensor<1x80x4096xf32>) outs(%3525 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.powf %in, %cst_2612 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %cst_2613 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32>
+    %3527 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%3526 : tensor<1x80x4096xf32>) outs(%cst_2613 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %cst_2684 = arith.constant 4.096000e+03 : f32
+      %3652 = arith.divf %in, %cst_2684 : f32
+      %3653 = arith.addf %3652, %out : f32
+      linalg.yield %3653 : f32
+    } -> tensor<1x80x1xf32>
+    %cst_2614 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32>
+    %3528 = tensor.empty() : tensor<1x80x1xf32>
+    %3529 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3527, %cst_2614 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%3528 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %3530 = tensor.empty() : tensor<1x80x1xf32>
+    %3531 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3529 : tensor<1x80x1xf32>) outs(%3530 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.rsqrt %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %3532 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_2615 = tensor.collapse_shape %3531 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32>
+    %3533 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3524, %collapsed_2615 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%3532 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %expanded_2616 = tensor.expand_shape %extracted_slice_61 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32>
+    %3534 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_2617 = tensor.collapse_shape %expanded_2616 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32>
+    %3535 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_2617, %3533 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%3534 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %3536 = tensor.empty() : tensor<4096x4096xf32>
+    %3537 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_500 : tensor<4096x4096xf32>) outs(%3536 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_2618 = tensor.collapse_shape %3535 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_2619 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %3538 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_2618, %3537 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_2619 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_2620 = tensor.expand_shape %3538 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %3539 = tensor.empty() : tensor<4096x4096xf32>
+    %3540 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_502 : tensor<4096x4096xf32>) outs(%3539 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_2621 = tensor.collapse_shape %3535 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_2622 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %3541 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_2621, %3540 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_2622 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_2623 = tensor.expand_shape %3541 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %3542 = tensor.empty() : tensor<4096x4096xf32>
+    %3543 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_504 : tensor<4096x4096xf32>) outs(%3542 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_2624 = tensor.collapse_shape %3535 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_2625 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %3544 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_2624, %3543 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_2625 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_2626 = tensor.expand_shape %3544 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %expanded_2627 = tensor.expand_shape %expanded_2620 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32>
+    %3545 = tensor.empty() : tensor<1x32x80x128xf32>
+    %3546 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2627 : tensor<1x80x32x128xf32>) outs(%3545 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x80x128xf32>
+    %expanded_2628 = tensor.expand_shape %expanded_2623 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32>
+    %3547 = tensor.empty() : tensor<1x32x80x128xf32>
+    %3548 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2628 : tensor<1x80x32x128xf32>) outs(%3547 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x80x128xf32>
+    %expanded_2629 = tensor.expand_shape %expanded_2626 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32>
+    %3549 = tensor.empty() : tensor<1x32x80x128xf32>
+    %3550 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2629 : tensor<1x80x32x128xf32>) outs(%3549 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x80x128xf32>
+    %extracted_slice_2630 = tensor.extract_slice %expanded_640[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32>
+    %extracted_slice_2631 = tensor.extract_slice %expanded_642[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32>
+    %3551 = tensor.empty() : tensor<1x80x128xf32>
+    %3552 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_2630 : tensor<1x1x80x128xf32>) outs(%3551 : tensor<1x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x80x128xf32>
+    %3553 = tensor.empty() : tensor<80x128xf32>
+    %3554 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%3552 : tensor<1x80x128xf32>) outs(%3553 : tensor<80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<80x128xf32>
+    %3555 = tensor.empty() : tensor<1x80x128xf32>
+    %3556 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_2631 : tensor<1x1x80x128xf32>) outs(%3555 : tensor<1x80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x80x128xf32>
+    %3557 = tensor.empty() : tensor<80x128xf32>
+    %3558 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%3556 : tensor<1x80x128xf32>) outs(%3557 : tensor<80x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<80x128xf32>
+    %3559 = tensor.empty() : tensor<1x80x128xf32>
+    %3560 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%3559 : tensor<1x80x128xf32>) {
+    ^bb0(%in: i64, %out: f32):
+      %3652 = arith.index_cast %in : i64 to index
+      %3653 = linalg.index 2 : index
+      %extracted = tensor.extract %3554[%3652, %3653] : tensor<80x128xf32>
+      linalg.yield %extracted : f32
+    } -> tensor<1x80x128xf32>
+    %expanded_2632 = tensor.expand_shape %3560 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32>
+    %3561 = tensor.empty() : tensor<1x80x128xf32>
+    %3562 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%3561 : tensor<1x80x128xf32>) {
+    ^bb0(%in: i64, %out: f32):
+      %3652 = arith.index_cast %in : i64 to index
+      %3653 = linalg.index 2 : index
+      %extracted = tensor.extract %3558[%3652, %3653] : tensor<80x128xf32>
+      linalg.yield %extracted : f32
+    } -> tensor<1x80x128xf32>
+    %expanded_2633 = tensor.expand_shape %3562 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32>
+    %3563 = tensor.empty() : tensor<1x32x80x128xf32>
+    %3564 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3546, %3560 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%3563 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %extracted_slice_2634 = tensor.extract_slice %3546[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %extracted_slice_2635 = tensor.extract_slice %3546[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %3565 = tensor.empty() : tensor<1x32x80x64xf32>
+    %3566 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_2635 : tensor<1x32x80x64xf32>) outs(%3565 : tensor<1x32x80x64xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.negf %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x64xf32>
+    %3567 = tensor.empty() : tensor<1x32x80x128xf32>
+    %inserted_slice_2636 = tensor.insert_slice %3566 into %3567[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %inserted_slice_2637 = tensor.insert_slice %extracted_slice_2634 into %inserted_slice_2636[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %3568 = tensor.empty() : tensor<1x32x80x128xf32>
+    %3569 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_2637, %3562 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%3568 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %3570 = tensor.empty() : tensor<1x32x80x128xf32>
+    %3571 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3564, %3569 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%3570 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %3572 = tensor.empty() : tensor<1x32x80x128xf32>
+    %3573 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3548, %3560 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%3572 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %extracted_slice_2638 = tensor.extract_slice %3548[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %extracted_slice_2639 = tensor.extract_slice %3548[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32>
+    %3574 = tensor.empty() : tensor<1x32x80x64xf32>
+    %3575 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_2639 : tensor<1x32x80x64xf32>) outs(%3574 : tensor<1x32x80x64xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.negf %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x64xf32>
+    %3576 = tensor.empty() : tensor<1x32x80x128xf32>
+    %inserted_slice_2640 = tensor.insert_slice %3575 into %3576[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %inserted_slice_2641 = tensor.insert_slice %extracted_slice_2638 into %inserted_slice_2640[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32>
+    %3577 = tensor.empty() : tensor<1x32x80x128xf32>
+    %3578 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_2641, %3562 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%3577 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %3579 = tensor.empty() : tensor<1x32x80x128xf32>
+    %3580 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3573, %3578 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%3579 : tensor<1x32x80x128xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x128xf32>
+    %3581 = tensor.empty() : tensor<1x32x128x80xf32>
+    %3582 = linalg.generic {indexing_maps = [#map14, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3580 : tensor<1x32x80x128xf32>) outs(%3581 : tensor<1x32x128x80xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x32x128x80xf32>
+    %cst_2642 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32>
+    %collapsed_2643 = tensor.collapse_shape %3571 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32>
+    %cst_2644 = arith.constant dense<0.000000e+00> : tensor<1x32x128x80xf32>
+    %collapsed_2645 = tensor.collapse_shape %3582 [[0, 1], [2], [3]] : tensor<1x32x128x80xf32> into tensor<32x128x80xf32>
+    %cst_2646 = arith.constant 0.000000e+00 : f32
+    %3583 = tensor.empty() : tensor<32x80x80xf32>
+    %3584 = linalg.fill ins(%cst_2646 : f32) outs(%3583 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32>
+    %3585 = linalg.batch_matmul ins(%collapsed_2643, %collapsed_2645 : tensor<32x80x128xf32>, tensor<32x128x80xf32>) outs(%3584 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32>
+    %expanded_2647 = tensor.expand_shape %3585 [[0, 1], [2], [3]] : tensor<32x80x80xf32> into tensor<1x32x80x80xf32>
+    %cst_2648 = arith.constant dense<11.3137083> : tensor<1x32x80x80xf32>
+    %3586 = tensor.empty() : tensor<1x32x80x80xf32>
+    %3587 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst_2648 : tensor<1x32x80x80xf32>) outs(%3586 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %cst_2684 = arith.constant 1.000000e+00 : f32
+      %3652 = arith.divf %cst_2684, %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %3588 = tensor.empty() : tensor<1x32x80x80xf32>
+    %3589 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2647, %3587 : tensor<1x32x80x80xf32>, tensor<1x32x80x80xf32>) outs(%3588 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %3590 = tensor.empty() : tensor<1x32x80x80xf32>
+    %collapsed_2649 = tensor.collapse_shape %21 [[0, 1], [2], [3]] : tensor<1x1x80x80xf32> into tensor<1x80x80xf32>
+    %3591 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3589, %collapsed_2649 : tensor<1x32x80x80xf32>, tensor<1x80x80xf32>) outs(%3590 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %3592 = tensor.empty() : tensor<1x32x80x1xf32>
+    %3593 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%3592 : tensor<1x32x80x1xf32>) {
+    ^bb0(%out: f32):
+      %cst_2684 = arith.constant 0xFF800000 : f32
+      linalg.yield %cst_2684 : f32
+    } -> tensor<1x32x80x1xf32>
+    %3594 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%3591 : tensor<1x32x80x80xf32>) outs(%3592 : tensor<1x32x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.maximumf %in, %out : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x1xf32>
+    %3595 = tensor.empty() : tensor<1x32x80x80xf32>
+    %3596 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3591, %3594 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%3595 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.subf %in, %in_2684 : f32
+      %3653 = math.exp %3652 : f32
+      linalg.yield %3653 : f32
+    } -> tensor<1x32x80x80xf32>
+    %3597 = tensor.empty() : tensor<1x32x80x1xf32>
+    %3598 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%3597 : tensor<1x32x80x1xf32>) {
+    ^bb0(%out: f32):
+      %cst_2684 = arith.constant 0.000000e+00 : f32
+      linalg.yield %cst_2684 : f32
+    } -> tensor<1x32x80x1xf32>
+    %3599 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%3596 : tensor<1x32x80x80xf32>) outs(%3598 : tensor<1x32x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.addf %in, %out : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x1xf32>
+    %3600 = tensor.empty() : tensor<1x32x80x80xf32>
+    %3601 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3596, %3599 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%3600 : tensor<1x32x80x80xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.divf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x32x80x80xf32>
+    %cst_2650 = arith.constant dense<0.000000e+00> : tensor<1x32x80x80xf32>
+    %collapsed_2651 = tensor.collapse_shape %3601 [[0, 1], [2], [3]] : tensor<1x32x80x80xf32> into tensor<32x80x80xf32>
+    %cst_2652 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32>
+    %collapsed_2653 = tensor.collapse_shape %3550 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32>
+    %cst_2654 = arith.constant 0.000000e+00 : f32
+    %3602 = tensor.empty() : tensor<32x80x128xf32>
+    %3603 = linalg.fill ins(%cst_2654 : f32) outs(%3602 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32>
+    %3604 = linalg.batch_matmul ins(%collapsed_2651, %collapsed_2653 : tensor<32x80x80xf32>, tensor<32x80x128xf32>) outs(%3603 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32>
+    %expanded_2655 = tensor.expand_shape %3604 [[0, 1], [2], [3]] : tensor<32x80x128xf32> into tensor<1x32x80x128xf32>
+    %3605 = tensor.empty() : tensor<1x80x32x128xf32>
+    %3606 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2655 : tensor<1x32x80x128xf32>) outs(%3605 : tensor<1x80x32x128xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<1x80x32x128xf32>
+    %collapsed_2656 = tensor.collapse_shape %3606 [[0], [1], [2, 3]] : tensor<1x80x32x128xf32> into tensor<1x80x4096xf32>
+    %3607 = tensor.empty() : tensor<4096x4096xf32>
+    %3608 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_506 : tensor<4096x4096xf32>) outs(%3607 : tensor<4096x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x4096xf32>
+    %collapsed_2657 = tensor.collapse_shape %collapsed_2656 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_2658 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %3609 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_2657, %3608 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_2658 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_2659 = tensor.expand_shape %3609 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %3610 = tensor.empty() : tensor<1x80x4096xf32>
+    %3611 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3524, %expanded_2659 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%3610 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %3612 = tensor.empty() : tensor<1x80x4096xf32>
+    %cst_2660 = arith.constant 2.000000e+00 : f32
+    %3613 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3611 : tensor<1x80x4096xf32>) outs(%3612 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.powf %in, %cst_2660 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %cst_2661 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32>
+    %3614 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%3613 : tensor<1x80x4096xf32>) outs(%cst_2661 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %cst_2684 = arith.constant 4.096000e+03 : f32
+      %3652 = arith.divf %in, %cst_2684 : f32
+      %3653 = arith.addf %3652, %out : f32
+      linalg.yield %3653 : f32
+    } -> tensor<1x80x1xf32>
+    %cst_2662 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32>
+    %3615 = tensor.empty() : tensor<1x80x1xf32>
+    %3616 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3614, %cst_2662 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%3615 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %3617 = tensor.empty() : tensor<1x80x1xf32>
+    %3618 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3616 : tensor<1x80x1xf32>) outs(%3617 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.rsqrt %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %3619 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_2663 = tensor.collapse_shape %3618 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32>
+    %3620 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3611, %collapsed_2663 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%3619 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %expanded_2664 = tensor.expand_shape %extracted_slice_62 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32>
+    %3621 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_2665 = tensor.collapse_shape %expanded_2664 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32>
+    %3622 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_2665, %3620 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%3621 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %3623 = tensor.empty() : tensor<4096x11008xf32>
+    %3624 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_508 : tensor<11008x4096xf32>) outs(%3623 : tensor<4096x11008xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x11008xf32>
+    %collapsed_2666 = tensor.collapse_shape %3622 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_2667 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32>
+    %3625 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_2666, %3624 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_2667 : tensor<80x11008xf32>) -> tensor<80x11008xf32>
+    %expanded_2668 = tensor.expand_shape %3625 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32>
+    %3626 = tensor.empty() : tensor<1x80x11008xf32>
+    %3627 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_2668 : tensor<1x80x11008xf32>) outs(%3626 : tensor<1x80x11008xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = arith.negf %in : f32
+      %3653 = math.exp %3652 : f32
+      %cst_2684 = arith.constant 1.000000e+00 : f32
+      %3654 = arith.addf %cst_2684, %3653 : f32
+      %3655 = arith.divf %in, %3654 : f32
+      linalg.yield %3655 : f32
+    } -> tensor<1x80x11008xf32>
+    %3628 = tensor.empty() : tensor<4096x11008xf32>
+    %3629 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_510 : tensor<11008x4096xf32>) outs(%3628 : tensor<4096x11008xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x11008xf32>
+    %collapsed_2669 = tensor.collapse_shape %3622 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_2670 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32>
+    %3630 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_2669, %3629 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_2670 : tensor<80x11008xf32>) -> tensor<80x11008xf32>
+    %expanded_2671 = tensor.expand_shape %3630 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32>
+    %3631 = tensor.empty() : tensor<1x80x11008xf32>
+    %3632 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3627, %expanded_2671 : tensor<1x80x11008xf32>, tensor<1x80x11008xf32>) outs(%3631 : tensor<1x80x11008xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x11008xf32>
+    %3633 = tensor.empty() : tensor<11008x4096xf32>
+    %3634 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_512 : tensor<4096x11008xf32>) outs(%3633 : tensor<11008x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<11008x4096xf32>
+    %collapsed_2672 = tensor.collapse_shape %3632 [[0, 1], [2]] : tensor<1x80x11008xf32> into tensor<80x11008xf32>
+    %cst_2673 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32>
+    %3635 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_2672, %3634 : tensor<80x11008xf32>, tensor<11008x4096xf32>) outs(%cst_2673 : tensor<80x4096xf32>) -> tensor<80x4096xf32>
+    %expanded_2674 = tensor.expand_shape %3635 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32>
+    %3636 = tensor.empty() : tensor<1x80x4096xf32>
+    %3637 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3611, %expanded_2674 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%3636 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %3638 = tensor.empty() : tensor<1x80x4096xf32>
+    %cst_2675 = arith.constant 2.000000e+00 : f32
+    %3639 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3637 : tensor<1x80x4096xf32>) outs(%3638 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.powf %in, %cst_2675 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %cst_2676 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32>
+    %3640 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%3639 : tensor<1x80x4096xf32>) outs(%cst_2676 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %cst_2684 = arith.constant 4.096000e+03 : f32
+      %3652 = arith.divf %in, %cst_2684 : f32
+      %3653 = arith.addf %3652, %out : f32
+      linalg.yield %3653 : f32
+    } -> tensor<1x80x1xf32>
+    %cst_2677 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32>
+    %3641 = tensor.empty() : tensor<1x80x1xf32>
+    %3642 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3640, %cst_2677 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%3641 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.addf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %3643 = tensor.empty() : tensor<1x80x1xf32>
+    %3644 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3642 : tensor<1x80x1xf32>) outs(%3643 : tensor<1x80x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %3652 = math.rsqrt %in : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x1xf32>
+    %3645 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_2678 = tensor.collapse_shape %3644 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32>
+    %3646 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3637, %collapsed_2678 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%3645 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %expanded_2679 = tensor.expand_shape %extracted_slice_63 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32>
+    %3647 = tensor.empty() : tensor<1x80x4096xf32>
+    %collapsed_2680 = tensor.collapse_shape %expanded_2679 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32>
+    %3648 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_2680, %3646 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%3647 : tensor<1x80x4096xf32>) {
+    ^bb0(%in: f32, %in_2684: f32, %out: f32):
+      %3652 = arith.mulf %in, %in_2684 : f32
+      linalg.yield %3652 : f32
+    } -> tensor<1x80x4096xf32>
+    %3649 = tensor.empty() : tensor<4096x32000xf32>
+    %3650 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_514 : tensor<32000x4096xf32>) outs(%3649 : tensor<4096x32000xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4096x32000xf32>
+    %collapsed_2681 = tensor.collapse_shape %3648 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32>
+    %cst_2682 = arith.constant dense<0.000000e+00> : tensor<80x32000xf32>
+    %3651 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%collapsed_2681, %3650 : tensor<80x4096xf32>, tensor<4096x32000xf32>) outs(%cst_2682 : tensor<80x32000xf32>) -> tensor<80x32000xf32>
+    %expanded_2683 = tensor.expand_shape %3651 [[0, 1], [2]] : tensor<80x32000xf32> into tensor<1x80x32000xf32>
+    return %expanded_2683 : tensor<1x80x32000xf32>
+  }
+}
+
diff --git a/examples/BuddyLlama/llama-main.cpp b/examples/BuddyLlama/llama-main.cpp
index 78b5cec02..55530a01c 100644
--- a/examples/BuddyLlama/llama-main.cpp
+++ b/examples/BuddyLlama/llama-main.cpp
@@ -18,12 +18,9 @@
 #include <buddy/LLM/TextContainer.h>
 #include <chrono>
 #include <cstddef>
-#include <cstdint>
 #include <filesystem>
 #include <fstream>
 #include <iostream>
-#include <limits>
-#include <type_traits>
 
 using namespace buddy;
 
diff --git a/examples/BuddyLlama/llama.sh b/examples/BuddyLlama/llama.sh
new file mode 100755
index 000000000..65eab547f
--- /dev/null
+++ b/examples/BuddyLlama/llama.sh
@@ -0,0 +1,11 @@
+buddy-opt llama-linalg-default.mlir -arith-expand -eliminate-empty-tensors -empty-tensor-to-alloc-tensor -linalg-bufferize -convert-linalg-to-affine-loops -affine-loop-fusion -affine-parallelize -lower-affine -canonicalize -func-bufferize -arith-bufferize -tensor-bufferize -buffer-deallocation -finalizing-bufferize -canonicalize -o llama-bufferized.mlir
+# mlir-opt llama-linalg-default.mlir -arith-expand -eliminate-empty-tensors -empty-tensor-to-alloc-tensor -one-shot-bufferize="bufferize-function-boundaries copy-before-write" -expand-realloc  -resolve-shaped-type-result-dims -canonicalize -buffer-deallocation-simplification -bufferization-lower-deallocations -cse -canonicalize -buffer-deallocation-pipeline -convert-linalg-to-affine-loops -affine-loop-fusion -affine-parallelize -lower-affine -canonicalize  -o llama-bufferized.mlir
+buddy-opt llama-bufferized.mlir -gpu-map-parallel-loops -convert-parallel-loops-to-gpu -canonicalize -gpu-kernel-outlining -o llama-outlined.mlir
+buddy-opt llama-outlined.mlir -gpu-host-register -o llama-host-registered.mlir
+buddy-opt llama-host-registered.mlir -convert-scf-to-cf -memref-expand -finalize-memref-to-llvm -convert-arith-to-llvm -convert-gpu-to-nvvm='has-redux=1' -o llama-nvvm.mlir
+mlir-opt llama-nvvm.mlir -llvm-request-c-wrappers -o llama-wrapper.mlir
+mlir-opt llama-wrapper.mlir --test-lower-to-nvvm="cubin-chip=sm_80 cubin-features=+ptx71 cubin-format=fatbin" -o llama-cubin.mlir
+mlir-translate llama-cubin.mlir --mlir-to-llvmir -o llama.ll
+/home/liam/IPRC/llvm-project/build/bin/llc llama.ll -filetype=obj -relocation-model=pic -O3 -o llama.o
+clang llama.o llama-main.cpp.o /home/liam/IPRC/llvm-project/build/lib/libmlir_cuda_runtime.so /home/liam/IPRC/llvm-project/build/lib/libmlir_c_runner_utils.so -lstdc++ -o llama.out
+./llama.out
\ No newline at end of file
diff --git a/examples/BuddyLlama/test.mlir b/examples/BuddyLlama/test.mlir
new file mode 100644
index 000000000..12aba71c3
--- /dev/null
+++ b/examples/BuddyLlama/test.mlir
@@ -0,0 +1,23 @@
+func.func @main() {
+  %0 = arith.constant 0 : i8
+  %1 = arith.constant 1 : i8
+  %2 = arith.constant 2 : i8
+  %mem0 = memref.alloc() : memref<8x8xi8>
+  %mem1 = memref.alloc() : memref<8x8xi8>
+  %mem2 = memref.alloc() : memref<8x8xi8>
+  linalg.fill
+    ins(%2 : i8)
+  outs(%mem0 : memref<8x8xi8>)
+  linalg.fill
+    ins(%1 : i8)
+  outs(%mem1 : memref<8x8xi8>)
+  // CHECK: gemmini.tile_matmul %alloc %alloc_{{[0-9]+}} %alloc_{{[0-9]+}} %alloc_{{[0-9]+}}
+  // CHECK-SAME: memref<8x8xi8> memref<8x8xi8> memref<8x8xi8> memref<8x8xi32>
+  linalg.matmul
+    ins(%mem0, %mem1 : memref<8x8xi8>, memref<8x8xi8>)
+  outs(%mem2 : memref<8x8xi8>)
+  memref.dealloc %mem0 : memref<8x8xi8>
+  memref.dealloc %mem1 : memref<8x8xi8>
+  memref.dealloc %mem2 : memref<8x8xi8>
+  return
+}
\ No newline at end of file
diff --git a/examples/BuddyLlama/test.sh b/examples/BuddyLlama/test.sh
new file mode 100755
index 000000000..b31a5b3b9
--- /dev/null
+++ b/examples/BuddyLlama/test.sh
@@ -0,0 +1,7 @@
+mlir-opt test.mlir -arith-expand -eliminate-empty-tensors -empty-tensor-to-alloc-tensor -linalg-bufferize -convert-linalg-to-affine-loops -affine-loop-fusion -affine-parallelize -lower-affine -canonicalize -func-bufferize -arith-bufferize -tensor-bufferize -buffer-deallocation -finalizing-bufferize -canonicalize -o bufferized.mlir
+mlir-opt bufferized.mlir -gpu-map-parallel-loops -convert-parallel-loops-to-gpu -canonicalize -gpu-kernel-outlining -o outlined.mlir
+buddy-opt outlined.mlir -gpu-host-register -o host-registered.mlir
+mlir-opt host-registered.mlir -convert-scf-to-cf -memref-expand -finalize-memref-to-llvm -convert-arith-to-llvm -convert-gpu-to-nvvm='has-redux=1' -o nvvm.mlir
+mlir-opt nvvm.mlir -llvm-request-c-wrappers -o wrapper.mlir
+mlir-opt wrapper.mlir --test-lower-to-nvvm="cubin-chip=sm_80 cubin-features=+ptx71 cubin-format=fatbin" -o cubin.mlir
+mlir-cpu-runner cubin.mlir -entry-point-result=void -shared-libs=/home/liam/IPRC/llvm-project/build/lib/libmlir_runner_utils.so -shared-libs=/home/liam/IPRC/llvm-project/build/lib/libmlir_cuda_runtime.so
\ No newline at end of file
diff --git a/examples/BuddyPython/bert.py b/examples/BuddyPython/bert.py
index 7f4f00435..e57dc991b 100644
--- a/examples/BuddyPython/bert.py
+++ b/examples/BuddyPython/bert.py
@@ -15,6 +15,10 @@
 text = "Replace me by any text you'd like."
 encoded_text = tokenizer(text, return_tensors="pt")
 with torch.no_grad():
-    module, params = dynamo_compiler.importer(model, **encoded_text)
-    print(module)
-    print(params)
+    graphs = dynamo_compiler.importer(model, **encoded_text)
+
+graph = graphs[0]
+params = dynamo_compiler.imported_params[graph]    
+graph.lower_to_top_level_ir(do_params_pack=True)
+print(graph._imported_module)
+print(params)
diff --git a/examples/BuddyPython/module_gen.py b/examples/BuddyPython/module_gen.py
index 10a1e2ee1..e2c722ceb 100644
--- a/examples/BuddyPython/module_gen.py
+++ b/examples/BuddyPython/module_gen.py
@@ -43,23 +43,12 @@ def foo(x, y):
     aot_autograd_decomposition=inductor_decomp,
 )
 
-# The first way to generate an MLIR Module:
-# Pass the function and input data to the dynamo compiler's importer,
-# and accepts the generated module and weight parameters.
-module, params = dynamo_compiler.importer(foo, *(float32_in1, float32_in2))
-
-print(module)
-print(params)
-
-# The second way to generate an MLIR Module:
-# Execute the target function using a define-by-run style,
-# and get the module and weight parameters from the dynamo compiler's attribute.
-foo_mlir = dynamo.optimize(dynamo_compiler)(foo)
-
-foo_mlir(float32_in1, float32_in2)
-print(dynamo_compiler.imported_module)
-print(dynamo_compiler.imported_params)
-
-foo_mlir(int32_in1, int32_in2)
-print(dynamo_compiler.imported_module)
-print(dynamo_compiler.imported_params)
+# Pass the function and input data to the dynamo compiler's importer, the 
+# importer will first build a graph. Then, lower the graph to top-level IR. 
+# (tosa, linalg, etc.). Finally, accepts the generated module and weight parameters.
+graphs = dynamo_compiler.importer(foo, *(float32_in1, float32_in2))
+graph = graphs[0]
+graph.lower_to_top_level_ir(do_params_pack=True)
+
+print(graph._imported_module)
+print(dynamo_compiler.imported_params[graph])
diff --git a/examples/BuddyResNet18/import-resnet18.py b/examples/BuddyResNet18/import-resnet18.py
new file mode 100644
index 000000000..c58f4a604
--- /dev/null
+++ b/examples/BuddyResNet18/import-resnet18.py
@@ -0,0 +1,45 @@
+# ===- import-resnet18.py ------------------------------------------------------
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# ===---------------------------------------------------------------------------
+#
+# This is the test of resnet18 model.
+#
+# ===---------------------------------------------------------------------------
+
+import torch
+import torchvision
+from torch._inductor.decomposition import decompositions as inductor_decomp
+
+from buddy.compiler.frontend import DynamoCompiler
+from buddy.compiler.ops import tosa
+
+
+model = torchvision.models.resnet18()
+model = model.eval()
+
+# Initialize Dynamo Compiler with specific configurations as an importer.
+dynamo_compiler = DynamoCompiler(
+    primary_registry=tosa.ops_registry,
+    aot_autograd_decomposition=inductor_decomp,
+)
+
+data = torch.randn([1, 3, 224, 224])
+# Import the model into MLIR module and parameters.
+with torch.no_grad():
+    graphs = dynamo_compiler.importer(model, data)
+
+assert len(graphs) == 1
+graphs[0].lower_to_top_level_ir(do_params_pack=True)
+print(graphs[0]._imported_module)
diff --git a/examples/DAPDialect/CMakeLists.txt b/examples/DAPDialect/CMakeLists.txt
index 21c766035..b147d5604 100644
--- a/examples/DAPDialect/CMakeLists.txt
+++ b/examples/DAPDialect/CMakeLists.txt
@@ -16,9 +16,9 @@ message(STATUS "Spliting size: ${SPLITING_SIZE}")
 # Buddy DAP Dialect FIR operation
 #-------------------------------------------------------------------------------
 
-add_executable(firLowpass firLowpass.cpp)
-add_dependencies(firLowpass buddy-opt)
-target_link_libraries(firLowpass
+add_executable(buddy-fir FIRLowpass.cpp)
+add_dependencies(buddy-fir buddy-opt)
+target_link_libraries(buddy-fir
   BuddyLibDAP
 )
 
@@ -26,9 +26,9 @@ target_link_libraries(firLowpass
 # Buddy DAP Dialect Biquad Operation
 #-------------------------------------------------------------------------------
 
-add_executable(biquad biquad.cpp)
-add_dependencies(biquad buddy-opt)
-target_link_libraries(biquad
+add_executable(buddy-biquad biquad.cpp)
+add_dependencies(buddy-biquad buddy-opt)
+target_link_libraries(buddy-biquad
   BuddyLibDAP
 )
 
@@ -36,8 +36,14 @@ target_link_libraries(biquad
 # Buddy DAP Dialect IIR Operation
 #-------------------------------------------------------------------------------
 
-add_executable(iirLowpass iirLowpass.cpp)
-add_dependencies(iirLowpass buddy-opt)
-target_link_libraries(iirLowpass 
+add_executable(buddy-iir-scalar IIRLowpass.cpp)
+add_dependencies(buddy-iir-scalar buddy-opt)
+target_link_libraries(buddy-iir-scalar 
   BuddyLibDAP
 )
+
+add_executable(buddy-iir-vectorization IIRVectorization.cpp)
+add_dependencies(buddy-iir-vectorization buddy-opt)
+target_link_libraries(buddy-iir-vectorization
+  BuddyLibDAPVectorization
+)
diff --git a/examples/DAPDialect/firLowpass.cpp b/examples/DAPDialect/FIRLowpass.cpp
similarity index 90%
rename from examples/DAPDialect/firLowpass.cpp
rename to examples/DAPDialect/FIRLowpass.cpp
index 6a4052e8d..cfce56091 100644
--- a/examples/DAPDialect/firLowpass.cpp
+++ b/examples/DAPDialect/FIRLowpass.cpp
@@ -1,4 +1,4 @@
-//===- FirLowpass.cpp - Example of DAP fir filter ----------------------===//
+//===- FIRLowpass.cpp - Example of DAP FIR Filter -------------------------===//
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -38,7 +38,7 @@ int main(int argc, char *argv[]) {
   if (argc == 3) {
     saveFileName = argv[2];
   }
-  cout << "Usage: FirLowpass [loadPath] [savePath]" << endl;
+  cout << "Usage: FIRLowpass [loadPath] [savePath]" << endl;
   cout << "Current specified path: \n";
   cout << "Load: " << fileName << endl;
   cout << "Save: " << saveFileName << endl;
@@ -53,6 +53,6 @@ int main(int argc, char *argv[]) {
   output.getAudioFile().setAudioBuffer(nullptr);
   dap::fir(&aud.getMemRef(), &kernel, &output.getMemRef());
   cout << "Saving file:" << endl;
-  cout << (output.save(saveFileName) ? "OK" : "NOT OK") << endl;
+  cout << (output.save(saveFileName) ? "OK" : "ERROR") << endl;
   return 0;
 }
diff --git a/examples/DAPDialect/iirLowpass.cpp b/examples/DAPDialect/IIRLowpass.cpp
similarity index 82%
rename from examples/DAPDialect/iirLowpass.cpp
rename to examples/DAPDialect/IIRLowpass.cpp
index f3d152802..1b69ec08b 100644
--- a/examples/DAPDialect/iirLowpass.cpp
+++ b/examples/DAPDialect/IIRLowpass.cpp
@@ -1,4 +1,4 @@
-//===- iirLowpass.cpp - Example of DAP iir filter -------------------------===//
+//===- IIRLowpass.cpp - Example of DAP IIR Filter -------------------------===//
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -30,21 +30,23 @@ using namespace std;
 
 int main(int argc, char *argv[]) {
   string fileName = "../../tests/Interface/core/NASA_Mars.wav";
-  string saveFileName = "IIR_NASA_Mars.wav";
+  string saveFileName = "IIR_LOWPASS_NASA_Mars.wav";
   if (argc >= 2) {
     fileName = argv[1];
   }
   if (argc == 3) {
     saveFileName = argv[2];
   }
-  cout << "Usage: FirLowpass [loadPath] [savePath]" << endl;
+  cout << "Usage: IIRLowpass [loadPath] [savePath]" << endl;
   cout << "Current specified path: \n";
   cout << "Load: " << fileName << endl;
   cout << "Save: " << saveFileName << endl;
+  // Order of butterworth filter
   int order = 8;
+  // Each SOS matrix has 6 paramters.
   intptr_t kernelSize[2] = {int(order / 2), 6};
   MemRef<float, 2> kernel(kernelSize);
-
+  // cutoff frequency = 1000, fs = 48000.
   dap::iirLowpass<float, 2>(kernel, dap::butterworth<float>(order), 1000,
                             48000);
 
@@ -54,10 +56,10 @@ int main(int argc, char *argv[]) {
   output.fetchMetadata(aud.getAudioFile());
   output.getAudioFile().setAudioBuffer(nullptr);
 
-  dap::iir(&aud.getMemRef(), &kernel, &output.getMemRef());
+  dap::IIR(&aud.getMemRef(), &kernel, &output.getMemRef());
 
   cout << "Saving file:" << endl;
-  cout << (output.save(saveFileName) ? "OK" : "NOT OK") << endl;
+  cout << (output.save(saveFileName) ? "OK" : "ERROR") << endl;
 
   return 0;
 }
diff --git a/examples/DAPDialect/IIRVectorization.cpp b/examples/DAPDialect/IIRVectorization.cpp
new file mode 100644
index 000000000..c7d0c1955
--- /dev/null
+++ b/examples/DAPDialect/IIRVectorization.cpp
@@ -0,0 +1,66 @@
+//===- IIRVectorization.cpp - Example of DAP IIR Vectorization ------------===//
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements an end to end example for iir filter in buddy-mlir. It
+// generates coefficients for a filter and apply it on a piece of mono audio,
+// then saves the audio.
+// This file will be linked with the object file which use dap vectorization
+// pass to generate the executable file.
+//
+//===----------------------------------------------------------------------===//
+
+#include <buddy/DAP/DAP.h>
+#include <iostream>
+
+using namespace dap;
+using namespace std;
+
+int main(int argc, char *argv[]) {
+  string fileName = "../../tests/Interface/core/NASA_Mars.wav";
+  string saveFileName = "IIR_VECTORIZATION_PASS_NASA_Mars.wav";
+  if (argc >= 2) {
+    fileName = argv[1];
+  }
+  if (argc == 3) {
+    saveFileName = argv[2];
+  }
+  cout << "Usage: IIRVectorizationPass [loadPath] [savePath]" << endl;
+  cout << "Current specified path: \n";
+  cout << "Load: " << fileName << endl;
+  cout << "Save: " << saveFileName << endl;
+  // Order for butterworth filter.
+  int order = 8;
+  // Each SOS matrix has 6 paramters.
+  intptr_t kernelSize[2] = {int(order / 2), 6};
+  MemRef<float, 2> kernel(kernelSize);
+  // cutoff frequency = 1000, fs = 48000.
+  dap::iirLowpass<float, 2>(kernel, dap::butterworth<float>(order), 1000,
+                            48000);
+
+  auto aud = dap::Audio<float, 1>(fileName);
+  aud.getAudioFile().printSummary();
+  dap::Audio<float, 1> output;
+  output.fetchMetadata(aud.getAudioFile());
+  output.getAudioFile().setAudioBuffer(nullptr);
+
+  dap::IIR(&aud.getMemRef(), &kernel, &output.getMemRef(),
+           /*isVectorization=*/true);
+
+  cout << "Saving file:" << endl;
+  cout << (output.save(saveFileName) ? "OK" : "ERROR") << endl;
+
+  return 0;
+}
diff --git a/examples/DAPDialect/biquad.cpp b/examples/DAPDialect/biquad.cpp
index 03709989e..14a78084a 100644
--- a/examples/DAPDialect/biquad.cpp
+++ b/examples/DAPDialect/biquad.cpp
@@ -1,4 +1,4 @@
-//===- biquad.cpp - Example of DAP iir filter -----------------------------===//
+//===- biquad.cpp - Example of DAP Biquad Filter --------------------------===//
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -53,6 +53,6 @@ int main(int argc, char *argv[]) {
   dap::biquad(&aud.getMemRef(), &kernel, &output.getMemRef());
 
   cout << "Saving file:" << endl;
-  cout << (output.save(saveFileName) ? "OK" : "NOT OK") << endl;
+  cout << (output.save(saveFileName) ? "OK" : "ERROR") << endl;
   return 0;
 }
diff --git a/examples/MLIRTOSA/makefile b/examples/MLIRTOSA/makefile
index 3400ea99c..9c1ff1b7f 100644
--- a/examples/MLIRTOSA/makefile
+++ b/examples/MLIRTOSA/makefile
@@ -16,6 +16,13 @@ MLIR_C_RUNNER_UTILS := ../../llvm/build/lib/libmlir_c_runner_utils.dylib
 MTRIPLE := x86_64-apple-darwin
 endif
 
+tosa-resize-lower-to-linalg:
+	@${MLIR_OPT} ./tosa-resize.mlir \
+		-pass-pipeline="builtin.module( \
+			func.func(tosa-to-linalg) \
+		)" \
+		-o ./log.mlir
+
 tosa-resize-lower:
 	@${MLIR_OPT} ./tosa-resize.mlir \
 		-pass-pipeline="builtin.module( \
@@ -26,7 +33,6 @@ tosa-resize-lower:
 			func-bufferize, \
 			func.func(buffer-deallocation, convert-linalg-to-loops), \
 			convert-scf-to-cf, \
-			convert-linalg-to-llvm, \
 			finalize-memref-to-llvm, \
 			convert-math-to-llvm, \
 			convert-func-to-llvm, \
@@ -44,7 +50,6 @@ tosa-resize-translate:
 			func-bufferize, \
 			func.func(buffer-deallocation, convert-linalg-to-loops), \
 			convert-scf-to-cf, \
-			convert-linalg-to-llvm, \
 			finalize-memref-to-llvm, \
 			convert-math-to-llvm, \
 			convert-func-to-llvm, \
@@ -62,7 +67,6 @@ tosa-resize-run:
 			func-bufferize, \
 			func.func(buffer-deallocation, convert-linalg-to-loops), \
 			convert-scf-to-cf, \
-			convert-linalg-to-llvm, \
 			finalize-memref-to-llvm, \
 			convert-math-to-llvm, \
 			convert-func-to-llvm, \
@@ -81,7 +85,6 @@ tosa-sigmoid-lower:
 			func-bufferize, \
 			func.func(buffer-deallocation, convert-linalg-to-loops), \
 			convert-scf-to-cf, \
-			convert-linalg-to-llvm, \
 			finalize-memref-to-llvm, \
 			convert-math-to-llvm, \
 			convert-func-to-llvm, \
@@ -99,7 +102,6 @@ tosa-sigmoid-translate:
 			func-bufferize, \
 			func.func(buffer-deallocation, convert-linalg-to-loops), \
 			convert-scf-to-cf, \
-			convert-linalg-to-llvm, \
 			finalize-memref-to-llvm, \
 			convert-math-to-llvm, \
 			convert-func-to-llvm, \
@@ -117,7 +119,6 @@ tosa-sigmoid-run:
 			func-bufferize, \
 			func.func(buffer-deallocation, convert-linalg-to-loops), \
 			convert-scf-to-cf, \
-			convert-linalg-to-llvm, \
 			finalize-memref-to-llvm, \
 			convert-math-to-llvm, \
 			convert-func-to-llvm, \
@@ -136,7 +137,6 @@ tosa-log-lower:
 			func-bufferize, \
 			func.func(buffer-deallocation, convert-linalg-to-loops), \
 			convert-scf-to-cf, \
-			convert-linalg-to-llvm, \
 			finalize-memref-to-llvm, \
 			convert-math-to-llvm, \
 			convert-func-to-llvm, \
@@ -154,7 +154,6 @@ tosa-log-translate:
 			func-bufferize, \
 			func.func(buffer-deallocation, convert-linalg-to-loops), \
 			convert-scf-to-cf, \
-			convert-linalg-to-llvm, \
 			finalize-memref-to-llvm, \
 			convert-math-to-llvm, \
 			convert-func-to-llvm, \
@@ -172,7 +171,6 @@ tosa-log-run:
 			func-bufferize, \
 			func.func(buffer-deallocation, convert-linalg-to-loops), \
 			convert-scf-to-cf, \
-			convert-linalg-to-llvm, \
 			finalize-memref-to-llvm, \
 			convert-math-to-llvm, \
 			convert-func-to-llvm, \
@@ -191,7 +189,6 @@ tosa-add-lower:
 			func-bufferize, \
 			func.func(buffer-deallocation, convert-linalg-to-loops), \
 			convert-scf-to-cf, \
-			convert-linalg-to-llvm, \
 			finalize-memref-to-llvm, \
 			convert-math-to-llvm, \
 			convert-func-to-llvm, \
@@ -209,7 +206,6 @@ tosa-add-translate:
 			func-bufferize, \
 			func.func(buffer-deallocation, convert-linalg-to-loops), \
 			convert-scf-to-cf, \
-			convert-linalg-to-llvm, \
 			finalize-memref-to-llvm, \
 			convert-math-to-llvm, \
 			convert-func-to-llvm, \
@@ -227,7 +223,6 @@ tosa-add-run:
 			func-bufferize, \
 			func.func(buffer-deallocation, convert-linalg-to-loops), \
 			convert-scf-to-cf, \
-			convert-linalg-to-llvm, \
 			finalize-memref-to-llvm, \
 			convert-math-to-llvm, \
 			convert-func-to-llvm, \
@@ -246,7 +241,6 @@ tosa-concat-lower:
 			func-bufferize, \
 			func.func(buffer-deallocation, convert-linalg-to-loops), \
 			convert-scf-to-cf, \
-			convert-linalg-to-llvm, \
 			expand-strided-metadata, \
 			finalize-memref-to-llvm, \
 			convert-math-to-llvm, \
@@ -265,7 +259,6 @@ tosa-concat-translate:
 			func-bufferize, \
 			func.func(buffer-deallocation, convert-linalg-to-loops), \
 			convert-scf-to-cf, \
-			convert-linalg-to-llvm, \
 			expand-strided-metadata, \
 			finalize-memref-to-llvm, \
 			convert-math-to-llvm, \
@@ -284,7 +277,6 @@ tosa-concat-run:
 			func-bufferize, \
 			func.func(buffer-deallocation, convert-linalg-to-loops), \
 			convert-scf-to-cf, \
-			convert-linalg-to-llvm, \
 			expand-strided-metadata, \
 			finalize-memref-to-llvm, \
 			convert-math-to-llvm, \
diff --git a/examples/MLIRTOSA/tosa-concat.mlir b/examples/MLIRTOSA/tosa-concat.mlir
index f1a1b1d4a..5c67b435a 100644
--- a/examples/MLIRTOSA/tosa-concat.mlir
+++ b/examples/MLIRTOSA/tosa-concat.mlir
@@ -3,7 +3,7 @@ func.func @main() {
   %0 = arith.constant dense<[[11.,12.],[30.,40.]]> : tensor<2x2xf32>
   %1 = arith.constant dense<[[12.,13.],[23.,45.],[11.,89.]]> : tensor<3x2xf32>
 
-  %output = "tosa.concat"(%0,%1) {axis=0} : (tensor<2x2xf32>,tensor<3x2xf32>) -> tensor<5x2xf32>
+  %output = "tosa.concat"(%0,%1) {axis=0 : i32} : (tensor<2x2xf32>,tensor<3x2xf32>) -> tensor<5x2xf32>
   %tensor_unranked = tensor.cast %output : tensor<5x2xf32> to tensor<*xf32>
   
   call @printMemrefF32(%tensor_unranked) : (tensor<*xf32>) -> ()
diff --git a/examples/MLIRTensor/makefile b/examples/MLIRTensor/makefile
index 2790fbe32..0a8d1fa20 100644
--- a/examples/MLIRTensor/makefile
+++ b/examples/MLIRTensor/makefile
@@ -20,14 +20,14 @@ tensor-print-lower:
 	@${MLIR_OPT} ./tensor-print.mlir \
 		-arith-bufferize  -tensor-bufferize -linalg-bufferize \
 		-func-bufferize -buffer-deallocation -convert-linalg-to-loops \
-		-convert-linalg-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm \
+		-finalize-memref-to-llvm -convert-func-to-llvm \
 		-reconcile-unrealized-casts -o ./log.mlir
 
 tensor-print-translate:
 	@${MLIR_OPT} ./tensor-print.mlir \
 		-arith-bufferize  -tensor-bufferize -linalg-bufferize \
 		-func-bufferize -buffer-deallocation -convert-linalg-to-loops \
-		-convert-linalg-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm \
+		-finalize-memref-to-llvm -convert-func-to-llvm \
 		-reconcile-unrealized-casts | \
 	${MLIR_TRANSLATE} --mlir-to-llvmir -o log.ll
 
@@ -35,7 +35,7 @@ tensor-print-run:
 	@${MLIR_OPT} ./tensor-print.mlir \
 		-arith-bufferize  -tensor-bufferize -linalg-bufferize \
 		-func-bufferize -buffer-deallocation -convert-linalg-to-loops \
-		-convert-linalg-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm \
+		-finalize-memref-to-llvm -convert-func-to-llvm \
 		-reconcile-unrealized-casts | \
 	${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \
 		-shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS}
@@ -43,7 +43,7 @@ tensor-print-run:
 tensor-collapse-shape-lower:
 	@${MLIR_OPT} ./tensor-collapse-shape.mlir \
 		-arith-bufferize -tensor-bufferize -func-bufferize \
-		-finalizing-bufferize -buffer-deallocation -convert-linalg-to-llvm \
+		-finalizing-bufferize -buffer-deallocation \
 		-expand-strided-metadata -lower-affine \
 		-finalize-memref-to-llvm -convert-func-to-llvm \
 		-reconcile-unrealized-casts -o ./log.mlir
@@ -51,7 +51,7 @@ tensor-collapse-shape-lower:
 tensor-collapse-shape-translate:
 	@${MLIR_OPT} ./tensor-collapse-shape.mlir \
 		-arith-bufferize -tensor-bufferize -func-bufferize \
-		-finalizing-bufferize -buffer-deallocation -convert-linalg-to-llvm \
+		-finalizing-bufferize -buffer-deallocation \
 		-expand-strided-metadata -lower-affine \
 		-finalize-memref-to-llvm -convert-func-to-llvm \
 		-reconcile-unrealized-casts | \
@@ -60,7 +60,7 @@ tensor-collapse-shape-translate:
 tensor-collapse-shape-run:
 	@${MLIR_OPT} ./tensor-collapse-shape.mlir \
 		-arith-bufferize -tensor-bufferize -func-bufferize \
-		-finalizing-bufferize -buffer-deallocation -convert-linalg-to-llvm \
+		-finalizing-bufferize -buffer-deallocation  \
 		-expand-strided-metadata -lower-affine \
 		-finalize-memref-to-llvm -convert-func-to-llvm \
 		-reconcile-unrealized-casts | \
@@ -71,21 +71,22 @@ tensor-extract-lower:
 	@${MLIR_OPT} ./tensor-extract.mlir \
 		-arith-bufferize  -tensor-bufferize -linalg-bufferize -convert-vector-to-llvm \
 		-func-bufferize -buffer-deallocation -convert-linalg-to-loops \
-		-convert-linalg-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm \
+		 -finalize-memref-to-llvm -convert-func-to-llvm \
 		-reconcile-unrealized-casts -o ./log.mlir
 
 tensor-extract-translate:
 	@${MLIR_OPT} ./tensor-extract.mlir \
 		-arith-bufferize  -tensor-bufferize -linalg-bufferize -convert-vector-to-llvm \
 		-func-bufferize -buffer-deallocation -convert-linalg-to-loops \
-		-convert-linalg-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm \
+		 -finalize-memref-to-llvm -convert-func-to-llvm \
 		-reconcile-unrealized-casts | \
 	${MLIR_TRANSLATE} --mlir-to-llvmir -o log.ll
+
 tensor-extract-run:
 	@${MLIR_OPT} ./tensor-extract.mlir \
 		-arith-bufferize  -tensor-bufferize -linalg-bufferize -convert-vector-to-llvm \
 		-func-bufferize -buffer-deallocation -convert-linalg-to-loops \
-		-convert-linalg-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm \
+		 -finalize-memref-to-llvm -convert-func-to-llvm \
 		-reconcile-unrealized-casts | \
 	${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \
 		-shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS}
@@ -94,14 +95,14 @@ tensor-extract-slice-lower:
 	@${MLIR_OPT} ./tensor-extract-slice.mlir \
 		-arith-bufferize  -tensor-bufferize -linalg-bufferize -convert-vector-to-llvm \
 		-func-bufferize -buffer-deallocation -convert-linalg-to-loops -expand-strided-metadata \
-		-convert-linalg-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm \
+		 -finalize-memref-to-llvm -convert-func-to-llvm \
 		-reconcile-unrealized-casts -o ./log.mlir
 
 tensor-extract-slice-translate:
 	@${MLIR_OPT} ./tensor-extract-slice.mlir \
 		-arith-bufferize  -tensor-bufferize -linalg-bufferize -convert-vector-to-llvm \
 		-func-bufferize -buffer-deallocation -convert-linalg-to-loops -expand-strided-metadata \
-		-convert-linalg-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm \
+		 -finalize-memref-to-llvm -convert-func-to-llvm \
 		-reconcile-unrealized-casts | \
 	${MLIR_TRANSLATE} --mlir-to-llvmir -o log.ll
 
@@ -109,7 +110,7 @@ tensor-extract-slice-run:
 	@${MLIR_OPT} ./tensor-extract-slice.mlir \
 		-arith-bufferize  -tensor-bufferize -linalg-bufferize -convert-vector-to-llvm \
 		-func-bufferize -buffer-deallocation -convert-linalg-to-loops -expand-strided-metadata \
-		-convert-linalg-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm \
+		 -finalize-memref-to-llvm -convert-func-to-llvm \
 		-reconcile-unrealized-casts | \
 	${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \
 		-shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS}
@@ -118,14 +119,14 @@ tensor-from-elements-lower:
 	@${MLIR_OPT} ./tensor-from-elements.mlir \
 		-arith-bufferize  -tensor-bufferize -linalg-bufferize -convert-vector-to-llvm \
 		-func-bufferize -buffer-deallocation -convert-linalg-to-loops \
-		-convert-linalg-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm \
+		 -finalize-memref-to-llvm -convert-func-to-llvm \
 		-reconcile-unrealized-casts -o ./log.mlir
 
 tensor-from-elements-translate:
 	@${MLIR_OPT} ./tensor-from-elements.mlir \
 		-arith-bufferize  -tensor-bufferize -linalg-bufferize -convert-vector-to-llvm \
 		-func-bufferize -buffer-deallocation -convert-linalg-to-loops \
-		-convert-linalg-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm \
+		 -finalize-memref-to-llvm -convert-func-to-llvm \
 		-reconcile-unrealized-casts | \
 	${MLIR_TRANSLATE} --mlir-to-llvmir -o log.ll
 
@@ -133,7 +134,7 @@ tensor-from-elements-run:
 	@${MLIR_OPT} ./tensor-from-elements.mlir \
 		-arith-bufferize  -tensor-bufferize -linalg-bufferize -convert-vector-to-llvm \
 		-func-bufferize -buffer-deallocation -convert-linalg-to-loops \
-		-convert-linalg-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm \
+		 -finalize-memref-to-llvm -convert-func-to-llvm \
 		-reconcile-unrealized-casts | \
 	${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \
 		-shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS}
@@ -142,14 +143,14 @@ tensor-insert-lower:
 	@${MLIR_OPT} ./tensor-insert.mlir \
 		-arith-bufferize  -tensor-bufferize -linalg-bufferize -convert-vector-to-llvm \
 		-func-bufferize -buffer-deallocation -convert-linalg-to-loops \
-		-convert-linalg-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm \
+		 -finalize-memref-to-llvm -convert-func-to-llvm \
 		-reconcile-unrealized-casts -o ./log.mlir
 
 tensor-insert-translate:
 	@${MLIR_OPT} ./tensor-insert.mlir \
 		-arith-bufferize  -tensor-bufferize -linalg-bufferize -convert-vector-to-llvm \
 		-func-bufferize -buffer-deallocation -convert-linalg-to-loops \
-		-convert-linalg-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm \
+		 -finalize-memref-to-llvm -convert-func-to-llvm \
 		-reconcile-unrealized-casts | \
 	${MLIR_TRANSLATE} --mlir-to-llvmir -o log.ll
 
@@ -157,7 +158,7 @@ tensor-insert-run:
 	@${MLIR_OPT} ./tensor-insert.mlir \
 		-arith-bufferize  -tensor-bufferize -linalg-bufferize -convert-vector-to-llvm \
 		-func-bufferize -buffer-deallocation -convert-linalg-to-loops \
-		-convert-linalg-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm \
+		 -finalize-memref-to-llvm -convert-func-to-llvm \
 		-reconcile-unrealized-casts | \
 	${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \
 		-shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS}
@@ -166,14 +167,14 @@ tensor-insert-slice-lower:
 	@${MLIR_OPT} ./tensor-insert-slice.mlir \
 		-arith-bufferize  -tensor-bufferize -linalg-bufferize -convert-vector-to-llvm \
 		-func-bufferize -buffer-deallocation -convert-linalg-to-loops -expand-strided-metadata \
-		-convert-linalg-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm \
+		 -finalize-memref-to-llvm -convert-func-to-llvm \
 		-reconcile-unrealized-casts -o ./log.mlir
 
 tensor-insert-slice-translate:
 	@${MLIR_OPT} ./tensor-insert-slice.mlir \
 		-arith-bufferize  -tensor-bufferize -linalg-bufferize -convert-vector-to-llvm \
 		-func-bufferize -buffer-deallocation -convert-linalg-to-loops -expand-strided-metadata \
-		-convert-linalg-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm \
+		 -finalize-memref-to-llvm -convert-func-to-llvm \
 		-reconcile-unrealized-casts | \
 	${MLIR_TRANSLATE} --mlir-to-llvmir -o log.ll
 
@@ -181,7 +182,7 @@ tensor-insert-slice-run:
 	@${MLIR_OPT} ./tensor-insert-slice.mlir \
 		-arith-bufferize  -tensor-bufferize -linalg-bufferize -convert-vector-to-llvm \
 		-func-bufferize -buffer-deallocation -convert-linalg-to-loops -expand-strided-metadata \
-		-convert-linalg-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm \
+		 -finalize-memref-to-llvm -convert-func-to-llvm \
 		-reconcile-unrealized-casts | \
 	${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \
 		-shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS}
diff --git a/examples/MLIRTensor/tensor-insert-slice.mlir b/examples/MLIRTensor/tensor-insert-slice.mlir
index 9baac6ccb..cac15ca80 100644
--- a/examples/MLIRTensor/tensor-insert-slice.mlir
+++ b/examples/MLIRTensor/tensor-insert-slice.mlir
@@ -1,6 +1,6 @@
 // RUN: buddy-opt %s \
 // RUN:     -arith-bufferize  -tensor-bufferize -linalg-bufferize \
-// RUN:     -convert-linalg-to-loops -convert-scf-to-cf -func-bufferize \
+// RUN:     -convert-scf-to-cf -func-bufferize \
 // RUN:     -buffer-deallocation -convert-linalg-to-loops \
 // RUN:     -expand-strided-metadata \
 // RUN:     -finalize-memref-to-llvm -convert-func-to-llvm \
diff --git a/examples/MLIRTensor/tensor-print.mlir b/examples/MLIRTensor/tensor-print.mlir
index e33e0184b..ec0b14126 100644
--- a/examples/MLIRTensor/tensor-print.mlir
+++ b/examples/MLIRTensor/tensor-print.mlir
@@ -1,7 +1,7 @@
 // RUN: buddy-opt %s \
 // RUN:     -arith-bufferize  -tensor-bufferize -linalg-bufferize \
 // RUN:     -func-bufferize -buffer-deallocation -convert-linalg-to-loops \
-// RUN:     -convert-linalg-to-loops -convert-scf-to-cf -finalize-memref-to-llvm -convert-func-to-llvm \
+// RUN:     -convert-scf-to-cf -finalize-memref-to-llvm -convert-func-to-llvm \
 // RUN:     -reconcile-unrealized-casts \
 // RUN: | mlir-cpu-runner -e main -entry-point-result=void \
 // RUN:     -shared-libs=%mlir_runner_utils_dir/libmlir_runner_utils%shlibext \
diff --git a/examples/README.md b/examples/README.md
index 2f940537d..fc551ccdd 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -92,7 +92,7 @@ $ buddy-opt <input> -lower-dip="DIP-strip-mining=${BUDDY_DIP_OPT_STRIP_MINING}"
 
 ```
 $ cd buddy-mlir/build/bin
-$ ./buddy-opt ../../examples/DIPDialect/dip.mlir --lower-dip="DIP-strip-mining=${BUDDY_DIP_OPT_STRIP_MINING}"
+$ ./buddy-opt ../../frontend/Interfaces/DIP.mlir --lower-dip="DIP-strip-mining=${BUDDY_DIP_OPT_STRIP_MINING}"
 ```
 
 - Edge detection example:
@@ -237,7 +237,7 @@ Example:
 
 ```
 $ cd buddy-mlir/build/bin
-$ ./buddy-opt ../../examples/BudDialect/TestConstant.mlir --lower-bud
+$ ./buddy-opt ../../examples/BudDialect/bud-print.mlir --lower-bud
 ```
 
 ## DSL Examples
diff --git a/examples/VectorExpDialect/.gitignore b/examples/VectorExpDialect/.gitignore
index d32dc0c50..790429d34 100644
--- a/examples/VectorExpDialect/.gitignore
+++ b/examples/VectorExpDialect/.gitignore
@@ -1,2 +1,3 @@
 log*
 core
+a.out
diff --git a/examples/VectorExpDialect/makefile b/examples/VectorExpDialect/makefile
index 5be430a67..bfecdba83 100644
--- a/examples/VectorExpDialect/makefile
+++ b/examples/VectorExpDialect/makefile
@@ -124,6 +124,41 @@ vector-exp-predication-matmul-run:
 		-dlopen=${CROSS_MLIR_C_RUNNER_UTILS} \
 		-dlopen=${CROSS_MLIR_RUNNER_UTILS}
 
+vector-exp-predication-matmul-aot:
+	@${BUDDY_OPT} ./vector-exp-predication-matmul.mlir \
+		-lower-affine \
+		-convert-scf-to-cf \
+		-convert-math-to-llvm \
+		-lower-vector-exp \
+		-lower-rvv \
+		-convert-vector-to-llvm \
+		-finalize-memref-to-llvm \
+		-convert-func-to-llvm \
+		-reconcile-unrealized-casts |\
+	${BUDDY_TRANSLATE} --buddy-to-llvmir | \
+	${LLC} -mtriple riscv64  -mattr=+v,+m -riscv-v-vector-bits-min=128 --filetype=obj -o log.o
+	@${RISCV_GNU_TOOLCHAIN}/bin/riscv64-unknown-linux-gnu-gcc log.o  \
+		-L${CROSS_MLIR_LIB} -lmlir_runner_utils -lmlir_c_runner_utils \
+		-o a.out
+	@LD_LIBRARY_PATH=${CROSS_MLIR_LIB} ${QEMU} -L ${RISCV_GNU_TOOLCHAIN_SYSROOT} -cpu rv64,x-v=true,vlen=128 a.out
+
+# vector-exp-predication-matmul-elf:
+# 	@${BUDDY_OPT} ./vector-exp-predication-matmul.mlir \
+# 		-lower-affine \
+# 		-convert-scf-to-cf \
+# 		-convert-math-to-llvm \
+# 		-lower-vector-exp \
+# 		-lower-rvv \
+# 		-convert-vector-to-llvm \
+# 		-finalize-memref-to-llvm \
+# 		-convert-func-to-llvm \
+# 		-reconcile-unrealized-casts |\
+# 	${BUDDY_TRANSLATE} -buddy-to-llvmir | \
+# 	${LLC} -mtriple riscv64  -mattr=+v,+m -riscv-v-vector-bits-min=128 -filetype=obj -o log.o
+# 	@${RISCV_GNU_TOOLCHAIN}/bin/riscv64-unknown-linux-gnu-gcc log.o  \
+# 		-static \
+# 		-o log.elf
+
 vector-exp-add-mask-run:
 	@${BUDDY_OPT} ./vector-exp-add-mask.mlir \
 		-lower-affine \
@@ -187,3 +222,7 @@ vector-exp-add-predication-asm:
 	${LLC} ${OPT_FLAG} -mtriple riscv64 -target-abi lp64d \
 		-mattr=+m,+d,+v -riscv-v-vector-bits-min=128 \
 		--filetype=asm -o log.s
+
+vector-exp-dynamic-vector-dump:
+	@${BUDDY_OPT} ./vector-exp-dynamic-vector.mlir \
+		-o log.mlir
diff --git a/examples/VectorExpDialect/vector-exp-dynamic-vector.mlir b/examples/VectorExpDialect/vector-exp-dynamic-vector.mlir
new file mode 100644
index 000000000..d792bacb0
--- /dev/null
+++ b/examples/VectorExpDialect/vector-exp-dynamic-vector.mlir
@@ -0,0 +1,51 @@
+#map = affine_map<(d0)[s0, s1] -> (s0, -d0 + s1)>
+
+func.func private @printMemrefI32(memref<*xi32>)
+
+func.func @alloc_mem_i32(%init: i32) -> memref<?xi32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c20 = arith.constant 20 : index
+  %mem = memref.alloc(%c20) : memref<?xi32>
+  scf.for %idx0 = %c0 to %c20 step %c1 {
+    memref.store %init, %mem[%idx0] : memref<?xi32>
+  }
+  return %mem : memref<?xi32>
+}
+
+func.func @vector_add(%input1: memref<?xi32>, %input2: memref<?xi32>, %output: memref<?xi32>) {
+  %c0 = arith.constant 0 : index
+  // Get the dimension of the workload.
+  %dim_size = memref.dim %input1, %c0 : memref<?xi32>
+  // Perform dynamic vector addition.
+  // Returns four times the physical vl for element type i32.
+  %vl = vector_exp.get_vl i32, 4 : index
+
+  scf.for %idx = %c0 to %dim_size step %vl { // Tiling
+    %it_vl = affine.min #map(%idx)[%vl, %dim_size]
+    vector_exp.set_vl %it_vl : index {
+      %vec_input1 = vector.load %input1[%idx] : memref<?xi32>, vector<[1]xi32> // vector<?xi32>
+      %vec_input2 = vector.load %input2[%idx] : memref<?xi32>, vector<[1]xi32> // vector<?xi32>
+      %vec_output = arith.addi %vec_input1, %vec_input2 : vector<[1]xi32> // vector<?xi32>
+      vector.store %vec_output, %output[%idx] : memref<?xi32>, vector<[1]xi32> // vector<?xi32>
+      vector.yield
+    }
+  }
+  return
+}
+
+func.func @main() -> i32 {
+  %c0_i32 = arith.constant 0 : i32
+  %c1_i32 = arith.constant 1 : i32
+
+  %input_mem = call @alloc_mem_i32(%c1_i32) : (i32) -> memref<?xi32>
+  %result_mem = call @alloc_mem_i32(%c0_i32) : (i32) -> memref<?xi32>
+
+  call @vector_add(%input_mem, %input_mem, %result_mem) : (memref<?xi32>, memref<?xi32>, memref<?xi32>) -> ()
+
+  %print_result_mem = memref.cast %result_mem : memref<?xi32> to memref<*xi32>
+  call @printMemrefI32(%print_result_mem) : (memref<*xi32>) -> ()
+
+  %ret = arith.constant 0 : i32
+  return %ret : i32
+}
diff --git a/examples/lit.cfg.py b/examples/lit.cfg.py
index 5bb8746c3..724d9cdaa 100644
--- a/examples/lit.cfg.py
+++ b/examples/lit.cfg.py
@@ -36,7 +36,9 @@
 # subdirectories contain auxiliary inputs for various tests in their parent
 # directories.
 config.excludes = [
+    'BuddyBert',
     'BuddyLlama',
+    'BuddyBert',
     'ConvOpt',
     'DAPDialect',
     'DIPDialect',
diff --git a/frontend/Interfaces/buddy/Core/Container.h b/frontend/Interfaces/buddy/Core/Container.h
index db8b66c17..242199811 100644
--- a/frontend/Interfaces/buddy/Core/Container.h
+++ b/frontend/Interfaces/buddy/Core/Container.h
@@ -25,10 +25,16 @@
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
+#include <fcntl.h>
 #include <iostream>
+#include <fstream>
 #include <memory>
 #include <numeric>
 #include <stdexcept>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
 #include <vector>
 
 // MemRef descriptor.
@@ -54,6 +60,8 @@ template <typename T, size_t N> class MemRef {
   MemRef<T, N> &operator=(const MemRef<T, N> &other);
   // Move constructor.
   MemRef(MemRef<T, N> &&other) noexcept;
+  // Constructor from file.
+  MemRef(const std::string& filename, intptr_t sizes[N], intptr_t offset = 0, bool isMmap = false);
   // Move assignment operator.
   MemRef<T, N> &operator=(MemRef<T, N> &&other) noexcept;
   // Desctrutor.
@@ -96,6 +104,10 @@ template <typename T, size_t N> class MemRef {
   intptr_t sizes[N];
   // Strides.
   intptr_t strides[N];
+  // Number of elements.
+  size_t size;
+  // File descriptor for mmap
+  int fd = -1;
 };
 
 // MemRef Shape Constructor.
@@ -277,12 +289,54 @@ MemRef<T, N> &MemRef<T, N>::operator=(MemRef<T, N> &&other) noexcept {
   return *this;
 }
 
+template <typename T, std::size_t N>
+MemRef<T,N>::MemRef(const std::string &filename, intptr_t sizes[N], intptr_t offset,
+       bool isMmap) {
+        this->offset = offset;
+        for (size_t i = 0; i < N; i++) {
+          this->sizes[i] = sizes[i];
+        }
+        setStrides();
+        size = product(sizes);
+        if (isMmap) {
+          fd = open(filename.c_str(), O_RDONLY);
+          if (fd == -1) {
+            assert (0 && "Failed to open file!");
+          }
+          struct stat sb;
+          if (fstat(fd, &sb) == -1) {
+            assert (0 && "Failed to get file size!");
+          }
+          if (sb.st_size != size * sizeof(T)) {
+            assert (0 && "File size does not match!");
+          }
+          allocated = (T *)mmap(NULL, sb.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
+          if (allocated == MAP_FAILED) {
+            assert (0 && "Failed to mmap!");
+          }
+          aligned = allocated;
+          
+        } else {
+          allocated = new T[size];
+          aligned = allocated;
+          std::ifstream in(filename, std::ios::in | std::ios::binary);
+          if (!in.is_open()) {
+            throw std::runtime_error("Failed to open file!");
+          }
+          in.read((char *)(aligned), sizeof(T) * (size));
+          in.close();
+        }
+       }
+
 // MemRef Destructor.
 // Note that the `allocated` and `aligned` point to the same address, so it is
 // enough to release the space of the `allocated` pointer in the destructor.
 template <typename T, std::size_t N> MemRef<T, N>::~MemRef() {
   if (allocated)
-    free(allocated);
+    delete[] allocated;
+  if (fd != -1) {
+    close(fd);
+  }
 }
 
 // Get the data pointer.
diff --git a/frontend/Interfaces/buddy/DAP/DSP/IIR.h b/frontend/Interfaces/buddy/DAP/DSP/IIR.h
index c2c3bb1eb..bb3035a9f 100644
--- a/frontend/Interfaces/buddy/DAP/DSP/IIR.h
+++ b/frontend/Interfaces/buddy/DAP/DSP/IIR.h
@@ -30,13 +30,13 @@ namespace detail {
 // Declare the Fir C interface.
 extern "C" {
 // TODO: support both float and double.
-void _mlir_ciface_mlir_iir(MemRef<float, 1> *inputBuddyConv1D,
-                           MemRef<float, 2> *kernelBuddyConv1D,
-                           MemRef<float, 1> *outputBuddyConv1D);
-
 void _mlir_ciface_buddy_iir(MemRef<float, 1> *inputBuddyConv1D,
                             MemRef<float, 2> *kernelBuddyConv1D,
                             MemRef<float, 1> *outputBuddyConv1D);
+
+void _mlir_ciface_buddy_iir_vectorization(MemRef<float, 1> *inputBuddyConv1D,
+                                          MemRef<float, 2> *kernelBuddyConv1D,
+                                          MemRef<float, 1> *outputBuddyConv1D);
 }
 } // namespace detail
 
@@ -62,15 +62,19 @@ void iirLowpass(MemRef<T, N> &input, const zpk<T> &filter, T frequency, T fs) {
   }
 }
 
-template <typename T, size_t N, size_t M>
-void iir(MemRef<float, N> *input, MemRef<T, M> *filter,
-         MemRef<float, N> *output) {
+// Filter parameters are represented by Second Order Section (SOS) filter, which
+// accept a MemRef with 2 dimension only (with the second dimension set to 6). 
+template <typename T, size_t N>
+void IIR(MemRef<float, N> *input, MemRef<T, 2> *filter,
+         MemRef<float, N> *output, bool isVectorization=false) {
   if (N != 1)
     assert(0 && "Only mono audio is supported for now.");
-  if (M != 2)
-    assert(0 && "Second Order Section (SOS) filter is only supported for now.");
-  detail::_mlir_ciface_buddy_iir(input, filter, output);
+  if (!isVectorization)
+    detail::_mlir_ciface_buddy_iir(input, filter, output);
+  else
+    detail::_mlir_ciface_buddy_iir_vectorization(input, filter, output);
 }
+
 } // namespace dap
 
 #endif // FRONTEND_INTERFACES_BUDDY_DAP_DSP_IIR
diff --git a/frontend/Interfaces/buddy/DIP/ImageContainer.h b/frontend/Interfaces/buddy/DIP/ImageContainer.h
index d90e25337..a25fd4476 100644
--- a/frontend/Interfaces/buddy/DIP/ImageContainer.h
+++ b/frontend/Interfaces/buddy/DIP/ImageContainer.h
@@ -127,6 +127,7 @@ template <typename T, size_t N> Img<T, N> &Img<T, N>::operator=(Img<T, N> &&m) {
 template <typename T, size_t N>
 Img<T, N> &Img<T, N>::operator=(const Img<T, N> &m) {
   MemRef<T, N>::operator=(m);
+  return *this;
 }
 
 /**
diff --git a/frontend/Interfaces/buddy/DIP/imgcodecs/bitstrm.h b/frontend/Interfaces/buddy/DIP/imgcodecs/bitstrm.h
index e0199499d..5fb018ec2 100644
--- a/frontend/Interfaces/buddy/DIP/imgcodecs/bitstrm.h
+++ b/frontend/Interfaces/buddy/DIP/imgcodecs/bitstrm.h
@@ -111,14 +111,14 @@ template <typename T, size_t N> class RMByteStream : public RLByteStream<T, N> {
 class WBaseStream {
 public:
   // methods
-  WBaseStream();
-  virtual ~WBaseStream();
+  inline WBaseStream();
+  inline virtual ~WBaseStream();
 
-  virtual bool open(const String &filename);
-  virtual bool open(std::vector<uchar> &buf);
-  virtual void close();
-  bool isOpened();
-  int getPos();
+  inline virtual bool open(const String &filename);
+  inline virtual bool open(std::vector<uchar> &buf);
+  inline virtual void close();
+  inline bool isOpened();
+  inline int getPos();
 
 protected:
   uchar *m_start;
@@ -130,9 +130,9 @@ class WBaseStream {
   bool m_is_opened;
   std::vector<uchar> *m_buf;
 
-  virtual void writeBlock();
-  virtual void release();
-  virtual void allocate();
+  inline virtual void writeBlock();
+  inline virtual void release();
+  inline virtual void allocate();
 };
 
 // class WLByteStream - uchar-oriented stream.
@@ -140,12 +140,12 @@ class WBaseStream {
 // first
 class WLByteStream : public WBaseStream {
 public:
-  virtual ~WLByteStream();
+  inline virtual ~WLByteStream();
 
-  void putByte(int val);
-  void putBytes(const void *buffer, int count);
-  void putWord(int val);
-  void putDWord(int val);
+  inline void putByte(int val);
+  inline void putBytes(const void *buffer, int count);
+  inline void putWord(int val);
+  inline void putDWord(int val);
 };
 
 // class WLByteStream - uchar-oriented stream.
@@ -153,9 +153,9 @@ class WLByteStream : public WBaseStream {
 // last
 class WMByteStream : public WLByteStream {
 public:
-  virtual ~WMByteStream();
-  void putWord(int val);
-  void putDWord(int val);
+  inline virtual ~WMByteStream();
+  inline void putWord(int val);
+  inline void putDWord(int val);
 };
 
 inline unsigned BSWAP(unsigned v) {
@@ -165,7 +165,7 @@ inline unsigned BSWAP(unsigned v) {
 
 const int BS_DEF_BLOCK_SIZE = 1 << 15;
 
-bool bsIsBigEndian(void) {
+inline bool bsIsBigEndian(void) {
   return (((const int *)"\0\x1\x2\x3\x4\x5\x6\x7")[0] & 255) != 0;
 }
 
diff --git a/frontend/Interfaces/buddy/DIP/imgcodecs/grfmt_png.h b/frontend/Interfaces/buddy/DIP/imgcodecs/grfmt_png.h
index b2ad33a38..2866a4916 100644
--- a/frontend/Interfaces/buddy/DIP/imgcodecs/grfmt_png.h
+++ b/frontend/Interfaces/buddy/DIP/imgcodecs/grfmt_png.h
@@ -105,7 +105,7 @@ class PngEncoder : public BaseImageEncoder<T, N> {
   static void writeDataToBuf(void *png_ptr, uchar *src, size_t size);
 };
 
-bool isBigEndian() {
+inline bool isBigEndian() {
   int num = 1;
   char *ptr = (char *)&num;
   return (*ptr == 0);
diff --git a/frontend/Interfaces/buddy/DIP/imgcodecs/loadsave.h b/frontend/Interfaces/buddy/DIP/imgcodecs/loadsave.h
index 1b39d3577..a4854a7f4 100644
--- a/frontend/Interfaces/buddy/DIP/imgcodecs/loadsave.h
+++ b/frontend/Interfaces/buddy/DIP/imgcodecs/loadsave.h
@@ -308,4 +308,4 @@ static bool imwrite(const String &filename, Img<T, N> &img_vec) {
   return true;
 }
 } // namespace dip
-#endif
\ No newline at end of file
+#endif
diff --git a/frontend/Interfaces/buddy/DIP/imgcodecs/utils.h b/frontend/Interfaces/buddy/DIP/imgcodecs/utils.h
index f848fdc97..52dcbd764 100644
--- a/frontend/Interfaces/buddy/DIP/imgcodecs/utils.h
+++ b/frontend/Interfaces/buddy/DIP/imgcodecs/utils.h
@@ -52,7 +52,7 @@
 #include "buddy/DIP/imgcodecs/replenishment.h"
 
 namespace dip {
-int validateToInt(size_t sz) {
+inline int validateToInt(size_t sz) {
   int valueInt = (int)sz;
   assert((size_t)valueInt == sz);
   return valueInt;
@@ -77,26 +77,31 @@ struct PaletteEntry {
 #define descale(x, n) (((x) + (1 << ((n)-1))) >> (n))
 #define saturate(x) (uchar)(((x) & ~255) == 0 ? (x) : ~((x) >> 31))
 
-void icvCvt_BGR2Gray_8u_C3C1R(const uchar *bgr, int bgr_step, uchar *gray,
-                              int gray_step, _Size size, int swap_rb = 0);
-
-void FillGrayPalette(PaletteEntry *palette, int bpp, bool negative = false);
-bool IsColorPalette(PaletteEntry *palette, int bpp);
-void CvtPaletteToGray(const PaletteEntry *palette, uchar *grayPalette,
-                      int entries);
-uchar *FillUniColor(uchar *data, uchar *&line_end, int step, int width3, int &y,
-                    int height, int count3, PaletteEntry clr);
-uchar *FillUniGray(uchar *data, uchar *&line_end, int step, int width3, int &y,
-                   int height, int count3, uchar clr);
-uchar *FillColorRow8(uchar *data, uchar *indices, int len,
-                     PaletteEntry *palette);
-uchar *FillGrayRow8(uchar *data, uchar *indices, int len, uchar *palette);
-uchar *FillColorRow4(uchar *data, uchar *indices, int len,
-                     PaletteEntry *palette);
-uchar *FillGrayRow4(uchar *data, uchar *indices, int len, uchar *palette);
-uchar *FillColorRow1(uchar *data, uchar *indices, int len,
-                     PaletteEntry *palette);
-uchar *FillGrayRow1(uchar *data, uchar *indices, int len, uchar *palette);
+inline void icvCvt_BGR2Gray_8u_C3C1R(const uchar *bgr, int bgr_step,
+                                     uchar *gray, int gray_step, _Size size,
+                                     int swap_rb = 0);
+
+inline void FillGrayPalette(PaletteEntry *palette, int bpp,
+                            bool negative = false);
+inline bool IsColorPalette(PaletteEntry *palette, int bpp);
+inline void CvtPaletteToGray(const PaletteEntry *palette, uchar *grayPalette,
+                             int entries);
+inline uchar *FillUniColor(uchar *data, uchar *&line_end, int step, int width3,
+                           int &y, int height, int count3, PaletteEntry clr);
+inline uchar *FillUniGray(uchar *data, uchar *&line_end, int step, int width3,
+                          int &y, int height, int count3, uchar clr);
+inline uchar *FillColorRow8(uchar *data, uchar *indices, int len,
+                            PaletteEntry *palette);
+inline uchar *FillGrayRow8(uchar *data, uchar *indices, int len,
+                           uchar *palette);
+inline uchar *FillColorRow4(uchar *data, uchar *indices, int len,
+                            PaletteEntry *palette);
+inline uchar *FillGrayRow4(uchar *data, uchar *indices, int len,
+                           uchar *palette);
+inline uchar *FillColorRow1(uchar *data, uchar *indices, int len,
+                            PaletteEntry *palette);
+inline uchar *FillGrayRow1(uchar *data, uchar *indices, int len,
+                           uchar *palette);
 
 #define SCALE 14
 #define cR (int)(0.299 * (1 << SCALE) + 0.5)
diff --git a/frontend/Interfaces/buddy/LLM/TextContainer.h b/frontend/Interfaces/buddy/LLM/TextContainer.h
index b5e307abd..28432b3c1 100644
--- a/frontend/Interfaces/buddy/LLM/TextContainer.h
+++ b/frontend/Interfaces/buddy/LLM/TextContainer.h
@@ -325,7 +325,7 @@ template <typename T, size_t N> std::string Text<T, N>::revertLlama() {
   const int CLS_ID = 1;
   const int SEP_ID = 2;
 
-  for (size_t i = 0; i < this->getSize(); i++) {
+  for (size_t i = 0; i < this->tokenCnt; i++) {
     int id = this->aligned[i];
     if (id == PAD_ID || id == CLS_ID)
       continue;
diff --git a/frontend/Interfaces/lib/CMakeLists.txt b/frontend/Interfaces/lib/CMakeLists.txt
index e70a24034..9f6f61b29 100644
--- a/frontend/Interfaces/lib/CMakeLists.txt
+++ b/frontend/Interfaces/lib/CMakeLists.txt
@@ -65,3 +65,36 @@ SET_TARGET_PROPERTIES(BuddyLibDAP PROPERTIES
   LINKER_LANGUAGE CXX
   ARCHIVE_OUTPUT_DIRECTORY ${LIBRARY_OUTPUT_DIRECTORY}
   )
+
+  add_custom_command(OUTPUT DAPVectorization.o
+  COMMAND cat ${CMAKE_CURRENT_SOURCE_DIR}/DAP.mlir |
+          sed 's/buddy_fir/buddy_fir_vectorization/' |
+          sed 's/buddy_iir/buddy_iir_vectorization/' |
+          sed 's/buddy_biquad/buddy_biquad_vectorization/' |
+          ${CMAKE_BINARY_DIR}/bin/buddy-opt
+              -vectorize-dap
+              -convert-linalg-to-affine-loops
+              -arith-expand
+              -lower-affine
+              -convert-scf-to-cf
+              -convert-math-to-llvm
+              -convert-vector-to-llvm
+              -finalize-memref-to-llvm
+              -llvm-request-c-wrappers
+              -convert-func-to-llvm
+              -reconcile-unrealized-casts | 
+          ${LLVM_MLIR_BINARY_DIR}/mlir-translate -mlir-to-llvmir |
+          ${LLVM_MLIR_BINARY_DIR}/llc 
+              -mtriple=${BUDDY_TARGET_TRIPLE}
+              -mattr=${BUDDY_OPT_ATTR}
+              -filetype=obj
+              -o ${CMAKE_CURRENT_BINARY_DIR}/DAPVectorization.o
+  DEPENDS buddy-opt
+  )
+
+add_library(BuddyLibDAPVectorization STATIC DAPVectorization.o)
+
+SET_TARGET_PROPERTIES(BuddyLibDAPVectorization PROPERTIES
+  LINKER_LANGUAGE CXX
+  ARCHIVE_OUTPUT_DIRECTORY ${LIBRARY_OUTPUT_DIRECTORY}
+  )
diff --git a/frontend/Python/frontend.py b/frontend/Python/frontend.py
index 24002fd64..e89597800 100644
--- a/frontend/Python/frontend.py
+++ b/frontend/Python/frontend.py
@@ -16,32 +16,44 @@
 #
 # This is the entry of the Buddy Compiler frontend.
 #
+# TODO[Low]: When integrating more frameworks, `frontend.py` acts as a unified
+# entry and driver, separating out compilers/importers for various platforms
+# (e.g. DynamoCompiler).
+#
 # ===---------------------------------------------------------------------------
 
-import operator
 from typing import Any, List, Optional
-import functools
+import operator
+import os
+import ctypes
+import platform
 
-import mlir.dialects.func as func
 import mlir.ir as ir
+import mlir.dialects.func as func
+from mlir.passmanager import *
+from mlir.execution_engine import *
+from mlir import runtime as rt
 import torch
 import torch._dynamo as dynamo
 from torch._functorch.aot_autograd import aot_module_simplified
 import torch.utils._pytree as pytree
 
-from .ops.math import ops_registry as math_ops_registry
-from .ops.tosa import ops_registry as tosa_ops_registry
 from .ops.linalg import ops_registry as linalg_ops_registry
+from .ops.tosa import ops_registry as tosa_ops_registry
+from .ops.math import ops_registry as math_ops_registry
+from .graph import Graph, TensorDType, TensorMeta
+from .graph.operation import *
+from .graph.transform import maxpool2d_simplify
 
 
 class DynamoCompiler:
     """
     Dynamo Compiler is one of the frontends of Buddy Compiler.
-    Dynamo Compiler acts as a custom compiler for the Torch Dynamo framework,
-    which converts an FX Graph into an equivalent MLIR module.
+    Dynamo Compiler acts as a custom compiler for the TorchDynamo framework,
+    which converts an FX Graph into an equivalent Buddy Graph and MLIR module.
 
     Attributes:
-        imported_module: The imported MLIR module after compilation.
+        imported_graphs: The imported graphs.
         imported_params: The imported parameters from the model.
     """
 
@@ -50,80 +62,279 @@ def __init__(
         func_name: str = "forward",
         primary_registry: Optional[dict] = None,
         aot_autograd_decomposition: Optional[dict] = None,
-        do_param_pack: bool = True,
     ) -> None:
         """
         Initializes the Dynamo Compiler.
 
         Args:
-            func_name (str, optional): The function name to be used.
+            func_name: The function name to be used.
             primary_registry (dict, optional): The primary operations registry.
             aot_autograd_decomposition (Optional[dict], optional):
-                The ahead-of-time autograd decomposition dictionary.
+            The ahead-of-time autograd decomposition dictionary.
+        Attributes:
+            _func_name: The function name to be used.
+            _aot_autograd_decomposition (Optional[dict], optional):
+            The ahead-of-time autograd decomposition dictionary.
+            _imported_graphs: The buddy graphs from dynamo importer.
+            _ops_registry (dict, optional): The buddy operations' lower func
+            registry.
+            _imported_params: The model params extract from torch.
+            _ops_map: The torch aten ops map with buddy ops.
+
         """
         if primary_registry is None:
             primary_registry = {}
         self._func_name = func_name
         self._aot_autograd_decomposition = aot_autograd_decomposition
-        self._imported_module = None
-        self._imported_params = None
-        self._do_param_pack = do_param_pack
+        self._imported_graphs = []
         self._ops_registry = {}
+        self._imported_params = {}
         self._ops_registry.update(math_ops_registry)
         self._ops_registry.update(linalg_ops_registry)
         self._ops_registry.update(tosa_ops_registry)
         self._ops_registry.update(primary_registry)
+        self._ops_map = {
+            "output": OutputOp,
+            "placeholder": PlaceholderOp,
+            "arange.start": ArangeOp,
+            "arange.default": ArangeOp,
+            "unsqueeze.default": UnsqueezeOp,
+            "view.default": ViewOp,
+            "ones.default": OnesOp,
+            "full.default": FullOp,
+            "lt.Tensor": LessThanOp,
+            "embedding.default": EmbeddingOp,
+            "masked_fill.Scalar": MaskedFillOp,
+            "slice.Tensor": SliceOp,
+            "expand.default": ExpandOp,
+            "_to_copy.default": ToCopyOp,
+            "rsub.Scalar": RsubOp,
+            "pow.Tensor_Scalar": PowOp,
+            "mean.dim": MeanOp,
+            "rsqrt.default": RsqrtOp,
+            "mul.Tensor": MulOp,
+            "t.default": TOp,
+            "mm.default": MatmulOp,
+            "transpose.int": TransposeOp,
+            "index.Tensor": IndexOp,
+            "neg.default": NegOp,
+            "cat.default": CatOp,
+            "squeeze.dim": SqueezeOp,
+            "bmm.default": BatchMatmulOp,
+            "div.Tensor": DivOp,
+            "_softmax.default": SoftmaxOp,
+            "clone.default": CloneOp,
+            "silu.default": SiluOp,
+            "add.Tensor": AddOp,
+            "addmm.default": AddMMOp,
+            "permute.default": PermuteOp,
+            "convert_element_type.default": ConvertElementTypeOp,
+            "sum.dim_IntList": SumDimOp,
+            "tanh.default": TanhOp,
+            "sub.Tensor": SubOp,
+            "var_mean.correction": VarMeanOp,
+            "amax.default": AmaxOp,
+            "select.int": SelectOp,
+            "exp.default": ExpOp,
+            "erf.default": ErfOp,
+            "getitem": GetItemOp,
+            "convolution.default": Conv2dOp,
+            "max_pool2d_with_indices.default": MaxPool2dWithIndicesOp,
+            "relu.default": ReluOp,
+            "iota.default": IotaOp,
+            "sigmoid.default": SigmoidOp,
+            "scalar_tensor.default": ScalarTensorOp,
+            "where.self": WhereOp,
+            "sqrt.default": SqrtOp,
+            "reciprocal.default": ReciprocalOp,
+        }
 
     @property
-    def imported_module(self):
-        """Returns the imported MLIR module after compilation."""
-        return self._imported_module
+    def imported_graphs(self):
+        """Returns the imported buddy graphs after compilation."""
+        return self._imported_graphs
 
     @property
     def imported_params(self):
-        """Returns the imported parameters from the model."""
+        """Returns the imported model params after compilation."""
         return self._imported_params
 
+    def _torch_dtype_translate(self, dtype):
+        match dtype:
+            case "torch.int64":
+                return TensorDType.Int64
+            case "torch.int32":
+                return TensorDType.Int32
+            case "torch.float16":
+                return TensorDType.Float16
+            case "torch.float32":
+                return TensorDType.Float32
+            case "torch.float64":
+                return TensorDType.Float64
+            case "torch.bool":
+                return TensorDType.Bool
+            case _:
+                raise NotImplementedError(f"Unsupported dtype: {dtype}")
+
+    def _create_node(
+        self,
+        gm_node_name: str,
+        node_name: str,
+        node_input: Tuple,
+        node_users: List[str],
+        node_output_shape: list = [],
+        node_output_dtype: TensorDType = None,
+        node_kwargs: Optional[Dict] = None,
+    ):
+        """
+        Create buddy op node from torch aten op.
+
+        Args:
+            gm_node_name: The op node class map to buddy op by _ops_map.
+            node_name: The op node name to be used.
+            node_input: The args input to op node.
+            node_output_shape: The list of the op node's output shape.
+            node_output_dtype: The TensorDType enum type of the op node's output
+            data type.
+            node_kwargs: The restful attributes for op node.
+        """
+        op_class = self._ops_map[gm_node_name]
+        buddy_node = op_class()
+        buddy_node._name = node_name
+        if gm_node_name == "output":
+            for input_arg in node_input[0]:
+                buddy_node.add_argument(str(input_arg))
+            return buddy_node
+        for input_arg in node_input:
+            if isinstance(input_arg, torch.fx.Node):
+                buddy_node.add_argument(str(input_arg))
+                buddy_node.add_parent(str(input_arg))
+            elif isinstance(input_arg, torch.dtype):
+                buddy_node.add_argument(self._torch_dtype_translate(str(input_arg)))
+            else:
+                buddy_node.add_argument(input_arg)
+        for user in node_users:
+            buddy_node.add_children(user)
+        if node_kwargs is None:
+            node_kwargs = {}
+        buddy_node._keyword_arguments.update(node_kwargs)
+        buddy_node._tensor_meta["shape"] = node_output_shape
+        buddy_node._tensor_meta["dtype"] = node_output_dtype
+        return buddy_node
+
     def _compile_fx(
         self, gm: torch.fx.GraphModule, inputs: List[torch.Tensor]
     ) -> Any:
         """
-        Compiles the provided FX Graph to MLIR module.
+        Compiles the provided FX Graph to Buddy Graph.
 
         Args:
             gm (torch.fx.GraphModule): The GraphModule to be compiled.
             inputs (List[torch.Tensor]): The input tensors.
 
         Returns:
-            Any: The result of the ahead-of-time compiled module.
+            dynamo_run: The function of the ahead-of-time compiled module,
+            return for torchdynamo's call.
         """
 
-        def _compiler(_gm: torch.fx.GraphModule, _inputs: List[torch.Tensor]):
-            """Compile a FX graph in Aten/Prims IR to MLIR."""
-            func_params = _inputs[: len(self.imported_params)]
-            func_inputs = _inputs[len(self.imported_params) :]
-
-            # Initializes the MLIR context.
-            ctx = ir.Context()
-            with ir.Location.unknown(ctx):
-                fx_importer = FXGraphImporter(
-                    _gm,
-                    func_params,
-                    func_inputs,
-                    self._do_param_pack,
-                    self._func_name,
-                    self._ops_registry,
-                )
-                self._imported_module = fx_importer.import_graph()
-            # TODO: Lower to LLVM dialect and use JIT engine to execute.
-            return _gm.forward
-
         params = {
             **dict(gm.named_parameters(remove_duplicate=False)),
             **dict(gm.named_buffers(remove_duplicate=False)),
         }
         params_flat, _ = pytree.tree_flatten(params)
-        self._imported_params = params_flat
+
+        def _compiler(_gm: torch.fx.GraphModule, _inputs: List[torch.Tensor]):
+            """Compile a FX graph in Aten/Prims IR to MLIR."""
+            nonlocal params_flat
+            func_inputs = []
+            for inp in _inputs[len(params_flat) :]:
+                inp_shape = inp.shape
+                inp_dtype = self._torch_dtype_translate(str(inp.dtype))
+                func_inputs.append(TensorMeta(inp_shape, inp_dtype))
+            fake_params = []
+            for param in params_flat:
+                param_dtype = self._torch_dtype_translate(str(param.dtype))
+                fake_params.append(TensorMeta(param.shape, param_dtype))
+            graph = Graph(
+                func_inputs,
+                fake_params,
+                self._ops_registry,
+                self._func_name,
+            )
+            for gm_node in _gm.graph.nodes:
+                node_users = []
+                for user in gm_node.users.keys():
+                    node_users.append(str(user))
+                if gm_node.op == "placeholder":
+                    node_dtype = self._torch_dtype_translate(
+                        str(gm_node.meta["tensor_meta"].dtype)
+                    )
+                    buddy_node = self._create_node(
+                        gm_node.op,
+                        gm_node.name,
+                        gm_node.args,
+                        node_users,
+                        gm_node.meta["tensor_meta"].shape,
+                        node_dtype,
+                    )
+
+                elif gm_node.op == "output":
+                    buddy_node = self._create_node(
+                        gm_node.op,
+                        gm_node.name,
+                        gm_node.args,
+                        node_users
+                    )
+
+                elif gm_node.target is operator.getitem:
+                    node_dtype = self._torch_dtype_translate(
+                        str(gm_node.meta["tensor_meta"].dtype)
+                    )
+                    buddy_node = self._create_node(
+                        str(gm_node.target.__name__),
+                        gm_node.name,
+                        gm_node.args,
+                        node_users,
+                        gm_node.meta["tensor_meta"].shape,
+                        node_dtype,
+                    )
+
+                else:
+                    tensor_meta = gm_node.meta.get("tensor_meta")
+                    val = gm_node.meta.get("val")
+                    num_returns = len(gm_node.target._schema.returns)
+                    if num_returns == 1:
+                        node_dtype = self._torch_dtype_translate(
+                            str(tensor_meta.dtype)
+                        )
+                        node_shape = tensor_meta.shape
+                    elif num_returns > 1:
+                        node_dtype = tuple(
+                            [
+                                self._torch_dtype_translate(str(val_item.dtype))
+                                for val_item in val
+                            ]
+                        )
+                        node_shape = tuple([val_item.shape for val_item in val])
+                    else:
+                        raise RuntimeError("Zero returns is not supported.")
+
+                    buddy_node = self._create_node(
+                        str(gm_node.target.__name__),
+                        gm_node.name,
+                        gm_node.args,
+                        node_users,
+                        node_shape,
+                        node_dtype,
+                        node_kwargs=gm_node.kwargs,
+                    )
+
+                graph.add_node(buddy_node)
+            transform_list = [maxpool2d_simplify]
+            graph.perform(transform_list)
+            self._imported_graphs.append(graph)
+            self._imported_params[graph] = params_flat
+            return self.dynamo_run()
 
         return aot_module_simplified(
             gm,
@@ -143,11 +354,12 @@ def __call__(
             inputs (List[torch.Tensor]): The input tensors.
 
         Returns:
-            Any: The result of the ahead-of-time compiled module.
+            dynamo_run: The function of the ahead-of-time compiled module,
+            return for torchdynamo's call.
         """
         return self._compile_fx(gm, inputs)
 
-    def importer(self, model, *args, **kwargs):
+    def importer(self, model, *args, **kwargs) -> List[Graph]:
         """
         Imports the provided model as MLIR module and flat parameters.
 
@@ -157,212 +369,145 @@ def importer(self, model, *args, **kwargs):
             kwargs: Keyword arguments for the model.
 
         Returns:
-            module: The imported MLIR module.
-            params: The imported flat parameters.
+            imported_graphs: The imported buddy graphs.
         """
         model_opt = dynamo.optimize(self._compile_fx)(model)
         model_opt(*args, **kwargs)
-        module = self._imported_module
-        params = self._imported_params
-        return module, params
-
-
-class FXGraphImporter:
-    """
-    Imports an FX graph and generates an MLIR module in high-level dialects.
-
-    Attributes:
-        _symbol_table (dict): A dictionary to keep track of the symbols.
-        _gm (torch.fx.GraphModule): The FX graph module to be imported.
-        _func_name (str): Name of the generated MLIR function.
-        _inputs (List[torch.Tensor]): Input tensor(s) of the FX graph.
-        _num_input_visited (int): Number of input nodes that have been visited.
-        _module (mlir.ir.Module): The generated MLIR module.
-        _ops_registry (dict): Registry for the candidate operations.
-    """
-
-    def __init__(
-        self,
-        gm: torch.fx.GraphModule,
-        params: List[torch.Tensor],
-        inputs: List[torch.Tensor],
-        do_param_pack: bool = True,
-        func_name: str = "forward",
-        ops_registry: Optional[dict] = None,
-    ):
-        """
-        Initializes the FX Graph importer.
-
-        Args:
-            gm (torch.fx.GraphModule): The FX graph that will be imported.
-            inputs (List[torch.Tensor]): Input tensor(s) of the FX graph.
-            func_name (str): Name of the generated MLIR function.
-            ops_registry (dict): Registry for the candidate operations.
-        """
-        if ops_registry is None:
-            ops_registry = {}
-        self._symbol_table = {}
-        self._gm = gm
-        self._func_name = func_name
-        self._params = params
-        self._inputs = inputs
-        self._do_param_pack = do_param_pack
-        self._param_packs = []
-        self._num_input_visited = 0
-        self._module = ir.Module.create()
-        self._ops_registry = ops_registry
-        self._current_param_pack_offset = None
-
-    def _torch_dtype_to_mlir_dtype(self, dtype: torch.dtype) -> ir.Type:
-        """
-        Converts a torch dtype to the corresponding MLIR dtype.
-
-        Args:
-            dtype (torch.dtype): The torch data type.
+        return self._imported_graphs
 
-        Returns:
-            mlir.ir.Type: The corresponding MLIR data type.
-
-        Raises:
-            NotImplementedError: If the given dtype is not supported.
-        """
-        match dtype:
-            case torch.int32:
-                return ir.IntegerType.get_signless(32)
-            case torch.int64:
-                return ir.IntegerType.get_signless(64)
-            case torch.float32:
-                return ir.F32Type.get()
-            case torch.bool:
-                return ir.IntegerType.get_signless(1)
-            case _:
-                raise NotImplementedError(f"Unsupported dtype {dtype}")
-
-    def _pack_params(self) -> None:
-        dtypes = list(set([param.dtype for param in self._params]))
-        dtypes.sort(key=str)
-        self._current_param_pack_offset = {dtype: 0 for dtype in dtypes}
-        for dtype in dtypes:
-            params_of_dtype = [
-                param for param in self._params if param.dtype == dtype
-            ]
-            param_total_size = 0
-            for param in params_of_dtype:
-                param_total_size += functools.reduce(
-                    lambda x, y: x * y, list(param.shape)
-                )
-            mlir_dtype = self._torch_dtype_to_mlir_dtype(dtype)
-            self._param_packs.append(
-                ir.RankedTensorType.get([param_total_size], mlir_dtype)
-            )
-
-    def import_graph(self) -> ir.Module:
+    def dynamo_run(self):
         """
-        Imports FX graph and generates an MLIR module in high-level dialects.
+        A callable method that wraps around the `exec_buddy_graph` method.
 
         Returns:
-            mlir.ir.Module: An MLIR module in high-level dialects.
-        """
-        with ir.InsertionPoint(self._module.body):
-            arguments = []
-            if self._do_param_pack:
-                self._pack_params()
-                arguments.extend(self._param_packs)
-                inputs = self._inputs
-            else:
-                inputs = self._params + self._inputs
-            for arg in inputs:
-                shape_list = list(arg.shape)
-                torch_dtype = arg.dtype
-                mlir_dtype = self._torch_dtype_to_mlir_dtype(torch_dtype)
-                tensor_arg = ir.RankedTensorType.get(shape_list, mlir_dtype)
-                arguments.append(tensor_arg)
-
-            @func.FuncOp.from_py_func(*arguments, name=self._func_name)
-            def generated_func(*args):
-                args_list = list(args)
-                for node in self._gm.graph.nodes:
-                    if not (
-                        node.op in ["output", "placeholder", "call_function"]
-                        or node.target is operator.getitem
-                    ):
-                        continue
-                    if node.op == "output":
-                        output_node_args = node.args[0]
-                        returns = [
-                            self._symbol_table.get((str(output_arg), 0))
-                            for output_arg in output_node_args
-                        ]
-                        self._symbol_table[("output", 0)] = returns
-                    elif node.op == "placeholder":
-                        self._import_placeholder(node, args_list)
-                    elif node.target is operator.getitem:
-                        self._symbol_table[
-                            (str(node.name), 0)
-                        ] = self._symbol_table[
-                            (str(node.args[0]), node.args[1])
-                        ]
-                    else:
-                        self._import_op(node)
-
-                return self._symbol_table.get(("output", 0))
-
-        return self._module
-
-    def _import_placeholder(
-        self, node: torch.fx.Node, args_list: List[ir.BlockArgument]
-    ):
+            exec_buddy_graph: The function of the ahead-of-time compiled module,
+            return for torchdynamo's call.
         """
-        Imports a placeholder node from the FX graph.
 
-        Args:
-            node (torch.fx.Node): The FX node representing the placeholder.
-            args_list (List[mlir.ir.BlockArgument]): List of input tensors.
-        """
-        if self._num_input_visited < len(self._params):
-            dtype = node.meta["tensor_meta"].dtype
-            pack_of_dtype = None
-            for pack in args_list:
-                if ir.RankedTensorType(
-                    pack.type
-                ).element_type == self._torch_dtype_to_mlir_dtype(dtype):
-                    pack_of_dtype = pack
-                    break
-            placeholder_name = self._ops_registry["param.extract"](
-                node, self._current_param_pack_offset[dtype], pack_of_dtype
-            ).result
-            self._current_param_pack_offset[dtype] += functools.reduce(
-                lambda x, y: x * y, list(node.meta["tensor_meta"].shape)
-            )
-        else:
-            if len(self._params) > 0:
-                placeholder_name = args_list[
-                    self._num_input_visited
-                    - len(self._params)
-                    + len(self._param_packs)
-                ]
+        def get_lib_extension():
+            if platform.system() == "Linux":
+                return ".so"
+            elif platform.system() == "Darwin":
+                return ".dylib"
             else:
-                placeholder_name = args_list[self._num_input_visited]
-
-        self._symbol_table[(str(node.name), 0)] = placeholder_name
-        self._num_input_visited += 1
-
-    def _import_op(self, node: torch.fx.Node):
-        """
-        Imports an operation node from the FX graph.
-
-        Args:
-            node (torch.fx.Node): The FX node representing the operation.
-
-        """
-        op_name = node.target.__name__
-        op_ret: ir.Operation | ir.Value | tuple | ir.OpResult = (
-            self._ops_registry[op_name](node, self._symbol_table)
+                raise RuntimeError("Unsupported platform")
+
+        # Dynamo's graph break may import more than one graph.
+        graph = self._imported_graphs[-1]
+        graph.compile()
+        # Collect dependency libraries.
+        lib_extension = get_lib_extension()
+        lib_names = ["libmlir_runner_utils", "libmlir_c_runner_utils", "libomp"]
+        path_prefix = os.path.dirname(os.path.abspath(__file__))
+        lib_base_path = os.path.join(path_prefix, "../../../../llvm/build/lib/")
+        lib_base_path = os.path.abspath(lib_base_path)
+        shared_libs = [
+            os.path.join(lib_base_path, lib_name + lib_extension)
+            for lib_name in lib_names
+        ]
+        # Define execution engine.
+        ee = ExecutionEngine(
+            graph._imported_module, opt_level=3, shared_libs=shared_libs
         )
-        if isinstance(op_ret, tuple):
-            for i, operation in enumerate(op_ret):
-                self._symbol_table[(str(node.name), i)] = operation.result
-        elif isinstance(op_ret, ir.OpResult):
-            self._symbol_table[(str(node.name), 0)] = op_ret
-        else:
-            self._symbol_table[(str(node.name), 0)] = op_ret.result
+
+        def cast_c_ptr(outdata_ptr, memref_ptr):
+            """
+            Casts a C pointer (`outdata_ptr`) to the type of another C pointer 
+            (`memref_ptr`).
+
+            Args:
+                outdata_ptr: ctypes.POINTER
+                The C pointer whose type needs to be cast.
+                memref_ptr: ctypes.POINTER
+                The reference C pointer whose type will be used for casting.
+
+            Returns:
+            ctypes.POINTER
+                A new C pointer with the type of `memref_ptr`, representing the 
+                same memory location as `outdata_ptr`.
+
+            Example:
+            outdata = ctypes.pointer(ctypes.c_int())
+            memref = ctypes.pointer(ctypes.c_float())
+            casted_ptr = cast_c_ptr(outdata, memref)
+            # Now `casted_ptr` points to the same memory location as `outdata`, 
+            but with the type of `memref`.
+            """
+            outdata_addr = ctypes.addressof(outdata_ptr.contents)
+            out_ptr = ctypes.cast(outdata_addr, type(memref_ptr))
+            return out_ptr
+
+        def move_c_ptr(outdata_ptr, memref_ptr):
+            """
+            Moves a C pointer (`outdata_ptr`) to the next element in memory, 
+            based on the size of the referenced type in another C pointer 
+            (`memref_ptr`).
+
+            Args:
+                outdata_ptr: ctypes.POINTER
+                The C pointer whose position needs to be moved.
+                memref_ptr: ctypes.POINTER
+                The reference C pointer whose type determines the size of each 
+                element for the move.
+
+            Returns:
+            ctypes.POINTER
+                A new C pointer pointing to the next element in memory, based on
+                the size of the type referenced by `memref_ptr`.
+            """
+            elem_size = ctypes.sizeof(memref_ptr.contents)
+            outdata_addr = ctypes.addressof(outdata_ptr.contents)
+            out_ptr = ctypes.cast(outdata_addr + elem_size, type(memref_ptr))
+            return out_ptr
+
+        def exec_buddy_graph(*args):
+            """
+            Execute a graph using TorchDynamo with the provided input tensors.
+
+            Args:
+                *args: List[torch.Tensor]
+                Input tensors to be passed to the graph's function.
+
+            Returns:
+            List[torch.Tensor]
+                The result of executing the graph, represented as a list of 
+                output tensors.
+            """
+            # A list of ctypes pointers representing memory references for input
+            # tensors.
+            input_memref = [
+                ctypes.pointer(
+                    ctypes.pointer(
+                        rt.get_ranked_memref_descriptor(tensor.numpy())
+                    )
+                )
+                for tensor in args
+            ]
+            # A list of ctypes pointers representing memory references for 
+            # output tensors.
+            output_memref = [
+                ctypes.pointer(ctypes.pointer(graph._output_descriptor()))
+            ]
+            args_memref = output_memref + input_memref
+            # Invoke the graph's function using the provided execution engine 
+            # and memory references
+            ee.invoke(graph._func_name, *args_memref)
+
+            output_tensor = []
+            outdata_ptr = args_memref[0][0]
+            # Iterate through each output memory reference in the graph
+            for output_ptr in graph._output_memref:
+                # Cast the output data pointer to the type of the current output
+                # memory reference
+                data_ptr = cast_c_ptr(outdata_ptr, output_ptr[0])
+                # Convert the C data pointer to a NumPy array and append it to
+                # the output_tensor list
+                output_tensor.append(rt.ranked_memref_to_numpy(data_ptr))
+                # Move to the next element in memory based on the size of the
+                # current output type
+                outdata_ptr = move_c_ptr(outdata_ptr, output_ptr[0])
+            # Convert each NumPy array to a PyTorch tensor and return the list 
+            # of tensors
+            return [torch.from_numpy(tensor) for tensor in output_tensor]
+
+        return exec_buddy_graph
diff --git a/frontend/Python/graph/__init__.py b/frontend/Python/graph/__init__.py
new file mode 100644
index 000000000..bd927a3c0
--- /dev/null
+++ b/frontend/Python/graph/__init__.py
@@ -0,0 +1,23 @@
+# ===- __init__.py -------------------------------------------------------------
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# ===---------------------------------------------------------------------------
+#
+# Init the packages in graph directory.
+#
+# ===---------------------------------------------------------------------------
+
+from .graph import Graph
+from .operation import *
+from .type import TensorDType, TensorMeta
diff --git a/frontend/Python/graph/graph.py b/frontend/Python/graph/graph.py
new file mode 100644
index 000000000..be2ce438c
--- /dev/null
+++ b/frontend/Python/graph/graph.py
@@ -0,0 +1,487 @@
+# ===- graph.py ----------------------------------------------------------------
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# ===---------------------------------------------------------------------------
+#
+# This is the graph level of the Buddy Compiler frontend.
+#
+# ===---------------------------------------------------------------------------
+
+from typing import Any, List, Optional
+from types import FunctionType
+import ctypes
+import functools
+
+import numpy as np
+import mlir.ir as ir
+import mlir.dialects.func as func
+from mlir.passmanager import *
+from mlir.execution_engine import *
+from mlir import runtime as rt
+
+from .operation import *
+from .type import *
+
+
+def make_output_memref_descriptor(ranks, dtypes):
+    """
+    Make an output memref descriptor for the given memref ranks and dtypes.
+
+    Parameters:
+    - ranks: List[int]
+        A list of integers representing the ranks of each memref.
+    - dtypes: List[str]
+        A list of strings representing the data types of each memref.
+
+    Returns:
+    ctypes.Structure
+        An output memref descriptor struct.
+
+    Example:
+    ranks = [2, 3, 1]
+    dtypes = [np.float32, np.int64, np.bool]
+    descriptor = make_output_memref_descriptor(ranks, dtypes)
+    # Use the descriptor in your code
+    """
+    memref_descriptor = []
+    for i, rank, dtype in zip(range(len(ranks)), ranks, dtypes):
+        memref_descriptor.append(
+            (str(i), rt.make_nd_memref_descriptor(rank, dtype))
+        )
+
+    class OutputDescriptor(ctypes.Structure):
+        """Builds an output struct descriptor for the multi memref."""
+
+        _fields_ = memref_descriptor
+
+    return OutputDescriptor
+
+
+class Graph:
+    """
+    Graph is a graph-level expression for the Buddy Compiler frontends.
+    It acts as a model compute graph, which converts a Graph into an equivalent 
+    MLIR module.
+
+    Attributes:
+    - _body: List[Op]
+        The sequence of operation nodes in the graph.
+    - _inputs: List[TensorMeta]
+        The model inputs represented as TensorMeta objects.
+    - _fake_params: List[TensorMeta]
+        The fake parameters represented as TensorMeta objects.
+    - device: str
+        The hardware for graph runtime.
+    - _imported_module: Union[None, ImportedModuleType]
+        The imported MLIR module after compilation, if set.
+    - _ops_registry: dict
+        The ops lower strategy for the graph.
+    - _func_name: str
+        The function name for the MLIR module.
+    - _ctx: ir.Context
+        The context of the MLIR module.
+    - _output_memref: Union[None, ctypes.POINTER]
+        The memref pointer in the MLIR function output, if set.
+    - _output_descriptor: Union[None, OutputDescriptorType]
+        The output descriptor for the MLIR function, if set.
+    - ee_: Union[None, ExecutionEngineType]
+        The execution engine for the graph, if set.
+    """
+
+    def __init__(
+        self,
+        inputs: List[TensorMeta],
+        fake_params: List[TensorMeta],
+        ops_registry: dict,
+        func_name: str,
+    ) -> None:
+        """
+        Initializes the Graph.
+
+        Args:
+            inputs: List[TensorMeta]
+                The model inputs represented as TensorMeta objects.
+            fake_params: List[TensorMeta]
+                The fake parameters represented as TensorMeta objects.
+            ops_registry: dict
+                The ops lower strategy for the graph.
+            func_name: str
+                The function name for the MLIR module.
+        """
+        self._body = []
+        self._inputs = inputs
+        self.node_table: Dict[str, Op] = {}
+        self._fake_params = fake_params
+        self.device = "cpu"
+        self._imported_module = None
+        self._ops_registry = ops_registry
+        self._func_name = func_name
+        self._ctx = ir.Context()
+        self._output_memref = None
+        self._output_descriptor = None
+        self.execution_engine = None
+
+    def add_node(self, node: Op):
+        """
+        Adds an operation node to the graph's body.
+
+        Parameters:
+        - node: Op
+            The operation node to be added to the graph.
+
+        Returns:
+        None
+
+        Example:
+        graph_instance = Graph(inputs, fake_params, ops_registry, func_name)
+        op_node = Op()
+        graph_instance.add_node(op_node)
+        # The op_node is now part of the graph's body
+        """
+        self._body.append(node)
+        self.node_table[node.name] = node
+
+    def perform(self, func_list: List[FunctionType]):
+        for transform_func in func_list:
+            transform_func(self)
+
+    def lower_to_top_level_ir(self, do_params_pack=False):
+        """
+        Lowers the graph to top-level MLIR dialects.
+
+        Parameters:
+        - do_params_pack: bool, optional (default=False)
+            Flag indicating whether to perform parameters packing to one memref.
+
+        Returns:
+        None
+
+        Example:
+        graph_instance = Graph(inputs, fake_params, ops_registry, func_name)
+        graph_instance.lower_to_top_level_ir(do_params_pack=True)
+        # The graph is now lowered to top-level MLIR dialects
+        """
+        with ir.Location.unknown(self._ctx):
+            fx_importer = GraphImporter(
+                self._body,
+                self._fake_params,
+                self._inputs,
+                do_params_pack,
+                self._func_name,
+                self._ops_registry,
+            )
+            self._imported_module = fx_importer.import_graph()
+            outputs = fx_importer.get_output_nodes()
+        self._output_memref = []
+        output_ranks = []
+        output_dtypes = []
+        for out_node in outputs:
+            out_type = ir.RankedTensorType(out_node.type)
+            shape = list(out_type.shape)
+            dtype = out_type.element_type
+            match str(dtype):
+                case "i1":
+                    np_type = np.dtype(np.bool_)
+                case "i32":
+                    np_type = np.dtype(np.int32)
+                case "i64":
+                    np_type = np.dtype(np.int64)
+                case "f32":
+                    np_type = np.dtype(np.float32)
+                case _:
+                    raise NotImplementedError(f"Unsupported dtype {dtype}")
+            self._output_memref.append(
+                ctypes.pointer(
+                    ctypes.pointer(
+                        rt.make_nd_memref_descriptor(
+                            len(shape), rt.as_ctype(np_type)
+                        )()
+                    )
+                )
+            )
+            output_ranks.append(len(shape))
+            output_dtypes.append(rt.as_ctype(np_type))
+        self._output_descriptor = make_output_memref_descriptor(
+            output_ranks, output_dtypes
+        )
+
+    def lower_to_llvm_ir(self):
+        """
+        Lower graph to llvm ir.
+        """
+        if self._imported_module is None:
+            self.lower_to_top_level_ir()
+
+        with ir.Location.unknown(self._ctx):
+            pm = PassManager("builtin.module")
+            pm.add("func.func(tosa-to-linalg-named)")
+            pm.add("func.func(tosa-to-linalg)")
+            pm.add("func.func(tosa-to-tensor)")
+            pm.add("func.func(tosa-to-arith)")
+            pm.run(self._imported_module.operation)
+            pm.add("arith-expand")
+            pm.add("eliminate-empty-tensors")
+            pm.add("empty-tensor-to-alloc-tensor")
+            pm.add("convert-elementwise-to-linalg")
+            pm.add('one-shot-bufferize')
+            pm.add("func.func(convert-linalg-to-affine-loops)")
+            pm.add("affine-loop-fusion")
+            pm.add("func.func(affine-parallelize)")
+            pm.add("lower-affine")
+            pm.add("convert-scf-to-openmp")
+            pm.add("func-bufferize")
+            pm.add("arith-bufferize")
+            pm.add("func.func(tensor-bufferize)")
+            pm.add("func.func(buffer-deallocation)")
+            pm.add("func.func(finalizing-bufferize)")
+            pm.add("expand-strided-metadata")
+            pm.add("convert-vector-to-llvm")
+            pm.add("memref-expand")
+            pm.add("arith-expand")
+            pm.add("convert-arith-to-llvm")
+            pm.add("finalize-memref-to-llvm")
+            pm.add("convert-scf-to-cf")
+            pm.add("func.func(llvm-request-c-wrappers)")
+            pm.add("convert-openmp-to-llvm")
+            pm.add("convert-math-to-llvm")
+            pm.add("convert-math-to-libm")
+            pm.add("convert-func-to-llvm")
+            pm.add("reconcile-unrealized-casts")
+            pm.run(self._imported_module.operation)
+
+    def compile(self):
+        """
+        Compile graph from Buddy Graph to LLVM IR.
+        """
+        self.lower_to_top_level_ir()
+        self.lower_to_llvm_ir()
+
+
+class GraphImporter:
+    """
+    Imports an buddy graph and generates an MLIR module in high-level dialects.
+
+    Attributes:
+        _symbol_table (dict): A dictionary to keep track of the symbols.
+        _body (List[Op]): The FX graph module to be imported.
+        _func_name (str): Name of the generated MLIR function.
+        _inputs (List[TensorMeta]): Input tensor(s) of the FX graph.
+        _num_input_visited (int): Number of input nodes that have been visited.
+        _module (mlir.ir.Module): The generated MLIR module.
+        _ops_registry (dict): Registry for the candidate operations.
+    """
+
+    def __init__(
+        self,
+        body: List[Op],
+        params: List[TensorMeta],
+        inputs: List[TensorMeta],
+        do_param_pack: bool,
+        func_name: str,
+        ops_registry: dict,
+    ):
+        """
+        Initializes the buddy Graph importer.
+
+        Args:
+            gm (Graph): The buddy graph that will be imported.
+            inputs (List[TensorMeta]): Input tensor(s) of the buddy graph.
+            func_name (str): Name of the generated MLIR function.
+            ops_registry (dict): Registry for the candidate operations.
+        """
+        if ops_registry is None:
+            ops_registry = {}
+        self._symbol_table = {}
+        self._body = body
+        self._func_name = func_name
+        self._params = params
+        self._inputs = inputs
+        self._do_param_pack = do_param_pack
+        self._param_packs = []
+        self._num_input_visited = 0
+        self._module = ir.Module.create()
+        self._ops_registry = ops_registry
+        self._current_param_pack_offset = None
+
+    def _str_to_mlir_dtype(self, dtype: str) -> ir.Type:
+        """
+        Converts a str to the corresponding MLIR dtype.
+
+        Args:
+            dtype (str): The tensor type.
+
+        Returns:
+            mlir.ir.Type: The corresponding MLIR data type.
+
+        Raises:
+            NotImplementedError: If the given dtype is not supported.
+        """
+        match dtype:
+            case TensorDType.Int32:
+                return ir.IntegerType.get_signless(32)
+            case TensorDType.Int64:
+                return ir.IntegerType.get_signless(64)
+            case TensorDType.Float32:
+                return ir.F32Type.get()
+            case TensorDType.Bool:
+                return ir.IntegerType.get_signless(1)
+            case _:
+                raise NotImplementedError(f"Unsupported dtype {dtype}")
+
+    def _pack_params(self) -> None:
+        """
+        Packs parameters of the graph to one memref.
+
+        Returns:
+        None
+
+        Example:
+        graph_instance = Graph(inputs, fake_params, ops_registry, func_name)
+        graph_instance._pack_params()
+        # The parameters of the graph are now packed to one memref.
+        """
+        dtypes = list(set([param.dtype for param in self._params]))
+        dtypes.sort(key=str)
+        self._current_param_pack_offset = {dtype: 0 for dtype in dtypes}
+        for dtype in dtypes:
+            params_of_dtype = [
+                param for param in self._params if param.dtype == dtype
+            ]
+            param_total_size = 0
+            for param in params_of_dtype:
+                param_total_size += functools.reduce(
+                    lambda x, y: x * y, list(param.shape), 1
+                )
+            mlir_dtype = self._str_to_mlir_dtype(dtype)
+            self._param_packs.append(
+                ir.RankedTensorType.get([param_total_size], mlir_dtype)
+            )
+
+    def import_graph(self) -> ir.Module:
+        """
+        Imports buddy graph and generates an MLIR module in high-level dialects.
+
+        Returns:
+            mlir.ir.Module: An MLIR module in high-level dialects.
+        """
+        with ir.InsertionPoint(self._module.body):
+            arguments = []
+            if self._do_param_pack:
+                self._pack_params()
+                arguments.extend(self._param_packs)
+                inputs = self._inputs
+            else:
+                inputs = self._params + self._inputs
+            for arg in inputs:
+                shape_list = list(arg.shape)
+                dtype = arg.dtype
+                mlir_dtype = self._str_to_mlir_dtype(dtype)
+                tensor_arg = ir.RankedTensorType.get(shape_list, mlir_dtype)
+                arguments.append(tensor_arg)
+
+            @func.FuncOp.from_py_func(*arguments, name=self._func_name)
+            def generated_func(*args):
+                args_list = list(args)
+                for node in self._body:
+                    if isinstance(node, OutputOp):
+                        output_node_args = node.args
+                        returns = [
+                            self._symbol_table.get((str(output_arg), 0))
+                            for output_arg in output_node_args
+                        ]
+                        self._symbol_table[("output", 0)] = returns
+                    elif isinstance(node, PlaceholderOp):
+                        self._import_placeholder(node, args_list)
+                    elif isinstance(node, GetItemOp):
+                        self._symbol_table[
+                            (str(node.name), 0)
+                        ] = self._symbol_table[
+                            (str(node.args[0]), node.args[1])
+                        ]
+                    else:
+                        self._import_op(node)
+
+                return self._symbol_table.get(("output", 0))
+
+        return self._module
+
+    def _import_placeholder(
+        self, node: PlaceholderOp, args_list: List[ir.BlockArgument]
+    ):
+        """
+        Imports a placeholder node from the Buddy graph.
+
+        Parameters:
+        - node (PlaceholderOp): The PlaceholderOp node representing the 
+        placeholder.
+        - args_list (List[mlir.ir.BlockArgument]): List of input memrefs.
+
+        Returns:
+        None
+        """
+        if self._num_input_visited < len(self._params) and self._do_param_pack:
+            dtype = node.tensor_meta["dtype"]
+            pack_of_dtype = None
+            for pack in args_list:
+                if ir.RankedTensorType(
+                    pack.type
+                ).element_type == self._str_to_mlir_dtype(dtype):
+                    pack_of_dtype = pack
+                    break
+            placeholder_name = self._ops_registry["param.extract"](
+                node, self._current_param_pack_offset[dtype], pack_of_dtype
+            ).result
+            self._current_param_pack_offset[dtype] += functools.reduce(
+                lambda x, y: x * y, list(node.tensor_meta["shape"]), 1
+            )
+        elif self._do_param_pack:
+            if len(self._params) > 0:
+                placeholder_name = args_list[
+                    self._num_input_visited
+                    - len(self._params)
+                    + len(self._param_packs)
+                ]
+            else:
+                placeholder_name = args_list[self._num_input_visited]
+        else:
+            placeholder_name = args_list[self._num_input_visited]
+
+        self._symbol_table[(str(node.name), 0)] = placeholder_name
+        self._num_input_visited += 1
+
+    def _import_op(self, node: Op):
+        """
+        Imports an operation node from the buddy graph.
+
+        Args:
+            node (Op): The buddy node representing the operation.
+
+        """
+        op_name = node.__class__.__name__
+        op_ret: ir.Operation | ir.Value | tuple | ir.OpResult = (
+            self._ops_registry[op_name](node, self._symbol_table)
+        )
+        if isinstance(op_ret, tuple):
+            for i, operation in enumerate(op_ret):
+                self._symbol_table[(str(node.name), i)] = operation.result
+        elif isinstance(op_ret, ir.OpResult):
+            self._symbol_table[(str(node.name), 0)] = op_ret
+        else:
+            self._symbol_table[(str(node.name), 0)] = op_ret.result
+
+    def get_output_nodes(self):
+        """
+        Get output nodes from the lowered mlir func.
+        """
+        return self._symbol_table.get(("output", 0))
diff --git a/frontend/Python/graph/operation.py b/frontend/Python/graph/operation.py
new file mode 100644
index 000000000..550f3f321
--- /dev/null
+++ b/frontend/Python/graph/operation.py
@@ -0,0 +1,456 @@
+# ===- operation.py ------------------------------------------------------------
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# ===---------------------------------------------------------------------------
+#
+# This is the operation structure of Buddy Compiler graph representation.
+#
+# ===---------------------------------------------------------------------------
+
+from enum import Enum
+from typing import Dict, Optional, List, Tuple
+
+from .type import TensorDType, TensorMeta
+
+
+class OpType(Enum):
+    """
+    Enum class for declaring operation types.
+
+    Members:
+    - BroadcastType: int
+        Represents a broadcast operation.
+    - ElementwiseType: int
+        Represents an elementwise operation.
+    - ReshapeType: int
+        Represents a reshape operation.
+    - ReduceType: int
+        Represents a reduction operation.
+    - ConcatType: int
+        Represents a concatenation operation.
+    - PlaceholderType: int
+        Represents a placeholder operation.
+    - GetItemType: int
+        Represents an operation to retrieve an item.
+
+    Note: The underlying values are integers for these operation types.
+    """
+
+    BroadcastType = 0
+    ElementwiseType = 1
+    ReshapeType = 2
+    ReduceType = 3
+    ConcatType = 4
+    PlaceholderType = 5
+    GetItemType = 6
+
+
+class Op:
+    """
+    Base class for all operations in a computational graph.
+
+    Attributes:
+    - _name: str
+        The unique name of the operation node.
+    - _arguments: list
+        The input arguments of the operation node.
+    - _keyword_arguments: dict
+        The keyword arguments of the operation node.
+    - _tensor_meta: dict
+        The metadata of the output tensor, including shape and data type.
+    - _op_type: OpType
+        The type of the operation node, as defined in the OpType enum.
+    """
+
+    def __init__(self) -> None:
+        """
+        Initialize a new instance of the Op class.
+        """
+        self._name = None
+        self._arguments = []
+        self._keyword_arguments = {}
+        self._tensor_meta: List[TensorMeta] = {}
+        self._op_type: OpType = None
+        self._children: List[str] = []
+        self._parents: List[str] = []
+
+    def add_argument(self, arg):
+        """
+        Add an input argument to the operation node.
+
+        Parameters:
+        - arg: Any
+            The input argument to be added.
+        """
+        self._arguments.append(arg)
+
+    def add_parent(self, parent: str):
+        """
+        Add an parent node's name to the operation node.
+
+        Parameters:
+        - parent: str
+            The parent node's name to be added.
+        """
+        self._parents.append(parent)
+
+    def add_children(self, child):
+        """
+        Add an user node's name to the operation node.
+
+        Parameters:
+        - user: str
+            The user node's name to be added.
+        """
+        self._children.append(child)
+
+    @property
+    def args(self):
+        return self._arguments
+
+    @property
+    def kwargs(self):
+        return self._keyword_arguments
+
+    @property
+    def name(self):
+        return self._name
+
+    @name.setter
+    def name(self, new_name):
+        self._name = new_name
+
+    @property
+    def tensor_meta(self):
+        return self._tensor_meta
+
+
+class PlaceholderOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.PlaceholderType
+
+
+class MatmulOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.ReduceType
+
+
+class GetItemOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.GetItemType
+
+
+class OutputOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.GetItemType
+
+
+class ArangeOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.PlaceholderType
+
+
+class UnsqueezeOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.ReshapeType
+
+
+class ViewOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.ReshapeType
+
+
+class EmbeddingOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.ReshapeType
+
+
+class OnesOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.PlaceholderType
+
+
+class FullOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.PlaceholderType
+
+
+class LessThanOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.BroadcastType
+
+
+class MaskedFillOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.ElementwiseType
+
+
+class SliceOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.ReshapeType
+
+
+class ToCopyOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.ElementwiseType
+
+
+class RsubOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.BroadcastType
+
+
+class PowOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.BroadcastType
+
+
+class MeanOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.ReduceType
+
+
+class RsqrtOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.ElementwiseType
+
+
+class MulOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.BroadcastType
+
+
+class TransposeOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.ReshapeType
+
+
+class IndexOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.ReshapeType
+
+
+class NegOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.ElementwiseType
+
+
+class CatOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.ConcatType
+
+
+class SqueezeOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.ReshapeType
+
+
+class BatchMatmulOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.ReduceType
+
+
+class DivOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.BroadcastType
+
+
+class SoftmaxOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.ReduceType
+
+
+class CloneOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.ReduceType
+
+
+class SiluOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.ElementwiseType
+
+
+class AddOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.BroadcastType
+
+
+class AddMMOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.ReduceType
+
+
+class AmaxOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.ReduceType
+
+
+class SubOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.BroadcastType
+
+
+class ConvertElementTypeOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.ElementwiseType
+
+
+class ExpOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.ElementwiseType
+
+
+class ExpandOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.ReshapeType
+
+
+class PermuteOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.ReshapeType
+
+
+class ReshapeOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.ReshapeType
+
+
+class SelectOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.ReshapeType
+
+
+class SumDimOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.ReduceType
+
+
+class TanhOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.ElementwiseType
+
+
+class VarMeanOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.ReduceType
+
+
+class TOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.ReshapeType
+
+
+class ErfOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.ElementwiseType
+
+class Conv2dOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.ReduceType
+        self._layout = "NCHW_FCHW"
+
+class ReluOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.ElementwiseType
+
+class SigmoidOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.ElementwiseType
+
+class IotaOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.PlaceholderType
+
+class ScalarTensorOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.PlaceholderType
+
+class WhereOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.ElementwiseType
+
+class MaxPool2dWithIndicesOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.ReduceType
+        self._layout = "NCHW"
+
+
+class MaxPool2dOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.ReduceType
+        self._layout = "NCHW"
+
+
+class ReciprocalOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.ElementwiseType
+
+
+class SqrtOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.ElementwiseType
diff --git a/frontend/Python/graph/transform/__init__.py b/frontend/Python/graph/transform/__init__.py
new file mode 100644
index 000000000..c4b7ac3d1
--- /dev/null
+++ b/frontend/Python/graph/transform/__init__.py
@@ -0,0 +1 @@
+from .useless_op_eliminate import maxpool2d_simplify
\ No newline at end of file
diff --git a/frontend/Python/graph/transform/useless_op_eliminate.py b/frontend/Python/graph/transform/useless_op_eliminate.py
new file mode 100644
index 000000000..1b3f59296
--- /dev/null
+++ b/frontend/Python/graph/transform/useless_op_eliminate.py
@@ -0,0 +1,66 @@
+# ===- maxpool2d_simplify.py ---------------------------------------------------
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# ===---------------------------------------------------------------------------
+#
+# simplify the maxpool2d with getitem.
+#
+# ===---------------------------------------------------------------------------
+
+from .. import Graph
+from ..operation import *
+
+
+def maxpool2d_simplify(graph: Graph):
+    """
+    Fuse the maxpool op and getitem op to simpllify graph.
+
+    Args:
+        graph (torch.fx.GraphModule): The Graph to be simplified.
+    """
+    for i, node in enumerate(graph._body):
+        if isinstance(node, MaxPool2dWithIndicesOp):
+            getitem_num = 0
+            for user in node._children:
+                if isinstance(graph.node_table[user], GetItemOp):
+                    getitem_num += 1
+                    getitem_node = graph.node_table[user]
+            if (
+                getitem_num == 1
+                and len(node._children) == 1
+                and getitem_node.args[1] == 0
+            ):
+                new_node = MaxPool2dOp()
+                new_node.name = getitem_node.name
+                for arg in node.args:
+                    new_node.add_argument(arg)
+                for parent in node._parents:
+                    new_node.add_parent(parent)
+                for child in getitem_node._children:
+                    new_node.add_children(child)
+                new_node.tensor_meta["shape"] = getitem_node.tensor_meta[
+                    "shape"
+                ]
+                new_node.tensor_meta["dtype"] = getitem_node.tensor_meta[
+                    "dtype"
+                ]
+                new_node._layout = node._layout
+                del graph.node_table[node.name]
+                del graph.node_table[getitem_node.name]
+                graph.node_table[new_node.name] = new_node
+                del graph._body[i]
+                for j, op in enumerate(graph._body):
+                    if op == getitem_node:
+                        graph._body[j] = new_node
+                        break
diff --git a/frontend/Python/graph/type.py b/frontend/Python/graph/type.py
new file mode 100644
index 000000000..5e1db3ed8
--- /dev/null
+++ b/frontend/Python/graph/type.py
@@ -0,0 +1,79 @@
+# ===- type.py -----------------------------------------------------------------
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# ===---------------------------------------------------------------------------
+#
+# This is the tensor type of the Buddy Compiler frontend.
+#
+# ===---------------------------------------------------------------------------
+
+from enum import Enum
+
+
+class TensorDType(Enum):
+    """
+    Enum class for declaring tensor data types.
+
+    Members:
+    - Int32: str
+        Represents the 32-bit integer data type.
+    - Int64: str
+        Represents the 64-bit integer data type.
+    - Float32: str
+        Represents the 32-bit floating-point data type.
+    - Bool: str
+        Represents the boolean data type.
+    """
+
+    Int32 = "int32"
+    Int64 = "int64"
+    Float16 = "float16"
+    Float32 = "float32"
+    Float64 = "float64"
+    Bool = "bool"
+
+
+class TensorMeta:
+    """
+    Store tensor metadata, including shape and data type, while overlooking raw 
+    data.
+
+    Attributes:
+    - shape: tuple
+        Represents the shape of the tensor.
+    - dtype: str
+        Represents the data type of the tensor.
+
+    Methods:
+    - __init__(shape: tuple, dtype: str) -> None:
+        Initializes a new instance of the TensorMeta class with the specified 
+        shape and data type.
+
+    Example:
+    meta = TensorMeta(shape=(3, 4), dtype='float32')
+    # Access metadata attributes: meta.shape, meta.dtype
+    """
+
+    def __init__(self, shape, dtype) -> None:
+        """
+        Initialize a new instance of the TensorMeta class.
+
+        Parameters:
+        - shape: tuple
+            Represents the shape of the tensor.
+        - dtype: str
+            Represents the data type of the tensor.
+        """
+        self.shape = shape
+        self.dtype = dtype
diff --git a/frontend/Python/ops/linalg.py b/frontend/Python/ops/linalg.py
index 6a6e161c9..0a22478e1 100644
--- a/frontend/Python/ops/linalg.py
+++ b/frontend/Python/ops/linalg.py
@@ -14,29 +14,70 @@
 #
 # ===---------------------------------------------------------------------------
 #
-# The registry of mappings from Torch node to MLIR linalg dialect operations.
+# The registry of mappings from Buddy Graph to MLIR linalg dialect operations.
 #
 # ===---------------------------------------------------------------------------
 
 from typing import Dict, Tuple, List
 
-import torch
-
 import mlir.ir as ir
 from mlir.dialects import tosa, linalg, arith, tensor, math
 import copy
 import numpy
 import functools
 
+from ..graph import *
+from ..graph.graph import TensorDType
+from .utils import *
+
+
+def add_op(node: AddOp, symbol_table: Dict[Tuple[str, int], ir.Operation]):
+    """
+    Import tensor add operation.
+    From buddy AddOp to MLIR arith `constant` operation.
+
+    Note: this function init an output tensor according input range.
+
+    Args:
+        node: Containing information from the input graph node.
+        symbol_table: A dictionary mapping symbols to their corresponding
+        operations.
+
+    Returns:
+        op: The operation representing the result tensor of two input nodes' add
+        result.
+    """
+    input1 = symbol_table.get((str(node.args[0]), 0))
+    dtype = node.tensor_meta["dtype"]
+    mlir_dtype = mlir_element_type_get(dtype)
+    shape = list(node.tensor_meta["shape"])
+    if isinstance(node.args[1], str):
+        input2 = symbol_table.get((str(node.args[1]), 0))
+    else:
+        data = [node.args[1]]
+        input2_shape = numpy.array(data).shape
+        tensor_type = ir.RankedTensorType.get(input2_shape, mlir_dtype)
+        element = mlir_element_attr_get(dtype, node.args[1])
+        attr = ir.DenseElementsAttr.get_splat(tensor_type, element)
+        input2 = arith.ConstantOp(tensor_type, attr).result
+    if input1 is None or input2 is None:
+        return
+    add_result_tensor_type = ir.RankedTensorType.get(shape, mlir_dtype)
+    op = tosa.AddOp(
+        add_result_tensor_type,
+        input1,
+        input2,
+    )
+    return op.result
+
 
 def arange_op(
-    node: torch.fx.Node,
+    node: ArangeOp,
     symbol_table: Dict[Tuple[str, int], ir.Operation],
 ):
     """
     Import tensor arange operation.
-    From PyTorch `aten.arange.default` and `aten.arange.start` operator to MLIR
-    arith `constant` operation.
+    From buddy ArangeOp to MLIR arith `constant` operation.
 
     Note: this function init an output tensor according input range.
 
@@ -49,51 +90,34 @@ def arange_op(
         op: The operation representing the result tensor of ranging the start
         and end from input node.
     """
-    if node.target.__name__ == "arange.start":
+    if len(node.args) == 2:
         start = int(node.args[0])
         end = int(node.args[1])
-        stride = int(node.meta["tensor_meta"].stride[0])
-        dtype = str(node.meta["tensor_meta"].dtype)
-        shape = list(node.meta["tensor_meta"].shape)
-        dtype = ir.IntegerType.get_signless(64)
-        tensor_type = ir.RankedTensorType.get(shape, dtype)
-        attr = ir.DenseElementsAttr.get(
-            numpy.array([i for i in range(start, end, stride)]),
-            signless=True,
-            type=tensor_type,
-        )
-        op = arith.ConstantOp(tensor_type, attr)
-
-    elif node.target.__name__ == "arange.default":
+    else:
         start = 0
         end = int(node.args[0])
-        stride = int(node.meta["tensor_meta"].stride[0])
-        dtype = str(node.meta["tensor_meta"].dtype)
-        shape = list(node.meta["tensor_meta"].shape)
-        dtype = ir.IntegerType.get_signless(64)
-        tensor_type = ir.RankedTensorType.get(shape, dtype)
-        attr = ir.DenseElementsAttr.get(
-            numpy.array([i for i in range(start, end, stride)]),
-            signless=True,
-            type=tensor_type,
-        )
-        op = arith.ConstantOp(tensor_type, attr)
+    stride = 1
+    dtype = node.tensor_meta["dtype"]
+    shape = list(node.tensor_meta["shape"])
+    dtype = mlir_element_type_get(dtype)
+    tensor_type = ir.RankedTensorType.get(shape, dtype)
+    attr = ir.DenseElementsAttr.get(
+        numpy.array([i for i in range(start, end, stride)]),
+        signless=True,
+        type=tensor_type,
+    )
+    op = arith.ConstantOp(tensor_type, attr)
 
     return op
 
 
 def unsqueeze_op(
-    node: torch.fx.Node,
+    node: UnsqueezeOp,
     symbol_table: Dict[Tuple[str, int], ir.Operation],
 ):
     """
     Import the unsqueeze operation.
-    From PyTorch `aten.unsqueeze.default` operator to MLIR TOSA `reshape`
-    operation.
-
-    Note: "unsqueeze" means inserting a new dimension of size 1 at the specified
-          position. For more information, please refer to
-          https://pytorch.org/docs/stable/generated/torch.unsqueeze.html
+    From buddy UnsqueezeOp to MLIR TOSA `reshape` operation.
 
     Args:
         node: Containing information from the input graph node.
@@ -118,12 +142,12 @@ def unsqueeze_op(
 
 
 def view_op(
-    node: torch.fx.Node,
+    node: ViewOp,
     symbol_table: Dict[Tuple[str, int], ir.Operation],
 ):
     """
     Import the tensor view operation.
-    From PyTorch `aten.view.default` operator to MLIR TOSA `reshape` operation.
+    From buddy ViewOp to MLIR TOSA `reshape` operation.
 
     Note: If the new shape contains one and only one `-1`, the size of the new
     shape will be inferred automatically.
@@ -160,13 +184,12 @@ def view_op(
 
 
 def embedding_op(
-    node: torch.fx.Node,
+    node: EmbeddingOp,
     symbol_table: Dict[Tuple[str, int], ir.Operation],
 ):
     """
     Import the embedding operation.
-    From PyTorch `aten.embedding.default` operator to MLIR linalg `generic`
-    operation.
+    From buddy EmbeddingOp to MLIR linalg `generic` operation.
 
     Note: In this op, input node1's value is as index to get input node2's row
     slice.
@@ -180,52 +203,51 @@ def embedding_op(
     """
     input1 = symbol_table.get((str(node.args[0]), 0))
     input2 = symbol_table.get((str(node.args[1]), 0))
-    output_shape = list(node.meta["tensor_meta"].shape)
-    dtype = str(node.meta["tensor_meta"].dtype)
-    if dtype == "torch.float32":
-        tensor_type = ir.RankedTensorType.get(output_shape, ir.F32Type.get())
-        output = tensor.EmptyOp(output_shape, ir.F32Type.get())
-        generic_map = ir.AffineMap.get_permutation([0, 1, 2])
-        op = linalg.GenericOp(
-            [tensor_type],
-            [input2],
-            [output],
-            ir.ArrayAttr.get(
-                [
-                    ir.AffineMapAttr.get(generic_map.get_submap([0, 1])),
-                    ir.AffineMapAttr.get(generic_map.get_submap([0, 1, 2])),
-                ]
-            ),
-            ir.ArrayAttr.get(
-                [ir.Attribute.parse("#linalg.iterator_type<parallel>")] * 3
-            ),
-        )
-        block = ir.Block.create_at_start(
-            op.region,
+    output_shape = list(node.tensor_meta["shape"])
+    dtype = node.tensor_meta["dtype"]
+    dtype = mlir_element_type_get(dtype)
+    tensor_type = ir.RankedTensorType.get(output_shape, dtype)
+    output = tensor.EmptyOp(output_shape, dtype)
+    generic_map = ir.AffineMap.get_permutation([0, 1, 2])
+    op = linalg.GenericOp(
+        [tensor_type],
+        [input2],
+        [output],
+        ir.ArrayAttr.get(
             [
-                ir.RankedTensorType(input2.type).element_type,
-                ir.RankedTensorType(output.result.type).element_type,
-            ],
-        )
-        index1 = arith.IndexCastOp(ir.IndexType.get(), block.arguments[0])
-        index2 = linalg.IndexOp(ir._i64Attr(2, None))
-        value = tensor.ExtractOp(input1, [index1.result, index2.result])
-        block.append(index1)
-        block.append(index2)
-        block.append(value)
-        block.append(linalg.YieldOp([value.result]))
+                ir.AffineMapAttr.get(generic_map.get_submap([0, 1])),
+                ir.AffineMapAttr.get(generic_map.get_submap([0, 1, 2])),
+            ]
+        ),
+        ir.ArrayAttr.get(
+            [ir.Attribute.parse("#linalg.iterator_type<parallel>")] * 3
+        ),
+    )
+    block = ir.Block.create_at_start(
+        op.region,
+        [
+            ir.RankedTensorType(input2.type).element_type,
+            ir.RankedTensorType(output.result.type).element_type,
+        ],
+    )
+    index1 = arith.IndexCastOp(ir.IndexType.get(), block.arguments[0])
+    index2 = linalg.IndexOp(ir._i64Attr(2, None))
+    value = tensor.ExtractOp(input1, [index1.result, index2.result])
+    block.append(index1)
+    block.append(index2)
+    block.append(value)
+    block.append(linalg.YieldOp([value.result]))
 
     return op
 
 
 def ones_op(
-    node: torch.fx.Node,
+    node: OnesOp,
     symbol_table: Dict[Tuple[str, int], ir.Operation],
 ):
     """
     Import the tensor ones operation.
-    From PyTorch `aten.ones.default` operator to MLIR arith `constant`
-    operation.
+    From buddy OnesOp to MLIR arith `constant` operation.
 
     Note: This op, input node1's value is as index to get input node2's row
     slice.
@@ -238,30 +260,21 @@ def ones_op(
         op: The operation return the arith.constant op.
     """
     output_shape = list(node.args[0])
-    dtype = str(node.meta["tensor_meta"].dtype)
-    if dtype == "torch.bool":
-        element = ir.BoolAttr.get(1)
-        tensor_type = ir.RankedTensorType.get(output_shape, element.type)
-        attr = ir.DenseElementsAttr.get_splat(tensor_type, element)
-    elif dtype == "torch.int64":
-        dtype = ir.IntegerType.get_signless(64)
-        tensor_type = ir.RankedTensorType.get(output_shape, dtype)
-        attr = ir.DenseElementsAttr.get(
-            numpy.ones(output_shape), signless=True, type=tensor_type
-        )
+    dtype = node.tensor_meta["dtype"]
+    element = mlir_element_attr_get(dtype, 1)
+    tensor_type = ir.RankedTensorType.get(output_shape, element.type)
+    attr = ir.DenseElementsAttr.get_splat(tensor_type, element)
     op = arith.ConstantOp(tensor_type, attr)
 
     return op
 
-
 def full_op(
-    node: torch.fx.Node,
+    node: FullOp,
     symbol_table: Dict[Tuple[str, int], ir.Operation],
 ):
     """
     Import the tensor full operation.
-    From PyTorch `aten.full.default` operator to MLIR arith `constant`
-    operation.
+    From buddy FullOp to MLIR arith `constant` operation.
 
     Note: This op, input node1's value is the shape of output tensor, input
     node2's value is the value of all elements in output tensor.
@@ -275,39 +288,22 @@ def full_op(
     """
     output_shape = list(node.args[0])
     value = node.args[1]
-    dtype = str(node.meta["tensor_meta"].dtype)
-    if dtype == "torch.bool":
-        element = ir.BoolAttr.get(bool(value))
-        tensor_type = ir.RankedTensorType.get(output_shape, element.type)
-        attr = ir.DenseElementsAttr.get_splat(tensor_type, element)
-    elif dtype == "torch.int64":
-        dtype = ir.IntegerType.get_signless(64)
-        tensor_type = ir.RankedTensorType.get(output_shape, dtype)
-        attr = ir.DenseElementsAttr.get(
-            numpy.full(output_shape, value, dtype=numpy.int64),
-            signless=True,
-            type=tensor_type,
-        )
-    elif dtype == "torch.float32":
-        dtype = ir.F32Type.get()
-        tensor_type = ir.RankedTensorType.get(output_shape, dtype)
-        attr = ir.DenseElementsAttr.get(
-            numpy.full(output_shape, value, dtype=numpy.float32),
-            signless=True,
-            type=tensor_type,
-        )
+    dtype = node.tensor_meta["dtype"]
+    element = mlir_element_attr_get(dtype, value)
+    tensor_type = ir.RankedTensorType.get(output_shape, element.type)
+    attr = ir.DenseElementsAttr.get_splat(tensor_type, element)
     op = arith.ConstantOp(tensor_type, attr)
 
     return op
 
 
 def lt_op(
-    node: torch.fx.Node,
+    node: LessThanOp,
     symbol_table: Dict[Tuple[str, int], ir.Operation],
 ):
     """
     Import the tensor less than operation.
-    From PyTorch `aten.lt.Tensor` operator to MLIR arith `constant` operation.
+    From buddy LessThanOp to MLIR arith `constant` operation.
 
     Note: This op, campare two input nodes, and output bool tensor to represent
     compare result.
@@ -321,93 +317,86 @@ def lt_op(
     """
     input1 = symbol_table.get((str(node.args[0]), 0))
     input2 = symbol_table.get((str(node.args[1]), 0))
-    output_shape = list(node.meta["tensor_meta"].shape)
-    dtype = str(node.meta["tensor_meta"].dtype)
+    output_shape = list(node.tensor_meta["shape"])
+    dtype = node.tensor_meta["dtype"]
     value = ir.IntegerAttr.get(ir.IntegerType.get_signless(64), 2)
     shp1 = list(ir.RankedTensorType(ir.Value(input1).type).shape)
     shp2 = list(ir.RankedTensorType(ir.Value(input2).type).shape)
-    if dtype == "torch.bool":
-        tensor_type = ir.RankedTensorType.get(
-            output_shape, ir.IntegerType.get_signless(1)
-        )
-        output = tensor.EmptyOp(output_shape, ir.IntegerType.get_signless(1))
-        if len(shp1) < len(shp2):
-            if int(shp1[-1]) > 1 and shp2[-1] == 1:
-                generic_map = ir.AffineMap.get_permutation(
-                    [i for i in range(len(shp2) + 1)]
-                )
-                op = linalg.GenericOp(
-                    [tensor_type],
-                    [input1, input2],
-                    [output],
-                    ir.ArrayAttr.get(
-                        [
-                            ir.AffineMapAttr.get(
-                                generic_map.get_submap(
-                                    [
-                                        i
-                                        for i in range(
-                                            len(shp2) - len(shp1), len(shp2)
-                                        )
-                                    ]
-                                )
-                            ),
-                            ir.AffineMapAttr.get(
-                                generic_map.get_submap(
-                                    [i for i in range(0, len(shp2) - 1)]
-                                    + [len(shp2)]
-                                )
-                            ),
-                            ir.AffineMapAttr.get(
-                                generic_map.get_submap(
-                                    [i for i in range(0, len(shp2))]
-                                )
-                            ),
-                        ]
-                    ),
-                    ir.ArrayAttr.get(
-                        [ir.Attribute.parse("#linalg.iterator_type<parallel>")]
-                        * len(shp2)
-                        + [
-                            ir.Attribute.parse(
-                                "#linalg.iterator_type<reduction>"
+    dtype = mlir_element_type_get(dtype)
+    tensor_type = ir.RankedTensorType.get(output_shape, dtype)
+    output = tensor.EmptyOp(output_shape, dtype)
+    if len(shp1) < len(shp2):
+        if int(shp1[-1]) > 1 and shp2[-1] == 1:
+            generic_map = ir.AffineMap.get_permutation(
+                [i for i in range(len(shp2) + 1)]
+            )
+            op = linalg.GenericOp(
+                [tensor_type],
+                [input1, input2],
+                [output],
+                ir.ArrayAttr.get(
+                    [
+                        ir.AffineMapAttr.get(
+                            generic_map.get_submap(
+                                [
+                                    i
+                                    for i in range(
+                                        len(shp2) - len(shp1), len(shp2)
+                                    )
+                                ]
                             )
-                        ]
-                    ),
+                        ),
+                        ir.AffineMapAttr.get(
+                            generic_map.get_submap(
+                                [i for i in range(0, len(shp2) - 1)]
+                                + [len(shp2)]
+                            )
+                        ),
+                        ir.AffineMapAttr.get(
+                            generic_map.get_submap(
+                                [i for i in range(0, len(shp2))]
+                            )
+                        ),
+                    ]
+                ),
+                ir.ArrayAttr.get(
+                    [ir.Attribute.parse("#linalg.iterator_type<parallel>")]
+                    * len(shp2)
+                    + [ir.Attribute.parse("#linalg.iterator_type<reduction>")]
+                ),
+            )
+            block = ir.Block.create_at_start(
+                op.region,
+                [
+                    ir.RankedTensorType(input2.type).element_type,
+                    ir.RankedTensorType(input2.type).element_type,
+                    dtype,
+                ],
+            )
+            if (
+                str(ir.RankedTensorType(input2.type).element_type).find("i")
+                != -1
+            ):
+                cmpop = arith.CmpIOp(
+                    value, block.arguments[0], block.arguments[1]
                 )
-                block = ir.Block.create_at_start(
-                    op.region,
-                    [
-                        ir.RankedTensorType(input2.type).element_type,
-                        ir.RankedTensorType(input2.type).element_type,
-                        ir.IntegerType.get_signless(1),
-                    ],
+            else:
+                cmpop = arith.CmpFOp(
+                    value, block.arguments[0], block.arguments[1]
                 )
-                if (
-                    str(ir.RankedTensorType(input2.type).element_type).find("i")
-                    != -1
-                ):
-                    cmpop = arith.CmpIOp(
-                        value, block.arguments[0], block.arguments[1]
-                    )
-                else:
-                    cmpop = arith.CmpFOp(
-                        value, block.arguments[0], block.arguments[1]
-                    )
-                block.append(cmpop)
-                block.append(linalg.YieldOp([cmpop.result]))
+            block.append(cmpop)
+            block.append(linalg.YieldOp([cmpop.result]))
 
     return op
 
 
 def masked_fill_op(
-    node: torch.fx.Node,
+    node: MaskedFillOp,
     symbol_table: Dict[Tuple[str, int], ir.Operation],
 ):
     """
     Import the tensor masked fill operation.
-    From PyTorch `aten.masked_fill.Scalar` operator to MLIR linalg `generic`
-    operation.
+    From buddy MaskedFillOp to MLIR linalg `generic` operation.
 
     Note: This op, input node2 is a bool tensor. Select input node1's value or
     input node3's value by true or false in input node2's value.
@@ -423,71 +412,67 @@ def masked_fill_op(
     input2 = symbol_table.get((str(node.args[1]), 0))
     if input1 is None or input2 is None:
         return
-    if str(node.args[0].meta["tensor_meta"].dtype) == "torch.float32":
-        value = float(node.args[2])
-        attr = ir.FloatAttr.get(ir.F32Type.get(), value)
-        value = arith.ConstantOp(ir.F32Type.get(), attr)
-    output_shape = list(node.meta["tensor_meta"].shape)
-    dtype = str(node.meta["tensor_meta"].dtype)
-    if dtype == "torch.float32":
-        tensor_type = ir.RankedTensorType.get(output_shape, ir.F32Type.get())
-        output = tensor.EmptyOp(output_shape, ir.F32Type.get())
-        generic_map = ir.AffineMap.get_permutation(
-            [i for i in range(len(output_shape))]
-        )
-        op = linalg.GenericOp(
-            [tensor_type],
-            [input1, input2],
-            [output],
-            ir.ArrayAttr.get(
-                [
-                    ir.AffineMapAttr.get(
-                        generic_map.get_submap(
-                            [i for i in range(len(output_shape))]
-                        )
-                    ),
-                    ir.AffineMapAttr.get(
-                        generic_map.get_submap(
-                            [i for i in range(len(output_shape))]
-                        )
-                    ),
-                    ir.AffineMapAttr.get(
-                        generic_map.get_submap(
-                            [i for i in range(len(output_shape))]
-                        )
-                    ),
-                ]
-            ),
-            ir.ArrayAttr.get(
-                [ir.Attribute.parse("#linalg.iterator_type<parallel>")]
-                * len(output_shape)
-            ),
-        )
-        block = ir.Block.create_at_start(
-            op.region,
+    dtype = node.tensor_meta["dtype"]
+    value = node.args[2]
+    attr = mlir_element_attr_get(dtype, value)
+    dtype = mlir_element_type_get(dtype)
+    value = arith.ConstantOp(dtype, attr)
+    output_shape = list(node.tensor_meta["shape"])
+    tensor_type = ir.RankedTensorType.get(output_shape, dtype)
+    output = tensor.EmptyOp(output_shape, dtype)
+    generic_map = ir.AffineMap.get_permutation(
+        [i for i in range(len(output_shape))]
+    )
+    op = linalg.GenericOp(
+        [tensor_type],
+        [input1, input2],
+        [output],
+        ir.ArrayAttr.get(
             [
-                ir.RankedTensorType(input1.type).element_type,
-                ir.RankedTensorType(input2.type).element_type,
-                ir.RankedTensorType(output.result.type).element_type,
-            ],
-        )
-        select_op = arith.SelectOp(
-            block.arguments[1], value, block.arguments[0]
-        )
-        block.append(select_op)
-        block.append(linalg.YieldOp([select_op.result]))
+                ir.AffineMapAttr.get(
+                    generic_map.get_submap(
+                        [i for i in range(len(output_shape))]
+                    )
+                ),
+                ir.AffineMapAttr.get(
+                    generic_map.get_submap(
+                        [i for i in range(len(output_shape))]
+                    )
+                ),
+                ir.AffineMapAttr.get(
+                    generic_map.get_submap(
+                        [i for i in range(len(output_shape))]
+                    )
+                ),
+            ]
+        ),
+        ir.ArrayAttr.get(
+            [ir.Attribute.parse("#linalg.iterator_type<parallel>")]
+            * len(output_shape)
+        ),
+    )
+    block = ir.Block.create_at_start(
+        op.region,
+        [
+            ir.RankedTensorType(input1.type).element_type,
+            ir.RankedTensorType(input2.type).element_type,
+            ir.RankedTensorType(output.result.type).element_type,
+        ],
+    )
+    select_op = arith.SelectOp(block.arguments[1], value, block.arguments[0])
+    block.append(select_op)
+    block.append(linalg.YieldOp([select_op.result]))
 
     return op
 
 
 def slice_op(
-    node: torch.fx.Node,
+    node: SliceOp,
     symbol_table: Dict[Tuple[str, int], ir.Operation],
 ):
     """
     Import the tensor slice operation.
-    From PyTorch `aten.slice.Tensor` operator to MLIR tensor `extract_slice`
-    operation.
+    From buddy SliceOp to MLIR tensor `extract_slice` operation.
 
     Note: This op, get the slice of input node1.
     Args:
@@ -514,18 +499,14 @@ def slice_op(
     offset = [0 for x in input_shape]
     offset[dim] = start
     offset_attr = ir._denseI64ArrayAttr(offset, None)
-    output_shape = list(node.meta["tensor_meta"].shape)
+    output_shape = list(node.tensor_meta["shape"])
     size_attr = ir._denseI64ArrayAttr(output_shape, None)
     stride = [1 for x in output_shape]
     stride[dim] = step
     stride_attr = ir._denseI64ArrayAttr(stride, None)
-    dtype = str(node.meta["tensor_meta"].dtype)
-    if dtype == "torch.float32":
-        tensor_type = ir.RankedTensorType.get(output_shape, ir.F32Type.get())
-    if dtype == "torch.bool":
-        tensor_type = ir.RankedTensorType.get(
-            output_shape, ir.IntegerType.get_signless(1)
-        )
+    dtype = node.tensor_meta["dtype"]
+    dtype = mlir_element_type_get(dtype)
+    tensor_type = ir.RankedTensorType.get(output_shape, dtype)
 
     op = tensor.ExtractSliceOp(
         tensor_type, input1, [], [], [], offset_attr, size_attr, stride_attr
@@ -535,13 +516,12 @@ def slice_op(
 
 
 def expand_op(
-    node: torch.fx.Node,
+    node: ExpandOp,
     symbol_table: Dict[Tuple[str, int], ir.Operation],
 ):
     """
     Import the tensor expand operation.
-    From PyTorch `aten.expand.default` operator to MLIR tensor `extract_slice`
-    operation.
+    From buddy ExpandOp to MLIR tensor `extract_slice` operation.
 
     Note: This op, based on expand shape, create a new tensor and extract slice
     from origin tensor.
@@ -559,26 +539,15 @@ def expand_op(
     if input1 is None:
         return
     input_shape = ir.RankedTensorType(input1.type).shape
-    output_shape = list(node.meta["tensor_meta"].shape)
-    dtype = str(node.meta["tensor_meta"].dtype)
-    if dtype == "torch.bool":
-        empty_tensor = tensor.EmptyOp(
-            output_shape, ir.IntegerType.get_signless(1)
-        )
-    elif dtype == "torch.float32":
-        empty_tensor = tensor.EmptyOp(output_shape, ir.F32Type.get())
+    output_shape = list(node.tensor_meta["shape"])
+    dtype = node.tensor_meta["dtype"]
+    dtype = mlir_element_type_get(dtype)
+    empty_tensor = tensor.EmptyOp(output_shape, dtype)
     if list(input_shape) == list(node.args[1]):
         offset_attr = ir._denseI64ArrayAttr([0 for x in input_shape], None)
         size_attr = ir._denseI64ArrayAttr(output_shape, None)
         stride_attr = ir._denseI64ArrayAttr([1 for x in input_shape], None)
-        if dtype == "torch.bool":
-            tensor_type = ir.RankedTensorType.get(
-                output_shape, ir.IntegerType.get_signless(1)
-            )
-        elif dtype == "torch.float32":
-            tensor_type = ir.RankedTensorType.get(
-                output_shape, ir.F32Type.get()
-            )
+        tensor_type = ir.RankedTensorType.get(output_shape, dtype)
         extract_tensor = tensor.ExtractSliceOp(
             tensor_type, input1, [], [], [], offset_attr, size_attr, stride_attr
         )
@@ -602,16 +571,10 @@ def expand_op(
                         [1] * (i + 1) + [x for x in output_shape[i + 1 :]], None
                     )
                     stride_attr = ir._denseI64ArrayAttr([1] * len(offset), None)
-                    if dtype == "torch.bool":
-                        tensor_type = ir.RankedTensorType.get(
-                            [1] * (i + 1) + [x for x in output_shape[i + 1 :]],
-                            ir.IntegerType.get_signless(1),
-                        )
-                    elif dtype == "torch.float32":
-                        tensor_type = ir.RankedTensorType.get(
-                            [1] * (i + 1) + [x for x in output_shape[i + 1 :]],
-                            ir.F32Type.get(),
-                        )
+                    tensor_type = ir.RankedTensorType.get(
+                        [1] * (i + 1) + [x for x in output_shape[i + 1 :]],
+                        dtype,
+                    )
                     extract_tensor = tensor.ExtractSliceOp(
                         tensor_type,
                         input1,
@@ -639,12 +602,12 @@ def expand_op(
 
 
 def to_copy_op(
-    node: torch.fx.Node,
+    node: ToCopyOp,
     symbol_table: Dict[Tuple[str, int], ir.Operation],
 ):
     """
     Import the tensor copy operation.
-    From PyTorch `aten._to_copy.default` operator to MLIR linalg `generic`
+    From buddy ToCopyOp to MLIR linalg `generic`
     operation.
 
     Note: This op, will convert input node's value type, such as float32 to
@@ -660,10 +623,10 @@ def to_copy_op(
     input1 = symbol_table.get((str(node.args[0]), 0))
     if input1 is None:
         return
-    output_shape = list(node.meta["tensor_meta"].shape)
-    dtype = str(node.meta["tensor_meta"].dtype)
+    output_shape = list(node.tensor_meta["shape"])
+    dtype = node.tensor_meta["dtype"]
 
-    if dtype == "torch.bool":
+    if dtype == TensorDType.Bool:
         if str(ir.RankedTensorType(input1.type).element_type) == "f32":
             tensor_type = ir.RankedTensorType.get(
                 output_shape, ir.IntegerType.get_signless(1)
@@ -713,7 +676,7 @@ def to_copy_op(
             block.append(fptosi_op)
             block.append(trunc_op)
             block.append(linalg.YieldOp([trunc_op.result]))
-    elif dtype == "torch.float32":
+    elif dtype == TensorDType.Float32:
         if str(ir.RankedTensorType(input1.type).element_type) == "i1":
             tensor_type = ir.RankedTensorType.get(
                 output_shape, ir.F32Type.get()
@@ -764,12 +727,12 @@ def to_copy_op(
 
 
 def rsub_op(
-    node: torch.fx.Node,
+    node: RsubOp,
     symbol_table: Dict[Tuple[str, int], ir.Operation],
 ):
     """
     Import the tensor rsub operation.
-    From PyTorch `aten.rsub.Scalar` operator to MLIR linalg `generic` operation.
+    From buddy RsubOp to MLIR linalg `generic` operation.
 
     Note: This op, compute input node1 rsub input node2
     Args:
@@ -782,20 +745,94 @@ def rsub_op(
     """
     input1 = symbol_table.get((str(node.args[0]), 0))
     value = node.args[1]
-    output_shape = list(node.meta["tensor_meta"].shape)
-    dtype = str(node.meta["tensor_meta"].dtype)
-    if not isinstance(value, torch.fx.Node):
-        if dtype == "torch.float32":
-            value = arith.ConstantOp(
-                ir.F32Type.get(), ir.FloatAttr.get(ir.F32Type.get(), value)
-            )
+    output_shape = list(node.tensor_meta["shape"])
+    dtype = node.tensor_meta["dtype"]
+    mlir_dtype = mlir_element_type_get(dtype)
+    if not isinstance(value, str):
+        value = arith.ConstantOp(
+            mlir_dtype, mlir_element_attr_get(dtype, value)
+        )
+        generic_map = ir.AffineMap.get_permutation(
+            [i for i in range(len(output_shape))]
+        )
+        tensor_type = ir.RankedTensorType.get(output_shape, mlir_dtype)
+        output = tensor.EmptyOp(output_shape, mlir_dtype)
+        op = linalg.GenericOp(
+            [tensor_type],
+            [input1],
+            [output],
+            ir.ArrayAttr.get(
+                [
+                    ir.AffineMapAttr.get(
+                        generic_map.get_submap(
+                            [i for i in range(len(output_shape))]
+                        )
+                    ),
+                    ir.AffineMapAttr.get(
+                        generic_map.get_submap(
+                            [i for i in range(len(output_shape))]
+                        )
+                    ),
+                ]
+            ),
+            ir.ArrayAttr.get(
+                [ir.Attribute.parse("#linalg.iterator_type<parallel>")]
+                * len(output_shape)
+            ),
+        )
+        block = ir.Block.create_at_start(
+            op.region,
+            [
+                ir.RankedTensorType(input1.type).element_type,
+                ir.RankedTensorType(output.result.type).element_type,
+            ],
+        )
+        if str(ir.RankedTensorType(input1.type).element_type).find("i") != -1:
+            sub_op = arith.SubIOp(value.result, block.arguments[0])
+        else:
+            sub_op = arith.SubFOp(value.result, block.arguments[0])
+        block.append(sub_op)
+        block.append(linalg.YieldOp([sub_op.result]))
+
+    return op
+
+
+def pow_op(
+    node: PowOp,
+    symbol_table: Dict[Tuple[str, int], ir.Operation],
+):
+    """
+    Import the tensor copy operation.
+    From buddy PowOp to MLIR linalg `generic`
+    operation.
+
+    Note: This op, compute input node's power result.
+    Args:
+        node: Containing information from the input graph node.
+        symbol_table: A dictionary mapping symbols to their corresponding
+        operations.
+
+    Returns:
+        op: The operation return the linalg.generic op.
+    """
+    input1 = symbol_table.get((str(node.args[0]), 0))
+    if input1 is None:
+        return
+    value = node.args[1]
+    output_shape = list(node.tensor_meta["shape"])
+    dtype = node.tensor_meta["dtype"]
+    dtype = mlir_element_type_get(dtype)
+    if not isinstance(value, str):
+        if abs(int(value) - float(value)) < 1e-6:
             generic_map = ir.AffineMap.get_permutation(
                 [i for i in range(len(output_shape))]
             )
-            tensor_type = ir.RankedTensorType.get(
-                output_shape, ir.F32Type.get()
+            tensor_type = ir.RankedTensorType.get(output_shape, dtype)
+            output = tensor.EmptyOp(output_shape, dtype)
+            value = arith.ConstantOp(
+                ir.IntegerType.get_signless(32),
+                ir.IntegerAttr.get(ir.IntegerType.get_signless(32), value),
             )
-            output = tensor.EmptyOp(output_shape, ir.F32Type.get())
             op = linalg.GenericOp(
                 [tensor_type],
                 [input1],
@@ -826,23 +863,28 @@ def rsub_op(
                     ir.RankedTensorType(output.result.type).element_type,
                 ],
             )
-            subf_op = arith.SubFOp(value.result, block.arguments[0])
-            block.append(subf_op)
-            block.append(linalg.YieldOp([subf_op.result]))
+            if (
+                str(ir.RankedTensorType(input1.type).element_type).find("i")
+                != -1
+            ):
+                powi_op = math.IPowIOp(block.arguments[0], value.result)
+            else:
+                powi_op = math.FPowIOp(block.arguments[0], value.result)
+            block.append(powi_op)
+            block.append(linalg.YieldOp([powi_op.result]))
 
     return op
 
 
-def pow_op(
-    node: torch.fx.Node,
+def mean_op(
+    node: MeanOp,
     symbol_table: Dict[Tuple[str, int], ir.Operation],
 ):
     """
     Import the tensor copy operation.
-    From PyTorch `aten.pow.Tensor_Scalar` operator to MLIR linalg `generic`
-    operation.
+    From buddy MeanOp to MLIR linalg `generic` operation.
 
-    Note: This op, compute input node's power result.
+    Note: This op, compute input node's mean result in a specified dim.
     Args:
         node: Containing information from the input graph node.
         symbol_table: A dictionary mapping symbols to their corresponding
@@ -854,160 +896,91 @@ def pow_op(
     input1 = symbol_table.get((str(node.args[0]), 0))
     if input1 is None:
         return
-    value = node.args[1]
-    output_shape = list(node.meta["tensor_meta"].shape)
-    dtype = str(node.meta["tensor_meta"].dtype)
-    if not isinstance(value, torch.fx.Node):
-        if dtype == "torch.float32":
+    dims = list(node.args[1])
+    keep_dim = bool(node.args[2])
+    output_shape = list(node.tensor_meta["shape"])
+    dtype = node.tensor_meta["dtype"]
+    mlir_dtype = mlir_element_type_get(dtype)
+    tensor_type = ir.RankedTensorType.get(output_shape, mlir_dtype)
+    element = mlir_element_attr_get(dtype, 0.0)
+    attr = ir.DenseElementsAttr.get_splat(tensor_type, element)
+    output = arith.ConstantOp(tensor_type, attr)
+    assert len(dims) == 1
+    for dim in dims:
+        if dim < 0:
+            dim = len(list(ir.RankedTensorType(input1.type).shape)) + dim
+        if keep_dim:
             generic_map = ir.AffineMap.get_permutation(
-                [i for i in range(len(output_shape))]
+                [i for i in range(len(output_shape) + 1)]
             )
-            tensor_type = ir.RankedTensorType.get(
-                output_shape, ir.F32Type.get()
+            tensor_type = ir.RankedTensorType.get(output_shape, mlir_dtype)
+            output_map = [i for i in range(len(output_shape))]
+            output_map[dim] = len(output_shape)
+            loop_type = [
+                ir.Attribute.parse("#linalg.iterator_type<parallel>")
+            ] * (len(output_shape) + 1)
+            loop_type[dim] = ir.Attribute.parse(
+                "#linalg.iterator_type<reduction>"
             )
-            output = tensor.EmptyOp(output_shape, ir.F32Type.get())
-            if abs(int(value) - float(value)) < 1e-6:
-                value = arith.ConstantOp(
-                    ir.IntegerType.get_signless(32),
-                    ir.IntegerAttr.get(ir.IntegerType.get_signless(32), value),
-                )
-                op = linalg.GenericOp(
-                    [tensor_type],
-                    [input1],
-                    [output],
-                    ir.ArrayAttr.get(
-                        [
-                            ir.AffineMapAttr.get(
-                                generic_map.get_submap(
-                                    [i for i in range(len(output_shape))]
-                                )
-                            ),
-                            ir.AffineMapAttr.get(
-                                generic_map.get_submap(
-                                    [i for i in range(len(output_shape))]
-                                )
-                            ),
-                        ]
-                    ),
-                    ir.ArrayAttr.get(
-                        [ir.Attribute.parse("#linalg.iterator_type<parallel>")]
-                        * len(output_shape)
-                    ),
-                )
-                block = ir.Block.create_at_start(
-                    op.region,
-                    [
-                        ir.RankedTensorType(input1.type).element_type,
-                        ir.RankedTensorType(output.result.type).element_type,
-                    ],
-                )
-                fpowi_op = math.FPowIOp(block.arguments[0], value.result)
-                block.append(fpowi_op)
-                block.append(linalg.YieldOp([fpowi_op.result]))
-
-    return op
-
-
-def mean_op(
-    node: torch.fx.Node,
-    symbol_table: Dict[Tuple[str, int], ir.Operation],
-):
-    """
-    Import the tensor copy operation.
-    From PyTorch `aten.mean.dim` operator to MLIR linalg `generic` operation.
-
-    Note: This op, compute input node's mean result in a specified dim.
-    Args:
-        node: Containing information from the input graph node.
-        symbol_table: A dictionary mapping symbols to their corresponding
-        operations.
-
-    Returns:
-        op: The operation return the linalg.generic op.
-    """
-    input1 = symbol_table.get((str(node.args[0]), 0))
-    if input1 is None:
-        return
-    dims = list(node.args[1])
-    keep_dim = bool(node.args[2])
-    output_shape = list(node.meta["tensor_meta"].shape)
-    dtype = str(node.meta["tensor_meta"].dtype)
-    if dtype == "torch.float32":
-        tensor_type = ir.RankedTensorType.get(output_shape, ir.F32Type.get())
-        element = ir.FloatAttr.get(ir.F32Type.get(), 0.0)
-        attr = ir.DenseElementsAttr.get_splat(tensor_type, element)
-        output = arith.ConstantOp(tensor_type, attr)
-
-        assert len(dims) == 1
-
-        for dim in dims:
-            if dim == -1:
-                dim = len(list(ir.RankedTensorType(input1.type).shape)) - 1
-            if keep_dim:
-                generic_map = ir.AffineMap.get_permutation(
-                    [i for i in range(len(output_shape) + 1)]
-                )
-                tensor_type = ir.RankedTensorType.get(
-                    output_shape, ir.F32Type.get()
-                )
-                output_map = [i for i in range(len(output_shape))]
-                output_map[dim] = len(output_shape)
-                loop_type = [
-                    ir.Attribute.parse("#linalg.iterator_type<parallel>")
-                ] * (len(output_shape) + 1)
-                loop_type[dim] = ir.Attribute.parse(
-                    "#linalg.iterator_type<reduction>"
-                )
-                op = linalg.GenericOp(
-                    [tensor_type],
-                    [input1],
-                    [output],
-                    ir.ArrayAttr.get(
-                        [
-                            ir.AffineMapAttr.get(
-                                generic_map.get_submap(
-                                    [i for i in range(len(output_shape))]
-                                )
-                            ),
-                            ir.AffineMapAttr.get(
-                                generic_map.get_submap(output_map)
-                            ),
-                        ]
-                    ),
-                    ir.ArrayAttr.get(loop_type),
-                )
-                block = ir.Block.create_at_start(
-                    op.region,
+            op = linalg.GenericOp(
+                [tensor_type],
+                [input1],
+                [output],
+                ir.ArrayAttr.get(
                     [
-                        ir.RankedTensorType(input1.type).element_type,
-                        ir.RankedTensorType(output.result.type).element_type,
-                    ],
+                        ir.AffineMapAttr.get(
+                            generic_map.get_submap(
+                                [i for i in range(len(output_shape))]
+                            )
+                        ),
+                        ir.AffineMapAttr.get(
+                            generic_map.get_submap(output_map)
+                        ),
+                    ]
+                ),
+                ir.ArrayAttr.get(loop_type),
+            )
+            block = ir.Block.create_at_start(
+                op.region,
+                [
+                    ir.RankedTensorType(input1.type).element_type,
+                    ir.RankedTensorType(output.result.type).element_type,
+                ],
+            )
+            value = arith.ConstantOp(
+                mlir_dtype,
+                mlir_element_attr_get(
+                    dtype, list(ir.RankedTensorType(input1.type).shape)[dim]
+                ),
+            )
+            if (
+                str(ir.RankedTensorType(input1.type).element_type).find("i")
+                != -1
+            ):
+                block_div_op = arith.DivSIOp(block.arguments[0], value.result)
+                block_add_op = arith.AddIOp(
+                    block_div_op.result, block.arguments[1]
                 )
-                value = arith.ConstantOp(
-                    ir.F32Type.get(),
-                    ir.FloatAttr.get(
-                        ir.F32Type.get(),
-                        list(ir.RankedTensorType(input1.type).shape)[dim],
-                    ),
+            else:
+                block_div_op = arith.DivFOp(block.arguments[0], value.result)
+                block_add_op = arith.AddFOp(
+                    block_div_op.result, block.arguments[1]
                 )
-                divf_op = arith.DivFOp(block.arguments[0], value.result)
-                addf_op = arith.AddFOp(divf_op.result, block.arguments[1])
-                block.append(value)
-                block.append(divf_op)
-                block.append(addf_op)
-                block.append(linalg.YieldOp([addf_op.result]))
+            block.append(value)
+            block.append(block_div_op)
+            block.append(block_add_op)
+            block.append(linalg.YieldOp([block_add_op.result]))
 
     return op
 
 
 def rsqrt_op(
-    node: torch.fx.Node,
+    node: RsqrtOp,
     symbol_table: Dict[Tuple[str, int], ir.Operation],
 ):
     """
     Import the tensor rsqrt operation.
-    From PyTorch `aten.rsqrt.default` operator to MLIR linalg `generic`
-    operation.
+    From buddy RsqrtOp to MLIR linalg `generic` operation.
 
     Note: This op, compute input node's rsqrt result.
     Args:
@@ -1023,59 +996,58 @@ def rsqrt_op(
     if input1 is None:
         return
 
-    output_shape = list(node.meta["tensor_meta"].shape)
-    dtype = str(node.meta["tensor_meta"].dtype)
-
-    if dtype == "torch.float32":
-        tensor_type = ir.RankedTensorType.get(output_shape, ir.F32Type.get())
-        output = tensor.EmptyOp(output_shape, ir.F32Type.get())
-        generic_map = ir.AffineMap.get_permutation(
-            [i for i in range(len(output_shape))]
-        )
-        op = linalg.GenericOp(
-            [tensor_type],
-            [input1],
-            [output],
-            ir.ArrayAttr.get(
-                [
-                    ir.AffineMapAttr.get(
-                        generic_map.get_submap(
-                            [i for i in range(len(output_shape))]
-                        )
-                    ),
-                    ir.AffineMapAttr.get(
-                        generic_map.get_submap(
-                            [i for i in range(len(output_shape))]
-                        )
-                    ),
-                ]
-            ),
-            ir.ArrayAttr.get(
-                [ir.Attribute.parse("#linalg.iterator_type<parallel>")]
-                * len(output_shape)
-            ),
-        )
-        block = ir.Block.create_at_start(
-            op.region,
+    output_shape = list(node.tensor_meta["shape"])
+    dtype = node.tensor_meta["dtype"]
+    mlir_dtype = mlir_element_type_get(dtype)
+    tensor_type = ir.RankedTensorType.get(output_shape, mlir_dtype)
+    output = tensor.EmptyOp(output_shape, mlir_dtype)
+    generic_map = ir.AffineMap.get_permutation(
+        [i for i in range(len(output_shape))]
+    )
+    op = linalg.GenericOp(
+        [tensor_type],
+        [input1],
+        [output],
+        ir.ArrayAttr.get(
             [
-                ir.RankedTensorType(input1.type).element_type,
-                ir.RankedTensorType(output.result.type).element_type,
-            ],
-        )
-        math_rsqrt_op = math.RsqrtOp(block.arguments[0])
-        block.append(math_rsqrt_op)
-        block.append(linalg.YieldOp([math_rsqrt_op.result]))
+                ir.AffineMapAttr.get(
+                    generic_map.get_submap(
+                        [i for i in range(len(output_shape))]
+                    )
+                ),
+                ir.AffineMapAttr.get(
+                    generic_map.get_submap(
+                        [i for i in range(len(output_shape))]
+                    )
+                ),
+            ]
+        ),
+        ir.ArrayAttr.get(
+            [ir.Attribute.parse("#linalg.iterator_type<parallel>")]
+            * len(output_shape)
+        ),
+    )
+    block = ir.Block.create_at_start(
+        op.region,
+        [
+            ir.RankedTensorType(input1.type).element_type,
+            ir.RankedTensorType(output.result.type).element_type,
+        ],
+    )
+    math_rsqrt_op = math.RsqrtOp(block.arguments[0])
+    block.append(math_rsqrt_op)
+    block.append(linalg.YieldOp([math_rsqrt_op.result]))
 
     return op
 
 
 def mul_op(
-    node: torch.fx.Node,
+    node: MulOp,
     symbol_table: Dict[Tuple[str, int], ir.Operation],
 ):
     """
     Import the tensor mul operation.
-    From PyTorch `aten.mul.Tensor` operator to MLIR linalg `generic` operation.
+    From buddy MulOp to MLIR linalg `generic` operation.
 
     Note: This op, compute input node's mul result.
     Args:
@@ -1087,257 +1059,38 @@ def mul_op(
         op: The operation return the linalg.generic op.
     """
     assert len(node.args) == 2
-    if isinstance(node.args[0], torch.fx.Node):
-        input1 = symbol_table.get((str(node.args[0]), 0))
-    else:
-        input1 = node.args[0]
-
-    if isinstance(node.args[1], torch.fx.Node):
+    input1 = symbol_table.get((str(node.args[0]), 0))
+    dtype = node.tensor_meta["dtype"]
+    mlir_dtype = mlir_element_type_get(dtype)
+    shape = list(node.tensor_meta["shape"])
+    if isinstance(node.args[1], str):
         input2 = symbol_table.get((str(node.args[1]), 0))
     else:
-        input2 = node.args[1]
-
+        data = [node.args[1]]
+        input2_shape = numpy.array(data).shape
+        tensor_type = ir.RankedTensorType.get(input2_shape, mlir_dtype)
+        element = mlir_element_attr_get(dtype, node.args[1])
+        attr = ir.DenseElementsAttr.get_splat(tensor_type, element)
+        input2 = arith.ConstantOp(tensor_type, attr).result
     if input1 is None or input2 is None:
         return
-
-    output_shape = list(node.meta["tensor_meta"].shape)
-    dtype = str(node.meta["tensor_meta"].dtype)
-
-    if isinstance(node.args[0], torch.fx.Node):
-        if dtype == "torch.float32":
-            if not isinstance(node.args[1], torch.fx.Node):
-                input2 = arith.ConstantOp(
-                    ir.F32Type.get(), ir.FloatAttr.get(ir.F32Type.get(), input2)
-                )
-                tensor_type = ir.RankedTensorType.get(
-                    output_shape, ir.F32Type.get()
-                )
-                output = tensor.EmptyOp(output_shape, ir.F32Type.get())
-                generic_map = ir.AffineMap.get_permutation(
-                    [i for i in range(len(output_shape))]
-                )
-                op = linalg.GenericOp(
-                    [tensor_type],
-                    [input1],
-                    [output],
-                    ir.ArrayAttr.get(
-                        [
-                            ir.AffineMapAttr.get(
-                                generic_map.get_submap(
-                                    [i for i in range(len(output_shape))]
-                                )
-                            ),
-                            ir.AffineMapAttr.get(
-                                generic_map.get_submap(
-                                    [i for i in range(len(output_shape))]
-                                )
-                            ),
-                        ]
-                    ),
-                    ir.ArrayAttr.get(
-                        [ir.Attribute.parse("#linalg.iterator_type<parallel>")]
-                        * len(output_shape)
-                    ),
-                )
-                block = ir.Block.create_at_start(
-                    op.region,
-                    [
-                        ir.RankedTensorType(input1.type).element_type,
-                        ir.RankedTensorType(output.result.type).element_type,
-                    ],
-                )
-                mulf_op = arith.MulFOp(block.arguments[0], input2.result)
-                block.append(mulf_op)
-                block.append(linalg.YieldOp([mulf_op.result]))
-            else:
-                tensor_type = ir.RankedTensorType.get(
-                    output_shape, ir.F32Type.get()
-                )
-                output = tensor.EmptyOp(output_shape, ir.F32Type.get())
-                input1_shape = list(ir.RankedTensorType(input1.type).shape)
-                if input1_shape != output_shape:
-                    dims = []
-                    for i in range(len(input1_shape) - 1, -1, -1):
-                        if (
-                            input1_shape[i]
-                            != output_shape[
-                                len(output_shape) - (len(input1_shape) - i)
-                            ]
-                        ):
-                            dims.append(i)
-                    output1 = tensor.EmptyOp(output_shape, ir.F32Type.get())
-                    generic_map = ir.AffineMap.get_permutation(
-                        [i for i in range(len(output_shape) + len(dims))]
-                    )
-                    input1_map = [
-                        i
-                        for i in range(
-                            len(output_shape) - len(input1_shape),
-                            len(output_shape),
-                        )
-                    ]
-                    for index, i in enumerate(dims):
-                        input1_map[i] = len(output_shape) + index
-                    input1_map = generic_map.get_submap(input1_map)
-                    input1_op = linalg.GenericOp(
-                        [tensor_type],
-                        [input1],
-                        [output1],
-                        ir.ArrayAttr.get(
-                            [
-                                ir.AffineMapAttr.get(input1_map),
-                                ir.AffineMapAttr.get(
-                                    generic_map.get_submap(
-                                        [i for i in range(len(output_shape))]
-                                    )
-                                ),
-                            ]
-                        ),
-                        ir.ArrayAttr.get(
-                            [
-                                ir.Attribute.parse(
-                                    "#linalg.iterator_type<parallel>"
-                                )
-                            ]
-                            * len(output_shape)
-                            + [
-                                ir.Attribute.parse(
-                                    "#linalg.iterator_type<reduction>"
-                                )
-                            ]
-                            * len(dims)
-                        ),
-                    )
-                    block = ir.Block.create_at_start(
-                        input1_op.region,
-                        [
-                            ir.RankedTensorType(input1.type).element_type,
-                            ir.RankedTensorType(
-                                output.result.type
-                            ).element_type,
-                        ],
-                    )
-                    block.append(linalg.YieldOp([block.arguments[0]]))
-                    input1 = input1_op.result
-
-                input2_shape = list(ir.RankedTensorType(input2.type).shape)
-                if input2_shape != output_shape:
-                    dims = []
-                    for i in range(len(input2_shape) - 1, -1, -1):
-                        if (
-                            input2_shape[i]
-                            != output_shape[
-                                len(output_shape) - (len(input2_shape) - i)
-                            ]
-                        ):
-                            dims.append(i)
-                    output2 = tensor.EmptyOp(output_shape, ir.F32Type.get())
-                    generic_map = ir.AffineMap.get_permutation(
-                        [i for i in range(len(output_shape) + len(dims))]
-                    )
-                    input2_map = [
-                        i
-                        for i in range(
-                            len(output_shape) - len(input2_shape),
-                            len(output_shape),
-                        )
-                    ]
-                    for index, i in enumerate(dims):
-                        input2_map[i] = len(output_shape) + index
-                    input2_map = generic_map.get_submap(input2_map)
-                    input2_op = linalg.GenericOp(
-                        [tensor_type],
-                        [input2],
-                        [output2],
-                        ir.ArrayAttr.get(
-                            [
-                                ir.AffineMapAttr.get(input2_map),
-                                ir.AffineMapAttr.get(
-                                    generic_map.get_submap(
-                                        [i for i in range(len(output_shape))]
-                                    )
-                                ),
-                            ]
-                        ),
-                        ir.ArrayAttr.get(
-                            [
-                                ir.Attribute.parse(
-                                    "#linalg.iterator_type<parallel>"
-                                )
-                            ]
-                            * len(output_shape)
-                            + [
-                                ir.Attribute.parse(
-                                    "#linalg.iterator_type<reduction>"
-                                )
-                            ]
-                            * len(dims)
-                        ),
-                    )
-                    block = ir.Block.create_at_start(
-                        input2_op.region,
-                        [
-                            ir.RankedTensorType(input2.type).element_type,
-                            ir.RankedTensorType(
-                                output.result.type
-                            ).element_type,
-                        ],
-                    )
-                    block.append(linalg.YieldOp([block.arguments[0]]))
-                    input2 = input2_op.result
-                generic_map = ir.AffineMap.get_permutation(
-                    [i for i in range(len(output_shape))]
-                )
-                op = linalg.GenericOp(
-                    [tensor_type],
-                    [input1, input2],
-                    [output],
-                    ir.ArrayAttr.get(
-                        [
-                            ir.AffineMapAttr.get(
-                                generic_map.get_submap(
-                                    [i for i in range(len(output_shape))]
-                                )
-                            ),
-                            ir.AffineMapAttr.get(
-                                generic_map.get_submap(
-                                    [i for i in range(len(output_shape))]
-                                )
-                            ),
-                            ir.AffineMapAttr.get(
-                                generic_map.get_submap(
-                                    [i for i in range(len(output_shape))]
-                                )
-                            ),
-                        ]
-                    ),
-                    ir.ArrayAttr.get(
-                        [ir.Attribute.parse("#linalg.iterator_type<parallel>")]
-                        * len(output_shape)
-                    ),
-                )
-                block = ir.Block.create_at_start(
-                    op.region,
-                    [
-                        ir.RankedTensorType(input1.type).element_type,
-                        ir.RankedTensorType(input2.type).element_type,
-                        ir.RankedTensorType(output.result.type).element_type,
-                    ],
-                )
-                mulf_op = arith.MulFOp(block.arguments[0], block.arguments[1])
-                block.append(mulf_op)
-                block.append(linalg.YieldOp([mulf_op.result]))
-
-    return op
+    mul_result_tensor_type = ir.RankedTensorType.get(shape, mlir_dtype)
+    op = tosa.MulOp(
+        mul_result_tensor_type,
+        input1,
+        input2,
+        ir.IntegerAttr.get(ir.IntegerType.get_signless(8), 0),
+    )
+    return op.result
 
 
 def t_op(
-    node: torch.fx.Node,
+    node: TOp,
     symbol_table: Dict[Tuple[str, int], ir.Operation],
 ):
     """
     Import the tensor tanspose operation.
-    From PyTorch `aten.t.default` operator to MLIR linalg `generic` operation.
+    From buddy TransposeOp to MLIR linalg `generic` operation.
 
     Note: This op, compute input node's transpose result.
     Args:
@@ -1353,50 +1106,23 @@ def t_op(
     if input1 is None:
         return
 
-    input_shape = list(ir.RankedTensorType(input1.type).shape)
-    output_shape = list(node.meta["tensor_meta"].shape)
-    dtype = str(node.meta["tensor_meta"].dtype)
-    if len(input_shape) == 2:
-        if dtype == "torch.float32":
-            tensor_type = ir.RankedTensorType.get(
-                output_shape, ir.F32Type.get()
-            )
-            output = tensor.EmptyOp(output_shape, ir.F32Type.get())
-            generic_map = ir.AffineMap.get_permutation([0, 1])
-            op = linalg.GenericOp(
-                [tensor_type],
-                [input1],
-                [output],
-                ir.ArrayAttr.get(
-                    [
-                        ir.AffineMapAttr.get(generic_map.get_submap([0, 1])),
-                        ir.AffineMapAttr.get(generic_map.get_submap([1, 0])),
-                    ]
-                ),
-                ir.ArrayAttr.get(
-                    [ir.Attribute.parse("#linalg.iterator_type<parallel>")]
-                    * len(output_shape)
-                ),
-            )
-            block = ir.Block.create_at_start(
-                op.region,
-                [
-                    ir.RankedTensorType(input1.type).element_type,
-                    ir.RankedTensorType(output.result.type).element_type,
-                ],
-            )
-            block.append(linalg.YieldOp([block.arguments[0]]))
+    output_shape = list(node.tensor_meta["shape"])
+    dtype = node.tensor_meta["dtype"]
+    mlir_dtype = mlir_element_type_get(dtype)
+    perm = ir._denseI64ArrayAttr([1, 0], None)
+    output = tensor.EmptyOp(output_shape, mlir_dtype)
+    op = linalg.transpose(input=input1, outs=[output], permutation=perm)
 
-    return op
+    return op.result[0]
 
 
 def matmul_op(
-    node: torch.fx.Node,
+    node: MatmulOp,
     symbol_table: Dict[Tuple[str, int], ir.Operation],
 ):
     """
     Import the tensor matmul operation.
-    From PyTorch `aten.mm.default` operator to MLIR linalg `matmul` operation.
+    From Buddy MatmulOp to MLIR linalg `matmul` operation.
 
     Note: This op, compute input node's matrix multiplication result.
     Args:
@@ -1413,25 +1139,24 @@ def matmul_op(
     if input1 is None or input2 is None:
         return
 
-    output_shape = list(node.meta["tensor_meta"].shape)
-    dtype = str(node.meta["tensor_meta"].dtype)
-    if dtype == "torch.float32":
-        tensor_type = ir.RankedTensorType.get(output_shape, ir.F32Type.get())
-        f32 = ir.F32Type.get()
-        element = ir.FloatAttr.get(f32, 0.0)
-        attr = ir.DenseElementsAttr.get_splat(tensor_type, element)
-        matmul_result_buffer = arith.ConstantOp(tensor_type, attr).result
-        op = linalg.matmul(input1, input2, outs=[matmul_result_buffer])
+    output_shape = list(node.tensor_meta["shape"])
+    dtype = node.tensor_meta["dtype"]
+    mlir_dtype = mlir_element_type_get(dtype)
+    tensor_type = ir.RankedTensorType.get(output_shape, mlir_dtype)
+    element = mlir_element_attr_get(dtype, 0.0)
+    attr = ir.DenseElementsAttr.get_splat(tensor_type, element)
+    matmul_result_buffer = arith.ConstantOp(tensor_type, attr).result
+    op = linalg.matmul(input1, input2, outs=[matmul_result_buffer])
     return op
 
 
 def transpose_op(
-    node: torch.fx.Node,
+    node: TransposeOp,
     symbol_table: Dict[Tuple[str, int], ir.Operation],
 ):
     """
     Import the tensor transpose operation.
-    From PyTorch `aten.transpose.int` operator to MLIR linalg `generic`
+    From buddy TransposeSpecificDimOp to MLIR linalg `generic`
     operation.
 
     Note: This op, compute input node's transpose result.
@@ -1449,51 +1174,25 @@ def transpose_op(
         return
     dim1 = int(node.args[1])
     dim2 = int(node.args[2])
-    output_shape = list(node.meta["tensor_meta"].shape)
-    dtype = str(node.meta["tensor_meta"].dtype)
-    if dtype == "torch.float32":
-        tensor_type = ir.RankedTensorType.get(output_shape, ir.F32Type.get())
-        output = tensor.EmptyOp(output_shape, ir.F32Type.get())
-        generic_map = ir.AffineMap.get_permutation(
-            [i for i in range(len(output_shape))]
-        )
-        input1_map = [i for i in range(len(output_shape))]
-        input1_map[dim1], input1_map[dim2] = input1_map[dim2], input1_map[dim1]
-        output_map = [i for i in range(len(output_shape))]
-        op = linalg.GenericOp(
-            [tensor_type],
-            [input1],
-            [output],
-            ir.ArrayAttr.get(
-                [
-                    ir.AffineMapAttr.get(generic_map.get_submap(input1_map)),
-                    ir.AffineMapAttr.get(generic_map.get_submap(output_map)),
-                ]
-            ),
-            ir.ArrayAttr.get(
-                [ir.Attribute.parse("#linalg.iterator_type<parallel>")]
-                * len(output_shape)
-            ),
-        )
-        block = ir.Block.create_at_start(
-            op.region,
-            [
-                ir.RankedTensorType(input1.type).element_type,
-                ir.RankedTensorType(output.result.type).element_type,
-            ],
-        )
-        block.append(linalg.YieldOp([block.arguments[0]]))
+    output_shape = list(node.tensor_meta["shape"])
+    dtype = node.tensor_meta["dtype"]
+    mlir_dtype = mlir_element_type_get(dtype)
+    output_perm = [i for i in range(len(output_shape))]
+    output_perm[dim2], output_perm[dim1] = output_perm[dim1], output_perm[dim2]
+    perm = ir._denseI64ArrayAttr(output_perm, None)
+    output = tensor.EmptyOp(output_shape, mlir_dtype)
+    op = linalg.transpose(input=input1, outs=[output], permutation=perm)
 
-    return op
+    return op.result[0]
 
 
 def index_op(
-    node: torch.fx.Node,
+    node: IndexOp,
     symbol_table: Dict[Tuple[str, int], ir.Operation],
 ):
     """
     Import the tensor index operation.
-    From PyTorch `aten.index.Tensor` operator to MLIR linalg `generic`
+    From buddy IndexOp to MLIR linalg `generic`
     operation.
 
     Note: This op, get input node slice result by input index.
@@ -1511,70 +1210,66 @@ def index_op(
         return
     input1_shape = ir.RankedTensorType(input1.type).shape
     input2 = node.args[1]
-    output_shape = list(node.meta["tensor_meta"].shape)
-    dtype = str(node.meta["tensor_meta"].dtype)
+    output_shape = list(node.tensor_meta["shape"])
+    dtype = node.tensor_meta["dtype"]
+    mlir_dtype = mlir_element_type_get(dtype)
     if len(input2) < len(input1_shape):
-        if dtype == "torch.float32":
-            tensor_type = ir.RankedTensorType.get(
-                output_shape, ir.F32Type.get()
-            )
-            output = tensor.EmptyOp(output_shape, ir.F32Type.get())
-            loops = ir.RankedTensorType(
-                symbol_table.get((str(input2[0]), 0)).type
-            ).shape
-            generic_map = ir.AffineMap.get_permutation(
-                [i for i in range(len(output_shape))]
+        tensor_type = ir.RankedTensorType.get(output_shape, mlir_dtype)
+        output = tensor.EmptyOp(output_shape, mlir_dtype)
+        loops = ir.RankedTensorType(
+            symbol_table.get((str(input2[0]), 0)).type
+        ).shape
+        generic_map = ir.AffineMap.get_permutation(
+            [i for i in range(len(output_shape))]
+        )
+        input_map = [
+            ir.AffineMapAttr.get(
+                generic_map.get_submap([j for j in range(len(loops))])
             )
-            input_map = [
-                ir.AffineMapAttr.get(
-                    generic_map.get_submap([j for j in range(len(loops))])
-                )
-                for i in range(len(input2))
-            ] + [
-                ir.AffineMapAttr.get(
-                    generic_map.get_submap(
-                        [j for j in range(len(output_shape))]
-                    )
-                )
-            ]
-            operands = [symbol_table.get((str(i), 0)) for i in input2]
-            op = linalg.GenericOp(
-                [tensor_type],
-                operands,
-                [output],
-                ir.ArrayAttr.get(input_map),
-                ir.ArrayAttr.get(
-                    [ir.Attribute.parse("#linalg.iterator_type<parallel>")]
-                    * len(output_shape)
-                ),
+            for i in range(len(input2))
+        ] + [
+            ir.AffineMapAttr.get(
+                generic_map.get_submap([j for j in range(len(output_shape))])
             )
-            arguments = [
-                ir.RankedTensorType(i.type).element_type for i in operands
-            ] + [ir.RankedTensorType(output.result.type).element_type]
-            block = ir.Block.create_at_start(op.region, arguments)
-            index = []
-            for i in block.arguments[:-1]:
-                indexcast_op = arith.IndexCastOp(ir.IndexType.get(), i)
-                block.append(indexcast_op)
-                index.append(indexcast_op.result)
-            for i in range(len(loops), len(output_shape) - len(input2) + 1):
-                index_op = linalg.IndexOp(ir._i64Attr(i, None))
-                block.append(index_op)
-                index.append(index_op.result)
-            value = tensor.ExtractOp(input1, index)
-            block.append(value)
-            block.append(linalg.YieldOp([value.result]))
+        ]
+        operands = [symbol_table.get((str(i), 0)) for i in input2]
+        op = linalg.GenericOp(
+            [tensor_type],
+            operands,
+            [output],
+            ir.ArrayAttr.get(input_map),
+            ir.ArrayAttr.get(
+                [ir.Attribute.parse("#linalg.iterator_type<parallel>")]
+                * len(output_shape)
+            ),
+        )
+        arguments = [
+            ir.RankedTensorType(i.type).element_type for i in operands
+        ] + [ir.RankedTensorType(output.result.type).element_type]
+        block = ir.Block.create_at_start(op.region, arguments)
+        index = []
+        for i in block.arguments[:-1]:
+            indexcast_op = arith.IndexCastOp(ir.IndexType.get(), i)
+            block.append(indexcast_op)
+            index.append(indexcast_op.result)
+        for i in range(len(loops), len(output_shape) - len(input2) + 1):
+            index_op = linalg.IndexOp(ir._i64Attr(i, None))
+            block.append(index_op)
+            index.append(index_op.result)
+        value = tensor.ExtractOp(input1, index)
+        block.append(value)
+        block.append(linalg.YieldOp([value.result]))
 
     return op
 
 
 def neg_op(
-    node: torch.fx.Node,
+    node: NegOp,
     symbol_table: Dict[Tuple[str, int], ir.Operation],
 ):
     """
     Import the tensor neg operation.
-    From PyTorch `aten.neg.default` operator to MLIR linalg `matmul` operation.
+    From buddy NegOp to MLIR linalg `negf` operation.
 
     Note: This op, compute input node's neg result.
     Args:
@@ -1589,59 +1284,22 @@ def neg_op(
     input1 = symbol_table.get((str(node.args[0]), 0))
     if input1 is None:
         return
-
-    output_shape = list(node.meta["tensor_meta"].shape)
-    dtype = str(node.meta["tensor_meta"].dtype)
-    if dtype == "torch.float32":
-        tensor_type = ir.RankedTensorType.get(output_shape, ir.F32Type.get())
-        output = tensor.EmptyOp(output_shape, ir.F32Type.get())
-        generic_map = ir.AffineMap.get_permutation(
-            [i for i in range(len(output_shape))]
-        )
-        op = linalg.GenericOp(
-            [tensor_type],
-            [input1],
-            [output],
-            ir.ArrayAttr.get(
-                [
-                    ir.AffineMapAttr.get(
-                        generic_map.get_submap(
-                            [i for i in range(len(output_shape))]
-                        )
-                    ),
-                    ir.AffineMapAttr.get(
-                        generic_map.get_submap(
-                            [i for i in range(len(output_shape))]
-                        )
-                    ),
-                ]
-            ),
-            ir.ArrayAttr.get(
-                [ir.Attribute.parse("#linalg.iterator_type<parallel>")]
-                * len(output_shape)
-            ),
-        )
-        block = ir.Block.create_at_start(
-            op.region,
-            [
-                ir.RankedTensorType(input1.type).element_type,
-                ir.RankedTensorType(output.result.type).element_type,
-            ],
-        )
-        negf_op = arith.NegFOp(block.arguments[0])
-        block.append(negf_op)
-        block.append(linalg.YieldOp([negf_op.result]))
+    output_shape = list(node.tensor_meta["shape"])
+    dtype = node.tensor_meta["dtype"]
+    mlir_dtype = mlir_element_type_get(dtype)
+    output = tensor.EmptyOp(output_shape, mlir_dtype)
+    op = linalg.negf(input1, outs=output)
 
     return op
 
 
 def cat_op(
-    node: torch.fx.Node,
+    node: CatOp,
     symbol_table: Dict[Tuple[str, int], ir.Operation],
 ):
     """
     Import the tensor concate operation.
-    From PyTorch `aten.cat.default` operator to MLIR tensor `insert_slice`
+    From buddy CatOp to MLIR tensor `insert_slice`
     operation.
 
     Note: This op, concate two input tensor.
@@ -1660,52 +1318,52 @@ def cat_op(
     if input1 is None or input2 is None:
         return
 
-    output_shape = list(node.meta["tensor_meta"].shape)
+    output_shape = list(node.tensor_meta["shape"])
     if dim < 0:
         dim = len(output_shape) + dim
-    dtype = str(node.meta["tensor_meta"].dtype)
-    if dtype == "torch.float32":
-        output = tensor.EmptyOp(output_shape, ir.F32Type.get())
-        offset = [0 for x in output_shape]
-        offset_attr = ir._denseI64ArrayAttr(offset, None)
-        input1_shape = ir.RankedTensorType(input1.type).shape
-        size_attr = ir._denseI64ArrayAttr(input1_shape, None)
-        stride_attr = ir._denseI64ArrayAttr([1] * len(offset), None)
-        insert_input1 = tensor.InsertSliceOp(
-            input1,
-            output.result,
-            [],
-            [],
-            [],
-            offset_attr,
-            size_attr,
-            stride_attr,
-        )
-        offset[dim] += input1_shape[dim]
-        offset_attr = ir._denseI64ArrayAttr(offset, None)
-        input2_shape = ir.RankedTensorType(input2.type).shape
-        size_attr = ir._denseI64ArrayAttr(input2_shape, None)
-        insert_input2 = tensor.InsertSliceOp(
-            input2,
-            insert_input1.result,
-            [],
-            [],
-            [],
-            offset_attr,
-            size_attr,
-            stride_attr,
-        )
+    dtype = node.tensor_meta["dtype"]
+    mlir_dtype = mlir_element_type_get(dtype)
+    output = tensor.EmptyOp(output_shape, mlir_dtype)
+    offset = [0 for x in output_shape]
+    offset_attr = ir._denseI64ArrayAttr(offset, None)
+    input1_shape = ir.RankedTensorType(input1.type).shape
+    size_attr = ir._denseI64ArrayAttr(input1_shape, None)
+    stride_attr = ir._denseI64ArrayAttr([1] * len(offset), None)
+    insert_input1 = tensor.InsertSliceOp(
+        input1,
+        output.result,
+        [],
+        [],
+        [],
+        offset_attr,
+        size_attr,
+        stride_attr,
+    )
+    offset[dim] += input1_shape[dim]
+    offset_attr = ir._denseI64ArrayAttr(offset, None)
+    input2_shape = ir.RankedTensorType(input2.type).shape
+    size_attr = ir._denseI64ArrayAttr(input2_shape, None)
+    insert_input2 = tensor.InsertSliceOp(
+        input2,
+        insert_input1.result,
+        [],
+        [],
+        [],
+        offset_attr,
+        size_attr,
+        stride_attr,
+    )
 
     return insert_input2
 
 
 def squeeze_op(
-    node: torch.fx.Node,
+    node: SqueezeOp,
     symbol_table: Dict[Tuple[str, int], ir.Operation],
 ):
     """
     Import the tensor squeeze operation.
-    From PyTorch `aten.squeeze.dim` operator to MLIR linalg `generic` operation.
+    From buddy SqueezeOp to MLIR linalg `generic` operation.
 
     Note: This op, reduce the input tensor's shape dims by specified dim.
     Args:
@@ -1722,78 +1380,78 @@ def squeeze_op(
     if input1 is None:
         return
 
-    output_shape = list(node.meta["tensor_meta"].shape)
+    output_shape = list(node.tensor_meta["shape"])
     input1_shape = ir.RankedTensorType(input1.type).shape
     if dim < 0:
         dim = len(input1_shape) + dim
-    dtype = str(node.meta["tensor_meta"].dtype)
-    if dtype == "torch.float32":
-        tensor_type = ir.RankedTensorType.get(output_shape, ir.F32Type.get())
-        output = tensor.EmptyOp(output_shape, ir.F32Type.get())
-        if input1_shape[dim] != 1:
-            offset = [0 for x in output_shape]
-            offset_attr = ir._denseI64ArrayAttr(offset, None)
-            size_attr = ir._denseI64ArrayAttr(input1_shape, None)
-            stride_attr = ir._denseI64ArrayAttr([1] * len(offset), None)
-            op = tensor.InsertSliceOp(
-                input1,
-                output.result,
-                [],
-                [],
-                [],
-                offset_attr,
-                size_attr,
-                stride_attr,
-            )
-        else:
-            output_map = ir.AffineMap.get(
-                len(output_shape),
-                0,
-                [ir.AffineExpr.get_dim(i) for i in range(len(output_shape))],
-            )
-            input1_map = []
-            loop_index = 0
-            for i in range(len(input1_shape)):
-                if len(input1_map) == dim:
-                    input1_map.append(ir.AffineExpr.get_constant(0))
-                else:
-                    input1_map.append(ir.AffineExpr.get_dim(loop_index))
-                    loop_index += 1
-            input1_map = ir.AffineMap.get(len(output_shape), 0, input1_map)
-            op = linalg.GenericOp(
-                [tensor_type],
-                [input1],
-                [output],
-                ir.ArrayAttr.get(
-                    [
-                        ir.AffineMapAttr.get(input1_map),
-                        ir.AffineMapAttr.get(output_map),
-                    ]
-                ),
-                ir.ArrayAttr.get(
-                    [ir.Attribute.parse("#linalg.iterator_type<parallel>")]
-                    * len(output_shape)
-                ),
-            )
-            block = ir.Block.create_at_start(
-                op.region,
+    dtype = node.tensor_meta["dtype"]
+    mlir_dtype = mlir_element_type_get(dtype)
+    tensor_type = ir.RankedTensorType.get(output_shape, mlir_dtype)
+    output = tensor.EmptyOp(output_shape, mlir_dtype)
+    if input1_shape[dim] != 1:
+        offset = [0 for x in output_shape]
+        offset_attr = ir._denseI64ArrayAttr(offset, None)
+        size_attr = ir._denseI64ArrayAttr(input1_shape, None)
+        stride_attr = ir._denseI64ArrayAttr([1] * len(offset), None)
+        op = tensor.InsertSliceOp(
+            input1,
+            output.result,
+            [],
+            [],
+            [],
+            offset_attr,
+            size_attr,
+            stride_attr,
+        )
+    else:
+        output_map = ir.AffineMap.get(
+            len(output_shape),
+            0,
+            [ir.AffineExpr.get_dim(i) for i in range(len(output_shape))],
+        )
+        input1_map = []
+        loop_index = 0
+        for i in range(len(input1_shape)):
+            if len(input1_map) == dim:
+                input1_map.append(ir.AffineExpr.get_constant(0))
+            else:
+                input1_map.append(ir.AffineExpr.get_dim(loop_index))
+                loop_index += 1
+        input1_map = ir.AffineMap.get(len(output_shape), 0, input1_map)
+        op = linalg.GenericOp(
+            [tensor_type],
+            [input1],
+            [output],
+            ir.ArrayAttr.get(
                 [
-                    ir.RankedTensorType(input1.type).element_type,
-                    ir.RankedTensorType(output.result.type).element_type,
-                ],
-            )
-            block.append(linalg.YieldOp([block.arguments[0]]))
+                    ir.AffineMapAttr.get(input1_map),
+                    ir.AffineMapAttr.get(output_map),
+                ]
+            ),
+            ir.ArrayAttr.get(
+                [ir.Attribute.parse("#linalg.iterator_type<parallel>")]
+                * len(output_shape)
+            ),
+        )
+        block = ir.Block.create_at_start(
+            op.region,
+            [
+                ir.RankedTensorType(input1.type).element_type,
+                ir.RankedTensorType(output.result.type).element_type,
+            ],
+        )
+        block.append(linalg.YieldOp([block.arguments[0]]))
 
     return op
 
 
 def batch_matmul_op(
-    node: torch.fx.Node,
+    node: BatchMatmulOp,
     symbol_table: Dict[Tuple[str, int], ir.Operation],
 ):
     """
     Import the tensor batch matmul operation.
-    From PyTorch `aten.bmm.default` operator to MLIR linalg `batch_matmul`
+    From buddy BatchMatmulOp to MLIR linalg `batch_matmul`
     operation.
 
     Note: This op, compute input node's batch matrix multiplication result.
@@ -1811,45 +1469,25 @@ def batch_matmul_op(
     if input1 is None or input2 is None:
         return
 
-    output_shape = list(node.meta["tensor_meta"].shape)
-    dtype = str(node.meta["tensor_meta"].dtype)
-    if dtype == "torch.float32":
-        tensor_type = ir.RankedTensorType.get(output_shape, ir.F32Type.get())
-        output = tensor.EmptyOp(output_shape, ir.F32Type.get())
-        # use linalg.generic implementation
-        generic_map = ir.AffineMap.get_permutation([0, 1, 2])
-        zero_fill = linalg.GenericOp(
-            [tensor_type],
-            [],
-            [output],
-            ir.ArrayAttr.get(
-                [ir.AffineMapAttr.get(generic_map.get_submap([0, 1, 2]))]
-            ),
-            ir.ArrayAttr.get(
-                [ir.Attribute.parse("#linalg.iterator_type<parallel>")] * 3
-            ),
-        )
-        block = ir.Block.create_at_start(
-            zero_fill.region,
-            [ir.RankedTensorType(output.result.type).element_type],
-        )
-        zero_op = arith.ConstantOp(
-            ir.F32Type.get(), ir.FloatAttr.get(ir.F32Type.get(), 0)
-        )
-        block.append(zero_op)
-        block.append(linalg.YieldOp([zero_op.result]))
-        op = linalg.batch_matmul(input1, input2, outs=[zero_fill.result])
+    output_shape = list(node.tensor_meta["shape"])
+    dtype = node.tensor_meta["dtype"]
+    mlir_dtype = mlir_element_type_get(dtype)
+    tensor_type = ir.RankedTensorType.get(output_shape, mlir_dtype)
+    element = mlir_element_attr_get(dtype, 0)
+    attr = ir.DenseElementsAttr.get_splat(tensor_type, element)
+    zero_fill = arith.ConstantOp(tensor_type, attr).result
+    op = linalg.batch_matmul(input1, input2, outs=[zero_fill])
 
     return op
 
 
 def div_op(
-    node: torch.fx.Node,
+    node: DivOp,
     symbol_table: Dict[Tuple[str, int], ir.Operation],
 ):
     """
     Import the tensor divsion operation.
-    From PyTorch `aten.div.Tensor` operator to MLIR linalg `generic` operation.
+    From buddy DivOp to MLIR linalg `generic` operation.
 
     Note: This op, compute input node's division result.
     Args:
@@ -1861,258 +1499,38 @@ def div_op(
         op: The operation return the linalg.generic op.
     """
     assert len(node.args) == 2
-    if isinstance(node.args[0], torch.fx.Node):
-        input1 = symbol_table.get((str(node.args[0]), 0))
-    else:
-        input1 = node.args[0]
-
-    if isinstance(node.args[1], torch.fx.Node):
+    input1 = symbol_table.get((str(node.args[0]), 0))
+    dtype = node.tensor_meta["dtype"]
+    mlir_dtype = mlir_element_type_get(dtype)
+    shape = list(node.tensor_meta["shape"])
+    if isinstance(node.args[1], str):
         input2 = symbol_table.get((str(node.args[1]), 0))
     else:
-        input2 = node.args[1]
-
+        data = [node.args[1]]
+        input2_shape = numpy.array(data).shape
+        tensor_type = ir.RankedTensorType.get(input2_shape, mlir_dtype)
+        element = mlir_element_attr_get(dtype, node.args[1])
+        attr = ir.DenseElementsAttr.get_splat(tensor_type, element)
+        input2 = arith.ConstantOp(tensor_type, attr).result
     if input1 is None or input2 is None:
         return
-
-    output_shape = list(node.meta["tensor_meta"].shape)
-    dtype = str(node.meta["tensor_meta"].dtype)
-
-    if isinstance(node.args[0], torch.fx.Node):
-        if dtype == "torch.float32":
-            if not isinstance(node.args[1], torch.fx.Node):
-                input2 = arith.ConstantOp(
-                    ir.F32Type.get(), ir.FloatAttr.get(ir.F32Type.get(), input2)
-                )
-                tensor_type = ir.RankedTensorType.get(
-                    output_shape, ir.F32Type.get()
-                )
-                output = tensor.EmptyOp(output_shape, ir.F32Type.get())
-                generic_map = ir.AffineMap.get_permutation(
-                    [i for i in range(len(output_shape))]
-                )
-                op = linalg.GenericOp(
-                    [tensor_type],
-                    [input1],
-                    [output],
-                    ir.ArrayAttr.get(
-                        [
-                            ir.AffineMapAttr.get(
-                                generic_map.get_submap(
-                                    [i for i in range(len(output_shape))]
-                                )
-                            ),
-                            ir.AffineMapAttr.get(
-                                generic_map.get_submap(
-                                    [i for i in range(len(output_shape))]
-                                )
-                            ),
-                        ]
-                    ),
-                    ir.ArrayAttr.get(
-                        [ir.Attribute.parse("#linalg.iterator_type<parallel>")]
-                        * len(output_shape)
-                    ),
-                )
-                block = ir.Block.create_at_start(
-                    op.region,
-                    [
-                        ir.RankedTensorType(input1.type).element_type,
-                        ir.RankedTensorType(output.result.type).element_type,
-                    ],
-                )
-                divf_op = arith.DivFOp(block.arguments[0], input2.result)
-                block.append(divf_op)
-                block.append(linalg.YieldOp([divf_op.result]))
-            else:
-                tensor_type = ir.RankedTensorType.get(
-                    output_shape, ir.F32Type.get()
-                )
-                output = tensor.EmptyOp(output_shape, ir.F32Type.get())
-                input1_shape = list(ir.RankedTensorType(input1.type).shape)
-                if input1_shape != output_shape:
-                    dims = []
-                    for i in range(len(input1_shape) - 1, -1, -1):
-                        if (
-                            input1_shape[i]
-                            != output_shape[
-                                len(output_shape) - (len(input1_shape) - i)
-                            ]
-                        ):
-                            dims.append(i)
-                    output1 = tensor.EmptyOp(output_shape, ir.F32Type.get())
-                    generic_map = ir.AffineMap.get_permutation(
-                        [i for i in range(len(output_shape) + len(dims))]
-                    )
-                    input1_map = [
-                        i
-                        for i in range(
-                            len(output_shape) - len(input1_shape),
-                            len(output_shape),
-                        )
-                    ]
-                    for index, i in enumerate(dims):
-                        input1_map[i] = len(output_shape) + index
-                    input1_map = generic_map.get_submap(input1_map)
-                    input1_op = linalg.GenericOp(
-                        [tensor_type],
-                        [input1],
-                        [output1],
-                        ir.ArrayAttr.get(
-                            [
-                                ir.AffineMapAttr.get(input1_map),
-                                ir.AffineMapAttr.get(
-                                    generic_map.get_submap(
-                                        [i for i in range(len(output_shape))]
-                                    )
-                                ),
-                            ]
-                        ),
-                        ir.ArrayAttr.get(
-                            [
-                                ir.Attribute.parse(
-                                    "#linalg.iterator_type<parallel>"
-                                )
-                            ]
-                            * len(output_shape)
-                            + [
-                                ir.Attribute.parse(
-                                    "#linalg.iterator_type<reduction>"
-                                )
-                            ]
-                            * len(dims)
-                        ),
-                    )
-                    block = ir.Block.create_at_start(
-                        input1_op.region,
-                        [
-                            ir.RankedTensorType(input1.type).element_type,
-                            ir.RankedTensorType(
-                                output.result.type
-                            ).element_type,
-                        ],
-                    )
-                    block.append(linalg.YieldOp([block.arguments[0]]))
-                    input1 = input1_op.result
-
-                input2_shape = list(ir.RankedTensorType(input2.type).shape)
-                if input2_shape != output_shape:
-                    dims = []
-                    for i in range(len(input2_shape) - 1, -1, -1):
-                        if (
-                            input2_shape[i]
-                            != output_shape[
-                                len(output_shape) - (len(input2_shape) - i)
-                            ]
-                        ):
-                            dims.append(i)
-                    output2 = tensor.EmptyOp(output_shape, ir.F32Type.get())
-                    generic_map = ir.AffineMap.get_permutation(
-                        [i for i in range(len(output_shape) + len(dims))]
-                    )
-                    input2_map = [
-                        i
-                        for i in range(
-                            len(output_shape) - len(input2_shape),
-                            len(output_shape),
-                        )
-                    ]
-                    for index, i in enumerate(dims):
-                        input2_map[i] = len(output_shape) + index
-                    input2_map = generic_map.get_submap(input2_map)
-                    input2_op = linalg.GenericOp(
-                        [tensor_type],
-                        [input2],
-                        [output2],
-                        ir.ArrayAttr.get(
-                            [
-                                ir.AffineMapAttr.get(input2_map),
-                                ir.AffineMapAttr.get(
-                                    generic_map.get_submap(
-                                        [i for i in range(len(output_shape))]
-                                    )
-                                ),
-                            ]
-                        ),
-                        ir.ArrayAttr.get(
-                            [
-                                ir.Attribute.parse(
-                                    "#linalg.iterator_type<parallel>"
-                                )
-                            ]
-                            * len(output_shape)
-                            + [
-                                ir.Attribute.parse(
-                                    "#linalg.iterator_type<reduction>"
-                                )
-                            ]
-                            * len(dims)
-                        ),
-                    )
-                    block = ir.Block.create_at_start(
-                        input2_op.region,
-                        [
-                            ir.RankedTensorType(input2.type).element_type,
-                            ir.RankedTensorType(
-                                output.result.type
-                            ).element_type,
-                        ],
-                    )
-                    block.append(linalg.YieldOp([block.arguments[0]]))
-                    input2 = input2_op.result
-                generic_map = ir.AffineMap.get_permutation(
-                    [i for i in range(len(output_shape))]
-                )
-                op = linalg.GenericOp(
-                    [tensor_type],
-                    [input1, input2],
-                    [output],
-                    ir.ArrayAttr.get(
-                        [
-                            ir.AffineMapAttr.get(
-                                generic_map.get_submap(
-                                    [i for i in range(len(output_shape))]
-                                )
-                            ),
-                            ir.AffineMapAttr.get(
-                                generic_map.get_submap(
-                                    [i for i in range(len(output_shape))]
-                                )
-                            ),
-                            ir.AffineMapAttr.get(
-                                generic_map.get_submap(
-                                    [i for i in range(len(output_shape))]
-                                )
-                            ),
-                        ]
-                    ),
-                    ir.ArrayAttr.get(
-                        [ir.Attribute.parse("#linalg.iterator_type<parallel>")]
-                        * len(output_shape)
-                    ),
-                )
-                block = ir.Block.create_at_start(
-                    op.region,
-                    [
-                        ir.RankedTensorType(input1.type).element_type,
-                        ir.RankedTensorType(input2.type).element_type,
-                        ir.RankedTensorType(output.result.type).element_type,
-                    ],
-                )
-                divf_op = arith.DivFOp(block.arguments[0], block.arguments[1])
-                block.append(divf_op)
-                block.append(linalg.YieldOp([divf_op.result]))
-
-    return op
+    div_result_tensor_type = ir.RankedTensorType.get(shape, mlir_dtype)
+    op = tosa.MulOp(
+        div_result_tensor_type,
+        input1,
+        tosa.ReciprocalOp(input2.type, input2).result,
+        ir.IntegerAttr.get(ir.IntegerType.get_signless(8), 0),
+    )
+    return op.result
 
 
 def softmax_op(
-    node: torch.fx.Node,
+    node: SoftmaxOp,
     symbol_table: Dict[Tuple[str, int], ir.Operation],
 ):
     """
     Import the tensor softmax operation.
-    From PyTorch `aten._softmax.default` operator to MLIR linalg `generic`
-    operation.
+    From buddy SoftmaxOp to MLIR linalg `generic` operation.
 
     Note: This op, compute input node's softmax result.
     Args:
@@ -2129,266 +1547,109 @@ def softmax_op(
     dim = int(node.args[1])
     if input1 is None:
         return
-    output_shape = list(node.meta["tensor_meta"].shape)
-    dtype = str(node.meta["tensor_meta"].dtype)
+    output_shape = list(node.tensor_meta["shape"])
+    dtype = node.tensor_meta["dtype"]
     if dim < 0:
         dim += len(output_shape)
-    if dtype == "torch.float32":
-        max_tensor_shape = copy.deepcopy(output_shape)
-        max_tensor_shape[dim] = 1
-        max_tensor_type = ir.RankedTensorType.get(
-            max_tensor_shape, ir.F32Type.get()
-        )
-        max_tensor = tensor.EmptyOp(max_tensor_shape, ir.F32Type.get())
-        max_tensor_map = [
-            ir.AffineExpr.get_dim(i) for i in range(len(max_tensor_shape))
-        ]
-        max_tensor_map = ir.AffineMap.get(
-            len(max_tensor_shape), 0, max_tensor_map
-        )
-        neg_inf_fill = linalg.GenericOp(
-            [max_tensor_type],
-            [],
-            [max_tensor],
-            ir.ArrayAttr.get([ir.AffineMapAttr.get(max_tensor_map)]),
-            ir.ArrayAttr.get(
-                [ir.Attribute.parse("#linalg.iterator_type<parallel>")]
-                * len(max_tensor_shape)
-            ),
-        )
-        block = ir.Block.create_at_start(
-            neg_inf_fill.region,
-            [ir.RankedTensorType(max_tensor.result.type).element_type],
-        )
-        neg_inf_op = arith.ConstantOp(
-            ir.F32Type.get(), ir.FloatAttr.get(ir.F32Type.get(), float("-inf"))
-        )
-        block.append(neg_inf_op)
-        block.append(linalg.YieldOp([neg_inf_op.result]))
-
-        input1_map = [
-            ir.AffineExpr.get_dim(i) for i in range(len(output_shape))
-        ]
-        input1_map = ir.AffineMap.get(len(output_shape), 0, input1_map)
-        max_tensor_map = [
-            ir.AffineExpr.get_dim(i) for i in range(len(output_shape))
-        ]
-        max_tensor_map[dim] = ir.AffineExpr.get_constant(0)
-        max_tensor_map = ir.AffineMap.get(len(output_shape), 0, max_tensor_map)
-        loop_type = [
-            ir.Attribute.parse("#linalg.iterator_type<parallel>")
-        ] * len(output_shape)
-        loop_type[dim] = ir.Attribute.parse("#linalg.iterator_type<reduction>")
-        max_tensor_op = linalg.GenericOp(
-            [max_tensor_type],
-            [input1],
-            [neg_inf_fill],
-            ir.ArrayAttr.get(
-                [
-                    ir.AffineMapAttr.get(input1_map),
-                    ir.AffineMapAttr.get(max_tensor_map),
-                ]
-            ),
-            ir.ArrayAttr.get(loop_type),
-        )
-        block = ir.Block.create_at_start(
-            max_tensor_op.region,
-            [
-                ir.RankedTensorType(input1.type).element_type,
-                ir.RankedTensorType(neg_inf_fill.result.type).element_type,
-            ],
-        )
-        max_op = arith.MaximumFOp(block.arguments[0], block.arguments[1])
-        block.append(max_op)
-        block.append(linalg.YieldOp([max_op.result]))
-
-        exp_tensor = tensor.EmptyOp(output_shape, ir.F32Type.get())
-        exp_tensor_type = ir.RankedTensorType.get(
-            output_shape, ir.F32Type.get()
-        )
-        input1_map = [
-            ir.AffineExpr.get_dim(i) for i in range(len(output_shape))
-        ]
-        input1_map = ir.AffineMap.get(len(output_shape), 0, input1_map)
-        max_tensor_map = [
-            ir.AffineExpr.get_dim(i) for i in range(len(output_shape))
-        ]
-        max_tensor_map[dim] = ir.AffineExpr.get_constant(0)
-        max_tensor_map = ir.AffineMap.get(len(output_shape), 0, max_tensor_map)
-        exp_tensor_map = [
-            ir.AffineExpr.get_dim(i) for i in range(len(output_shape))
-        ]
-        exp_tensor_map = ir.AffineMap.get(len(output_shape), 0, exp_tensor_map)
-        exp_tensor_op = linalg.GenericOp(
-            [exp_tensor_type],
-            [input1, max_tensor_op.result],
-            [exp_tensor],
-            ir.ArrayAttr.get(
-                [
-                    ir.AffineMapAttr.get(input1_map),
-                    ir.AffineMapAttr.get(max_tensor_map),
-                    ir.AffineMapAttr.get(exp_tensor_map),
-                ]
-            ),
-            ir.ArrayAttr.get(
-                [ir.Attribute.parse("#linalg.iterator_type<parallel>")]
-                * len(output_shape)
-            ),
-        )
-        block = ir.Block.create_at_start(
-            exp_tensor_op.region,
-            [
-                ir.RankedTensorType(input1.type).element_type,
-                ir.RankedTensorType(max_tensor_op.result.type).element_type,
-                ir.RankedTensorType(exp_tensor.result.type).element_type,
-            ],
-        )
-        sub_op = arith.SubFOp(block.arguments[0], block.arguments[1])
-        exp_op = math.ExpOp(sub_op.result)
-        block.append(sub_op)
-        block.append(exp_op)
-        block.append(linalg.YieldOp([exp_op.result]))
-
-        reduce_sum_tensor_shape = copy.deepcopy(output_shape)
-        reduce_sum_tensor_shape[dim] = 1
-        reduce_sum_tensor = tensor.EmptyOp(
-            reduce_sum_tensor_shape, ir.F32Type.get()
-        )
-        reduce_sum_tensor_type = ir.RankedTensorType.get(
-            reduce_sum_tensor_shape, ir.F32Type.get()
-        )
-        reduce_sum_tensor_map = [
-            ir.AffineExpr.get_dim(i) for i in range(len(output_shape))
-        ]
-        reduce_sum_tensor_map = ir.AffineMap.get(
-            len(output_shape), 0, reduce_sum_tensor_map
-        )
-        zero_fill_op = linalg.GenericOp(
-            [reduce_sum_tensor_type],
-            [],
-            [reduce_sum_tensor.result],
-            ir.ArrayAttr.get([ir.AffineMapAttr.get(reduce_sum_tensor_map)]),
-            ir.ArrayAttr.get(
-                [ir.Attribute.parse("#linalg.iterator_type<parallel>")]
-                * len(output_shape)
-            ),
-        )
-        block = ir.Block.create_at_start(
-            zero_fill_op.region,
-            [ir.RankedTensorType(reduce_sum_tensor.result.type).element_type],
-        )
-        zero_op = arith.ConstantOp(
-            ir.F32Type.get(), ir.FloatAttr.get(ir.F32Type.get(), 0)
-        )
-        block.append(zero_op)
-        block.append(linalg.YieldOp([zero_op.result]))
-
-        reduce_sum_tensor_shape = copy.deepcopy(output_shape)
-        reduce_sum_tensor_shape[dim] = 1
-        reduce_sum_tensor_type = ir.RankedTensorType.get(
-            reduce_sum_tensor_shape, ir.F32Type.get()
-        )
-        exp_tensor_map = [
-            ir.AffineExpr.get_dim(i) for i in range(len(output_shape))
-        ]
-        exp_tensor_map = ir.AffineMap.get(len(output_shape), 0, exp_tensor_map)
-        reduce_sum_tensor_map = [
-            ir.AffineExpr.get_dim(i) for i in range(len(output_shape))
-        ]
-        reduce_sum_tensor_map[dim] = ir.AffineExpr.get_constant(0)
-        reduce_sum_tensor_map = ir.AffineMap.get(
-            len(output_shape), 0, reduce_sum_tensor_map
-        )
-        loop_type = [
-            ir.Attribute.parse("#linalg.iterator_type<parallel>")
-        ] * len(output_shape)
-        loop_type[dim] = ir.Attribute.parse("#linalg.iterator_type<reduction>")
-        reduce_sum_tensor_op = linalg.GenericOp(
-            [reduce_sum_tensor_type],
-            [exp_tensor_op.result],
-            [zero_fill_op.result],
-            ir.ArrayAttr.get(
-                [
-                    ir.AffineMapAttr.get(exp_tensor_map),
-                    ir.AffineMapAttr.get(reduce_sum_tensor_map),
-                ]
-            ),
-            ir.ArrayAttr.get(loop_type),
-        )
-        block = ir.Block.create_at_start(
-            reduce_sum_tensor_op.region,
+    mlir_dtype = mlir_element_type_get(dtype)
+    # tensor_type = ir.RankedTensorType.get(output_shape, mlir_dtype)
+    # output = tensor.EmptyOp(output_shape, mlir_dtype)
+    # op = linalg.softmax(
+    #     [tensor_type],
+    #     input1,
+    #     output,
+    #     ir.IntegerAttr.get(ir.IntegerType.get_signless(64), dim),
+    # )
+    # print(op, flush=True)
+    sum_tensor_shape = copy.deepcopy(output_shape)
+    sum_tensor_shape[dim] = 1
+    sum_tensor_type = ir.RankedTensorType.get(sum_tensor_shape, mlir_dtype)
+    element = mlir_element_attr_get(dtype, 0)
+    attr = ir.DenseElementsAttr.get_splat(sum_tensor_type, element)
+    sum_tensor = arith.ConstantOp(sum_tensor_type, attr).result
+    input1_map = [ir.AffineExpr.get_dim(i) for i in range(len(output_shape))]
+    input1_map = ir.AffineMap.get(len(output_shape), 0, input1_map)
+    sum_tensor_map = [
+        ir.AffineExpr.get_dim(i) for i in range(len(output_shape))
+    ]
+    sum_tensor_map[dim] = ir.AffineExpr.get_constant(0)
+    sum_tensor_map = ir.AffineMap.get(len(output_shape), 0, sum_tensor_map)
+    loop_type = [ir.Attribute.parse("#linalg.iterator_type<parallel>")] * len(
+        output_shape
+    )
+    loop_type[dim] = ir.Attribute.parse("#linalg.iterator_type<reduction>")
+    sum_tensor_op = linalg.GenericOp(
+        [sum_tensor_type],
+        [input1],
+        [sum_tensor],
+        ir.ArrayAttr.get(
             [
-                ir.RankedTensorType(exp_tensor_op.result.type).element_type,
-                ir.RankedTensorType(zero_fill_op.result.type).element_type,
-            ],
-        )
-        add_op = arith.AddFOp(block.arguments[0], block.arguments[1])
-        block.append(add_op)
-        block.append(linalg.YieldOp([add_op.result]))
-
-        reduce_sum_tensor_shape = copy.deepcopy(output_shape)
-        reduce_sum_tensor_shape[dim] = 1
-        result_tensor_type = ir.RankedTensorType.get(
-            output_shape, ir.F32Type.get()
-        )
-        result_tensor = tensor.EmptyOp(output_shape, ir.F32Type.get())
-        exp_tensor_map = [
-            ir.AffineExpr.get_dim(i) for i in range(len(output_shape))
-        ]
-        exp_tensor_map = ir.AffineMap.get(len(output_shape), 0, exp_tensor_map)
-        reduce_sum_tensor_map = [
-            ir.AffineExpr.get_dim(i) for i in range(len(output_shape))
-        ]
-        reduce_sum_tensor_map[dim] = ir.AffineExpr.get_constant(0)
-        reduce_sum_tensor_map = ir.AffineMap.get(
-            len(output_shape), 0, reduce_sum_tensor_map
-        )
-        result_tensor_map = [
-            ir.AffineExpr.get_dim(i) for i in range(len(output_shape))
-        ]
-        result_tensor_map = ir.AffineMap.get(
-            len(output_shape), 0, result_tensor_map
-        )
-        op = linalg.GenericOp(
-            [result_tensor_type],
-            [exp_tensor_op.result, reduce_sum_tensor_op.result],
-            [result_tensor.result],
-            ir.ArrayAttr.get(
-                [
-                    ir.AffineMapAttr.get(exp_tensor_map),
-                    ir.AffineMapAttr.get(reduce_sum_tensor_map),
-                    ir.AffineMapAttr.get(result_tensor_map),
-                ]
-            ),
-            ir.ArrayAttr.get(
-                [ir.Attribute.parse("#linalg.iterator_type<parallel>")]
-                * len(output_shape)
-            ),
-        )
-        block = ir.Block.create_at_start(
-            op.region,
+                ir.AffineMapAttr.get(input1_map),
+                ir.AffineMapAttr.get(sum_tensor_map),
+            ]
+        ),
+        ir.ArrayAttr.get(loop_type),
+    )
+    block = ir.Block.create_at_start(
+        sum_tensor_op.region,
+        [
+            mlir_dtype,
+            mlir_dtype,
+        ],
+    )
+    exp_op = math.ExpOp(block.arguments[0])
+    add_op = arith.AddFOp(exp_op.result, block.arguments[1])
+    block.append(exp_op)
+    block.append(add_op)
+    block.append(linalg.YieldOp([add_op.result]))
+    result_tensor_type = ir.RankedTensorType.get(output_shape, mlir_dtype)
+    result_tensor = tensor.EmptyOp(output_shape, mlir_dtype)
+    result_tensor_map = [
+        ir.AffineExpr.get_dim(i) for i in range(len(output_shape))
+    ]
+    result_tensor_map = ir.AffineMap.get(
+        len(output_shape), 0, result_tensor_map
+    )
+    op = linalg.GenericOp(
+        [result_tensor_type],
+        [input1, sum_tensor_op.result],
+        [result_tensor.result],
+        ir.ArrayAttr.get(
             [
-                ir.RankedTensorType(exp_tensor_op.result.type).element_type,
-                ir.RankedTensorType(
-                    reduce_sum_tensor_op.result.type
-                ).element_type,
-                ir.RankedTensorType(result_tensor.result.type).element_type,
-            ],
-        )
-        div_op = arith.DivFOp(block.arguments[0], block.arguments[1])
-        block.append(div_op)
-        block.append(linalg.YieldOp([div_op.result]))
+                ir.AffineMapAttr.get(input1_map),
+                ir.AffineMapAttr.get(sum_tensor_map),
+                ir.AffineMapAttr.get(result_tensor_map),
+            ]
+        ),
+        ir.ArrayAttr.get(
+            [ir.Attribute.parse("#linalg.iterator_type<parallel>")]
+            * len(output_shape)
+        ),
+    )
+    block = ir.Block.create_at_start(
+        op.region,
+        [
+            mlir_dtype,
+            mlir_dtype,
+            mlir_dtype,
+        ],
+    )
+    exp_op = math.ExpOp(block.arguments[0])
+    div_op = arith.DivFOp(exp_op.result, block.arguments[1])
+    block.append(exp_op)
+    block.append(div_op)
+    block.append(linalg.YieldOp([div_op.result]))
 
     return op
 
 
 def clone_op(
-    node: torch.fx.Node,
+    node: CloneOp,
     symbol_table: Dict[Tuple[str, int], ir.Operation],
 ):
     """
     Import the tensor clone operation.
-    From PyTorch `aten.clone.default` operator to MLIR tensor `extract_slice`
+    From buddy CloneOp to MLIR tensor `extract_slice`
     operation.
 
     Note: This op, clone input tensor to a new tensor.
@@ -2405,31 +1666,29 @@ def clone_op(
     if input1 is None:
         return
 
-    output_shape = list(node.meta["tensor_meta"].shape)
-    dtype = str(node.meta["tensor_meta"].dtype)
-    if dtype == "torch.float32":
-        offset = [0 for x in output_shape]
-        offset_attr = ir._denseI64ArrayAttr(offset, None)
-        size_attr = ir._denseI64ArrayAttr(output_shape, None)
-        stride = [1 for x in output_shape]
-        stride_attr = ir._denseI64ArrayAttr(stride, None)
-        tensor_type = ir.RankedTensorType.get(output_shape, ir.F32Type.get())
-
-        op = tensor.ExtractSliceOp(
-            tensor_type, input1, [], [], [], offset_attr, size_attr, stride_attr
-        )
+    output_shape = list(node.tensor_meta["shape"])
+    dtype = node.tensor_meta["dtype"]
+    mlir_dtype = mlir_element_type_get(dtype)
+    offset = [0 for x in output_shape]
+    offset_attr = ir._denseI64ArrayAttr(offset, None)
+    size_attr = ir._denseI64ArrayAttr(output_shape, None)
+    stride = [1 for x in output_shape]
+    stride_attr = ir._denseI64ArrayAttr(stride, None)
+    tensor_type = ir.RankedTensorType.get(output_shape, mlir_dtype)
+    op = tensor.ExtractSliceOp(
+        tensor_type, input1, [], [], [], offset_attr, size_attr, stride_attr
+    )
 
     return op
 
 
 def silu_op(
-    node: torch.fx.Node,
+    node: SiluOp,
     symbol_table: Dict[Tuple[str, int], ir.Operation],
 ):
     """
     Import the tensor silu activation operation.
-    From PyTorch `aten.silu.default` operator to MLIR linalg `generic`
-    operation.
+    From Buddy SiluOp to MLIR linalg `generic` operation.
 
     Note: This op, compute input node's silu activation result.
     Args:
@@ -2445,63 +1704,61 @@ def silu_op(
     if input1 is None:
         return
 
-    output_shape = list(node.meta["tensor_meta"].shape)
-    dtype = str(node.meta["tensor_meta"].dtype)
-    if dtype == "torch.float32":
-        tensor_type = ir.RankedTensorType.get(output_shape, ir.F32Type.get())
-        output = tensor.EmptyOp(output_shape, ir.F32Type.get())
-        generic_map = ir.AffineMap.get_permutation(
-            [i for i in range(len(output_shape))]
-        )
-        op = linalg.GenericOp(
-            [tensor_type],
-            [input1],
-            [output],
-            ir.ArrayAttr.get(
-                [
-                    ir.AffineMapAttr.get(
-                        generic_map.get_submap(
-                            [i for i in range(len(output_shape))]
-                        )
-                    ),
-                    ir.AffineMapAttr.get(
-                        generic_map.get_submap(
-                            [i for i in range(len(output_shape))]
-                        )
-                    ),
-                ]
-            ),
-            ir.ArrayAttr.get(
-                [ir.Attribute.parse("#linalg.iterator_type<parallel>")]
-                * len(output_shape)
-            ),
-        )
-        block = ir.Block.create_at_start(
-            op.region,
+    output_shape = list(node.tensor_meta["shape"])
+    dtype = node.tensor_meta["dtype"]
+    mlir_dtype = mlir_element_type_get(dtype)
+    tensor_type = ir.RankedTensorType.get(output_shape, mlir_dtype)
+    output = tensor.EmptyOp(output_shape, mlir_dtype)
+    generic_map = ir.AffineMap.get_permutation(
+        [i for i in range(len(output_shape))]
+    )
+    op = linalg.GenericOp(
+        [tensor_type],
+        [input1],
+        [output],
+        ir.ArrayAttr.get(
             [
-                ir.RankedTensorType(input1.type).element_type,
-                ir.RankedTensorType(output.result.type).element_type,
-            ],
-        )
-        neg_op = arith.NegFOp(block.arguments[0])
-        exp_op = math.ExpOp(neg_op.result)
-        one_op = arith.ConstantOp(
-            ir.F32Type.get(), ir.FloatAttr.get(ir.F32Type.get(), 1)
-        )
-        add_op = arith.AddFOp(one_op.result, exp_op.result)
-        div_op = arith.DivFOp(block.arguments[0], add_op.result)
-        block.append(neg_op)
-        block.append(exp_op)
-        block.append(one_op)
-        block.append(add_op)
-        block.append(div_op)
-        block.append(linalg.YieldOp([div_op.result]))
+                ir.AffineMapAttr.get(
+                    generic_map.get_submap(
+                        [i for i in range(len(output_shape))]
+                    )
+                ),
+                ir.AffineMapAttr.get(
+                    generic_map.get_submap(
+                        [i for i in range(len(output_shape))]
+                    )
+                ),
+            ]
+        ),
+        ir.ArrayAttr.get(
+            [ir.Attribute.parse("#linalg.iterator_type<parallel>")]
+            * len(output_shape)
+        ),
+    )
+    block = ir.Block.create_at_start(
+        op.region,
+        [
+            ir.RankedTensorType(input1.type).element_type,
+            ir.RankedTensorType(output.result.type).element_type,
+        ],
+    )
+    neg_op = arith.NegFOp(block.arguments[0])
+    exp_op = math.ExpOp(neg_op.result)
+    one_op = arith.ConstantOp(mlir_dtype, mlir_element_attr_get(dtype, 1))
+    add_op = arith.AddFOp(one_op.result, exp_op.result)
+    div_op = arith.DivFOp(block.arguments[0], add_op.result)
+    block.append(neg_op)
+    block.append(exp_op)
+    block.append(one_op)
+    block.append(add_op)
+    block.append(div_op)
+    block.append(linalg.YieldOp([div_op.result]))
 
     return op
 
 
 def param_extract(
-    node: torch.fx.Node,
+    node: PlaceholderOp,
     offset,
     params_mlir_node,
 ):
@@ -2519,12 +1776,12 @@ def param_extract(
         op: The operation return the tensor.expand_shape op.
     """
     dtype_mapping = {
-        torch.float32: ir.F32Type.get(),
-        torch.int64: ir.IntegerType.get_signless(64),
+        TensorDType.Float32: ir.F32Type.get(),
+        TensorDType.Int64: ir.IntegerType.get_signless(64),
     }
-    tensor_element_type = dtype_mapping[node.meta["tensor_meta"].dtype]
-    output_shape = list(node.meta["tensor_meta"].shape)
-    extract_size = functools.reduce(lambda x, y: x * y, output_shape)
+    tensor_element_type = dtype_mapping[node.tensor_meta["dtype"]]
+    output_shape = list(node.tensor_meta["shape"])
+    extract_size = functools.reduce(lambda x, y: x * y, output_shape, 1)
     offset_attr = ir._denseI64ArrayAttr([offset], None)
     size_attr = ir._denseI64ArrayAttr([extract_size], None)
     stride = [1]
@@ -2540,7 +1797,7 @@ def param_extract(
         size_attr,
         stride_attr,
     )
-    if len(output_shape) == 1:
+    if len(output_shape) == 1 or len(output_shape) == 0:
         return extract_slice_op
     tensor_type = ir.RankedTensorType.get(output_shape, tensor_element_type)
     axis = ir.ArrayAttr.get(
@@ -2553,36 +1810,123 @@ def param_extract(
     axis = ir.ArrayAttr.get([axis], None)
     return tensor.ExpandShapeOp(tensor_type, extract_slice_op.result, axis)
 
+def where_op(
+    node: WhereOp,
+    symbol_table: Dict[Tuple[str, int], ir.Operation],
+):
+    """
+    Import the tensor where operation.
+    From Buddy WhereOp to MLIR linalg `generic` operation.
+
+    Note: This op, compute input node's silu activation result.
+    Args:
+        node: Containing information from the input graph node.
+        symbol_table: A dictionary mapping symbols to their corresponding
+        operations.
+
+    Returns:
+        op: The operation return the linalg.generic op.
+    """
+    assert len(node.args) == 3
+    input1 = symbol_table.get((str(node.args[0]), 0))
+    input2 = symbol_table.get((str(node.args[1]), 0))
+    input3 = symbol_table.get((str(node.args[2]), 0))
+    if input1 is None or input2 is None or input3 is None:
+        return
+
+    output_shape = list(node.tensor_meta["shape"])
+    dtype = node.tensor_meta["dtype"]
+    mlir_dtype = mlir_element_type_get(dtype)
+    tensor_type = ir.RankedTensorType.get(output_shape, mlir_dtype)
+    output = tensor.EmptyOp(output_shape, mlir_dtype)
+    generic_map = ir.AffineMap.get_permutation(
+        [i for i in range(len(output_shape))]
+    )
+    op = linalg.GenericOp(
+        [tensor_type],
+        [input1, input3],
+        [output],
+        ir.ArrayAttr.get(
+            [
+                ir.AffineMapAttr.get(
+                    generic_map.get_submap(
+                        [i for i in range(len(output_shape))]
+                    )
+                ),
+                ir.AffineMapAttr.get(
+                    generic_map.get_submap(
+                        [i for i in range(len(output_shape))]
+                    )
+                ),
+                ir.AffineMapAttr.get(
+                    generic_map.get_submap(
+                        [i for i in range(len(output_shape))]
+                    )
+                ),
+            ]
+        ),
+        ir.ArrayAttr.get(
+            [ir.Attribute.parse("#linalg.iterator_type<parallel>")]
+            * len(output_shape)
+        ),
+    )
+    block = ir.Block.create_at_start(
+        op.region,
+        [
+            ir.RankedTensorType(input1.type).element_type,
+            ir.RankedTensorType(input3.type).element_type,
+            ir.RankedTensorType(output.result.type).element_type,
+        ],
+    )
+    select_op = arith.SelectOp(block.arguments[0], input2, block.arguments[1])
+    block.append(select_op)
+    block.append(linalg.YieldOp([select_op.result]))
+
+    return op
+
+def scalar_tensor_op(node: ScalarTensorOp, symbol_table):
+    """
+    Import the tensor Scalar_Tensor operation.
+    From Buddy ScalarTensorOp to MLIR arith `ConstantOp` operation.
+    """
+    assert len(node.args) == 1
+    dtype = node.tensor_meta["dtype"]
+    attr = mlir_element_attr_get(dtype, node.args[0])
+    op = arith.ConstantOp(dtype, attr)
+
+    return op
 
 ops_registry = {
-    "arange.start": arange_op,
-    "arange.default": arange_op,
-    "unsqueeze.default": unsqueeze_op,
-    "view.default": view_op,
-    "ones.default": ones_op,
-    "full.default": full_op,
-    "lt.Tensor": lt_op,
-    "embedding.default": embedding_op,
-    "masked_fill.Scalar": masked_fill_op,
-    "slice.Tensor": slice_op,
-    "expand.default": expand_op,
-    "_to_copy.default": to_copy_op,
-    "rsub.Scalar": rsub_op,
-    "pow.Tensor_Scalar": pow_op,
-    "mean.dim": mean_op,
-    "rsqrt.default": rsqrt_op,
-    "mul.Tensor": mul_op,
-    "t.default": t_op,
-    "mm.default": matmul_op,
-    "transpose.int": transpose_op,
-    "index.Tensor": index_op,
-    "neg.default": neg_op,
-    "cat.default": cat_op,
-    "squeeze.dim": squeeze_op,
-    "bmm.default": batch_matmul_op,
-    "div.Tensor": div_op,
-    "_softmax.default": softmax_op,
-    "clone.default": clone_op,
-    "silu.default": silu_op,
     "param.extract": param_extract,
+    "MatmulOp": matmul_op,
+    "ArangeOp": arange_op,
+    "UnsqueezeOp": unsqueeze_op,
+    "ViewOp": view_op,
+    "EmbeddingOp": embedding_op,
+    "OnesOp": ones_op,
+    "FullOp": full_op,
+    "LessThanOp": lt_op,
+    "MaskedFillOp": masked_fill_op,
+    "SliceOp": slice_op,
+    "ExpandOp": expand_op,
+    "ToCopyOp": to_copy_op,
+    "RsubOp": rsub_op,
+    "PowOp": pow_op,
+    "MeanOp": mean_op,
+    "RsqrtOp": rsqrt_op,
+    "MulOp": mul_op,
+    "TOp": t_op,
+    "TransposeOp": transpose_op,
+    "IndexOp": index_op,
+    "NegOp": neg_op,
+    "CatOp": cat_op,
+    "SqueezeOp": squeeze_op,
+    "BatchMatmulOp": batch_matmul_op,
+    "DivOp": div_op,
+    "SoftmaxOp": softmax_op,
+    "CloneOp": clone_op,
+    "SiluOp": silu_op,
+    "AddOp": add_op,
+    "WhereOp": where_op,
+    "ScalarTensorOp": scalar_tensor_op,
 }
diff --git a/frontend/Python/ops/math.py b/frontend/Python/ops/math.py
index 7e2de80b5..19820c2b3 100644
--- a/frontend/Python/ops/math.py
+++ b/frontend/Python/ops/math.py
@@ -22,11 +22,16 @@
 
 
 def erf_op(node, symbol_table):
-    input_ = symbol_table.get((str(node.args[0]), 0))
-    op = math.ErfOp(input_)
+    input_tensor = symbol_table.get((str(node.args[0]), 0))
+    op = math.ErfOp(input_tensor)
     return op
 
+def sqrt_op(node, symbol_table):
+    input_tensor = symbol_table.get((str(node.args[0]), 0))
+    return math.SqrtOp(input_tensor)
+
 
 ops_registry = {
-    "erf.default": erf_op,
+    "ErfOp": erf_op,
+    "SqrtOp": sqrt_op,
 }
diff --git a/frontend/Python/ops/tosa.py b/frontend/Python/ops/tosa.py
index bf957002a..8a0997a3a 100644
--- a/frontend/Python/ops/tosa.py
+++ b/frontend/Python/ops/tosa.py
@@ -14,17 +14,52 @@
 #
 # ===---------------------------------------------------------------------------
 #
-# The registry of mappings from Torch node to MLIR tosa dialect operations.
+# The registry of mappings from Buddy Graph to MLIR tosa dialect operations.
 #
 # ===---------------------------------------------------------------------------
 
-import torch
 import array
 from typing import Dict, List, Tuple, Union
+import numpy
 
 import mlir.ir as ir
 from mlir.dialects import tensor, tosa
 
+from ..graph import TensorDType
+from ..graph import (
+    AddOp,
+    PermuteOp,
+    AddMMOp,
+    BatchMatmulOp,
+    SubOp,
+    MulOp,
+    DivOp,
+    TanhOp,
+    ExpOp,
+    RsqrtOp,
+    AmaxOp,
+    ReshapeOp,
+    UnsqueezeOp,
+    SelectOp,
+    SliceOp,
+    ConvertElementTypeOp,
+    CloneOp,
+    VarMeanOp,
+    EmbeddingOp,
+    ExpandOp,
+    SumDimOp,
+    TOp,
+    TransposeOp,
+    MaxPool2dOp,
+    Conv2dOp,
+    ReluOp,
+    IotaOp,
+    SigmoidOp,
+    ReciprocalOp,
+    MeanOp,
+)
+from .utils import *
+
 
 def _normalize_binary_operator_shape(shp1, shp2):
     """Normalize the shape of two input tensors according to the broadcasting
@@ -75,9 +110,8 @@ def _gen_arith_binary_op(input1, input2, op_func):
 def _scalar_to_tensor(
     scalar: Union[float, int], element_type: ir.Type, shape: List[int]
 ):
-    """PyTorch allow the binary operation between tensor and scalar. But MLIR
-    does not.
-    So we need to convert scalars to the corresponding tensors."""
+    """Convert scalers to cooresponding tensors since MLIR
+    doesn't support operation between scalers and tensors."""
     element = (
         ir.FloatAttr.get(element_type, float(scalar))
         if str(element_type) == "f32"
@@ -128,11 +162,11 @@ def _normalize_binary_operator_args(arg1, arg2):
 
 
 def addmm_op(
-    node, symbol_table: Dict[Tuple[str, int], ir.Operation]
+    node: AddMMOp, symbol_table: Dict[Tuple[str, int], ir.Operation]
 ) -> ir.Operation:
     """
     Import matrix multiplication operation.
-    From PyTorch `aten.addmm.default` operator to MLIR TOSA `matmul` operation.
+    From buddy graph ir's `AddMMOp` operator to MLIR TOSA `matmul` operation.
 
     Note: this function first reshapes the input matrices to 3D tensors
     (since tosa.MatMulOp requires it). Then it multiplies these reshaped
@@ -146,8 +180,7 @@ def addmm_op(
 
     Returns:
         op: The operation representing the result of adding the matrix
-        multiplication
-            to the input tensor.
+        multiplication to the input tensor.
     """
     # get input
     input_ = symbol_table.get((str(node.args[0]), 0))
@@ -184,10 +217,11 @@ def addmm_op(
     return op
 
 
-def bmm_op(node, symbol_table) -> ir.Operation:
+def bmm_op(node: BatchMatmulOp, symbol_table) -> ir.Operation:
     """
     Import batch matrix multiplication operation.
-    From PyTorch `aten.bmm.default` operator to MLIR TOSA `matmul` operation.
+    From buddy graph ir's `BatchMatmulOp` operator to MLIR TOSA `matmul` 
+    operation.
     """
     input_ = symbol_table.get((str(node.args[0]), 0))
     mat2 = symbol_table.get((str(node.args[1]), 0))
@@ -200,30 +234,30 @@ def bmm_op(node, symbol_table) -> ir.Operation:
     return op
 
 
-def add_op(node, symbol_table):
+def add_op(node: AddOp, symbol_table):
     """
     Import tensor addition operation.
-    From PyTorch `aten.add.Tensor` operator to MLIR TOSA `add` operation.
+    From buddy graph ir's `AddOp` operator to MLIR TOSA `add` operation.
     """
     input1 = symbol_table.get((str(node.args[0]), 0), node.args[0])
     input2 = symbol_table.get((str(node.args[1]), 0), node.args[1])
     return _gen_arith_binary_op(input1, input2, tosa.AddOp)
 
 
-def sub_op(node, symbol_table):
+def sub_op(node: SubOp, symbol_table):
     """
     Import tensor subtraction operation.
-    From PyTorch `aten.sub.Tensor` operator to MLIR TOSA `sub` operation.
+    From buddy graph ir's `SubOp` operator to MLIR TOSA `sub` operation.
     """
     input1 = symbol_table.get((str(node.args[0]), 0), node.args[0])
     input2 = symbol_table.get((str(node.args[1]), 0), node.args[1])
     return _gen_arith_binary_op(input1, input2, tosa.SubOp)
 
 
-def mul_op(node, symbol_table):
+def mul_op(node: MulOp, symbol_table):
     """
-    Import tensor multiplication operation.
-    From PyTorch `aten.mul.Tensor` operator to MLIR TOSA `mul` operation.
+    Import tensor division operation.
+    From buddy graph ir's `DivOp` operator to MLIR TOSA `div` operation.
     """
 
     def _inner_op(result_type, input1, input2):
@@ -240,10 +274,10 @@ def _inner_op(result_type, input1, input2):
     return _gen_arith_binary_op(input1, input2, _inner_op)
 
 
-def div_op(node, symbol_table):
+def div_op(node: DivOp, symbol_table):
     """
     Import tensor division operation.
-    From PyTorch `aten.div.Tensor` operator to MLIR TOSA `div` operation.
+    From buddy graph ir's `DivOp` operator to MLIR TOSA `div` operation.
     """
 
     def _inner_op(result_type, input1, input2):
@@ -260,10 +294,10 @@ def _inner_op(result_type, input1, input2):
     return _gen_arith_binary_op(input1, input2, _inner_op)
 
 
-def tanh_op(node, symbol_table):
+def tanh_op(node: TanhOp, symbol_table):
     """
     Import elementwise tanh operation.
-    From PyTorch `aten.tanh.default` operator to MLIR TOSA `tanh` operation.
+    From buddy graph ir's `TanhOp` operator to MLIR TOSA `tanh` operation.
     """
     input1 = symbol_table.get((str(node.args[0]), 0))
     sizes = ir.RankedTensorType(input1.type).shape
@@ -273,10 +307,10 @@ def tanh_op(node, symbol_table):
     return op
 
 
-def exp_op(node, symbol_table):
+def exp_op(node: ExpOp, symbol_table):
     """
     Import elementwise exponential operation.
-    From PyTorch `aten.exp.default` operator to MLIR TOSA `exp` operation.
+    From buddy graph ir's `ExpOp` operator to MLIR TOSA `exp` operation.
     """
     input1 = symbol_table.get((str(node.args[0]), 0))
     sizes = ir.RankedTensorType(input1.type).shape
@@ -286,10 +320,10 @@ def exp_op(node, symbol_table):
     return op
 
 
-def rsqrt_op(node, symbol_table):
+def rsqrt_op(node: RsqrtOp, symbol_table):
     """
     Import elementwise reciprocal square root operation.
-    From PyTorch `aten.rsqrt.default` operator to MLIR TOSA `rsqrt` operation.
+    From buddy graph ir's `RsqrtOp` operator to MLIR TOSA `rsqrt` operation.
     """
     input1 = symbol_table.get((str(node.args[0]), 0))
     sizes = ir.RankedTensorType(input1.type).shape
@@ -301,15 +335,11 @@ def rsqrt_op(node, symbol_table):
     return op
 
 
-def amax_op(node, symbol_table):
+def amax_op(node: AmaxOp, symbol_table):
     """
     Import the amax operation.
-    From PyTorch `aten.amax.default` operator to MLIR TOSA `reduce_max`
+    From buddy graph ir's `AmaxOp` operator to MLIR TOSA `reduce_max`
     operation.
-
-    Note: This conversion function returns the maximum value of each slice
-          of the input tensor in the given dimension(s). This is consistent
-          with PyTorch's `torch.amax` operator.
     """
     input1 = symbol_table.get((str(node.args[0]), 0))
     dim_val = node.args[1][0]
@@ -321,10 +351,10 @@ def amax_op(node, symbol_table):
     return op
 
 
-def reshape_op(node, symbol_table):
+def reshape_op(node: ReshapeOp, symbol_table):
     """
     Import the reshape operation.
-    From PyTorch `aten.reshape.default` operator to MLIR TOSA `reshape`
+    From buddy graph ir's `ReshapeOp` operator to MLIR TOSA `reshape`
     operation.
 
     Note: If the new shape contains one and only one `-1`, the size of the new
@@ -362,34 +392,30 @@ def reshape_op(node, symbol_table):
     return op
 
 
-def unsqueeze_op(node, symbol_table):
+def unsqueeze_op(node: UnsqueezeOp, symbol_table):
     """
     Import the unsqueeze operation.
-    From PyTorch `aten.unsqueeze.default` operator to MLIR TOSA `reshape`
+    From buddy graph ir's `UnsqueezeOp` operator to MLIR TOSA `reshape`
     operation.
-
-    Note: "unsqueeze" means inserting a new dimension of size 1 at the specified
-          position. For more information, please refer to
-          https://pytorch.org/docs/stable/generated/torch.unsqueeze.html
     """
     input_tensor = symbol_table.get((str(node.args[0]), 0))
     dim = node.args[1]
     sizes = ir.RankedTensorType(input_tensor.type).shape
-    sizes.insert(dim, 1)
+    if dim == -1:
+        sizes.append(1)
+    else:
+        sizes.insert(dim, 1)
     new_shape_content = array.array("i", sizes)
     new_shape_content = memoryview(new_shape_content)
     op = tosa.ReshapeOp(input_tensor, new_shape_content)
     return op
 
 
-def select_op(node, symbol_table):
+def select_op(node: SelectOp, symbol_table):
     """
     Import the select operation.
-    From PyTorch `aten.select.int` operator to MLIR TOSA `reshape` operation.
-
-    Note: "select" means slicing the input tensor along the selected dimension
-    at the given index. For more information, please refer to
-          https://pytorch.org/docs/stable/generated/torch.select.html
+    From buddy graph ir's `SelectOp` operator to MLIR TOSA `reshape`
+    operation.
     """
     input_tensor = symbol_table.get((str(node.args[0]), 0))
     dim = node.args[1]
@@ -416,14 +442,11 @@ def select_op(node, symbol_table):
     return op
 
 
-def slice_op(node, symbol_table):
+def slice_op(node: SliceOp, symbol_table):
     """
     Import the slice operation.
-    From PyTorch `aten.slice.Tensor` operator to MLIR tensor `extract_slice`
+    From buddy graph ir's `SliceOp` operator to MLIR TOSA `extract_slice`
     operation.
-
-    Note: "slice" means slicing the input tensor along the selected dimension
-    from a given start index to an end index.
     """
     input_tensor = symbol_table.get((str(node.args[0]), 0))
     dim = node.args[1]
@@ -477,17 +500,19 @@ def slice_op(node, symbol_table):
     return op
 
 
-def convert_element_type_op(node, symbol_table):
+def convert_element_type_op(node: ConvertElementTypeOp, symbol_table):
     """
     Import the element type conversion operation.
-    From PyTorch `prims.convert_element_type.default` operator to
-    MLIR TOSA `cast` operation.
+    From buddy graph ir's `ConvertElementTypeOp` operator to MLIR TOSA
+    `cast` operation.
     """
-    # maintain a mapping of torch types and mlir types
+    # maintain a mapping of buddy dtype to mlir types
     types_mapping = {
-        torch.float64: ir.F64Type.get(),
-        torch.float32: ir.F32Type.get(),
-        torch.float16: ir.F16Type.get(),
+        TensorDType.Float64: ir.F64Type.get(),
+        TensorDType.Float32: ir.F32Type.get(),
+        TensorDType.Float16: ir.F16Type.get(),
+        TensorDType.Int32: ir.IntegerType.get_signless(32),
+        TensorDType.Bool: ir.IntegerType.get_signless(1),
     }
     input_tensor = symbol_table.get((str(node.args[0]), 0))
     to_cast_type = types_mapping[node.args[1]]
@@ -496,13 +521,13 @@ def convert_element_type_op(node, symbol_table):
     return tosa.CastOp(output_type, input_tensor)
 
 
-def clone_op(node, symbol_table):
+def clone_op(node: CloneOp, symbol_table):
     """
     Import the clone operation.
-    From PyTorch `aten.clone.default` operator to MLIR TOSA `identity`
+    From buddy graph ir's `CloneOp` operator to MLIR TOSA `identity`
     operation.
 
-    Note: Since MLIR follow the SSA form, when using the `identity` operation,
+    Note: Since MLIR follows the SSA form, when using the `identity` operation,
     we actually deep-copies the original tensor.
     """
     input_tensor = symbol_table.get((str(node.args[0]), 0))
@@ -513,13 +538,16 @@ def clone_op(node, symbol_table):
     return tosa.IdentityOp(output_type, input_tensor)
 
 
-def var_mean_op(node, symbol_table):
+def var_mean_op(node: VarMeanOp, symbol_table):
     """
     Import the variance & mean operation.
-    From PyTorch `aten.var_mean.default` operator to two MLIR TOSA `mul`
+    From buddy graph ir's `VarMeanOp` operator to two MLIR TOSA `mul`
     operation.
 
-    Note: The conversion procedure can be splited into two steps:
+    Note: By now, this conversion function follows PyTorch's `var_mean`
+    semantic.
+
+          The conversion procedure can be splited into two steps:
           1. In the first part, we calculate the mean value along the given
           dimension(s) in `mean_dim_op` function. We first reduce the input
           tensor along the given dimension(s) using tosa's `reduce_sum`
@@ -667,10 +695,10 @@ def var_dim_op(
     return var_op, mean_op
 
 
-def permute_op(node, symbol_table):
+def permute_op(node: PermuteOp, symbol_table):
     """
     Import the permute operation.
-    From PyTorch `aten.permute.default` operator to MLIR TOSA `transpose`
+    From buddy graph ir's `PermuteOp` operator to MLIR TOSA `transpose`
     operation.
     """
     input_tensor = symbol_table.get((str(node.args[0]), 0))
@@ -693,10 +721,10 @@ def permute_op(node, symbol_table):
     return permute_op
 
 
-def embedding_op(node, symbol_table):
+def embedding_op(node: EmbeddingOp, symbol_table):
     """
     Import the embedding operation.
-    From PyTorch `aten.embedding.default` operator to MLIR TOSA `reshape`
+    From buddy graph ir's `EmbeddingOp` operator to MLIR TOSA `reshape`
     operation.
 
     Note: Althought this conversion function will finally return a `reshape`
@@ -754,10 +782,10 @@ def embedding_op(node, symbol_table):
     return op
 
 
-def expand_op(node, symbol_table) -> ir.Operation:
+def expand_op(node: ExpandOp, symbol_table) -> ir.Operation:
     """
     Import the expand operation.
-    From PyTorch `aten.expand.default` operator to MLIR TOSA `add` operation.
+    From buddy graph ir's `ExpandOp` operator to MLIR TOSA `add` operation.
 
     Note: This conversion is implemented using the broadcast machanism of TOSA
           `add` operation. We allocate a tensor with the shape to expand and
@@ -787,11 +815,10 @@ def expand_op(node, symbol_table) -> ir.Operation:
     return op
 
 
-def sum_op(node, symbol_table):
+def sum_op(node: SumDimOp, symbol_table):
     """
     Import the sum operation.
-    From PyTorch `aten.sum.dim_IntList` operator to MLIR TOSA `reduce_sum`
-    operation.
+    From buddy graph ir's `SumDimOp` operator to MLIR TOSA `reduce_sum`
     """
     input_tensor = symbol_table.get((str(node.args[0]), 0))
     reduce_sum_dims = node.args[1]
@@ -813,40 +840,37 @@ def sum_op(node, symbol_table):
     return reduce_sum_op
 
 
-def t_op(node, symbol_table):
+def t_op(node: TOp, symbol_table):
     """
     Import the tensor transpose operation.
-    From PyTorch `aten.t.default` operator to MLIR TOSA `reduce_sum` operation.
+    From buddy graph ir's `TOp` operator to MLIR TOSA `transpose` operation
     """
     assert len(node.args) == 1
     input1 = symbol_table.get((str(node.args[0]), 0))
-    if input1 is None:
-        return
+    assert input1 is not None
 
     input_shape = list(ir.RankedTensorType(input1.type).shape)
-    output_shape = list(node.meta["tensor_meta"].shape)
-    if len(input_shape) == 2:
-        perm_const_op = tosa.ConstOp(
-            ir.DenseElementsAttr.get(memoryview(array.array("i", [1, 0])))
-        )
-        result_element_type = ir.RankedTensorType(input1.type).element_type
-        permute_result_type = ir.RankedTensorType.get(
-            output_shape, result_element_type
-        )
-        op = tosa.TransposeOp(
-            permute_result_type, input1, perm_const_op.results[0]
-        )
+    output_shape = list(node.tensor_meta["shape"])
+    assert len(input_shape) == 2, "Input tensor must be 2D"
+    perm_const_op = tosa.ConstOp(
+        ir.DenseElementsAttr.get(memoryview(array.array("i", [1, 0])))
+    )
+    result_element_type = ir.RankedTensorType(input1.type).element_type
+    permute_result_type = ir.RankedTensorType.get(
+        output_shape, result_element_type
+    )
+    op = tosa.TransposeOp(permute_result_type, input1, perm_const_op.results[0])
 
     return op
 
 
-def transpose_op(node, symbol_table):
+def transpose_op(node: TransposeOp, symbol_table):
     """
     Import the tensor permute operation based on input dims.
-    From PyTorch `aten.transpose.int` operator to MLIR TOSA `reduce_sum`
+    From buddy graph ir's `TransposeOp` operator to MLIR TOSA `transpose`
     operation.
     """
-    assert len(node.args) == 3
+    assert len(node.args) == 3, "Input tensor must be 3D"
     input1 = symbol_table.get((str(node.args[0]), 0))
     if input1 is None:
         return
@@ -857,7 +881,7 @@ def transpose_op(node, symbol_table):
     temp = perm_list[dim1]
     perm_list[dim1] = perm_list[dim2]
     perm_list[dim2] = temp
-    output_shape = list(node.meta["tensor_meta"].shape)
+    output_shape = list(node.tensor_meta["shape"])
     perm_const_op = tosa.ConstOp(
         ir.DenseElementsAttr.get(memoryview(array.array("i", perm_list)))
     )
@@ -870,29 +894,352 @@ def transpose_op(node, symbol_table):
     return op
 
 
+def maxpool2d_op(node: MaxPool2dOp, symbol_table):
+    """
+    Import the maxpool2d operation.
+    From Buddy MaxPool2dOp to MLIR TOSA `max_pool2d` operation.
+    """
+    if len(node.args) == 5:
+        raise NotImplementedError
+    input1 = symbol_table.get((str(node.args[0]), 0))
+    kernel = node.args[1]
+    stride = node.args[2]
+    if len(node.args) > 3:
+        pad = node.args[3]
+    else:
+        pad = [0 for _ in kernel]
+    dtype = node.tensor_meta["dtype"]
+    result_element_type = mlir_element_type_get(dtype)
+    if node._layout.find("NCHW") != -1:
+        perm_list = [0, 2, 3, 1]
+        perm_const_op = tosa.ConstOp(
+            ir.DenseElementsAttr.get(memoryview(array.array("i", perm_list)))
+        )
+        out_shape = list(ir.RankedTensorType(input1.type).shape)
+        perm_shape = []
+        perm_shape.append(out_shape[0])
+        perm_shape.append(out_shape[2])
+        perm_shape.append(out_shape[3])
+        perm_shape.append(out_shape[1])
+        permute_result_type = ir.RankedTensorType.get(
+            perm_shape, result_element_type
+        )
+        input1 = tosa.TransposeOp(
+            permute_result_type, input1, perm_const_op.results[0]
+        ).result
+    out_shape = node.tensor_meta["shape"]
+    if len(pad) == 1:
+        pad = [pad[0]] * 4
+    elif len(pad) == 2:
+        pad = [pad[0]] * 2 + [pad[1]] * 2
+    kernel_attr = ir._denseI64ArrayAttr(kernel, None)
+    stride_attr = ir._denseI64ArrayAttr(stride, None)
+    pad_attr = ir._denseI64ArrayAttr(pad, None)
+    if node._layout.find("NCHW") != -1:
+        perm_shape = []
+        perm_shape.append(out_shape[0])
+        perm_shape.append(out_shape[2])
+        perm_shape.append(out_shape[3])
+        perm_shape.append(out_shape[1])
+        out_shape = perm_shape
+    output = ir.RankedTensorType.get(out_shape, result_element_type)
+    op = tosa.MaxPool2dOp(output, input1, kernel_attr, stride_attr, pad_attr)
+    if node._layout.find("NCHW") != -1:
+        perm_list = [0, 3, 1, 2]
+        perm_const_op = tosa.ConstOp(
+            ir.DenseElementsAttr.get(memoryview(array.array("i", perm_list)))
+        )
+        perm_shape = []
+        perm_shape.append(out_shape[0])
+        perm_shape.append(out_shape[3])
+        perm_shape.append(out_shape[1])
+        perm_shape.append(out_shape[2])
+        permute_result_type = ir.RankedTensorType.get(
+            perm_shape, result_element_type
+        )
+        op = tosa.TransposeOp(
+            permute_result_type, op.result, perm_const_op.results[0]
+        )
+    return op
+
+
+def convolution2d_op(node: Conv2dOp, symbol_table):
+    """
+    Import the convolution operation.
+    From Buddy Conv2dOp to MLIR TOSA `conv2d` operation.
+    """
+    assert len(node.args) == 9
+    input1 = symbol_table.get((str(node.args[0]), 0))
+    weight = symbol_table.get((str(node.args[1]), 0))
+    is_kernel_transposed = node.args[6]
+    dtype = node.tensor_meta["dtype"]
+    result_element_type = mlir_element_type_get(dtype)
+    if node._layout.find("NCHW") != -1:
+        perm_list = [0, 2, 3, 1]
+        perm_const_op = tosa.ConstOp(
+            ir.DenseElementsAttr.get(memoryview(array.array("i", perm_list)))
+        )
+        out_shape = list(ir.RankedTensorType(input1.type).shape)
+        perm_shape = []
+        perm_shape.append(out_shape[0])
+        perm_shape.append(out_shape[2])
+        perm_shape.append(out_shape[3])
+        perm_shape.append(out_shape[1])
+        permute_result_type = ir.RankedTensorType.get(
+            perm_shape, result_element_type
+        )
+        input1 = tosa.TransposeOp(
+            permute_result_type, input1, perm_const_op.results[0]
+        ).result
+    if node._layout.find("FCHW") != -1:
+        perm_list = [0, 2, 3, 1]
+        perm_const_op = tosa.ConstOp(
+            ir.DenseElementsAttr.get(memoryview(array.array("i", perm_list)))
+        )
+        out_shape = list(ir.RankedTensorType(weight.type).shape)
+        perm_shape = []
+        perm_shape.append(out_shape[0])
+        perm_shape.append(out_shape[2])
+        perm_shape.append(out_shape[3])
+        perm_shape.append(out_shape[1])
+        permute_result_type = ir.RankedTensorType.get(
+            perm_shape, result_element_type
+        )
+        weight = tosa.TransposeOp(
+            permute_result_type, weight, perm_const_op.results[0]
+        ).result
+    if is_kernel_transposed:
+        in_channels = list(ir.RankedTensorType(weight.type).shape)[0]
+        out_channels = list(ir.RankedTensorType(weight.type).shape)[1]
+    else:
+        in_channels = list(ir.RankedTensorType(weight.type).shape)[1]
+        out_channels = list(ir.RankedTensorType(weight.type).shape)[0]
+    if len(node._parents) == 2:
+        new_size_tensor_type = ir.RankedTensorType.get(
+            [out_channels], result_element_type
+        )
+        element = mlir_element_attr_get(dtype, 0)
+        new_size_attr = ir.DenseElementsAttr.get_splat(
+            new_size_tensor_type, element
+        )
+        bias_tensor = tosa.ConstOp(new_size_attr).results[0]
+    else:
+        bias_tensor = symbol_table.get((str(node.args[2]), 0))
+    assert input1 != None and weight != None and bias_tensor != None
+    stride = node.args[3]
+    input_padding = node.args[4]
+    if len(input_padding) == 1:
+        input_padding = [input_padding[0]] * 4
+    elif len(input_padding) == 2:
+        input_padding = [input_padding[0]] * 2 + [input_padding[1]] * 2
+    dilation = node.args[5]
+    groups = node.args[8]
+    out_shape = node.tensor_meta["shape"]
+    if node._layout.find("NCHW") != -1:
+        perm_shape = []
+        perm_shape.append(out_shape[0])
+        perm_shape.append(out_shape[2])
+        perm_shape.append(out_shape[3])
+        perm_shape.append(out_shape[1])
+        out_shape = perm_shape
+    output = ir.RankedTensorType.get(out_shape, result_element_type)
+    stride_attr = ir._denseI64ArrayAttr(stride, None)
+    assert groups == 1, 'tosa.conv2d only support one group'
+    if is_kernel_transposed:
+        if sum(input_padding) > 0 or sum(dilation) > len(dilation):
+            raise NotImplementedError
+        out_padding = node.args[7]
+        for i in range(len(out_padding), 4):
+            out_padding = [0] + out_padding
+        out_padding_attr = ir._denseI64ArrayAttr(out_padding, None)
+        out_shape_attr = ir._denseI64ArrayAttr(out_shape, None)
+        op = tosa.TransposeConv2DOp(
+            output,
+            input1,
+            weight,
+            bias_tensor,
+            out_padding_attr,
+            stride_attr,
+            out_shape_attr,
+        )
+    else:
+        input_padding_attr = ir._denseI64ArrayAttr(input_padding, None)
+        dilation_attr = ir._denseI64ArrayAttr(dilation, None)
+        op = tosa.Conv2DOp(
+            output,
+            input1,
+            weight,
+            bias_tensor,
+            input_padding_attr,
+            stride_attr,
+            dilation_attr,
+        )
+    if node._layout.find("NCHW") != -1:
+        perm_list = [0, 3, 1, 2]
+        perm_const_op = tosa.ConstOp(
+            ir.DenseElementsAttr.get(memoryview(array.array("i", perm_list)))
+        )
+        perm_shape = []
+        perm_shape.append(out_shape[0])
+        perm_shape.append(out_shape[3])
+        perm_shape.append(out_shape[1])
+        perm_shape.append(out_shape[2])
+        permute_result_type = ir.RankedTensorType.get(
+            perm_shape, result_element_type
+        )
+        op = tosa.TransposeOp(
+            permute_result_type, op.result, perm_const_op.results[0]
+        )
+    return op
+
+
+def relu_op(node: ReluOp, symbol_table):
+    """
+    Import the tensor relu operation.
+    From Buddy ReluOp to MLIR TOSA `maximum` operation.
+    """
+    assert len(node.args) == 1
+    input1 = symbol_table.get((str(node.args[0]), 0))
+    if input1 is None:
+        return
+    output_shape = list(node.tensor_meta["shape"])
+    dtype = node.tensor_meta["dtype"]
+    element = mlir_element_attr_get(dtype, 0)
+    tensor_type = ir.RankedTensorType.get(output_shape, element.type)
+    attr = ir.DenseElementsAttr.get_splat(tensor_type, element)
+    zero_op = tosa.ConstOp(attr)
+    result_element_type = mlir_element_type_get(dtype)
+    op = tosa.MaximumOp(tensor_type, input1, zero_op)
+
+    return op
+
+
+def iota_op(node: IotaOp, symbol_table):
+    """
+    Import the tensor iota operation.
+    From Buddy IotaOp to MLIR TOSA `ConstOp` operation.
+    """
+    assert len(node.args) == 1
+    output_shape = list(node.tensor_meta["shape"])
+    dtype = node.tensor_meta["dtype"]
+    start = node.kwargs["start"]
+    end = node.args[0]
+    step = node.kwargs["step"]
+    mlir_dtype = mlir_element_type_get(dtype)
+    tensor_type = ir.RankedTensorType.get(output_shape, mlir_dtype)
+    attr = ir.DenseElementsAttr.get(
+        numpy.arange(start, end, step),
+        type=tensor_type,
+    )
+    op = tosa.ConstOp(attr)
+
+    return op
+
+
+def sigmoid_op(node: SigmoidOp, symbol_table):
+    """
+    Import the tensor sigmoid operation.
+    From Buddy SigmoidOp to MLIR TOSA `SigmoidOp` operation.
+    """
+    assert len(node.args) == 1
+    input1 = symbol_table.get((str(node.args[0]), 0))
+    if input1 is None:
+        return
+    output_shape = list(node.tensor_meta["shape"])
+    dtype = node.tensor_meta["dtype"]
+    mlir_dtype = mlir_element_type_get(dtype)
+    tensor_type = ir.RankedTensorType.get(output_shape, mlir_dtype)
+    op = tosa.SigmoidOp(tensor_type, input1)
+
+    return op
+
+
+def reciprocal_op(node: ReciprocalOp, symbol_table):
+    input_tensor = symbol_table.get((str(node.args[0]), 0))
+    return tosa.ReciprocalOp(input_tensor.type, input_tensor)
+
+
+def mean_op(node: MeanOp, symbol_table):
+    input_tensor = symbol_table.get((str(node.args[0]), 0))
+    keepdim = node.args[2]
+    dims = [x for x in node.args[1]]
+    if isinstance(dims, int):
+        dims = [dims]
+
+    for dim_item_idx, _ in enumerate(dims):
+        if dims[dim_item_idx] < 0:
+            dims[dim_item_idx] += len(
+                ir.RankedTensorType(input_tensor.type).shape
+            )
+
+    reduce_sum_result = input_tensor
+    for dim_item in dims:
+        reduce_dim_attr = ir.IntegerAttr.get(
+            ir.IntegerType.get_signless(32), dim_item
+        )
+        reduce_sum_op = tosa.ReduceSumOp(reduce_sum_result, reduce_dim_attr)
+        reduce_sum_result = reduce_sum_op.results[0]
+
+    tensor_shp = ir.RankedTensorType(input_tensor.type).shape
+    dim_size = 1
+
+    for dim_item in dims:
+        dim_size *= tensor_shp[dim_item]
+
+    denominator_const_op = tosa.ConstOp(
+        ir.DenseElementsAttr.get(memoryview(array.array("f", [dim_size])))
+    )
+    reciprocal_op = tosa.ReciprocalOp(
+        denominator_const_op.results[0].type, denominator_const_op
+    )
+
+    ret = tosa.MulOp(
+        reduce_sum_op.results[0].type,
+        reciprocal_op.results[0],
+        reduce_sum_op.results[0],
+        ir.IntegerAttr.get(ir.IntegerType.get_signless(8), 0),
+    )
+
+    if not keepdim:
+        result_shp = ir.RankedTensorType(ret.results[0].type).shape
+        result_shp = [siz for siz in result_shp if siz != 1]
+        ret = tosa.ReshapeOp(
+            ret.results[0], memoryview(array.array("i", result_shp))
+        )
+
+    return ret
+
+
 ops_registry = {
-    "add.Tensor": add_op,
-    "mul.Tensor": mul_op,
-    "sub.Tensor": sub_op,
-    "sum.dim_IntList": sum_op,
-    "tanh.default": tanh_op,
-    "amax.default": amax_op,
-    "rsqrt.default": rsqrt_op,
-    "bmm.default": bmm_op,
-    "clone.default": clone_op,
-    "div.Tensor": div_op,
-    "exp.default": exp_op,
-    "expand.default": expand_op,
-    "var_mean.correction": var_mean_op,
-    "addmm.default": addmm_op,
-    "reshape.default": reshape_op,
-    "view.default": reshape_op,
-    "select.int": select_op,
-    "slice.Tensor": slice_op,
-    "embedding.default": embedding_op,
-    "convert_element_type.default": convert_element_type_op,
-    "permute.default": permute_op,
-    "unsqueeze.default": unsqueeze_op,
-    "t.default": t_op,
-    "transpose.int": transpose_op,
+    "AddOp": add_op,
+    "MulOp": mul_op,
+    "SubOp": sub_op,
+    "SumDimOp": sum_op,
+    "TanhOp": tanh_op,
+    "AmaxOp": amax_op,
+    "RsqrtOp": rsqrt_op,
+    "BatchMatmulOp": bmm_op,
+    "CloneOp": clone_op,
+    "DivOp": div_op,
+    "ExpOp": exp_op,
+    "ExpandOp": expand_op,
+    "VarMeanOp": var_mean_op,
+    "AddMMOp": addmm_op,
+    "ReshapeOp": reshape_op,
+    "ViewOp": reshape_op,
+    "SelectOp": select_op,
+    "SliceOp": slice_op,
+    "EmbeddingOp": embedding_op,
+    "ConvertElementTypeOp": convert_element_type_op,
+    "PermuteOp": permute_op,
+    "UnsqueezeOp": unsqueeze_op,
+    "TOp": t_op,
+    "TransposeOp": transpose_op,
+    "MaxPool2dOp": maxpool2d_op,
+    "Conv2dOp": convolution2d_op,
+    "ReluOp": relu_op,
+    "IotaOp": iota_op,
+    "SigmoidOp": sigmoid_op,
+    "ReciprocalOp": reciprocal_op,
+    "MeanOp": mean_op,
 }
diff --git a/frontend/Python/ops/utils.py b/frontend/Python/ops/utils.py
new file mode 100644
index 000000000..337f5a6b4
--- /dev/null
+++ b/frontend/Python/ops/utils.py
@@ -0,0 +1,56 @@
+# ===- utils.py ----------------------------------------------------------------
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# ===---------------------------------------------------------------------------
+#
+# The element utils of mlir element type and attribute.
+#
+# ===---------------------------------------------------------------------------
+
+from typing import Dict
+import mlir.ir as ir
+
+from ..graph import TensorDType
+
+
+def mlir_element_type_get(type_name):
+    """
+    Get the mlir element type base on TensorDType's enum type.
+    Args:
+        type_name: The TensorDType's enum type.
+    """
+    match type_name:
+        case TensorDType.Float32:
+            return ir.F32Type.get()
+        case TensorDType.Int64:
+            return ir.IntegerType.get_signless(64)
+        case TensorDType.Bool:
+            return ir.IntegerType.get_signless(1)
+
+
+def mlir_element_attr_get(type_name, value):
+    """
+    Get the mlir element attribute base on TensorDType's enum type and value.
+    Args:
+        type_name: The TensorDType's enum type.
+        value: The real value for mlir element attribute.
+    """
+    match type_name:
+        case TensorDType.Float32:
+            return ir.FloatAttr.get(ir.F32Type.get(), value)
+        case TensorDType.Int64:
+            return ir.IntegerAttr.get(ir.IntegerType.get_signless(64), value)
+        case TensorDType.Bool:
+            return ir.IntegerAttr.get(ir.IntegerType.get_signless(1), value)
+
diff --git a/midend/include/Dialect/VectorExp/VectorExpOps.td b/midend/include/Dialect/VectorExp/VectorExpOps.td
index 67f492643..aeacba34d 100644
--- a/midend/include/Dialect/VectorExp/VectorExpOps.td
+++ b/midend/include/Dialect/VectorExp/VectorExpOps.td
@@ -25,6 +25,8 @@ include "VectorExpDialect.td"
 include "mlir/Interfaces/InferTypeOpInterface.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
 
+include "mlir/IR/AttrTypeBase.td"
+
 //===----------------------------------------------------------------------===//
 // Vector Predication Operation
 //===----------------------------------------------------------------------===//
@@ -41,6 +43,30 @@ def VectorExp_PredicationOp : VectorExp_Op<"predication"> {
       "$region `:` type($result)";
 }
 
+//===----------------------------------------------------------------------===//
+// Vector GetVL Operation
+//===----------------------------------------------------------------------===//
+
+def VectorExp_GetVLOp : VectorExp_Op<"get_vl"> {
+  let summary = "Vector Experiment GetVL Operation.";
+  let arguments = (ins TypeAttr:$dtype, IndexAttr:$lmul);
+  let results = (outs Index:$result);
+  let assemblyFormat = "$dtype `,` $lmul attr-dict `:` type($result)";
+}
+
+//===----------------------------------------------------------------------===//
+// Vector SetVL Operation
+//===----------------------------------------------------------------------===//
+
+def VectorExp_SetVLOp : VectorExp_Op<"set_vl"> {
+  let summary = "Vector Experiment SetVL Operation.";
+  let arguments = (ins Index:$vl);
+  // TODO: Add optional returns.
+  // let results = (outs AnyType:$result);
+  let regions = (region AnyRegion:$region);
+  let assemblyFormat = "$vl attr-dict `:` type($vl) $region";
+}
+
 //===----------------------------------------------------------------------===//
 // Vector Load Operation with Dynamic Length
 //===----------------------------------------------------------------------===//
diff --git a/midend/include/Utils/DAPUtils.h b/midend/include/Utils/DAPUtils.h
new file mode 100644
index 000000000..9a9f418c7
--- /dev/null
+++ b/midend/include/Utils/DAPUtils.h
@@ -0,0 +1,58 @@
+//====- DAPUtils.h --------------------------------------------------------===//
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines DAP dialect specific utility functions for the buddy
+// compiler ecosystem.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef INCLUDE_UTILS_DAPUTILS_H
+#define INCLUDE_UTILS_DAPUTILS_H
+
+#include "Utils/Utils.h"
+#include <stdarg.h>
+
+using namespace mlir;
+
+namespace buddy {
+namespace dap {
+
+// Generate 5 vector params from SOS matrices
+SmallVector<Value, 5> generateSOSParams(OpBuilder &rewriter, Location loc,
+                                        VectorType vectorTy, Value f0, Value f1,
+                                        Value c0, Value c1, Value c2, Value c4,
+                                        Value c5, Value filterSize,
+                                        Value kernel);
+
+// Processing iir operation, result are stored in output MemRef
+void biquadProcess(OpBuilder &rewriter, Location loc, VectorType vectorTy,
+                   Value f0, Value c0, Value c1, Value cUpperBound,
+                   Value iUpperBound, SmallVector<Value, 5> SOSParams,
+                   ArrayRef<int64_t> arrayRef, Value N, Value input,
+                   Value output);
+
+// Total process for a specific vector length iir vectorization process
+void iirVectorizationProcess(OpBuilder &rewriter, Location loc, uint64_t vecLen,
+                             FloatType floatType, Value f0, Value f1, Value c0,
+                             Value c1, Value c2, Value c4, Value c5,
+                             Value filterSize, Value kernel,
+                             ArrayRef<int64_t> arrayRef, Value N, Value input,
+                             Value output);
+
+} // namespace dap
+} // namespace buddy
+
+#endif // INCLUDE_UTILS_DAPUTILS_H
diff --git a/midend/lib/Conversion/CMakeLists.txt b/midend/lib/Conversion/CMakeLists.txt
index 0d94bfa16..fc47e4171 100644
--- a/midend/lib/Conversion/CMakeLists.txt
+++ b/midend/lib/Conversion/CMakeLists.txt
@@ -3,6 +3,7 @@ add_subdirectory(LowerBud)
 add_subdirectory(LowerDIP)
 add_subdirectory(LowerRVV)
 add_subdirectory(LowerDAP)
+add_subdirectory(DAPVectorization)
 add_subdirectory(MatMulOptimization)
 add_subdirectory(TransposeOptimization)
 add_subdirectory(ConvOptimization)
@@ -11,3 +12,4 @@ add_subdirectory(LowerGemmini)
 add_subdirectory(LowerLinalgToGemmini)
 add_subdirectory(SchedulingOnDevices)
 add_subdirectory(LowerSche)
+add_subdirectory(MLIRGPU)
diff --git a/midend/lib/Conversion/DAPVectorization/CMakeLists.txt b/midend/lib/Conversion/DAPVectorization/CMakeLists.txt
new file mode 100644
index 000000000..d67592051
--- /dev/null
+++ b/midend/lib/Conversion/DAPVectorization/CMakeLists.txt
@@ -0,0 +1,6 @@
+add_mlir_library(DAPVectorization
+  DAPVectorization.cpp
+
+  LINK_LIBS PUBLIC
+  BuddyDAPUtils
+)
diff --git a/midend/lib/Conversion/DAPVectorization/DAPVectorization.cpp b/midend/lib/Conversion/DAPVectorization/DAPVectorization.cpp
new file mode 100644
index 000000000..8c3eb3069
--- /dev/null
+++ b/midend/lib/Conversion/DAPVectorization/DAPVectorization.cpp
@@ -0,0 +1,222 @@
+//====- DAPVectorization.cpp - DAP Dialect Vectorization Pass  ------------===//
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines DAP dialect vectorization pass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Bufferization/Transforms/Bufferize.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/Pass/Pass.h"
+
+#include "DAP/DAPDialect.h"
+#include "DAP/DAPOps.h"
+#include "Utils/DAPUtils.h"
+#include <optional>
+
+using namespace mlir;
+using namespace buddy;
+using namespace vector;
+using namespace mlir::arith;
+using namespace mlir::linalg;
+
+//===----------------------------------------------------------------------===//
+// Rewrite Pattern
+//===----------------------------------------------------------------------===//
+
+namespace {
+class DAPIirVectorization : public OpRewritePattern<dap::IirOp> {
+public:
+  using OpRewritePattern<dap::IirOp>::OpRewritePattern;
+
+  explicit DAPIirVectorization(MLIRContext *context)
+      : OpRewritePattern(context) {}
+
+  LogicalResult matchAndRewrite(dap::IirOp op,
+                                PatternRewriter &rewriter) const override {
+    auto loc = op->getLoc();
+    auto ctx = op->getContext();
+
+    Value input = op->getOperand(0);
+    Value kernel = op->getOperand(1);
+    Value output = op->getOperand(2);
+
+    Value c0 = rewriter.create<ConstantIndexOp>(loc, 0);
+    Value c1 = rewriter.create<ConstantIndexOp>(loc, 1);
+    Value c2 = rewriter.create<ConstantIndexOp>(loc, 2);
+    Value c4 = rewriter.create<ConstantIndexOp>(loc, 4);
+    Value c5 = rewriter.create<ConstantIndexOp>(loc, 5);
+    Value c8 = rewriter.create<ConstantIndexOp>(loc, 8);
+    Value c16 = rewriter.create<ConstantIndexOp>(loc, 16);
+    Value c32 = rewriter.create<ConstantIndexOp>(loc, 32);
+
+    Value N = rewriter.create<memref::DimOp>(loc, input, c0);
+    Value filterSize = rewriter.create<memref::DimOp>(loc, kernel, c0);
+
+    FloatType f32 = FloatType::getF32(ctx);
+    Value f0 = rewriter.create<ConstantFloatOp>(loc, APFloat(0.0f), f32);
+    Value f1 = rewriter.create<ConstantFloatOp>(loc, APFloat(1.0f), f32);
+
+    Value cond4 =
+        rewriter.create<CmpIOp>(loc, CmpIPredicate::ule, filterSize, c4);
+    Value cond8 =
+        rewriter.create<CmpIOp>(loc, CmpIPredicate::ule, filterSize, c8);
+    Value cond16 =
+        rewriter.create<CmpIOp>(loc, CmpIPredicate::ule, filterSize, c16);
+    Value cond32 =
+        rewriter.create<CmpIOp>(loc, CmpIPredicate::ule, filterSize, c32);
+
+    // clang-format off
+    rewriter.create<scf::IfOp>(loc, cond4,
+    /*thenBuilder=*/
+    [&](OpBuilder &builder, Location loc) {
+        dap::iirVectorizationProcess(builder, loc, 4, f32, f0, f1, c0, c1, c2, c4, c5,
+                                     filterSize, kernel, ArrayRef<int64_t>{0, 0, 1, 2},
+                                     N, input, output);
+        
+        builder.create<scf::YieldOp>(loc);
+    },
+    /*elseBuilder=*/
+    [&](OpBuilder &builder, Location loc) {
+        builder.create<scf::IfOp>(loc, cond8,
+        /*thenBuilder=*/
+        [&](OpBuilder &builder, Location loc){
+            dap::iirVectorizationProcess(builder, loc, 8, f32, f0, f1, c0, c1, c2, c4, c5,
+                                         filterSize, kernel, 
+                                         ArrayRef<int64_t>{0, 0, 1, 2, 3, 4, 5, 6}, N, 
+                                         input, output);
+            
+            builder.create<scf::YieldOp>(loc);
+        },
+        /*elseBuilder=*/
+        [&](OpBuilder &builder, Location loc) {
+            builder.create<scf::IfOp>(loc, cond16,
+            /*thenBuilder=*/
+            [&](OpBuilder &builder, Location loc){
+                dap::iirVectorizationProcess(builder, loc, 16, f32, f0, f1, c0, c1, c2, c4, c5,
+                                             filterSize, kernel, ArrayRef<int64_t>{0, 0, 1, 2, 
+                                             3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}, N, 
+                                             input, output);
+                
+                builder.create<scf::YieldOp>(loc);
+            },
+            /*elseBuilder=*/
+            [&](OpBuilder &builder, Location loc) {
+                builder.create<scf::IfOp>(loc, cond32,
+                /*thenBuilder=*/
+                [&](OpBuilder &builder, Location loc){
+                    dap::iirVectorizationProcess(builder, loc, 32, f32, f0, f1, c0, c1, c2, c4, c5,
+                                                 filterSize, kernel, ArrayRef<int64_t>{0, 0, 1, 2, 
+                                                 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 
+                                                 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+                                                 30}, N, input, output);
+                    
+                    builder.create<scf::YieldOp>(loc);
+                },
+                /*elseBuilder=*/
+                [&](OpBuilder &builder, Location loc) {
+                    dap::iirVectorizationProcess(builder, loc, 64, f32, f0, f1, c0, c1, c2, c4, c5,
+                                                 filterSize, kernel, ArrayRef<int64_t>{0, 0, 1, 2, 3, 4,
+                                                 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 
+                                                 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 
+                                                 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 
+                                                 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
+                                                 62}, N, input, output);
+                    
+                    builder.create<scf::YieldOp>(loc);
+                }
+                );
+                builder.create<scf::YieldOp>(loc);
+            });
+
+            builder.create<scf::YieldOp>(loc);
+        });
+
+        builder.create<scf::YieldOp>(loc);
+    });
+    // clang-format on
+
+    rewriter.eraseOp(op);
+    return success();
+  }
+};
+
+} // end anonymous namespace
+
+void populateVectorizeDAPConversionPatterns(RewritePatternSet &patterns) {
+  patterns.add<DAPIirVectorization>(patterns.getContext());
+}
+
+//===----------------------------------------------------------------------===//
+// VectorizeDAPPass
+//===----------------------------------------------------------------------===//
+
+namespace {
+class VectorizeDAPPass
+    : public PassWrapper<VectorizeDAPPass, OperationPass<ModuleOp>> {
+public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(VectorizeDAPPass)
+  VectorizeDAPPass() = default;
+  VectorizeDAPPass(const VectorizeDAPPass &) {}
+
+  StringRef getArgument() const final { return "vectorize-dap"; }
+  StringRef getDescription() const final { return "Vectorize DAP Dialect."; }
+
+  void runOnOperation() override;
+
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<buddy::dap::DAPDialect, func::FuncDialect,
+                    memref::MemRefDialect, scf::SCFDialect, VectorDialect,
+                    affine::AffineDialect, arith::ArithDialect,
+                    linalg::LinalgDialect>();
+  }
+};
+} // end anonymous namespace.
+
+void VectorizeDAPPass::runOnOperation() {
+  MLIRContext *context = &getContext();
+  ModuleOp module = getOperation();
+
+  ConversionTarget target(*context);
+  // clang-format off
+  target.addLegalDialect<
+      affine::AffineDialect,
+      scf::SCFDialect,
+      func::FuncDialect,
+      memref::MemRefDialect,
+      VectorDialect,
+      arith::ArithDialect,
+      linalg::LinalgDialect>();
+  target.addLegalOp<ModuleOp, func::FuncOp, func::ReturnOp>();
+  // clang-format on
+
+  RewritePatternSet patterns(context);
+  populateVectorizeDAPConversionPatterns(patterns);
+
+  if (failed(applyPartialConversion(module, target, std::move(patterns))))
+    signalPassFailure();
+}
+
+namespace mlir {
+namespace buddy {
+void registerDAPVectorizePass() { PassRegistration<VectorizeDAPPass>(); }
+} // namespace buddy
+} // namespace mlir
diff --git a/midend/lib/Conversion/LowerDAP/LowerDAPPass.cpp b/midend/lib/Conversion/LowerDAP/LowerDAPPass.cpp
index 33148d547..bf77f358b 100644
--- a/midend/lib/Conversion/LowerDAP/LowerDAPPass.cpp
+++ b/midend/lib/Conversion/LowerDAP/LowerDAPPass.cpp
@@ -21,11 +21,11 @@
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Bufferization/Transforms/Bufferize.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
 #include "mlir/Dialect/Linalg/Transforms/Transforms.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/Pass/Pass.h"
-#include "mlir/Dialect/Linalg/IR/Linalg.h"
 
 #include "DAP/DAPDialect.h"
 #include "DAP/DAPOps.h"
@@ -175,10 +175,7 @@ class DAPIirLowering : public OpRewritePattern<dap::IirOp> {
 public:
   using OpRewritePattern<dap::IirOp>::OpRewritePattern;
 
-  explicit DAPIirLowering(MLIRContext *context, int64_t strideParam)
-      : OpRewritePattern(context) {
-    stride = strideParam;
-  }
+  explicit DAPIirLowering(MLIRContext *context) : OpRewritePattern(context) {}
 
   LogicalResult matchAndRewrite(dap::IirOp op,
                                 PatternRewriter &rewriter) const override {
@@ -197,141 +194,60 @@ class DAPIirLowering : public OpRewritePattern<dap::IirOp> {
 
     Value N = rewriter.create<memref::DimOp>(loc, input, c0);
     Value filterSize = rewriter.create<memref::DimOp>(loc, kernel, c0);
-    Value strideVal = rewriter.create<ConstantIndexOp>(loc, stride);
 
     FloatType f32 = FloatType::getF32(ctx);
 
-    VectorType vectorTy32 = VectorType::get({stride}, f32);
-
-    Value zr = rewriter.create<ConstantFloatOp>(loc, APFloat(float(0)), f32);
-    // calculate the upper bound of the FIR part <scf::ForOp>
-    Value strictN = rewriter.create<SubIOp>(loc, N, c2);
-    Value strideRem = rewriter.create<RemSIOp>(loc, strictN, strideVal);
-    Value upperN = rewriter.create<SubIOp>(loc, N, strideRem);
-
     // loop over every row in SOS matrix
     rewriter.create<scf::ForOp>(
-        loc, c0, filterSize, c1, ValueRange{std::nullopt},
-        [&](OpBuilder &builder, Location loc, ValueRange ivs,
-            ValueRange iargs) {
-          Value b0 = builder.create<memref::LoadOp>(loc, kernel,
-                                                    ValueRange{ivs[0], c0});
-          Value b1 = builder.create<memref::LoadOp>(loc, kernel,
-                                                    ValueRange{ivs[0], c1});
-          Value b2 = builder.create<memref::LoadOp>(loc, kernel,
-                                                    ValueRange{ivs[0], c2});
-          // Value a0 of kernel is not used
-          Value a1 = builder.create<memref::LoadOp>(loc, kernel,
-                                                    ValueRange{ivs[0], c4});
-          Value a2 = builder.create<memref::LoadOp>(loc, kernel,
-                                                    ValueRange{ivs[0], c5});
+        loc, c0, filterSize, c1, ValueRange{input},
+        [&](OpBuilder &builder, Location loc, Value iv, ValueRange iarg) {
+          Value b0 =
+              builder.create<memref::LoadOp>(loc, kernel, ValueRange{iv, c0});
+          Value b1 =
+              builder.create<memref::LoadOp>(loc, kernel, ValueRange{iv, c1});
+          Value b2 =
+              builder.create<memref::LoadOp>(loc, kernel, ValueRange{iv, c2});
+          Value a1 =
+              builder.create<memref::LoadOp>(loc, kernel, ValueRange{iv, c4});
+          Value a2 =
+              builder.create<memref::LoadOp>(loc, kernel, ValueRange{iv, c5});
 
           Value z1 =
               builder.create<ConstantFloatOp>(loc, APFloat(float(0)), f32);
           Value z2 =
               builder.create<ConstantFloatOp>(loc, APFloat(float(0)), f32);
 
-          Value x0 = builder.create<memref::LoadOp>(loc, input, ValueRange{c0});
-          Value temp = builder.create<MulFOp>(loc, b0, x0);
-          builder.create<memref::StoreOp>(loc, temp, output, ValueRange{c0});
-
-          Value x1 = builder.create<memref::LoadOp>(loc, input, ValueRange{c1});
-          Value temp0 = builder.create<MulFOp>(loc, b0, x1);
-          Value temp1 = builder.create<MulFOp>(loc, b1, x0);
-          Value temp2 = builder.create<AddFOp>(loc, temp0, temp1);
-          builder.create<memref::StoreOp>(loc, temp2, output, ValueRange{c1});
-
-          Value Vecb0 =
-              builder.create<vector::BroadcastOp>(loc, vectorTy32, b0);
-          Value Vecb1 =
-              builder.create<vector::BroadcastOp>(loc, vectorTy32, b1);
-          Value Vecb2 =
-              builder.create<vector::BroadcastOp>(loc, vectorTy32, b2);
-
-          // A biquad filter expression:
-          // y[n] = b0*x[n] + b1*x[n-1] + b2*x[n-2] - a1*y[n-1] - a2*y[n-2];
-          // FIR part
-          builder.create<scf::ForOp>(
-              loc, c2, upperN, strideVal, ValueRange{std::nullopt},
-              [&](OpBuilder &builder, Location loc, Value iv,
-                  ValueRange itrargs) {
-                Value idx0 = iv;
-                Value idx1 = builder.create<SubIOp>(loc, idx0, c1);
-                Value idx2 = builder.create<SubIOp>(loc, idx0, c2);
-
-                Value inputVec0 = builder.create<LoadOp>(loc, vectorTy32, input,
-                                                         ValueRange{idx0});
-                Value inputVec1 = builder.create<LoadOp>(loc, vectorTy32, input,
-                                                         ValueRange{idx1});
-                Value inputVec2 = builder.create<LoadOp>(loc, vectorTy32, input,
-                                                         ValueRange{idx2});
-
-                Value outputVec =
-                    rewriter.create<vector::BroadcastOp>(loc, vectorTy32, zr);
-                Value resVec0 =
-                    builder.create<FMAOp>(loc, inputVec0, Vecb0, outputVec);
-                Value resVec1 =
-                    builder.create<FMAOp>(loc, inputVec1, Vecb1, resVec0);
-                Value resVec2 =
-                    builder.create<FMAOp>(loc, inputVec2, Vecb2, resVec1);
-                builder.create<StoreOp>(loc, resVec2, output, ValueRange{idx0});
-
-                builder.create<scf::YieldOp>(loc, std::nullopt);
-              });
-
-          // process the remain data of FIR part
-          Value idx1 = builder.create<SubIOp>(loc, upperN, c1);
-          Value idx2 = builder.create<SubIOp>(loc, upperN, c2);
-          Value in1 = 
-              builder.create<memref::LoadOp>(loc, input, ValueRange{idx1});
-          Value in2 = 
-              builder.create<memref::LoadOp>(loc, input, ValueRange{idx2});
-
-          builder.create<scf::ForOp>(
-              loc, upperN, N, c1, ValueRange{in1, in2}, 
-              [&](OpBuilder &builder, Location loc, Value iv,
-                  ValueRange itrargs) {
-                Value in0 = 
-                    builder.create<memref::LoadOp>(loc, input, ValueRange{iv});
-
-                Value temp0 = builder.create<MulFOp>(loc, b0, in0);
-                Value temp1 = builder.create<MulFOp>(loc, b1, in1);
-                Value temp2 = builder.create<MulFOp>(loc, b2, in2);
-                Value sum0 = builder.create<AddFOp>(loc, temp0, temp1);
-                Value sum1 = builder.create<AddFOp>(loc, sum0, temp2);
-                
-                builder.create<memref::StoreOp>(loc, sum1, output, ValueRange{iv});
-
-                builder.create<scf::YieldOp>(loc, std::vector<Value>{in0, in1});
-              });
-
-          // IIR part
+          // Loop reordering, compute z1 for next iteration, z2 for the second
+          // following iteration.
           builder.create<scf::ForOp>(
               loc, c0, N, c1, ValueRange{z1, z2},
               [&](OpBuilder &builder, Location loc, Value iv,
-                  ValueRange itrargs) {
-                Value x =
-                    builder.create<memref::LoadOp>(loc, output, ValueRange{iv});
-                Value t1 = builder.create<MulFOp>(loc, a1, itrargs[1]);
-                Value t2 = builder.create<MulFOp>(loc, a2, itrargs[0]);
-                Value y = builder.create<AddFOp>(loc, t1, t2);
-                Value opt = builder.create<SubFOp>(loc, x, y);
-
-                builder.create<memref::StoreOp>(loc, opt, output,
-                                                ValueRange{iv});
+                  ValueRange iargs) {
+                Value inElem = builder.create<memref::LoadOp>(loc, iarg[0], iv);
+                Value t0 = builder.create<arith::MulFOp>(loc, b0, inElem);
+                Value outElem =
+                    builder.create<arith::AddFOp>(loc, t0, iargs[0]);
+
+                Value t1 = builder.create<arith::MulFOp>(loc, b1, inElem);
+                Value t2 = builder.create<arith::MulFOp>(loc, a1, outElem);
+                Value t3 = builder.create<arith::SubFOp>(loc, t1, t2);
+                Value z1Next = builder.create<arith::AddFOp>(loc, t3, iargs[1]);
+
+                Value t4 = builder.create<arith::MulFOp>(loc, b2, inElem);
+                Value t5 = builder.create<arith::MulFOp>(loc, a2, outElem);
+                Value z2Next = builder.create<arith::SubFOp>(loc, t4, t5);
+
+                builder.create<memref::StoreOp>(loc, outElem, output, iv);
                 builder.create<scf::YieldOp>(
-                    loc, std::vector<Value>{itrargs[1], opt});
+                    loc, std::vector<Value>{z1Next, z2Next});
               });
-          builder.create<memref::CopyOp>(loc, output, input);
-          builder.create<scf::YieldOp>(loc, std::nullopt);
+
+          builder.create<scf::YieldOp>(loc, output);
         });
 
     rewriter.eraseOp(op);
     return success();
   }
-
-private:
-  int64_t stride;
 };
 
 } // end anonymous namespace
@@ -340,7 +256,7 @@ void populateLowerDAPConversionPatterns(RewritePatternSet &patterns,
                                         int64_t stride) {
   patterns.add<DAPFirLowering>(patterns.getContext());
   patterns.add<DAPBiquadLowering>(patterns.getContext(), stride);
-  patterns.add<DAPIirLowering>(patterns.getContext(), stride);
+  patterns.add<DAPIirLowering>(patterns.getContext());
 }
 
 //===----------------------------------------------------------------------===//
@@ -363,7 +279,8 @@ class LowerDAPPass : public PassWrapper<LowerDAPPass, OperationPass<ModuleOp>> {
   void getDependentDialects(DialectRegistry &registry) const override {
     registry.insert<buddy::dap::DAPDialect, func::FuncDialect,
                     memref::MemRefDialect, scf::SCFDialect, VectorDialect,
-                    affine::AffineDialect, arith::ArithDialect,linalg::LinalgDialect>();
+                    affine::AffineDialect, arith::ArithDialect,
+                    linalg::LinalgDialect>();
   }
   Option<int64_t> stride{*this, "DAP-vector-splitting",
                          llvm::cl::desc("Vector splitting size."),
@@ -376,10 +293,10 @@ void LowerDAPPass::runOnOperation() {
   ModuleOp module = getOperation();
 
   ConversionTarget target(*context);
-  target.addLegalDialect<affine::AffineDialect, scf::SCFDialect,
-                         func::FuncDialect, memref::MemRefDialect,
-                         VectorDialect, arith::ArithDialect,
-                         linalg::LinalgDialect>();
+  target
+      .addLegalDialect<affine::AffineDialect, scf::SCFDialect,
+                       func::FuncDialect, memref::MemRefDialect, VectorDialect,
+                       arith::ArithDialect, linalg::LinalgDialect>();
   target.addLegalOp<ModuleOp, func::FuncOp, func::ReturnOp>();
 
   RewritePatternSet patterns(context);
diff --git a/midend/lib/Conversion/LowerSche/LowerSchePass.cpp b/midend/lib/Conversion/LowerSche/LowerSchePass.cpp
index 882ffd2f4..0d1da54f2 100644
--- a/midend/lib/Conversion/LowerSche/LowerSchePass.cpp
+++ b/midend/lib/Conversion/LowerSche/LowerSchePass.cpp
@@ -1,4 +1,4 @@
-//====- LowerSchePass.cpp - Sche Dialect Lowering Pass  ---------------------===//
+//====- LowerSchePass.cpp - Sche Dialect Lowering Pass  -------------------===//
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -19,19 +19,19 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Async/IR/Async.h"
+#include "mlir/Dialect/Bufferization/IR/Bufferization.h"
 #include "mlir/Dialect/Bufferization/Transforms/Bufferize.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
-#include "mlir/Dialect/Vector/IR/VectorOps.h"
-#include "mlir/Dialect/GPU/IR/GPUDialect.h"
-#include "mlir/Dialect/Bufferization/IR/Bufferization.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
-#include "mlir/Dialect/Async/IR/Async.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinDialect.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/DialectConversion.h"
-#include "mlir/IR/Builders.h"
 
 #include "Bud/BudDialect.h"
 #include "Bud/BudOps.h"
@@ -40,7 +40,6 @@
 
 #include <unordered_map>
 
-
 using namespace mlir;
 using namespace buddy;
 
@@ -50,81 +49,100 @@ using namespace buddy;
 
 namespace {
 
-class WaitOpScheLowering : public ConversionPattern  {
+class WaitOpScheLowering : public ConversionPattern {
 public:
-  explicit WaitOpScheLowering(TypeConverter &typeConverter, MLIRContext *context)
-      : ConversionPattern(typeConverter, sche::WaitOp::getOperationName(), 1, context) {}
-
+  explicit WaitOpScheLowering(TypeConverter &typeConverter,
+                              MLIRContext *context)
+      : ConversionPattern(typeConverter, sche::WaitOp::getOperationName(), 1,
+                          context) {}
 
   LogicalResult
-  matchAndRewrite(Operation* op, ArrayRef<mlir::Value> operands, 
+  matchAndRewrite(Operation *op, ArrayRef<mlir::Value> operands,
                   mlir::ConversionPatternRewriter &rewriter) const final {
     assert(operands.size() == 1);
     auto loc = op->getLoc();
-    auto typeConverter = getTypeConverter();
+    // auto typeConverter = getTypeConverter();
     rewriter.setInsertionPoint(op);
-    auto awaitOp = rewriter.create<async::AwaitOp>(loc, operands[0]);
+    rewriter.create<async::AwaitOp>(loc, operands[0]);
     rewriter.eraseOp(op);
     return success();
   }
 };
 
-//lower to GPU Dialect
-class OnDeviceOpScheLowering : public ConversionPattern  {
+// lower to GPU Dialect
+class OnDeviceOpScheLowering : public ConversionPattern {
 public:
-explicit OnDeviceOpScheLowering(TypeConverter &typeConverter, MLIRContext *context)
-      : ConversionPattern(typeConverter, sche::OnDeviceOp::getOperationName(), 1, context) {}
-
-  //convert operands with tensor or vector type into memref operands, and register these operands to GPU
-  OpBuilder::InsertPoint convertOperands(mlir::ConversionPatternRewriter &rewriter, ValueRange operands, IRMapping &mp, Location& loc, OpBuilder::InsertPoint insertPointBeforeOp, OpBuilder::InsertPoint insertPointToBlockStart) const {
+  explicit OnDeviceOpScheLowering(TypeConverter &typeConverter,
+                                  MLIRContext *context)
+      : ConversionPattern(typeConverter, sche::OnDeviceOp::getOperationName(),
+                          1, context) {}
+
+  // convert operands with tensor or vector type into memref operands, and
+  // register these operands to GPU
+  OpBuilder::InsertPoint
+  convertOperands(mlir::ConversionPatternRewriter &rewriter,
+                  ValueRange operands, IRMapping &mp, Location &loc,
+                  OpBuilder::InsertPoint insertPointBeforeOp,
+                  OpBuilder::InsertPoint insertPointToBlockStart) const {
     rewriter.restoreInsertionPoint(insertPointBeforeOp);
-    for(auto v : operands){
+    for (auto v : operands) {
       auto t = v.getType();
-      if(isa<TensorType>(t)){
+      if (isa<TensorType>(t)) {
         auto shape = dyn_cast<TensorType>(t).getShape();
         auto ele_type = dyn_cast<TensorType>(t).getElementType();
-        auto to_memref_op = rewriter.create<bufferization::ToMemrefOp>(loc, MemRefType::get(shape, ele_type), v);
+        auto to_memref_op = rewriter.create<bufferization::ToMemrefOp>(
+            loc, MemRefType::get(shape, ele_type), v);
         mp.map(v, to_memref_op.getResult());
 
-        auto memref_cast_op = rewriter.create<memref::CastOp>(loc, UnrankedMemRefType::get(ele_type, {}), to_memref_op.getResult());
-        auto host_register_op = rewriter.create<gpu::HostRegisterOp>(loc, memref_cast_op.getResult());
-      }
-      else if(isa<VectorType>(t)){
+        auto memref_cast_op = rewriter.create<memref::CastOp>(
+            loc, UnrankedMemRefType::get(ele_type, {}),
+            to_memref_op.getResult());
+        rewriter.create<gpu::HostRegisterOp>(loc, memref_cast_op.getResult());
+      } else if (isa<VectorType>(t)) {
         auto shape = dyn_cast<VectorType>(t).getShape();
         auto ele_type = dyn_cast<VectorType>(t).getElementType();
         auto mem_type = MemRefType::get(shape, ele_type);
         auto alloc_op = rewriter.create<memref::AllocOp>(loc, mem_type);
-        auto memref_cast_op = rewriter.create<memref::CastOp>(loc, UnrankedMemRefType::get(ele_type, {}), alloc_op.getResult());
-        auto host_register_op = rewriter.create<gpu::HostRegisterOp>(loc, memref_cast_op.getResult());
-        auto idx0 = rewriter.create<arith::ConstantOp>(loc, rewriter.getIndexAttr(0)).getResult();
+        auto memref_cast_op = rewriter.create<memref::CastOp>(
+            loc, UnrankedMemRefType::get(ele_type, {}), alloc_op.getResult());
+        rewriter.create<gpu::HostRegisterOp>(loc, memref_cast_op.getResult());
+        auto idx0 =
+            rewriter.create<arith::ConstantOp>(loc, rewriter.getIndexAttr(0))
+                .getResult();
         llvm::SmallVector<Value> indices(shape.size(), idx0);
-        auto vector_transfer_write_op = rewriter.create<vector::TransferWriteOp>(loc, v, alloc_op.getResult(), indices);
+        rewriter.create<vector::TransferWriteOp>(loc, v, alloc_op.getResult(),
+                                                 indices);
         mp.map(v, alloc_op.getResult());
-      }
-      else if(isa<UnrankedMemRefType>(t)){
-        auto host_register_op = rewriter.create<gpu::HostRegisterOp>(loc, v);
-      }
-      else if(isa<MemRefType>(t)){
+      } else if (isa<UnrankedMemRefType>(t)) {
+        rewriter.create<gpu::HostRegisterOp>(loc, v);
+      } else if (isa<MemRefType>(t)) {
         auto memref_type = dyn_cast<MemRefType>(t);
-        auto memref_cast_op = rewriter.create<memref::CastOp>(loc, UnrankedMemRefType::get(memref_type.getElementType(), memref_type.getMemorySpace()), v);
-        auto host_register_op = rewriter.create<gpu::HostRegisterOp>(loc, memref_cast_op.getResult());
-      }
-      else{
+        auto memref_cast_op = rewriter.create<memref::CastOp>(
+            loc,
+            UnrankedMemRefType::get(memref_type.getElementType(),
+                                    memref_type.getMemorySpace()),
+            v);
+        rewriter.create<gpu::HostRegisterOp>(loc, memref_cast_op.getResult());
+      } else {
         continue;
       }
     }
 
     rewriter.restoreInsertionPoint(insertPointToBlockStart);
-    for(auto v : operands){
+    for (auto v : operands) {
       auto t = v.getType();
-      if(isa<TensorType>(t)){
-        auto to_tensor_op = rewriter.create<bufferization::ToTensorOp>(loc, t, mp.lookup<Value>(v));
+      if (isa<TensorType>(t)) {
+        auto to_tensor_op = rewriter.create<bufferization::ToTensorOp>(
+            loc, t, mp.lookup<Value>(v));
         mp.map(v, to_tensor_op.getResult());
-      }
-      else if(isa<VectorType>(t)){
-        auto idx0 = rewriter.create<arith::ConstantOp>(loc, rewriter.getIndexAttr(0)).getResult();
-        llvm::SmallVector<Value> indices(dyn_cast<VectorType>(t).getShape().size(), idx0);
-        auto transfer_read_op = rewriter.create<vector::TransferReadOp>(loc, dyn_cast<VectorType>(t), mp.lookup<Value>(v), indices);
+      } else if (isa<VectorType>(t)) {
+        auto idx0 =
+            rewriter.create<arith::ConstantOp>(loc, rewriter.getIndexAttr(0))
+                .getResult();
+        llvm::SmallVector<Value> indices(
+            dyn_cast<VectorType>(t).getShape().size(), idx0);
+        auto transfer_read_op = rewriter.create<vector::TransferReadOp>(
+            loc, dyn_cast<VectorType>(t), mp.lookup<Value>(v), indices);
         mp.map(v, transfer_read_op.getResult());
       }
     }
@@ -132,160 +150,209 @@ explicit OnDeviceOpScheLowering(TypeConverter &typeConverter, MLIRContext *conte
     return rewriter.saveInsertionPoint();
   }
 
-  //convert results with tensor or vector type into memref , and register these results to GPU
-  SmallVector<Value> convertResults(mlir::ConversionPatternRewriter &rewriter, ValueRange results, IRMapping &mp, Location& loc, OpBuilder::InsertPoint insertPointBeforeOp, OpBuilder::InsertPoint insertPointAfterGpuLaunchOp) const {
+  // convert results with tensor or vector type into memref , and register these
+  // results to GPU
+  SmallVector<Value>
+  convertResults(mlir::ConversionPatternRewriter &rewriter, ValueRange results,
+                 IRMapping &mp, Location &loc,
+                 OpBuilder::InsertPoint insertPointBeforeOp,
+                 OpBuilder::InsertPoint insertPointAfterGpuLaunchOp) const {
     rewriter.restoreInsertionPoint(insertPointBeforeOp);
     SmallVector<Value> result_memrefs;
-    for(auto v : results){
+    for (auto v : results) {
       MemRefType mem_type;
       auto t = v.getType();
-      //TODO:必须要有rank
-      if(isa<TensorType>(t)){
+      // TODO: must have the rank
+      if (isa<TensorType>(t)) {
         auto shape = dyn_cast<TensorType>(t).getShape();
         auto ele_type = dyn_cast<TensorType>(t).getElementType();
         mem_type = MemRefType::get(shape, ele_type);
-      }
-      else if(isa<VectorType>(t)){
+      } else if (isa<VectorType>(t)) {
         auto shape = dyn_cast<VectorType>(t).getShape();
         auto ele_type = dyn_cast<VectorType>(t).getElementType();
         mem_type = MemRefType::get(shape, ele_type);
-      }
-      else if(isa<MemRefType>(t)){
+      } else if (isa<MemRefType>(t)) {
         mem_type = dyn_cast<MemRefType>(t);
-      }
-      else{
+      } else {
         mem_type = MemRefType::get({1}, t);
       }
       auto alloc_op = rewriter.create<memref::AllocOp>(loc, mem_type);
       result_memrefs.push_back(alloc_op.getResult());
-      auto memref_cast_op = rewriter.create<memref::CastOp>(loc, UnrankedMemRefType::get(mem_type.getElementType(), mem_type.getMemorySpace()), alloc_op.getResult());
-      auto host_register_op = rewriter.create<gpu::HostRegisterOp>(loc, memref_cast_op.getResult());
+      auto memref_cast_op = rewriter.create<memref::CastOp>(
+          loc,
+          UnrankedMemRefType::get(mem_type.getElementType(),
+                                  mem_type.getMemorySpace()),
+          alloc_op.getResult());
+      rewriter.create<gpu::HostRegisterOp>(loc, memref_cast_op.getResult());
     }
 
     rewriter.restoreInsertionPoint(insertPointAfterGpuLaunchOp);
-    //convert result'type into original type for returning
-    int i=0;
-    for(auto v : results){
+    // convert result'type into original type for returning
+    int i = 0;
+    for (auto v : results) {
       auto t = v.getType();
-      //TODO:必须要有rank
-      if(isa<TensorType>(t)){
-        auto shape = dyn_cast_or_null<TensorType>(t).getShape();
-        auto ele_type = dyn_cast_or_null<TensorType>(t).getElementType();
-        auto to_tensor_op = rewriter.create<bufferization::ToTensorOp>(loc, t, result_memrefs[i++]);
+      // TODO: must have the rank
+      if (isa<TensorType>(t)) {
+        auto to_tensor_op = rewriter.create<bufferization::ToTensorOp>(
+            loc, t, result_memrefs[i++]);
         v.replaceAllUsesWith(to_tensor_op.getResult());
-      }
-      else if(isa<VectorType>(t)){
-        auto idx0 = rewriter.create<arith::ConstantOp>(loc, rewriter.getIndexAttr(0)).getResult();
-        llvm::SmallVector<Value> indices(dyn_cast<VectorType>(t).getShape().size(), idx0);
-        auto transfer_read_op = rewriter.create<vector::TransferReadOp>(loc, dyn_cast<VectorType>(t), result_memrefs[i++], indices);
+      } else if (isa<VectorType>(t)) {
+        auto idx0 =
+            rewriter.create<arith::ConstantOp>(loc, rewriter.getIndexAttr(0))
+                .getResult();
+        llvm::SmallVector<Value> indices(
+            dyn_cast<VectorType>(t).getShape().size(), idx0);
+        auto transfer_read_op = rewriter.create<vector::TransferReadOp>(
+            loc, dyn_cast<VectorType>(t), result_memrefs[i++], indices);
         v.replaceAllUsesWith(transfer_read_op.getResult());
-      }
-      else if(isa<MemRefType>(t)){
+      } else if (isa<MemRefType>(t)) {
         v.replaceAllUsesWith(result_memrefs[i++]);
-      }
-      else{
-        auto idx0 = rewriter.create<arith::ConstantOp>(loc, rewriter.getIndexAttr(0)).getResult();
-        auto load_op = rewriter.create<memref::LoadOp>(loc, v.getType(), result_memrefs[i++], ValueRange{idx0});
+      } else {
+        auto idx0 =
+            rewriter.create<arith::ConstantOp>(loc, rewriter.getIndexAttr(0))
+                .getResult();
+        auto load_op = rewriter.create<memref::LoadOp>(
+            loc, v.getType(), result_memrefs[i++], ValueRange{idx0});
         v.replaceAllUsesWith(load_op.getResult());
       }
     }
     return result_memrefs;
   }
 
-  //OnDeviceOp from ScfForOp conversion
-  void lowerFromForOp(scf::ForOp forOp, gpu::LaunchOp gpuLaunchOp, OpBuilder::InsertPoint insertPointBeforeOp, OpBuilder::InsertPoint insertPointInGpuLaunchBody, Location loc,  PatternRewriter &rewriter,Value gridX, Value gridY, Value gridZ, Value blockX, Value blockY, Value blockZ) const {
+  // OnDeviceOp from ScfForOp conversion
+  void lowerFromForOp(scf::ForOp forOp, gpu::LaunchOp gpuLaunchOp,
+                      OpBuilder::InsertPoint insertPointBeforeOp,
+                      OpBuilder::InsertPoint insertPointInGpuLaunchBody,
+                      Location loc, PatternRewriter &rewriter, Value gridX,
+                      Value gridY, Value gridZ, Value blockX, Value blockY,
+                      Value blockZ) const {
     rewriter.restoreInsertionPoint(insertPointBeforeOp);
     Value upperBound = forOp.getUpperBound();
     Value lowerBound = forOp.getLowerBound();
     Value step = forOp.getStep();
-    //Calculate the step size range required in a block
+    // Calculate the step size range required in a block
     auto range = rewriter.create<arith::SubIOp>(loc, upperBound, lowerBound);
-    Value stepRange = rewriter.create<arith::DivSIOp>(loc, range.getResult(), step);
-    Value stepRangeInBlock = rewriter.create<arith::DivSIOp>(loc, stepRange, gridX);
+    Value stepRange =
+        rewriter.create<arith::DivSIOp>(loc, range.getResult(), step);
+    Value stepRangeInBlock =
+        rewriter.create<arith::DivSIOp>(loc, stepRange, gridX);
     Value remInBlock = rewriter.create<arith::RemSIOp>(loc, stepRange, gridX);
-    auto idx0 = rewriter.create<arith::ConstantOp>(loc, rewriter.getIndexAttr(0)).getResult();
-    auto idx1 = rewriter.create<arith::ConstantOp>(loc, rewriter.getIndexAttr(1)).getResult();
-
-    auto& body = gpuLaunchOp.getBody();
-    auto& bodyBlock = body.front();
+    auto idx0 =
+        rewriter.create<arith::ConstantOp>(loc, rewriter.getIndexAttr(0))
+            .getResult();
+    auto idx1 =
+        rewriter.create<arith::ConstantOp>(loc, rewriter.getIndexAttr(1))
+            .getResult();
 
     rewriter.restoreInsertionPoint(insertPointInGpuLaunchBody);
-    Value start = rewriter.create<arith::MulIOp>(loc, stepRangeInBlock, gpuLaunchOp.getBlockIds().x);
+    Value start = rewriter.create<arith::MulIOp>(loc, stepRangeInBlock,
+                                                 gpuLaunchOp.getBlockIds().x);
     start = rewriter.create<arith::AddIOp>(loc, start, lowerBound);
-    Value cmp_rem_blkId = rewriter.create<arith::CmpIOp>(loc, arith::CmpIPredicate::sgt, remInBlock, gpuLaunchOp.getBlockIds().x);
-    Value cmp_rem_blkId_index = rewriter.create<arith::IndexCastUIOp>(loc, rewriter.getIndexType(), cmp_rem_blkId);
-    stepRangeInBlock = rewriter.create<arith::AddIOp>(loc, cmp_rem_blkId_index, stepRangeInBlock);
-    Value min = rewriter.create<arith::MinUIOp>(loc, gpuLaunchOp.getBlockIds().x, remInBlock);
+    Value cmp_rem_blkId =
+        rewriter.create<arith::CmpIOp>(loc, arith::CmpIPredicate::sgt,
+                                       remInBlock, gpuLaunchOp.getBlockIds().x);
+    Value cmp_rem_blkId_index = rewriter.create<arith::IndexCastUIOp>(
+        loc, rewriter.getIndexType(), cmp_rem_blkId);
+    stepRangeInBlock = rewriter.create<arith::AddIOp>(loc, cmp_rem_blkId_index,
+                                                      stepRangeInBlock);
+    Value min = rewriter.create<arith::MinUIOp>(
+        loc, gpuLaunchOp.getBlockIds().x, remInBlock);
     start = rewriter.create<arith::AddIOp>(loc, start, min);
-    //Calculate the step size range required in a thread
-    Value stepRangeInThread = rewriter.create<arith::DivSIOp>(loc, stepRangeInBlock, gpuLaunchOp.getBlockSizeX());
-    Value remInThread = rewriter.create<arith::RemSIOp>(loc, stepRangeInBlock, gpuLaunchOp.getBlockSizeX());
-    Value cmp_rem_threadId = rewriter.create<arith::CmpIOp>(loc, arith::CmpIPredicate::sgt, remInThread, gpuLaunchOp.getThreadIds().x);
-    Value cmp_rem_threadId_index = rewriter.create<arith::IndexCastUIOp>(loc, rewriter.getIndexType(), cmp_rem_threadId);
-    stepRangeInThread = rewriter.create<arith::AddIOp>(loc, cmp_rem_threadId_index, stepRangeInThread);
-
-    Value end = rewriter.create<arith::AddIOp>(loc, start, stepRangeInThread);
-
-    auto sub_forOp = rewriter.create<scf::ForOp>(loc, idx0, stepRangeInThread, idx1, forOp.getInitArgs(), 
-                                                  [&](OpBuilder& builder, Location loc, 
-                                                  Value iv, ValueRange iterArgs)
-    {
-      Block &bodyBlock = forOp.getRegion().front();//original forOp's bodyBlock
-      IRMapping mp;
-      iv = builder.create<arith::MulIOp>(loc, iv, gpuLaunchOp.getBlockSizeX());
-      iv = builder.create<arith::AddIOp>(loc, iv, gpuLaunchOp.getThreadIds().x);
-      iv = builder.create<arith::MulIOp>(loc, iv, step);
-      iv = builder.create<arith::AddIOp>(loc, iv, start);
-      mp.map(bodyBlock.getArgument(0), iv);
-      for(auto&& [a, b] : llvm::zip(bodyBlock.getArguments().drop_front(), iterArgs)){
-        mp.map(a, b);
-      }
-      for(auto&& op_ : bodyBlock.getOperations()){
-        builder.insert(op_.clone(mp));
-      }
-    });
+    // Calculate the step size range required in a thread
+    Value stepRangeInThread = rewriter.create<arith::DivSIOp>(
+        loc, stepRangeInBlock, gpuLaunchOp.getBlockSizeX());
+    Value remInThread = rewriter.create<arith::RemSIOp>(
+        loc, stepRangeInBlock, gpuLaunchOp.getBlockSizeX());
+    Value cmp_rem_threadId = rewriter.create<arith::CmpIOp>(
+        loc, arith::CmpIPredicate::sgt, remInThread,
+        gpuLaunchOp.getThreadIds().x);
+    Value cmp_rem_threadId_index = rewriter.create<arith::IndexCastUIOp>(
+        loc, rewriter.getIndexType(), cmp_rem_threadId);
+    stepRangeInThread = rewriter.create<arith::AddIOp>(
+        loc, cmp_rem_threadId_index, stepRangeInThread);
+
+    rewriter.create<arith::AddIOp>(loc, start, stepRangeInThread);
+
+    rewriter.create<scf::ForOp>(
+        loc, idx0, stepRangeInThread, idx1, forOp.getInitArgs(),
+        [&](OpBuilder &builder, Location loc, Value iv, ValueRange iterArgs) {
+          Block &bodyBlock =
+              forOp.getRegion().front(); // original forOp's bodyBlock
+          IRMapping mp;
+          iv = builder.create<arith::MulIOp>(loc, iv,
+                                             gpuLaunchOp.getBlockSizeX());
+          iv = builder.create<arith::AddIOp>(loc, iv,
+                                             gpuLaunchOp.getThreadIds().x);
+          iv = builder.create<arith::MulIOp>(loc, iv, step);
+          iv = builder.create<arith::AddIOp>(loc, iv, start);
+          mp.map(bodyBlock.getArgument(0), iv);
+          for (auto &&[a, b] :
+               llvm::zip(bodyBlock.getArguments().drop_front(), iterArgs)) {
+            mp.map(a, b);
+          }
+          for (auto &&op_ : bodyBlock.getOperations()) {
+            builder.insert(op_.clone(mp));
+          }
+        });
   }
 
   LogicalResult
-  matchAndRewrite(Operation* op, ArrayRef<mlir::Value> operands, 
+  matchAndRewrite(Operation *op, ArrayRef<mlir::Value> operands,
                   mlir::ConversionPatternRewriter &rewriter) const final {
     auto loc = op->getLoc();
     auto onDeviceOp = dyn_cast<sche::OnDeviceOp>(op);
 
     rewriter.setInsertionPoint(op);
 
-    auto grid_x = rewriter.create<arith::ConstantOp>(loc, rewriter.getIndexAttr(3)).getResult();
-    auto grid_y = rewriter.create<arith::ConstantOp>(loc, rewriter.getIndexAttr(1)).getResult();
-    auto grid_z = rewriter.create<arith::ConstantOp>(loc, rewriter.getIndexAttr(1)).getResult();
-    auto block_x = rewriter.create<arith::ConstantOp>(loc, rewriter.getIndexAttr(3)).getResult();
-    auto block_y = rewriter.create<arith::ConstantOp>(loc, rewriter.getIndexAttr(1)).getResult();
-    auto block_z = rewriter.create<arith::ConstantOp>(loc, rewriter.getIndexAttr(1)).getResult();
+    auto grid_x =
+        rewriter.create<arith::ConstantOp>(loc, rewriter.getIndexAttr(3))
+            .getResult();
+    auto grid_y =
+        rewriter.create<arith::ConstantOp>(loc, rewriter.getIndexAttr(1))
+            .getResult();
+    auto grid_z =
+        rewriter.create<arith::ConstantOp>(loc, rewriter.getIndexAttr(1))
+            .getResult();
+    auto block_x =
+        rewriter.create<arith::ConstantOp>(loc, rewriter.getIndexAttr(3))
+            .getResult();
+    auto block_y =
+        rewriter.create<arith::ConstantOp>(loc, rewriter.getIndexAttr(1))
+            .getResult();
+    auto block_z =
+        rewriter.create<arith::ConstantOp>(loc, rewriter.getIndexAttr(1))
+            .getResult();
 
     OpBuilder::InsertPoint insertBeforeOp, insertAfterOp;
     gpu::LaunchOp gpu_launch_op;
 
-    //if use async
+    // if use async
     Value token = onDeviceOp.getAsyncToken();
-    if(token){
-      auto asyncDependencies = operands.take_front(onDeviceOp.getODSOperandIndexAndLength(0).second - onDeviceOp.getODSOperandIndexAndLength(0).first);
-      auto async_exec_op = rewriter.create<async::ExecuteOp>(loc, TypeRange{}, asyncDependencies, ValueRange{});
+    if (token) {
+      auto asyncDependencies =
+          operands.take_front(onDeviceOp.getODSOperandIndexAndLength(0).second -
+                              onDeviceOp.getODSOperandIndexAndLength(0).first);
+      auto async_exec_op = rewriter.create<async::ExecuteOp>(
+          loc, TypeRange{}, asyncDependencies, ValueRange{});
       rewriter.replaceAllUsesWith(token, async_exec_op.getToken());
-      auto& bodyBlock = async_exec_op.getBodyRegion().front();
+      auto &bodyBlock = async_exec_op.getBodyRegion().front();
       rewriter.setInsertionPointToStart(&bodyBlock);
-      gpu_launch_op = rewriter.create<gpu::LaunchOp>(loc, grid_x, grid_y, grid_z, block_x, block_y, block_z);
+      gpu_launch_op = rewriter.create<gpu::LaunchOp>(
+          loc, grid_x, grid_y, grid_z, block_x, block_y, block_z);
       rewriter.setInsertionPoint(async_exec_op);
       insertBeforeOp = rewriter.saveInsertionPoint();
       rewriter.setInsertionPointAfter(async_exec_op);
       insertAfterOp = rewriter.saveInsertionPoint();
-    }else{
-      gpu_launch_op = rewriter.create<gpu::LaunchOp>(loc, grid_x, grid_y, grid_z, block_x, block_y, block_z);
+    } else {
+      gpu_launch_op = rewriter.create<gpu::LaunchOp>(
+          loc, grid_x, grid_y, grid_z, block_x, block_y, block_z);
       rewriter.setInsertionPoint(gpu_launch_op);
       insertBeforeOp = rewriter.saveInsertionPoint();
       rewriter.setInsertionPointAfter(gpu_launch_op);
       insertAfterOp = rewriter.saveInsertionPoint();
     }
 
-    auto& bodyBlock = gpu_launch_op.getBody().front();
+    auto &bodyBlock = gpu_launch_op.getBody().front();
 
     rewriter.setInsertionPointToStart(&bodyBlock);
     auto insertToStart = rewriter.saveInsertionPoint();
@@ -293,59 +360,73 @@ explicit OnDeviceOpScheLowering(TypeConverter &typeConverter, MLIRContext *conte
     auto insertToEnd = rewriter.saveInsertionPoint();
 
     IRMapping mp;
-    auto innerOperands = operands.take_back(onDeviceOp.getODSOperandIndexAndLength(1).second - onDeviceOp.getODSOperandIndexAndLength(1).first);
-    auto insertPointInGpuLaunchBody = convertOperands(rewriter, innerOperands, mp, loc, insertBeforeOp, insertToStart);
+    auto innerOperands =
+        operands.take_back(onDeviceOp.getODSOperandIndexAndLength(1).second -
+                           onDeviceOp.getODSOperandIndexAndLength(1).first);
+    auto insertPointInGpuLaunchBody = convertOperands(
+        rewriter, innerOperands, mp, loc, insertBeforeOp, insertToStart);
     auto results = onDeviceOp.getInnerResults();
-    auto result_memrefs = convertResults(rewriter, results, mp, loc, insertBeforeOp, insertAfterOp);
-  
+    auto result_memrefs = convertResults(rewriter, results, mp, loc,
+                                         insertBeforeOp, insertAfterOp);
+
     assert(isa<StringAttr>(op->getAttr("sche.source")));
-    auto sche_source = dyn_cast_or_null<StringAttr>(op->getAttr("sche.source")).strref();
+    auto sche_source =
+        dyn_cast_or_null<StringAttr>(op->getAttr("sche.source")).strref();
 
-    //scf::for lower
-    if(sche_source == "scf.for"){
-      Operation& op_ = onDeviceOp.getRegion().front().front();
+    // scf::for lower
+    if (sche_source == "scf.for") {
+      Operation &op_ = onDeviceOp.getRegion().front().front();
       auto for_op = dyn_cast<scf::ForOp>(op_);
-      lowerFromForOp(for_op, gpu_launch_op, insertBeforeOp, insertPointInGpuLaunchBody, loc, rewriter, grid_x, grid_y, grid_z, block_x, block_y, block_z);
-    }
-    else if(sche_source == "func"){
+      lowerFromForOp(for_op, gpu_launch_op, insertBeforeOp,
+                     insertPointInGpuLaunchBody, loc, rewriter, grid_x, grid_y,
+                     grid_z, block_x, block_y, block_z);
+    } else if (sche_source == "func") {
       rewriter.restoreInsertionPoint(insertPointInGpuLaunchBody);
-      for(auto&& op_ : onDeviceOp.getRegion().front().getOperations()){
-        if(!op_.hasTrait<OpTrait::ReturnLike>()){
+      for (auto &&op_ : onDeviceOp.getRegion().front().getOperations()) {
+        if (!op_.hasTrait<OpTrait::ReturnLike>()) {
           auto new_op = rewriter.clone(op_, mp);
-          for(auto&& [a, b] : llvm::zip(op_.getResults(), new_op->getResults())){
+          for (auto &&[a, b] :
+               llvm::zip(op_.getResults(), new_op->getResults())) {
             mp.map(a, b);
           }
-        }else{
-          int i=0;
+        } else {
+          int i = 0;
           rewriter.restoreInsertionPoint(insertToEnd);
-          for(auto res : op_.getOperands()){
+          for (auto res : op_.getOperands()) {
             auto t = res.getType();
-            //TODO:必须要有rank
-            if(isa<TensorType>(t)){
+            // TODO: must have the rank
+            if (isa<TensorType>(t)) {
               auto shape = dyn_cast<TensorType>(t).getShape();
               auto ele_type = dyn_cast<TensorType>(t).getElementType();
-              auto to_memref_op = rewriter.create<bufferization::ToMemrefOp>(loc, MemRefType::get(shape, ele_type), mp.lookupOrNull<Value>(res));
-              auto copy_op = rewriter.create<memref::CopyOp>(loc, to_memref_op.getResult(), result_memrefs[i++]);
-            }
-            else if(isa<VectorType>(t)){
-              auto idx0 = rewriter.create<arith::ConstantOp>(loc, rewriter.getIndexAttr(0)).getResult();
-              llvm::SmallVector<Value> indices(dyn_cast<VectorType>(t).getShape().size(), idx0);
-              auto vector_transfer_write_op = rewriter.create<vector::TransferWriteOp>(loc, mp.lookupOrNull<Value>(res), result_memrefs[i++], indices);
-            }
-            else if(isa<MemRefType>(t)){
-              auto copy_op = rewriter.create<memref::CopyOp>(loc, mp.lookupOrNull<Value>(res), result_memrefs[i++]);
-            }
-            else{
-              auto store_op = rewriter.create<memref::StoreOp>(loc, mp.lookupOrNull<Value>(res), result_memrefs[i++]);
+              auto to_memref_op = rewriter.create<bufferization::ToMemrefOp>(
+                  loc, MemRefType::get(shape, ele_type),
+                  mp.lookupOrNull<Value>(res));
+              rewriter.create<memref::CopyOp>(loc, to_memref_op.getResult(),
+                                              result_memrefs[i++]);
+            } else if (isa<VectorType>(t)) {
+              auto idx0 =
+                  rewriter
+                      .create<arith::ConstantOp>(loc, rewriter.getIndexAttr(0))
+                      .getResult();
+              llvm::SmallVector<Value> indices(
+                  dyn_cast<VectorType>(t).getShape().size(), idx0);
+              rewriter.create<vector::TransferWriteOp>(
+                  loc, mp.lookupOrNull<Value>(res), result_memrefs[i++],
+                  indices);
+            } else if (isa<MemRefType>(t)) {
+              rewriter.create<memref::CopyOp>(loc, mp.lookupOrNull<Value>(res),
+                                              result_memrefs[i++]);
+            } else {
+              rewriter.create<memref::StoreOp>(loc, mp.lookupOrNull<Value>(res),
+                                               result_memrefs[i++]);
             }
           }
         }
       }
-    }
-    else{
-      //TODO add conversion of onDeviceOp from more op
-      printf("conversion from source %s has not implemented\n", sche_source);
-      abort();
+    } else {
+      // TODO add conversion of onDeviceOp from more op
+      op->emitError("Conversion from source " + sche_source +
+                    " has not been implemented");
     }
 
     rewriter.setInsertionPointToEnd(&bodyBlock);
@@ -359,7 +440,8 @@ explicit OnDeviceOpScheLowering(TypeConverter &typeConverter, MLIRContext *conte
 
 } // end anonymous namespace
 
-void populateLowerScheConversionPatterns(TypeConverter& typeConverter, RewritePatternSet &patterns) {
+void populateLowerScheConversionPatterns(TypeConverter &typeConverter,
+                                         RewritePatternSet &patterns) {
   // clang-format off
   patterns.add<OnDeviceOpScheLowering>(typeConverter, patterns.getContext());
   patterns.add<WaitOpScheLowering>(typeConverter, patterns.getContext());
@@ -371,7 +453,8 @@ void populateLowerScheConversionPatterns(TypeConverter& typeConverter, RewritePa
 //===----------------------------------------------------------------------===//
 
 namespace {
-class LowerSchePass : public PassWrapper<LowerSchePass, OperationPass<ModuleOp>> {
+class LowerSchePass
+    : public PassWrapper<LowerSchePass, OperationPass<ModuleOp>> {
 public:
   MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(LowerSchePass)
   LowerSchePass() = default;
@@ -422,8 +505,10 @@ void LowerSchePass::runOnOperation() {
 
   target.addIllegalDialect<buddy::sche::ScheDialect>();
   TypeConverter typeConverter;
-  typeConverter.addConversion([&](sche::AsyncTokenType type){return async::TokenType::get(context);});
-  typeConverter.addConversion([&](Type type){return type;});
+  typeConverter.addConversion([&](sche::AsyncTokenType type) {
+    return async::TokenType::get(context);
+  });
+  typeConverter.addConversion([&](Type type) { return type; });
 
   RewritePatternSet patterns(context);
   populateLowerScheConversionPatterns(typeConverter, patterns);
diff --git a/midend/lib/Conversion/MLIRGPU/CMakeLists.txt b/midend/lib/Conversion/MLIRGPU/CMakeLists.txt
new file mode 100644
index 000000000..041c6ff11
--- /dev/null
+++ b/midend/lib/Conversion/MLIRGPU/CMakeLists.txt
@@ -0,0 +1,30 @@
+add_mlir_library(MLIRGPUPasses
+  GPUHostRegister.cpp
+  GPUBufferize.cpp
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/Bufferization
+
+  LINK_LIBS PUBLIC
+  MLIRArithDialect
+  MLIRBufferizationDialect
+  MLIRControlFlowInterfaces
+  MLIRFuncDialect
+  MLIRFunctionInterfaces
+  MLIRInferTypeOpInterface
+  MLIRIR
+  MLIRMemRefDialect
+  MLIRPass
+  MLIRTensorDialect
+  MLIRSCFDialect
+  MLIRSideEffectInterfaces
+  MLIRSubsetOpInterface
+  MLIRTransforms
+  MLIRViewLikeInterface
+  MLIRSupport
+  BuddyUtils
+  MLIRBufferizationTransforms
+  MLIRGPUDialect
+)
+
+
+
diff --git a/midend/lib/Conversion/MLIRGPU/ConvertMemcpyToGPU.cpp b/midend/lib/Conversion/MLIRGPU/ConvertMemcpyToGPU.cpp
new file mode 100644
index 000000000..1417a96bb
--- /dev/null
+++ b/midend/lib/Conversion/MLIRGPU/ConvertMemcpyToGPU.cpp
@@ -0,0 +1,256 @@
+//===- ConvertMemcpyToGPU.cpp
+//-------------------------------------------------===//
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the pass that converts memcpy to gpu operations.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/OperationSupport.h"
+#include "mlir/IR/TypeRange.h"
+#include "mlir/IR/ValueRange.h"
+#include "mlir/IR/Visitors.h"
+#include "mlir/Support/LLVM.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include <mlir/Dialect/Affine/IR/AffineOps.h>
+#include <mlir/Dialect/Func/IR/FuncOps.h>
+#include <mlir/Dialect/Linalg/Transforms/Transforms.h>
+#include <mlir/IR/Dialect.h>
+#include <mlir/IR/Operation.h>
+#include <mlir/IR/TypeUtilities.h>
+#include <mlir/IR/Value.h>
+#include <mlir/Pass/Pass.h>
+
+#include <map>
+#include <set>
+#include <sstream>
+#include <string>
+#include <utility>
+using namespace mlir;
+using namespace vector;
+
+//===----------------------------------------------------------------------===//
+// Rewrite Pattern
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+class ConvertMemcpyToGPUPattern : public ConversionPattern {
+public:
+  explicit ConvertMemcpyToGPUPattern(MLIRContext *context)
+      : ConversionPattern(gpu::LaunchFuncOp().getOperationName(), 1, context) {}
+
+  LogicalResult
+  matchAndRewrite(Operation *op, ArrayRef<Value> /*operands*/,
+                  ConversionPatternRewriter &rewriter) const override {
+    llvm::errs() << op->getName().getStringRef() << "\n";
+    return success();
+  }
+
+private:
+};
+} // end anonymous namespace
+
+//===----------------------------------------------------------------------===//
+// ConvertMemcpyToGPUPass
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+class ConvertMemcpyToGPUPass
+    : public PassWrapper<ConvertMemcpyToGPUPass, OperationPass<func::FuncOp>> {
+public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(ConvertMemcpyToGPUPass)
+  StringRef getArgument() const final { return "convert-memcpy-to-gpu"; }
+  StringRef getDescription() const final {
+    return "Convert memref opertaions to gpu operations.";
+  }
+  ConvertMemcpyToGPUPass() = default;
+  ConvertMemcpyToGPUPass(const ConvertMemcpyToGPUPass &
+
+  ) {}
+
+  void runOnOperation() override;
+
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<gpu::GPUDialect, memref::MemRefDialect>();
+  }
+};
+
+void ConvertMemcpyToGPUPass::runOnOperation() {
+  auto funcOp = getOperation();
+  std::set<gpu::AllocOp *> unDeallocatedOperations;
+
+  // Copy all function arguments to gpu, needs deallocation
+  OpBuilder builder(funcOp->getContext());
+  builder.setInsertionPointToStart(&(funcOp.getBody().front()));
+  unsigned numArgs = funcOp.getNumArguments();
+  for (unsigned i = 0; i < numArgs; ++i) {
+    BlockArgument arg = funcOp.getArgument(i);
+    // Create a gpu.alloc op, then copy memory to it
+    // TODO: Move this out of operation, make the copy process async
+    auto memrefType = dyn_cast<MemRefType>(arg.getType());
+    auto gpuAllocOp = builder.create<gpu::AllocOp>(
+        builder.getUnknownLoc(), TypeRange({memrefType}), ValueRange({}));
+    unDeallocatedOperations.insert(&gpuAllocOp);
+    auto gpuMemcpyOp = builder.create<gpu::MemcpyOp>(
+        gpuAllocOp.getLoc(), TypeRange(), ValueRange(), gpuAllocOp.getResult(0),
+        arg);
+    // Replace all users with GPU memory
+    auto users = arg.getUsers();
+    std::vector<Operation *> usersVec(users.begin(), users.end());
+    for (auto user : usersVec) {
+      // Don't replace memcpy's operand
+      if (isa<gpu::MemcpyOp>(user))
+        continue;
+      for (size_t j = 0; j < user->getNumOperands(); j++) {
+        if (user->getOperand(j) == arg) {
+          user->setOperand(j, gpuAllocOp.getResult(0));
+        }
+      }
+    }
+  }
+
+  funcOp->walk<WalkOrder::PreOrder>([&](Operation *nestedOp) {
+    // Replace all allocations with GPU.alloc
+    if (auto allocOp = dyn_cast<memref::AllocOp>(nestedOp)) {
+      // Rewrite this allocOp to gpu.alloc, change for all users
+      builder.setInsertionPointAfter(allocOp);
+      auto result = allocOp->getResult(0);
+      auto memrefType = dyn_cast<MemRefType>(result.getType());
+      auto gpuAllocOp = builder.create<gpu::AllocOp>(
+          allocOp->getLoc(), TypeRange({memrefType}), ValueRange({}));
+      auto users = result.getUsers();
+      std::vector<Operation *> usersVec(users.begin(), users.end());
+      for (auto user : usersVec) {
+        for (size_t j = 0; j < user->getNumOperands(); j++) {
+          // Only the return value will not have dealloc op
+          if (auto deallocOp = dyn_cast<memref::DeallocOp>(user)) {
+            builder.setInsertionPointAfter(deallocOp);
+            auto gpuDeallocOp = builder.create<gpu::DeallocOp>(
+                deallocOp->getLoc(), TypeRange(), ValueRange(),
+                gpuAllocOp.getResult(0));
+            deallocOp->erase();
+          } 
+          else if (user->getOperand(j) == result) {
+            user->setOperand(j, gpuAllocOp.getResult(0));
+          }
+        }
+      }
+      allocOp->erase();
+    }
+    // Replace all memory.copy operations with gpu.memcpy
+    else if (auto copyOp = dyn_cast<memref::CopyOp>(nestedOp)) {
+      auto src = copyOp.getOperand(0);
+      auto dst = copyOp.getOperand(1);
+      // Notice: GPU.memcpy has a different src dst order
+      builder.setInsertionPointAfter(copyOp);
+      auto gpuMemcpyOp = builder.create<gpu::MemcpyOp>(
+          copyOp->getLoc(), TypeRange(), ValueRange(), dst, src);
+      {
+        auto users = src.getUsers();
+        std::vector<Operation *> usersVec(users.begin(), users.end());
+        for (auto user : usersVec) {
+          for (size_t j = 0; j < user->getNumOperands(); j++) {
+            if (user->getOperand(j) == src) {
+              user->setOperand(j, gpuMemcpyOp.getOperand(1));
+            }
+          }
+        }
+      }
+      {
+        auto users = dst.getUsers();
+        std::vector<Operation *> usersVec(users.begin(), users.end());
+        for (auto user : usersVec) {
+          for (size_t j = 0; j < user->getNumOperands(); j++) {
+            if (user->getOperand(j) == src) {
+              user->setOperand(j, gpuMemcpyOp.getOperand(0));
+            }
+          }
+        }
+      }
+      copyOp->erase();
+    }
+    // Allocate space on GPU and copy global memrefs to GPU, needs deallocation
+    else if (auto getGlobalOp = dyn_cast<memref::GetGlobalOp>(nestedOp)) {
+      builder.setInsertionPointAfter(getGlobalOp);
+      auto result = getGlobalOp->getResult(0);
+      auto memrefType = dyn_cast<MemRefType>(result.getType());
+      auto gpuAllocOp = builder.create<gpu::AllocOp>(
+          getGlobalOp->getLoc(), TypeRange({memrefType}), ValueRange({}));
+      unDeallocatedOperations.insert(&gpuAllocOp);
+      auto src = result;
+      auto dst = gpuAllocOp->getResult(0);
+      auto gpuMemcpyOp = builder.create<gpu::MemcpyOp>(
+          gpuAllocOp->getLoc(), TypeRange(), ValueRange(), dst, src);
+      {
+        auto users = src.getUsers();
+        std::vector<Operation *> usersVec(users.begin(), users.end());
+        for (auto user : usersVec) {
+          if (isa<gpu::MemcpyOp>(user))
+            continue;
+          for (size_t j = 0; j < user->getNumOperands(); j++) {
+            if (user->getOperand(j) == src) {
+              user->setOperand(j, dst);
+            }
+          }
+        }
+      }
+    }
+    // Copy data back to CPU, deallocate GPU, then return
+    else if (auto returnOp = dyn_cast<func::ReturnOp>(nestedOp)) {
+      builder.setInsertionPoint(returnOp);
+
+      for (auto* gpuAllocOp: unDeallocatedOperations){
+        auto gpuDeallocOp = builder.create<gpu::DeallocOp>(
+            builder.getUnknownLoc(), TypeRange(), ValueRange(),
+            gpuAllocOp->getResult(0));
+      }
+      builder.setInsertionPoint(returnOp);
+      for (unsigned i = 0; i < returnOp.getNumOperands(); ++i) {
+        auto val = returnOp->getOperand(i);
+        auto memRefType = dyn_cast<MemRefType>(val.getType());
+        auto allocOp = builder.create<memref::AllocOp>(
+            builder.getUnknownLoc(), memRefType);
+        auto gpuMemcpyOp = builder.create<gpu::MemcpyOp>(
+            allocOp.getLoc(), TypeRange(), ValueRange(),
+            allocOp->getResult(0), val);
+        auto gpuDeallocOp = builder.create<gpu::DeallocOp>(
+            gpuMemcpyOp->getLoc(), TypeRange(), ValueRange(), val);
+        returnOp->setOperand(i, allocOp->getResult(0));
+      }
+    }
+    return WalkResult::advance();
+  });
+}
+} // end anonymous namespace.
+
+namespace mlir {
+namespace buddy {
+void registerConvertMemcpyToGPUPass() {
+  PassRegistration<ConvertMemcpyToGPUPass>();
+}
+} // namespace buddy
+} // namespace mlir
diff --git a/midend/lib/Conversion/MLIRGPU/GPUBufferize.cpp b/midend/lib/Conversion/MLIRGPU/GPUBufferize.cpp
new file mode 100644
index 000000000..94a24e52e
--- /dev/null
+++ b/midend/lib/Conversion/MLIRGPU/GPUBufferize.cpp
@@ -0,0 +1,220 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+//===- GPUBufferizePass.cpp - ---------------------------------------------===//
+//
+// Wrapper pass to use MLIR's One-Shot Bufferize pass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
+#include "mlir/Conversion/ComplexToStandard/ComplexToStandard.h"
+#include "mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h"
+#include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h"
+#include "mlir/Conversion/VectorToGPU/VectorToGPU.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Arith/Transforms/Passes.h"
+#include "mlir/Dialect/Bufferization/IR/Bufferization.h"
+#include "mlir/Dialect/Bufferization/Transforms/OneShotAnalysis.h"
+#include "mlir/Dialect/Bufferization/Transforms/Passes.h"
+#include "mlir/Dialect/Bufferization/Transforms/Transforms.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/Func/Transforms/Passes.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/GPU/Transforms/Passes.h"
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/Linalg/Passes.h"
+#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/MemRef/Transforms/Passes.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Visitors.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Pass/PassOptions.h"
+#include "mlir/Pass/PassRegistry.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "mlir/Transforms/Passes.h"
+#include "llvm/Support/Casting.h"
+#include <mlir/Dialect/Affine/IR/AffineOps.h>
+#include <mlir/Dialect/Func/IR/FuncOps.h>
+#include <mlir/Dialect/Linalg/Transforms/Transforms.h>
+#include <mlir/IR/Dialect.h>
+#include <mlir/IR/Operation.h>
+#include <mlir/IR/TypeUtilities.h>
+#include <mlir/IR/Value.h>
+#include <mlir/Pass/Pass.h>
+
+using mlir::bufferization::BufferizationOptions;
+using mlir::bufferization::OneShotAnalysisState;
+using mlir::bufferization::OneShotBufferizationOptions;
+
+using namespace mlir;
+
+namespace {
+
+bool hasSharedMemoryAddressSpace(MemRefType memrefType) {
+  auto addrSpace = llvm::dyn_cast_if_present<gpu::AddressSpaceAttr>(
+      memrefType.getMemorySpace());
+  return addrSpace &&
+         addrSpace.getValue() == gpu::GPUDialect::getWorkgroupAddressSpace();
+}
+
+static FailureOr<Value> gpuAllocationFn(OpBuilder &builder, Location loc,
+                                        MemRefType memRefType,
+                                        ValueRange dynamicSizes,
+                                        unsigned alignment) {
+  auto workgroupSpace = gpu::AddressSpaceAttr::get(
+      builder.getContext(), gpu::GPUDialect::getWorkgroupAddressSpace());
+  MemRefType allocType =
+      MemRefType::get(memRefType.getShape(), memRefType.getElementType(),
+                      AffineMap(), workgroupSpace);
+  return builder.create<memref::AllocOp>(loc, allocType, dynamicSizes)
+      .getResult();
+}
+
+static LogicalResult gpuCopyFn(OpBuilder &builder, Location loc, Value from,
+                               Value to) {
+  bool needsBarrier = false;
+  if (hasSharedMemoryAddressSpace(llvm::cast<MemRefType>(from.getType()))) {
+    needsBarrier = true;
+  }
+  if (hasSharedMemoryAddressSpace(llvm::cast<MemRefType>(to.getType()))) {
+    needsBarrier = true;
+  }
+  if (needsBarrier)
+    builder.create<gpu::BarrierOp>(loc);
+  // Operation *copy =
+  builder.create<memref::CopyOp>(loc, from, to);
+  if (needsBarrier) {
+    // setMarker(copy, getCopyToWorkgroupMemoryMarker());
+    builder.create<gpu::BarrierOp>(loc);
+  }
+  return success();
+}
+
+/// Pass to convert from tensor based ops to memref based ops.
+class BuudyGPUBufferizePass
+    : public PassWrapper<BuudyGPUBufferizePass,
+                         OperationPass<ModuleOp>> {
+public:
+  explicit BuudyGPUBufferizePass(
+      std::optional<BufferizationOptions::AllocationFn> allocationFn =
+          gpuAllocationFn,
+      std::optional<BufferizationOptions::MemCpyFn> memCpyFn = gpuCopyFn)
+      : allocationFn(allocationFn), memCpyFn(memCpyFn) {}
+
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(BuudyGPUBufferizePass)
+  StringRef getArgument() const final { return "gpu-bufferize"; }
+  StringRef getDescription() const final {
+    return "One shot bufferize GPU pass.";
+  }
+
+  void getDependentDialects(DialectRegistry &registry) const override {
+    // clang-format off
+    registry
+        .insert<affine::AffineDialect,
+                arith::ArithDialect,
+                bufferization::BufferizationDialect,
+                func::FuncDialect,
+                linalg::LinalgDialect,
+                memref::MemRefDialect,
+                scf::SCFDialect,
+                tensor::TensorDialect,
+                vector::VectorDialect,
+                gpu::GPUDialect>();
+    // clang-format on
+  }
+
+  void runOnOperation() override;
+
+private:
+  const std::optional<BufferizationOptions::AllocationFn> allocationFn;
+  const std::optional<BufferizationOptions::MemCpyFn> memCpyFn;
+};
+
+
+
+} // namespace
+
+// The following is copied from bufferization::runOneShotBufferize with
+// modifications.
+LogicalResult
+runBuudyOneShotBufferize(Operation *op,
+                        const OneShotBufferizationOptions &options) {
+  OneShotAnalysisState state(op, options);
+  if (failed(analyzeOp(op, state)))
+    return failure();
+  if (options.testAnalysisOnly)
+    return success();
+  return bufferization::runOneShotBufferize(op, options);
+}
+
+/// Run comprehensive bufferize.
+void BuudyGPUBufferizePass::runOnOperation() {
+  ModuleOp moduleOp = getOperation();
+  OneShotBufferizationOptions options;
+  options.allocationFn = allocationFn;
+  options.memCpyFn = memCpyFn;
+
+  if (failed(runBuudyOneShotBufferize(moduleOp, options))) {
+    return signalPassFailure();
+  }
+
+  // Remove redundant args and unused results.
+  {
+    RewritePatternSet patterns(&getContext());
+    linalg::populateEraseUnusedOperandsAndResultsPatterns(patterns);
+    if (failed(applyPatternsAndFoldGreedily(moduleOp, std::move(patterns)))) {
+      return signalPassFailure();
+    }
+  }
+}
+
+std::unique_ptr<OperationPass<ModuleOp>> createBuudyGPUBufferizePass(
+    std::optional<BufferizationOptions::AllocationFn> allocationFn,
+    std::optional<BufferizationOptions::MemCpyFn> memCpyFn) {
+  if (!allocationFn)
+    allocationFn = gpuAllocationFn;
+  if (!memCpyFn)
+    memCpyFn = gpuCopyFn;
+  return std::make_unique<BuudyGPUBufferizePass>(allocationFn,
+                                                          memCpyFn);
+}
+
+void addBuudyPostBufferizationPasses(OpPassManager &passManager) {
+  passManager.addPass(memref::createResolveShapedTypeResultDimsPass());
+  passManager.addNestedPass<func::FuncOp>(createCanonicalizerPass());
+  passManager.addNestedPass<func::FuncOp>(createCSEPass());
+  // There are redundant memcpy (with linalg.generic form) ops created, which
+  // can be deleted by canonicalizer. We have to run it again because the
+  // memrefs are unified in CSE pass, so we can truely remove redundant memcpy.
+  passManager.addNestedPass<func::FuncOp>(createCanonicalizerPass());
+}
+
+void addBuudyGPUBufferizePasses(
+    OpPassManager &passManager,
+    std::optional<BufferizationOptions::AllocationFn> allocationFn,
+    std::optional<BufferizationOptions::MemCpyFn> memCpyFn) {
+  passManager.addPass(bufferization::createEmptyTensorEliminationPass());
+  passManager.addPass(bufferization::createEmptyTensorToAllocTensorPass());
+  passManager.addPass(
+      createBuudyGPUBufferizePass(allocationFn, memCpyFn));
+  addBuudyPostBufferizationPasses(passManager);
+}
+
+namespace mlir {
+namespace buddy {
+void registerBuddyGPUBufferizePass() { PassRegistration<BuudyGPUBufferizePass>(); }
+} // namespace buddy
+} // namespace mlir
\ No newline at end of file
diff --git a/midend/lib/Conversion/MLIRGPU/GPUHostRegister.cpp b/midend/lib/Conversion/MLIRGPU/GPUHostRegister.cpp
new file mode 100644
index 000000000..e9d8a046f
--- /dev/null
+++ b/midend/lib/Conversion/MLIRGPU/GPUHostRegister.cpp
@@ -0,0 +1,323 @@
+//===- GPUHostRegister.cpp
+//-------------------------------------------------===//
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the GPU host register pass that adds gpu.host_register.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/OperationSupport.h"
+#include "mlir/IR/Visitors.h"
+#include "mlir/Support/LLVM.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include <mlir/Dialect/Affine/IR/AffineOps.h>
+#include <mlir/Dialect/Func/IR/FuncOps.h>
+#include <mlir/Dialect/Linalg/Transforms/Transforms.h>
+#include <mlir/IR/Dialect.h>
+#include <mlir/IR/Operation.h>
+#include <mlir/IR/TypeUtilities.h>
+#include <mlir/IR/Value.h>
+#include <mlir/Pass/Pass.h>
+
+#include <map>
+#include <set>
+#include <sstream>
+#include <string>
+#include <utility>
+using namespace mlir;
+using namespace vector;
+
+//===----------------------------------------------------------------------===//
+// Rewrite Pattern
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+class GPUHostRegisterPattern : public ConversionPattern {
+public:
+  explicit GPUHostRegisterPattern(MLIRContext *context)
+      : ConversionPattern(gpu::LaunchFuncOp().getOperationName(), 1, context) {}
+
+  LogicalResult
+  matchAndRewrite(Operation *op, ArrayRef<Value> /*operands*/,
+                  ConversionPatternRewriter &rewriter) const override {
+    llvm::errs() << op->getName().getStringRef() << "\n";
+    return success();
+  }
+
+private:
+};
+} // end anonymous namespace
+
+//===----------------------------------------------------------------------===//
+// GPUHostRegisterPass
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+class GPUHostRegisterPass
+    : public PassWrapper<GPUHostRegisterPass, OperationPass<ModuleOp>> {
+public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(GPUHostRegisterPass)
+  StringRef getArgument() const final { return "gpu-host-register"; }
+  StringRef getDescription() const final {
+    return "Register host memory to legalize gpu access.";
+  }
+  GPUHostRegisterPass() = default;
+  GPUHostRegisterPass(const GPUHostRegisterPass &
+
+  ) {}
+
+  void runOnOperation() override;
+
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<gpu::GPUDialect, memref::MemRefDialect>();
+  }
+};
+} // end anonymous namespace.
+
+Value *getSourceOperand(Operation *op) {
+  auto operands = op->getOperands();
+  Value *memrefOperand = nullptr;
+  for (auto operand : operands) {
+    if (!operand.getType().isa<BaseMemRefType>())
+      continue;
+    if (memrefOperand) {
+      llvm_unreachable("Op has more than one memref operand");
+    }
+    memrefOperand = &operand;
+  }
+  if (!memrefOperand) {
+    llvm_unreachable("Op has no memref operand");
+  }
+  return memrefOperand;
+}
+
+std::pair<Operation *, int> getAllocationOp(Value *value) {
+  if (auto *producerOp = value->getDefiningOp()) {
+    if (auto allocOp = dyn_cast<memref::AllocOp>(producerOp)) {
+      // llvm::dbgs()<<allocOp->getName().getStringRef()<<":"<<allocOp<<"\n";
+      // llvm::dbgs()<<"returning value:"<<allocOp->getResult(0)<<"\n";
+      // llvm::dbgs()<<"returning location:"<<allocOp->getLoc()<<"\n";
+      return {producerOp, 0};
+    }
+    // else if (auto reallocOp)
+    // else if (auto allocaOp)
+    // Getglobal needs to create a copy
+    else if (auto getGlobalOp = dyn_cast<memref::GetGlobalOp>(producerOp)) {
+      return {producerOp, 1};
+    } 
+    else if (auto subviewOp = dyn_cast<memref::SubViewOp>(producerOp)) {
+      for(auto operand : producerOp->getOperands()) {
+        if (!operand.getType().isa<BaseMemRefType>())
+          continue;
+        return getAllocationOp(&operand);
+      }
+    } 
+    else if (auto loadOp = dyn_cast<memref::LoadOp>(producerOp)) {
+      for (auto operand : producerOp->getOperands()) {
+        if (!operand.getType().isa<BaseMemRefType>())
+          continue;
+        return getAllocationOp(&operand);
+      }
+    } 
+    else if (auto collapseShapeOp =
+                   dyn_cast<memref::CollapseShapeOp>(producerOp)) {
+      for (auto operand : producerOp->getOperands()) {
+        if (!operand.getType().isa<BaseMemRefType>())
+          continue;
+        return getAllocationOp(&operand);
+      }
+    } 
+    else if (auto expandShapeOp =
+                   dyn_cast<memref::ExpandShapeOp>(producerOp)) {
+      for (auto operand : producerOp->getOperands()) {
+        if (!operand.getType().isa<BaseMemRefType>())
+          continue;
+        return getAllocationOp(&operand);
+      }
+    } 
+    else if (auto castOp = dyn_cast<memref::CastOp>(producerOp)) {
+      for (auto operand : producerOp->getOperands()) {
+        if (!operand.getType().isa<BaseMemRefType>())
+          continue;
+        return getAllocationOp(&operand);
+      }
+    }   
+    else if (auto reinterpretCastOp =
+                   dyn_cast<memref::ReinterpretCastOp>(producerOp)) {
+      for (auto operand : producerOp->getOperands()) {
+        if (!operand.getType().isa<BaseMemRefType>())
+          continue;
+        return getAllocationOp(&operand);
+      }
+    } 
+    else if (auto reshapeOp = dyn_cast<memref::ReshapeOp>(producerOp)) {
+      for (auto operand : producerOp->getOperands()) {
+        if (!operand.getType().isa<BaseMemRefType>())
+          continue;
+        return getAllocationOp(&operand);
+      }
+    } 
+    else if (auto transposeOp = dyn_cast<memref::TransposeOp>(producerOp)) {
+      for (auto operand : producerOp->getOperands()) {
+        if (!operand.getType().isa<BaseMemRefType>())
+          continue;
+        return getAllocationOp(&operand);
+      }
+    } 
+    else if (auto viewOp = dyn_cast<memref::ViewOp>(producerOp)) {
+      for (auto operand : producerOp->getOperands()) {
+        if (!operand.getType().isa<BaseMemRefType>())
+          continue;
+        return getAllocationOp(&operand);
+      }
+    } 
+    else {
+      llvm_unreachable("Unknown producer op");
+    }
+    // Look for parent op
+  }
+  // llvm::dbgs() << "returning null:" << value << "\n";
+  // value->dump();
+  // Values comes from outside the function
+  return {reinterpret_cast<Operation*>(value), 3};
+}
+static bool isEqual(const Operation *lhsC, const Operation *rhsC) {
+  auto *lhs = const_cast<Operation *>(lhsC);
+  auto *rhs = const_cast<Operation *>(rhsC);
+  if (lhs == rhs)
+    return true;
+
+  return OperationEquivalence::isEquivalentTo(const_cast<Operation *>(lhsC),
+                                              const_cast<Operation *>(rhsC),
+                                              OperationEquivalence::None);
+}
+
+void GPUHostRegisterPass::runOnOperation() {
+  auto module = getOperation();
+  std::set<Operation *> allocations;
+  std::map<Operation *, memref::AllocOp *> globalAllocations;
+  std::set<Value*> outsideValues;
+  module->walk<WalkOrder::PreOrder>([&](Operation *nestedOp) {
+    if (auto launchFuncOp = dyn_cast<gpu::LaunchFuncOp>(nestedOp)) {
+      // OpBuilder barrierBuilder(launchFuncOp->getContext());
+      // barrierBuilder.setInsertionPointAfter(launchFuncOp);
+      // barrierBuilder.create<gpu::BarrierOp>(launchFuncOp->getLoc());
+
+      for (auto operand : launchFuncOp->getOperands()) {
+        if (!operand.getType().isa<BaseMemRefType>())
+          continue;
+        auto res = getAllocationOp(&operand);
+        auto allocOp = res.first;
+        Operation *insertionOp = nullptr;
+        if (!allocOp)
+          continue;
+
+        if (res.second == 0) {
+          insertionOp = allocOp;
+          auto result = allocations.insert(insertionOp);
+          if (result.second) {
+            OpBuilder builder(insertionOp->getContext());
+            builder.setInsertionPointAfter(insertionOp);
+            auto memrefType = dyn_cast<MemRefType>(operand.getType());
+            auto elementType = memrefType.getElementType();
+            UnrankedMemRefType resType =
+                UnrankedMemRefType::get(elementType, 0);
+            Value cast = builder.create<memref::CastOp>(
+                insertionOp->getLoc(), resType, insertionOp->getResult(0));
+            builder.create<gpu::HostRegisterOp>(insertionOp->getLoc(), cast);
+          } else {
+            // llvm::dbgs() << insertionOp->getName().getStringRef()
+            //              << " has been registered\n";
+          }
+        }
+        else if (res.second == 1) {
+          // add a copy for this global op
+          OpBuilder builder(allocOp->getContext());
+          builder.setInsertionPointAfter(allocOp);
+          auto memrefType = dyn_cast<MemRefType>(operand.getType());
+          auto newAllocOp = builder.create<memref::AllocOp>(
+              allocOp->getLoc(), memrefType, ValueRange{});
+          builder.create<memref::CopyOp>(
+              allocOp->getLoc(), allocOp->getResult(0), newAllocOp.getResult());
+          for (size_t i = 0; i < launchFuncOp->getNumOperands(); i++) {
+            if (launchFuncOp->getOperand(i) == operand) {
+              launchFuncOp->setOperand(i, newAllocOp.getResult());
+            }
+          }
+          auto result = allocations.insert(newAllocOp);
+          auto elementType = memrefType.getElementType();
+          UnrankedMemRefType resType = UnrankedMemRefType::get(elementType, 0);
+          auto castOp = builder.create<memref::CastOp>(
+              newAllocOp->getLoc(), resType, newAllocOp->getResult(0));
+          builder.create<gpu::HostRegisterOp>(castOp->getLoc(),
+                                              castOp.getResult());
+          globalAllocations[allocOp] = &newAllocOp;
+        }
+        else if (res.second == 3) {
+          // Register the external memory directly
+          auto value = reinterpret_cast<Value*>(res.first);
+          if (outsideValues.find(value)!=outsideValues.end()){
+            llvm::dbgs()<<"Global value registered.\n";
+            return WalkResult::advance();
+          }
+          auto context = operand.getContext();
+          auto region = launchFuncOp->getParentRegion();
+          auto block = &region->front();
+          auto loc = launchFuncOp->getParentOp()->getLoc();
+          OpBuilder builder(context);
+          builder.setInsertionPoint(block, block->begin());
+          auto memrefType = dyn_cast<MemRefType>(operand.getType());
+          auto elementType = memrefType.getElementType();
+          UnrankedMemRefType resType =
+              UnrankedMemRefType::get(elementType, 0);
+          auto castOp = builder.create<memref::CastOp>(loc,resType,*value);
+          builder.create<gpu::HostRegisterOp>(loc,castOp.getResult());
+          outsideValues.insert(value);
+        }
+      }
+      return WalkResult::advance();
+    } else if (auto deallocOp = dyn_cast<memref::DeallocOp>(nestedOp)) {
+      auto operand = deallocOp->getOperand(0);
+      if (!operand.getType().isa<BaseMemRefType>())
+        return WalkResult::advance();
+      if (auto getGlobalOp =
+              dyn_cast<memref::GetGlobalOp>(operand.getDefiningOp())) {
+        if (globalAllocations.find(operand.getDefiningOp()) !=
+            globalAllocations.end()) {
+          auto allocOp = globalAllocations[operand.getDefiningOp()];
+          deallocOp->setOperand(0, allocOp->getResult());
+        }
+      }
+    }
+    return WalkResult::advance();
+  });
+}
+
+namespace mlir {
+namespace buddy {
+void registerGPUHostRegisterPass() { PassRegistration<GPUHostRegisterPass>(); }
+} // namespace buddy
+} // namespace mlir
diff --git a/midend/lib/Utils/CMakeLists.txt b/midend/lib/Utils/CMakeLists.txt
index 7d21a6765..ff9aa6e38 100644
--- a/midend/lib/Utils/CMakeLists.txt
+++ b/midend/lib/Utils/CMakeLists.txt
@@ -1,6 +1,7 @@
 add_mlir_library(BuddyUtils
   Utils.cpp
   DIPUtils.cpp
+  DAPUtils.cpp
   AffineTransformUtils.cpp
   )
 
@@ -9,4 +10,11 @@ add_mlir_library(BuddyDIPUtils
   
   LINK_LIBS PUBLIC
   BuddyUtils
-  )
\ No newline at end of file
+  )
+
+add_mlir_library(BuddyDAPUtils
+  DAPUtils.cpp
+  
+  LINK_LIBS PUBLIC
+  BuddyUtils
+  )
diff --git a/midend/lib/Utils/DAPUtils.cpp b/midend/lib/Utils/DAPUtils.cpp
new file mode 100644
index 000000000..4586f43b4
--- /dev/null
+++ b/midend/lib/Utils/DAPUtils.cpp
@@ -0,0 +1,220 @@
+//====- DAPUtils.cpp ------------------------------------------------------===//
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements DAP dialect specific utility functions for the buddy
+// compiler ecosystem.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef UTILS_DAPUTILS_DEF
+#define UTILS_DAPUTILS_DEF
+
+#include <cassert>
+#include <mlir/Dialect/Arith/IR/Arith.h>
+#include <mlir/Dialect/Math/IR/Math.h>
+#include <mlir/Dialect/MemRef/IR/MemRef.h>
+#include <mlir/Dialect/SCF/IR/SCF.h>
+#include <mlir/Dialect/Vector/IR/VectorOps.h>
+#include <mlir/IR/MLIRContext.h>
+#include <mlir/IR/Value.h>
+#include <vector>
+
+#include "DAP/DAPDialect.h"
+#include "DAP/DAPOps.h"
+#include "Utils/DAPUtils.h"
+#include "Utils/Utils.h"
+
+using namespace mlir;
+
+namespace buddy {
+namespace dap {
+
+// Generate 5 vector params from SOS matrices
+SmallVector<Value, 5> generateSOSParams(OpBuilder &rewriter, Location loc,
+                                        VectorType vectorTy, Value f0, Value f1,
+                                        Value c0, Value c1, Value c2, Value c4,
+                                        Value c5, Value filterSize,
+                                        Value kernel) {
+  Value initB0 = rewriter.create<vector::SplatOp>(loc, vectorTy, f1);
+  Value initB1 = rewriter.create<vector::SplatOp>(loc, vectorTy, f0);
+  Value initB2 = rewriter.create<vector::SplatOp>(loc, vectorTy, f0);
+  Value initA1 = rewriter.create<vector::SplatOp>(loc, vectorTy, f0);
+  Value initA2 = rewriter.create<vector::SplatOp>(loc, vectorTy, f0);
+
+  // Distribute all params into 5 param vectors
+  auto vecDistribute = rewriter.create<scf::ForOp>(
+      loc, c0, filterSize, c1,
+      ValueRange{initB0, initB1, initB2, initA1, initA2},
+      [&](OpBuilder &builder, Location loc, Value iv, ValueRange iargs) {
+        Value b0 =
+            builder.create<memref::LoadOp>(loc, kernel, ValueRange{iv, c0});
+        Value b1 =
+            builder.create<memref::LoadOp>(loc, kernel, ValueRange{iv, c1});
+        Value b2 =
+            builder.create<memref::LoadOp>(loc, kernel, ValueRange{iv, c2});
+        Value a1 =
+            builder.create<memref::LoadOp>(loc, kernel, ValueRange{iv, c4});
+        Value a2 =
+            builder.create<memref::LoadOp>(loc, kernel, ValueRange{iv, c5});
+
+        Value b0Next =
+            builder.create<vector::InsertElementOp>(loc, b0, iargs[0], iv);
+        Value b1Next =
+            builder.create<vector::InsertElementOp>(loc, b1, iargs[1], iv);
+        Value b2Next =
+            builder.create<vector::InsertElementOp>(loc, b2, iargs[2], iv);
+        Value a1Next =
+            builder.create<vector::InsertElementOp>(loc, a1, iargs[3], iv);
+        Value a2Next =
+            builder.create<vector::InsertElementOp>(loc, a2, iargs[4], iv);
+
+        builder.create<scf::YieldOp>(
+            loc, std::vector<Value>{b0Next, b1Next, b2Next, a1Next, a2Next});
+      });
+
+  return SmallVector<Value, 5>{vecDistribute.getResults()};
+}
+
+// Processing iir operation, result are stored in output MemRef
+void biquadProcess(OpBuilder &rewriter, Location loc, VectorType vectorTy,
+                   Value f0, Value c0, Value c1, Value cUpperBound,
+                   Value iUpperBound, SmallVector<Value, 5> SOSParams,
+                   ArrayRef<int64_t> arrayRef, Value N, Value input,
+                   Value output) {
+  Value vecB0 = SOSParams[0];
+  Value vecB1 = SOSParams[1];
+  Value vecB2 = SOSParams[2];
+  Value vecA1 = SOSParams[3];
+  Value vecA2 = SOSParams[4];
+
+  Value vecOut = rewriter.create<vector::SplatOp>(loc, vectorTy, f0);
+  Value vecS1 = rewriter.create<vector::SplatOp>(loc, vectorTy, f0);
+  Value vecS2 = rewriter.create<vector::SplatOp>(loc, vectorTy, f0);
+
+  // Injection stage for iir operation, no output produced
+  auto injectionResult = rewriter.create<scf::ForOp>(
+      loc, c0, cUpperBound, c1, ValueRange{vecOut, vecS1, vecS2},
+      [&](OpBuilder &builder, Location loc, Value iv, ValueRange iargs) {
+        Value inElem = builder.create<memref::LoadOp>(loc, input, iv);
+        Value vecInMoveRight = builder.create<vector::ShuffleOp>(
+            loc, iargs[0], iargs[0], arrayRef);
+        Value vecInNext = builder.create<vector::InsertElementOp>(
+            loc, inElem, vecInMoveRight, c0);
+        Value vecOutNext =
+            builder.create<vector::FMAOp>(loc, vecB0, vecInNext, iargs[1]);
+
+        Value vecS1Lhs =
+            builder.create<vector::FMAOp>(loc, vecB1, vecInNext, iargs[2]);
+        Value vecS1Rhs = builder.create<arith::MulFOp>(loc, vecA1, vecOutNext);
+        Value vecS1Next =
+            builder.create<arith::SubFOp>(loc, vecS1Lhs, vecS1Rhs);
+
+        Value vecS2Lhs = builder.create<arith::MulFOp>(loc, vecB2, vecInNext);
+        Value vecS2Rhs = builder.create<arith::MulFOp>(loc, vecA2, vecOutNext);
+        Value vecS2Next =
+            builder.create<arith::SubFOp>(loc, vecS2Lhs, vecS2Rhs);
+
+        builder.create<scf::YieldOp>(
+            loc, std::vector<Value>{vecOutNext, vecS1Next, vecS2Next});
+      });
+
+  Value upperBound = rewriter.create<arith::SubIOp>(loc, N, cUpperBound);
+
+  // Processing stage for iir operation, start to produce ouput
+  auto processResult = rewriter.create<scf::ForOp>(
+      loc, c0, upperBound, c1, ValueRange{injectionResult.getResults()},
+      [&](OpBuilder &builder, Location loc, Value iv, ValueRange iargs) {
+        Value index = builder.create<arith::AddIOp>(loc, iv, cUpperBound);
+        Value inElem = builder.create<memref::LoadOp>(loc, input, index);
+        Value vecInMoveRight = builder.create<vector::ShuffleOp>(
+            loc, iargs[0], iargs[0], arrayRef);
+        Value vecInNext = builder.create<vector::InsertElementOp>(
+            loc, inElem, vecInMoveRight, c0);
+        Value vecOutNext =
+            builder.create<vector::FMAOp>(loc, vecB0, vecInNext, iargs[1]);
+        Value outElem = builder.create<vector::ExtractElementOp>(
+            loc, vecOutNext, iUpperBound);
+        builder.create<memref::StoreOp>(loc, outElem, output, iv);
+
+        Value vecS1Lhs =
+            builder.create<vector::FMAOp>(loc, vecB1, vecInNext, iargs[2]);
+        Value vecS1Rhs = builder.create<arith::MulFOp>(loc, vecA1, vecOutNext);
+        Value vecS1Next =
+            builder.create<arith::SubFOp>(loc, vecS1Lhs, vecS1Rhs);
+
+        Value vecS2Lhs = builder.create<arith::MulFOp>(loc, vecB2, vecInNext);
+        Value vecS2Rhs = builder.create<arith::MulFOp>(loc, vecA2, vecOutNext);
+        Value vecS2Next =
+            builder.create<arith::SubFOp>(loc, vecS2Lhs, vecS2Rhs);
+
+        builder.create<scf::YieldOp>(
+            loc, std::vector<Value>{vecOutNext, vecS1Next, vecS2Next});
+      });
+
+  // Tail ending stafe for iir operation, generate rest ouput
+  rewriter.create<scf::ForOp>(
+      loc, upperBound, N, c1, ValueRange{processResult.getResults()},
+      [&](OpBuilder &builder, Location loc, Value iv, ValueRange iargs) {
+        Value vecInMoveRight = builder.create<vector::ShuffleOp>(
+            loc, iargs[0], iargs[0], arrayRef);
+        Value vecInNext = builder.create<vector::InsertElementOp>(
+            loc, f0, vecInMoveRight, c0);
+        Value vecOutNext =
+            builder.create<vector::FMAOp>(loc, vecB0, vecInNext, iargs[1]);
+        Value outElem = builder.create<vector::ExtractElementOp>(
+            loc, vecOutNext, iUpperBound);
+        builder.create<memref::StoreOp>(loc, outElem, output, iv);
+
+        Value vecS1Lhs =
+            builder.create<vector::FMAOp>(loc, vecB1, vecInNext, iargs[2]);
+        Value vecS1Rhs = builder.create<arith::MulFOp>(loc, vecA1, vecOutNext);
+        Value vecS1Next =
+            builder.create<arith::SubFOp>(loc, vecS1Lhs, vecS1Rhs);
+
+        Value vecS2Lhs = builder.create<arith::MulFOp>(loc, vecB2, vecInNext);
+        Value vecS2Rhs = builder.create<arith::MulFOp>(loc, vecA2, vecOutNext);
+        Value vecS2Next =
+            builder.create<arith::SubFOp>(loc, vecS2Lhs, vecS2Rhs);
+
+        builder.create<scf::YieldOp>(
+            loc, std::vector<Value>{vecOutNext, vecS1Next, vecS2Next});
+      });
+}
+
+// Total process for a specific vector length iir vectorization process
+void iirVectorizationProcess(OpBuilder &rewriter, Location loc, uint64_t vecLen,
+                             FloatType floatType, Value f0, Value f1, Value c0,
+                             Value c1, Value c2, Value c4, Value c5,
+                             Value filterSize, Value kernel,
+                             ArrayRef<int64_t> arrayRef, Value N, Value input,
+                             Value output) {
+  VectorType vectorTy = VectorType::get(vecLen, floatType);
+  uint64_t vecLenMinusOne = vecLen - 1;
+  Value cUpperBound =
+      rewriter.create<arith::ConstantIndexOp>(loc, vecLenMinusOne);
+  Value iUpperBound = rewriter.create<arith::ConstantIntOp>(
+      loc,
+      /*value=*/vecLenMinusOne, /*width=*/64);
+
+  auto SOSParams = dap::generateSOSParams(rewriter, loc, vectorTy, f0, f1, c0,
+                                          c1, c2, c4, c5, filterSize, kernel);
+  dap::biquadProcess(rewriter, loc, vectorTy, f0, c0, c1, cUpperBound,
+                     iUpperBound, SOSParams, arrayRef, N, input, output);
+}
+
+} // namespace dap
+} // namespace buddy
+#endif // UTILS_DAPUTILS_DEF
diff --git a/requirements.txt b/requirements.txt
index 782c70af9..606179eb7 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,3 +5,5 @@ tokenizers == 0.13.3
 sentencepiece == 0.1.99
 accelerate
 protobuf
+pybind11 == 2.11.1
+torchvision
diff --git a/tests/Interface/core/ContainerTest.cpp b/tests/Interface/core/ContainerTest.cpp
index 0f69d8938..3d80b3375 100644
--- a/tests/Interface/core/ContainerTest.cpp
+++ b/tests/Interface/core/ContainerTest.cpp
@@ -56,7 +56,7 @@ int main() {
   // Test custom shape no malloc constructor.
   //===--------------------------------------------------------------------===//
   MemRef<float, 2> testCustomShapeNoMallocConstructor(sizes, false, 0);
-  // CHECK: (nil)
+  // CHECK: {{(nil)|0x0}}
   fprintf(stderr, "%p\n", testCustomShapeNoMallocConstructor.getData());
 
   //===--------------------------------------------------------------------===//
@@ -64,7 +64,7 @@ int main() {
   //===--------------------------------------------------------------------===//
   std::vector<size_t> arrayShape = {1, 80, 32000};
   MemRef<float, 3> testArrayNoMallocConstructor(arrayShape, false, 0);
-  // CHECK: (nil)
+  // CHECK: {{(nil)|0x0}}
   fprintf(stderr, "%p\n", testArrayNoMallocConstructor.getData());
   
   //===--------------------------------------------------------------------===//
diff --git a/tests/Python/test_addmm.py b/tests/Python/test_addmm.py
index cb4459f45..563c87446 100644
--- a/tests/Python/test_addmm.py
+++ b/tests/Python/test_addmm.py
@@ -22,8 +22,11 @@ def foo(x, y, z):
     aot_autograd_decomposition=inductor_decomp,
 )
 
-foo_mlir = dynamo.optimize(dynamo_compiler)(foo)
-foo_mlir(in1, in2, in3)
+graphs = dynamo_compiler.importer(foo, in1, in2, in3)
+assert len(graphs) == 1
+graph = graphs[0]
+graph.lower_to_top_level_ir()
+print(graph._imported_module)
 
 # CHECK: module {
 # CHECK-LABEL: func.func @forward
@@ -32,4 +35,3 @@ def foo(x, y, z):
 # CHECK: return %{{.*}}
 # CHECK: }
 # CHECK: }
-print(dynamo_compiler.imported_module)
diff --git a/tests/Python/test_amax.py b/tests/Python/test_amax.py
index 3759b352c..81944a2c2 100644
--- a/tests/Python/test_amax.py
+++ b/tests/Python/test_amax.py
@@ -22,8 +22,11 @@ def foo(x, dim):
     aot_autograd_decomposition=inductor_decomp,
 )
 
-foo_mlir = dynamo.optimize(dynamo_compiler)(foo)
-foo_mlir(in1, dim)
+graphs = dynamo_compiler.importer(foo, in1, dim)
+assert len(graphs) == 1
+graph = graphs[0]
+graph.lower_to_top_level_ir()
+print(graph._imported_module)
 
 # CHECK: module {
 # CHECK-LABEL: func.func @forward
@@ -31,4 +34,3 @@ def foo(x, dim):
 # CHECK: return %{{.*}}
 # CHECK: }
 # CHECK: }
-print(dynamo_compiler.imported_module)
diff --git a/tests/Python/test_arange.py b/tests/Python/test_arange.py
index ac7fa3c45..f7e1cd1c4 100644
--- a/tests/Python/test_arange.py
+++ b/tests/Python/test_arange.py
@@ -2,10 +2,9 @@
 
 import torch
 import torch._dynamo as dynamo
-from torch._inductor.decomposition import decompositions as inductor_decomp
 from torch._functorch.aot_autograd import aot_autograd_decompositions
 from buddy.compiler.frontend import DynamoCompiler
-from buddy.compiler.ops import tosa
+from buddy.compiler.ops import linalg
 
 
 def foo(x):
@@ -16,12 +15,15 @@ def foo(x):
 
 # Initialize the dynamo compiler.
 dynamo_compiler = DynamoCompiler(
-    primary_registry=tosa.ops_registry,
+    primary_registry=linalg.ops_registry,
     aot_autograd_decomposition=aot_autograd_decompositions,
 )
 
-foo_mlir = dynamo.optimize(dynamo_compiler)(foo)
-foo_mlir(in1)
+graphs = dynamo_compiler.importer(foo, in1)
+assert len(graphs) == 1
+graph = graphs[0]
+graph.lower_to_top_level_ir()
+print(graph._imported_module)
 
 # CHECK: module {
 # CHECK-LABEL: func.func @forward
@@ -29,4 +31,3 @@ def foo(x):
 # CHECK: return %{{.*}}
 # CHECK: }
 # CHECK: }
-print(dynamo_compiler.imported_module)
diff --git a/tests/Python/test_arith_add.py b/tests/Python/test_arith_add.py
index 44db4609d..9c6e9d312 100644
--- a/tests/Python/test_arith_add.py
+++ b/tests/Python/test_arith_add.py
@@ -1,11 +1,10 @@
 # RUN: %PYTHON %s 2>&1 | FileCheck %s
 
 import torch
-import torch._dynamo as dynamo
 from torch._inductor.decomposition import decompositions as inductor_decomp
 
 from buddy.compiler.frontend import DynamoCompiler
-from buddy.compiler.ops import tosa
+from buddy.compiler.ops import linalg
 
 
 def foo(x, y):
@@ -17,12 +16,15 @@ def foo(x, y):
 
 # Initialize the dynamo compiler.
 dynamo_compiler = DynamoCompiler(
-    primary_registry=tosa.ops_registry,
+    primary_registry=linalg.ops_registry,
     aot_autograd_decomposition=inductor_decomp,
 )
 
-foo_mlir = dynamo.optimize(dynamo_compiler)(foo)
-foo_mlir(in1, in2)
+graphs = dynamo_compiler.importer(foo, in1, in2)
+assert len(graphs) == 1
+graph = graphs[0]
+graph.lower_to_top_level_ir()
+print(graph._imported_module)
 
 # CHECK: module {
 # CHECK-LABEL: func.func @forward
@@ -30,4 +32,3 @@ def foo(x, y):
 # CHECK: return %{{.*}}
 # CHECK: }
 # CHECK: }
-print(dynamo_compiler.imported_module)
diff --git a/tests/Python/test_arith_div.py b/tests/Python/test_arith_div.py
index afc222a15..cf5b29023 100644
--- a/tests/Python/test_arith_div.py
+++ b/tests/Python/test_arith_div.py
@@ -5,7 +5,7 @@
 from torch._inductor.decomposition import decompositions as inductor_decomp
 
 from buddy.compiler.frontend import DynamoCompiler
-from buddy.compiler.ops import tosa
+from buddy.compiler.ops import linalg
 
 
 def foo(x, y):
@@ -17,12 +17,15 @@ def foo(x, y):
 
 # Initialize the dynamo compiler.
 dynamo_compiler = DynamoCompiler(
-    primary_registry=tosa.ops_registry,
+    primary_registry=linalg.ops_registry,
     aot_autograd_decomposition=inductor_decomp,
 )
 
-foo_mlir = dynamo.optimize(dynamo_compiler)(foo)
-foo_mlir(in1, in2)
+graphs = dynamo_compiler.importer(foo, in1, in2)
+assert len(graphs) == 1
+graph = graphs[0]
+graph.lower_to_top_level_ir()
+print(graph._imported_module)
 
 # CHECK: module {
 # CHECK-LABEL: func.func @forward
@@ -31,4 +34,3 @@ def foo(x, y):
 # CHECK: return %{{.*}}
 # CHECK: }
 # CHECK: }
-print(dynamo_compiler.imported_module)
diff --git a/tests/Python/test_arith_mul.py b/tests/Python/test_arith_mul.py
index 9dc4dfbff..b22c6ebfd 100644
--- a/tests/Python/test_arith_mul.py
+++ b/tests/Python/test_arith_mul.py
@@ -5,7 +5,7 @@
 from torch._inductor.decomposition import decompositions as inductor_decomp
 
 from buddy.compiler.frontend import DynamoCompiler
-from buddy.compiler.ops import tosa
+from buddy.compiler.ops import linalg
 
 
 def foo(x, y):
@@ -13,21 +13,24 @@ def foo(x, y):
 
 
 in1 = torch.randn(10)
-in2 = torch.randn(10)
+in2 = 2
 
 # Initialize the dynamo compiler.
 dynamo_compiler = DynamoCompiler(
-    primary_registry=tosa.ops_registry,
+    primary_registry=linalg.ops_registry,
     aot_autograd_decomposition=inductor_decomp,
 )
 
-foo_mlir = dynamo.optimize(dynamo_compiler)(foo)
-foo_mlir(in1, in2)
+graphs = dynamo_compiler.importer(foo, in1, in2)
+assert len(graphs) == 1
+graph = graphs[0]
+graph.lower_to_top_level_ir()
+print(graph._imported_module)
 
 # CHECK: module {
 # CHECK-LABEL: func.func @forward
+# CHECK: %{{.*}} = arith.constant
 # CHECK: %{{.*}} = tosa.mul
 # CHECK: return %{{.*}}
 # CHECK: }
 # CHECK: }
-print(dynamo_compiler.imported_module)
diff --git a/tests/Python/test_arith_sub.py b/tests/Python/test_arith_sub.py
index 95b5475fc..0f6238afa 100644
--- a/tests/Python/test_arith_sub.py
+++ b/tests/Python/test_arith_sub.py
@@ -21,8 +21,11 @@ def foo(x, y):
     aot_autograd_decomposition=inductor_decomp,
 )
 
-foo_mlir = dynamo.optimize(dynamo_compiler)(foo)
-foo_mlir(in1, in2)
+graphs = dynamo_compiler.importer(foo, in1, in2)
+assert len(graphs) == 1
+graph = graphs[0]
+graph.lower_to_top_level_ir()
+print(graph._imported_module)
 
 # CHECK: module {
 # CHECK-LABEL: func.func @forward
@@ -30,4 +33,3 @@ def foo(x, y):
 # CHECK: return %{{.*}}
 # CHECK: }
 # CHECK: }
-print(dynamo_compiler.imported_module)
diff --git a/tests/Python/test_bmm.py b/tests/Python/test_bmm.py
index 403b0621b..ec7c8b160 100644
--- a/tests/Python/test_bmm.py
+++ b/tests/Python/test_bmm.py
@@ -5,7 +5,7 @@
 from torch._inductor.decomposition import decompositions as inductor_decomp
 
 from buddy.compiler.frontend import DynamoCompiler
-from buddy.compiler.ops import tosa
+from buddy.compiler.ops import linalg
 
 
 def foo(x, y):
@@ -17,17 +17,20 @@ def foo(x, y):
 
 # Initialize the dynamo compiler.
 dynamo_compiler = DynamoCompiler(
-    primary_registry=tosa.ops_registry,
+    primary_registry=linalg.ops_registry,
     aot_autograd_decomposition=inductor_decomp,
 )
 
-foo_mlir = dynamo.optimize(dynamo_compiler)(foo)
-foo_mlir(in1, in2)
+graphs = dynamo_compiler.importer(foo, in1, in2)
+assert len(graphs) == 1
+graph = graphs[0]
+graph.lower_to_top_level_ir()
+print(graph._imported_module)
 
 # CHECK: module {
 # CHECK-LABEL: func.func @forward
-# CHECK: %{{.*}} = tosa.matmul
+# CHECK: %{{.*}} = arith.constant
+# CHECK: %{{.*}} = linalg.batch_matmul
 # CHECK: return %{{.*}}
 # CHECK: }
 # CHECK: }
-print(dynamo_compiler.imported_module)
diff --git a/tests/Python/test_cat.py b/tests/Python/test_cat.py
index db9dacf11..9c769ae65 100644
--- a/tests/Python/test_cat.py
+++ b/tests/Python/test_cat.py
@@ -6,7 +6,7 @@
 from torch._functorch.aot_autograd import aot_autograd_decompositions
 
 from buddy.compiler.frontend import DynamoCompiler
-from buddy.compiler.ops import tosa
+from buddy.compiler.ops import linalg
 
 
 def foo(x, y):
@@ -17,12 +17,15 @@ def foo(x, y):
 in2 = torch.ones([13, 13], dtype=torch.float32)
 # Initialize the dynamo compiler.
 dynamo_compiler = DynamoCompiler(
-    primary_registry=tosa.ops_registry,
+    primary_registry=linalg.ops_registry,
     aot_autograd_decomposition=aot_autograd_decompositions,
 )
 
-foo_mlir = dynamo.optimize(dynamo_compiler)(foo)
-foo_mlir(in1, in2)
+graphs = dynamo_compiler.importer(foo, in1, in2)
+assert len(graphs) == 1
+graph = graphs[0]
+graph.lower_to_top_level_ir()
+print(graph._imported_module)
 
 # CHECK: module {
 # CHECK-LABEL: func.func @forward
@@ -32,4 +35,3 @@ def foo(x, y):
 # CHECK: return %{{.*}}
 # CHECK: }
 # CHECK: }
-print(dynamo_compiler.imported_module)
diff --git a/tests/Python/test_clone.py b/tests/Python/test_clone.py
index 24fcd3225..3eabd7d64 100644
--- a/tests/Python/test_clone.py
+++ b/tests/Python/test_clone.py
@@ -5,7 +5,7 @@
 from torch._inductor.decomposition import decompositions as inductor_decomp
 
 from buddy.compiler.frontend import DynamoCompiler
-from buddy.compiler.ops import tosa
+from buddy.compiler.ops import linalg
 
 
 def foo(x):
@@ -16,17 +16,19 @@ def foo(x):
 
 # Initialize the dynamo compiler.
 dynamo_compiler = DynamoCompiler(
-    primary_registry=tosa.ops_registry,
+    primary_registry=linalg.ops_registry,
     aot_autograd_decomposition=inductor_decomp,
 )
 
-foo_mlir = dynamo.optimize(dynamo_compiler)(foo)
-foo_mlir(in1)
+graphs = dynamo_compiler.importer(foo, in1)
+assert len(graphs) == 1
+graph = graphs[0]
+graph.lower_to_top_level_ir()
+print(graph._imported_module)
 
 # CHECK: module {
 # CHECK-LABEL: func.func @forward
-# CHECK: %{{.*}} = tosa.identity
+# CHECK: %{{.*}} = tensor.extract_slice
 # CHECK: return %{{.*}}
 # CHECK: }
 # CHECK: }
-print(dynamo_compiler.imported_module)
diff --git a/tests/Python/test_convert_element_type.py b/tests/Python/test_convert_element_type.py
index 63cd1ddae..ca8838463 100644
--- a/tests/Python/test_convert_element_type.py
+++ b/tests/Python/test_convert_element_type.py
@@ -13,7 +13,7 @@ def foo(x, to_cast_type):
 
 
 in1 = torch.randn(10).to(torch.float32)
-to_cast_type = torch.float16
+to_cast_type = torch.int32
 
 # Initialize the dynamo compiler.
 dynamo_compiler = DynamoCompiler(
@@ -21,8 +21,11 @@ def foo(x, to_cast_type):
     aot_autograd_decomposition=inductor_decomp,
 )
 
-foo_mlir = dynamo.optimize(dynamo_compiler)(foo)
-foo_mlir(in1, to_cast_type)
+graphs = dynamo_compiler.importer(foo, in1, to_cast_type)
+assert len(graphs) == 1
+graph = graphs[0]
+graph.lower_to_top_level_ir()
+print(graph._imported_module)
 
 # CHECK: module {
 # CHECK-LABEL: func.func @forward
@@ -30,4 +33,3 @@ def foo(x, to_cast_type):
 # CHECK: return %{{.*}}
 # CHECK: }
 # CHECK: }
-print(dynamo_compiler.imported_module)
diff --git a/tests/Python/test_convolution_default.py b/tests/Python/test_convolution_default.py
new file mode 100644
index 000000000..fed1607c7
--- /dev/null
+++ b/tests/Python/test_convolution_default.py
@@ -0,0 +1,42 @@
+# RUN: %PYTHON %s 2>&1 | FileCheck %s
+
+import torch
+import torch._dynamo as dynamo
+from torch._inductor.decomposition import decompositions as inductor_decomp
+
+from buddy.compiler.frontend import DynamoCompiler
+from buddy.compiler.ops import tosa
+
+
+class Convolution(torch.nn.Module):
+    def __init__(self, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+        self.conv = torch.nn.Conv2d(3, 255, (5, 5), 3, 3, bias=False)
+
+    def forward(self, a):
+        return self.conv(a)
+
+
+model = Convolution()
+dynamo_compiler = DynamoCompiler(
+    primary_registry=tosa.ops_registry,
+    aot_autograd_decomposition=inductor_decomp,
+)
+
+in1 = torch.randn((1, 3, 640, 480))
+graphs = dynamo_compiler.importer(model, in1)
+assert len(graphs) == 1
+graph = graphs[0]
+graph.lower_to_top_level_ir()
+print(graph._imported_module)
+# CHECK: module {
+# CHECK-LABEL: func.func @forward
+# CHECK: %{{.*}} = "tosa.const"
+# CHECK: %{{.*}} = tosa.transpose
+# CHECK: %{{.*}} = "tosa.const"()
+# CHECK: %{{.*}} = tosa.transpose
+# CHECK: %{{.*}} = tosa.conv2d
+# CHECK: %{{.*}} = tosa.transpose
+# CHECK: return %{{.*}}
+# CHECK: }
+# CHECK: }
diff --git a/tests/Python/test_embedding.py b/tests/Python/test_embedding.py
index ee76d2068..484bb617b 100644
--- a/tests/Python/test_embedding.py
+++ b/tests/Python/test_embedding.py
@@ -22,8 +22,11 @@ def foo(weight, indices):
 weight = torch.randn(10, 5)
 indices = torch.randint(10, (3, 3)).to(torch.int32)
 
-foo_mlir = dynamo.optimize(dynamo_compiler)(foo)
-foo_mlir(weight, indices)
+graphs = dynamo_compiler.importer(foo, weight, indices)
+assert len(graphs) == 1
+graph = graphs[0]
+graph.lower_to_top_level_ir()
+print(graph._imported_module)
 
 # CHECK: module {
 # CHECK-LABEL: func.func @forward
@@ -34,16 +37,29 @@ def foo(weight, indices):
 # CHECK: return %{{.*}}
 # CHECK: }
 # CHECK: }
-print(dynamo_compiler.imported_module)
-
 
 # test cast case
 weight = torch.randn(10, 5)
 indices = torch.randint(10, (3, 3)).to(torch.int64)
 
+graphs = dynamo_compiler.importer(foo, weight, indices)
+print(graphs)
+assert len(graphs) == 2
+graphs[0].lower_to_top_level_ir()
+print(graphs[0]._imported_module)
+
+# CHECK: module {
+# CHECK-LABEL: func.func @forward
+# CHECK: %{{.*}} = tosa.reshape
+# CHECK: %{{.*}} = tosa.reshape
+# CHECK: %{{.*}} = tosa.gather
+# CHECK: %{{.*}} = tosa.reshape
+# CHECK: return %{{.*}}
+# CHECK: }
+# CHECK: }
 
-foo_mlir = dynamo.optimize(dynamo_compiler)(foo)
-foo_mlir(weight, indices)
+graphs[1].lower_to_top_level_ir()
+print(graphs[1]._imported_module)
 
 # CHECK: module {
 # CHECK-LABEL: func.func @forward
@@ -54,5 +70,4 @@ def foo(weight, indices):
 # CHECK: %{{.*}} = tosa.reshape
 # CHECK: return %{{.*}}
 # CHECK: }
-# CHECK: }
-print(dynamo_compiler.imported_module)
+# CHECK: }
\ No newline at end of file
diff --git a/tests/Python/test_exp.py b/tests/Python/test_exp.py
index 3fcff4361..7519a999b 100644
--- a/tests/Python/test_exp.py
+++ b/tests/Python/test_exp.py
@@ -20,8 +20,11 @@ def foo(x):
     aot_autograd_decomposition=inductor_decomp,
 )
 
-foo_mlir = dynamo.optimize(dynamo_compiler)(foo)
-foo_mlir(in1)
+graphs = dynamo_compiler.importer(foo, in1)
+assert len(graphs) == 1
+graph = graphs[0]
+graph.lower_to_top_level_ir()
+print(graph._imported_module)
 
 # CHECK: module {
 # CHECK-LABEL: func.func @forward
@@ -29,4 +32,3 @@ def foo(x):
 # CHECK: return %{{.*}}
 # CHECK: }
 # CHECK: }
-print(dynamo_compiler.imported_module)
diff --git a/tests/Python/test_full.py b/tests/Python/test_full.py
index 33cdc2c1d..0a5f5888b 100644
--- a/tests/Python/test_full.py
+++ b/tests/Python/test_full.py
@@ -5,7 +5,7 @@
 from torch._inductor.decomposition import decompositions as inductor_decomp
 
 from buddy.compiler.frontend import DynamoCompiler
-from buddy.compiler.ops import tosa
+from buddy.compiler.ops import linalg
 
 
 def foo(x, y):
@@ -17,12 +17,15 @@ def foo(x, y):
 
 # Initialize the dynamo compiler.
 dynamo_compiler = DynamoCompiler(
-    primary_registry=tosa.ops_registry,
+    primary_registry=linalg.ops_registry,
     aot_autograd_decomposition=inductor_decomp,
 )
 
-foo_mlir = dynamo.optimize(dynamo_compiler)(foo)
-foo_mlir(in1, in2)
+graphs = dynamo_compiler.importer(foo, in1, in2)
+assert len(graphs) == 1
+graph = graphs[0]
+graph.lower_to_top_level_ir()
+print(graph._imported_module)
 
 # CHECK: module {
 # CHECK-LABEL: func.func @forward
@@ -30,4 +33,3 @@ def foo(x, y):
 # CHECK: return %{{.*}}
 # CHECK: }
 # CHECK: }
-print(dynamo_compiler.imported_module)
diff --git a/tests/Python/test_index.py b/tests/Python/test_index.py
index da31095c1..c21ce1a5f 100644
--- a/tests/Python/test_index.py
+++ b/tests/Python/test_index.py
@@ -6,7 +6,7 @@
 from torch._functorch.aot_autograd import aot_autograd_decompositions
 
 from buddy.compiler.frontend import DynamoCompiler
-from buddy.compiler.ops import tosa
+from buddy.compiler.ops import linalg
 
 
 def foo(x, y):
@@ -17,12 +17,15 @@ def foo(x, y):
 in2 = torch.tensor([1])
 # Initialize the dynamo compiler.
 dynamo_compiler = DynamoCompiler(
-    primary_registry=tosa.ops_registry,
+    primary_registry=linalg.ops_registry,
     aot_autograd_decomposition=aot_autograd_decompositions,
 )
 
-foo_mlir = dynamo.optimize(dynamo_compiler)(foo)
-foo_mlir(in1, in2)
+graphs = dynamo_compiler.importer(foo, in1, in2)
+assert len(graphs) == 1
+graph = graphs[0]
+graph.lower_to_top_level_ir()
+print(graph._imported_module)
 
 # CHECK: module {
 # CHECK-LABEL: func.func @forward
@@ -31,4 +34,3 @@ def foo(x, y):
 # CHECK: return %{{.*}}
 # CHECK: }
 # CHECK: }
-print(dynamo_compiler.imported_module)
diff --git a/tests/Python/test_expand.py b/tests/Python/test_iota.py
similarity index 52%
rename from tests/Python/test_expand.py
rename to tests/Python/test_iota.py
index 37e9aca38..d4e9d3e56 100644
--- a/tests/Python/test_expand.py
+++ b/tests/Python/test_iota.py
@@ -8,25 +8,28 @@
 from buddy.compiler.ops import tosa
 
 
-def foo(x, new_size):
-    return torch.ops.aten.expand(x, new_size)
+class foo(torch.nn.Module):
+    def __init__(self, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
 
-x = torch.randn(1, 3)
-new_size = (6, 3)
+    def forward(self, a):
+        return torch.arange(a)
 
-# Initialize the dynamo compiler.
+
+model = foo()
 dynamo_compiler = DynamoCompiler(
     primary_registry=tosa.ops_registry,
     aot_autograd_decomposition=inductor_decomp,
 )
-
-foo_mlir = dynamo.optimize(dynamo_compiler)(foo)
-foo_mlir(x, new_size)
-
+in1 = 40
+graphs = dynamo_compiler.importer(model, in1)
+assert len(graphs) == 1
+graph = graphs[0]
+graph.lower_to_top_level_ir()
+print(graph._imported_module)
 # CHECK: module {
 # CHECK-LABEL: func.func @forward
-# CHECK: %{{.*}} = tosa.add
-# CHECK: return %{{.*}} : tensor<6x3xf32>
+# CHECK: %{{.*}} = "tosa.const"
+# CHECK: return %{{.*}}
 # CHECK: }
 # CHECK: }
-print(dynamo_compiler.imported_module)
diff --git a/tests/Python/test_lt.py b/tests/Python/test_lt.py
index a6f30b61c..5cea5ce5f 100644
--- a/tests/Python/test_lt.py
+++ b/tests/Python/test_lt.py
@@ -5,23 +5,26 @@
 from torch._inductor.decomposition import decompositions as inductor_decomp
 
 from buddy.compiler.frontend import DynamoCompiler
-from buddy.compiler.ops import tosa
+from buddy.compiler.ops import linalg
 
 
 def foo(x, y):
-    return torch.ops.aten.lt(x,y)
+    return torch.ops.aten.lt(x, y)
 
 
 in1 = torch.ones([13], dtype=torch.int64)
 in2 = torch.ones([13, 1], dtype=torch.int64)
 # Initialize the dynamo compiler.
 dynamo_compiler = DynamoCompiler(
-    primary_registry=tosa.ops_registry,
+    primary_registry=linalg.ops_registry,
     aot_autograd_decomposition=inductor_decomp,
 )
 
-foo_mlir = dynamo.optimize(dynamo_compiler)(foo)
-foo_mlir(in1, in2)
+graphs = dynamo_compiler.importer(foo, in1, in2)
+assert len(graphs) == 1
+graph = graphs[0]
+graph.lower_to_top_level_ir()
+print(graph._imported_module)
 
 # CHECK: module {
 # CHECK-LABEL: func.func @forward
@@ -30,4 +33,3 @@ def foo(x, y):
 # CHECK: return %{{.*}}
 # CHECK: }
 # CHECK: }
-print(dynamo_compiler.imported_module)
diff --git a/tests/Python/test_masked_fill.py b/tests/Python/test_masked_fill.py
index 3802b3de7..3abbe88cd 100644
--- a/tests/Python/test_masked_fill.py
+++ b/tests/Python/test_masked_fill.py
@@ -6,7 +6,7 @@
 from torch._functorch.aot_autograd import aot_autograd_decompositions
 
 from buddy.compiler.frontend import DynamoCompiler
-from buddy.compiler.ops import tosa
+from buddy.compiler.ops import linalg
 
 
 def foo(x, y, z):
@@ -18,12 +18,15 @@ def foo(x, y, z):
 in3 = 0
 # Initialize the dynamo compiler.
 dynamo_compiler = DynamoCompiler(
-    primary_registry=tosa.ops_registry,
+    primary_registry=linalg.ops_registry,
     aot_autograd_decomposition=aot_autograd_decompositions,
 )
 
-foo_mlir = dynamo.optimize(dynamo_compiler)(foo)
-foo_mlir(in1, in2, in3)
+graphs = dynamo_compiler.importer(foo, in1, in2, in3)
+assert len(graphs) == 1
+graph = graphs[0]
+graph.lower_to_top_level_ir()
+print(graph._imported_module)
 
 # CHECK: module {
 # CHECK-LABEL: func.func @forward
@@ -33,4 +36,3 @@ def foo(x, y, z):
 # CHECK: return %{{.*}}
 # CHECK: }
 # CHECK: }
-print(dynamo_compiler.imported_module)
diff --git a/tests/Python/test_max_pool2d.py b/tests/Python/test_max_pool2d.py
new file mode 100644
index 000000000..eecfc73d9
--- /dev/null
+++ b/tests/Python/test_max_pool2d.py
@@ -0,0 +1,44 @@
+# RUN: %PYTHON %s 2>&1 | FileCheck %s
+
+import torch
+from torch._inductor.decomposition import decompositions as inductor_decomp
+
+from buddy.compiler.frontend import DynamoCompiler
+from buddy.compiler.ops import tosa
+
+
+class TestModule(torch.nn.Module):
+    def __init__(self, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+        self.pool = torch.nn.MaxPool2d((5, 5), 3, (2, 2))
+
+    def forward(self, a):
+        return self.pool(a)
+
+
+model = TestModule()
+dynamo_compiler = DynamoCompiler(
+    primary_registry=tosa.ops_registry,
+    aot_autograd_decomposition=inductor_decomp,
+)
+
+in1 = torch.randn((1, 3, 640, 480))
+
+model_opt = torch.compile(model, backend=dynamo_compiler)
+assert torch.allclose(model_opt(in1), model(in1), equal_nan=True)
+
+graphs = dynamo_compiler.importer(model, in1)
+assert len(graphs) == 1
+graph = graphs[0]
+graph.lower_to_top_level_ir()
+print(graph._imported_module)
+# CHECK: module {
+# CHECK-LABEL: func.func @forward
+# CHECK: %{{.*}} = "tosa.const"
+# CHECK: %{{.*}} = tosa.transpose
+# CHECK: %{{.*}} = tosa.max_pool2d
+# CHECK: %{{.*}} = "tosa.const"
+# CHECK: %{{.*}} = tosa.transpose
+# CHECK: return %{{.*}}
+# CHECK: }
+# CHECK: }
diff --git a/tests/Python/test_mean.py b/tests/Python/test_mean.py
index 781e49416..0595619d1 100644
--- a/tests/Python/test_mean.py
+++ b/tests/Python/test_mean.py
@@ -1,16 +1,14 @@
 # RUN: %PYTHON %s 2>&1 | FileCheck %s
 
 import torch
-import torch._dynamo as dynamo
 from torch._inductor.decomposition import decompositions as inductor_decomp
-from torch._functorch.aot_autograd import aot_autograd_decompositions
 
 from buddy.compiler.frontend import DynamoCompiler
 from buddy.compiler.ops import tosa
 
 
-def foo(x, y, z):
-    return torch.mean(x, y, z)
+def foo(x, y, keepdim):
+    return torch.mean(x, y, keepdim=keepdim)
 
 
 in1 = torch.ones([13, 13], dtype=torch.float32)
@@ -19,17 +17,25 @@ def foo(x, y, z):
 # Initialize the dynamo compiler.
 dynamo_compiler = DynamoCompiler(
     primary_registry=tosa.ops_registry,
-    aot_autograd_decomposition=aot_autograd_decompositions,
+    aot_autograd_decomposition=inductor_decomp,
 )
 
-foo_mlir = dynamo.optimize(dynamo_compiler)(foo)
-foo_mlir(in1, in2, in3)
+foo_mlir = torch.compile(foo, backend=dynamo_compiler)
+assert torch.allclose(
+    foo_mlir(in1, in2, keepdim=in3), foo(in1, in2, keepdim=in3), equal_nan=True
+)
+graphs = dynamo_compiler.importer(foo, in1, in2, in3)
+assert len(graphs) == 1
+graph = graphs[0]
+graph.lower_to_top_level_ir()
+print(graph._imported_module)
 
 # CHECK: module {
 # CHECK-LABEL: func.func @forward
-# CHECK: %{{.*}} = arith.constant
-# CHECK: %{{.*}} = linalg.generic
+# CHECK: %{{.*}} = tosa.reduce_sum
+# CHECK: %{{.*}} = "tosa.const"
+# CHECK: %{{.*}} = tosa.reciprocal
+# CHECK: %{{.*}} = tosa.mul
 # CHECK: return %{{.*}}
 # CHECK: }
 # CHECK: }
-print(dynamo_compiler.imported_module)
diff --git a/tests/Python/test_mm.py b/tests/Python/test_mm.py
index 4440b4ad8..4f7c41df3 100644
--- a/tests/Python/test_mm.py
+++ b/tests/Python/test_mm.py
@@ -6,7 +6,7 @@
 from torch._functorch.aot_autograd import aot_autograd_decompositions
 
 from buddy.compiler.frontend import DynamoCompiler
-from buddy.compiler.ops import tosa
+from buddy.compiler.ops import linalg
 
 
 def foo(x, y):
@@ -17,12 +17,15 @@ def foo(x, y):
 in2 = torch.ones([13, 13], dtype=torch.float32)
 # Initialize the dynamo compiler.
 dynamo_compiler = DynamoCompiler(
-    primary_registry=tosa.ops_registry,
+    primary_registry=linalg.ops_registry,
     aot_autograd_decomposition=aot_autograd_decompositions,
 )
 
-foo_mlir = dynamo.optimize(dynamo_compiler)(foo)
-foo_mlir(in1, in2)
+graphs = dynamo_compiler.importer(foo, in1, in2)
+assert len(graphs) == 1
+graph = graphs[0]
+graph.lower_to_top_level_ir()
+print(graph._imported_module)
 
 # CHECK: module {
 # CHECK-LABEL: func.func @forward
@@ -31,4 +34,3 @@ def foo(x, y):
 # CHECK: return %{{.*}}
 # CHECK: }
 # CHECK: }
-print(dynamo_compiler.imported_module)
diff --git a/tests/Python/test_neg.py b/tests/Python/test_neg.py
index e2f9e6f3d..78261085a 100644
--- a/tests/Python/test_neg.py
+++ b/tests/Python/test_neg.py
@@ -6,7 +6,7 @@
 from torch._functorch.aot_autograd import aot_autograd_decompositions
 
 from buddy.compiler.frontend import DynamoCompiler
-from buddy.compiler.ops import tosa
+from buddy.compiler.ops import linalg
 
 
 def foo(x):
@@ -16,18 +16,20 @@ def foo(x):
 in1 = torch.ones([13, 13], dtype=torch.float32)
 # Initialize the dynamo compiler.
 dynamo_compiler = DynamoCompiler(
-    primary_registry=tosa.ops_registry,
+    primary_registry=linalg.ops_registry,
     aot_autograd_decomposition=aot_autograd_decompositions,
 )
 
-foo_mlir = dynamo.optimize(dynamo_compiler)(foo)
-foo_mlir(in1)
+graphs = dynamo_compiler.importer(foo, in1)
+assert len(graphs) == 1
+graph = graphs[0]
+graph.lower_to_top_level_ir()
+print(graph._imported_module)
 
 # CHECK: module {
 # CHECK-LABEL: func.func @forward
 # CHECK: %{{.*}} = tensor.empty
-# CHECK: %{{.*}} = linalg.generic
+# CHECK: %{{.*}} = linalg.negf
 # CHECK: return %{{.*}}
 # CHECK: }
 # CHECK: }
-print(dynamo_compiler.imported_module)
diff --git a/tests/Python/test_ones.py b/tests/Python/test_ones.py
index 7343fd102..4af4ead36 100644
--- a/tests/Python/test_ones.py
+++ b/tests/Python/test_ones.py
@@ -5,7 +5,7 @@
 from torch._inductor.decomposition import decompositions as inductor_decomp
 
 from buddy.compiler.frontend import DynamoCompiler
-from buddy.compiler.ops import tosa
+from buddy.compiler.ops import linalg
 
 
 def foo(x):
@@ -16,12 +16,15 @@ def foo(x):
 
 # Initialize the dynamo compiler.
 dynamo_compiler = DynamoCompiler(
-    primary_registry=tosa.ops_registry,
+    primary_registry=linalg.ops_registry,
     aot_autograd_decomposition=inductor_decomp,
 )
 
-foo_mlir = dynamo.optimize(dynamo_compiler)(foo)
-foo_mlir(in1)
+graphs = dynamo_compiler.importer(foo, in1)
+assert len(graphs) == 1
+graph = graphs[0]
+graph.lower_to_top_level_ir()
+print(graph._imported_module)
 
 # CHECK: module {
 # CHECK-LABEL: func.func @forward
@@ -29,4 +32,3 @@ def foo(x):
 # CHECK: return %{{.*}}
 # CHECK: }
 # CHECK: }
-print(dynamo_compiler.imported_module)
diff --git a/tests/Python/test_permute.py b/tests/Python/test_permute.py
index d260df3c2..7f1aad3e1 100644
--- a/tests/Python/test_permute.py
+++ b/tests/Python/test_permute.py
@@ -21,8 +21,11 @@ def foo(x, y):
     aot_autograd_decomposition=inductor_decomp,
 )
 
-foo_mlir = dynamo.optimize(dynamo_compiler)(foo)
-foo_mlir(x, perm)
+graphs = dynamo_compiler.importer(foo, x, perm)
+assert len(graphs) == 1
+graph = graphs[0]
+graph.lower_to_top_level_ir()
+print(graph._imported_module)
 
 # CHECK: module {
 # CHECK-LABEL: func.func @forward
@@ -30,4 +33,3 @@ def foo(x, y):
 # CHECK: return %{{.*}} : tensor<4x3x2xf32>
 # CHECK: }
 # CHECK: }
-print(dynamo_compiler.imported_module)
diff --git a/tests/Python/test_pow.py b/tests/Python/test_pow.py
index cfc47feb1..d67156383 100644
--- a/tests/Python/test_pow.py
+++ b/tests/Python/test_pow.py
@@ -6,7 +6,7 @@
 from torch._functorch.aot_autograd import aot_autograd_decompositions
 
 from buddy.compiler.frontend import DynamoCompiler
-from buddy.compiler.ops import tosa
+from buddy.compiler.ops import linalg
 
 
 def foo(x, y):
@@ -17,12 +17,15 @@ def foo(x, y):
 in2 = 2
 # Initialize the dynamo compiler.
 dynamo_compiler = DynamoCompiler(
-    primary_registry=tosa.ops_registry,
+    primary_registry=linalg.ops_registry,
     aot_autograd_decomposition=aot_autograd_decompositions,
 )
 
-foo_mlir = dynamo.optimize(dynamo_compiler)(foo)
-foo_mlir(in1, in2)
+graphs = dynamo_compiler.importer(foo, in1, in2)
+assert len(graphs) == 1
+graph = graphs[0]
+graph.lower_to_top_level_ir()
+print(graph._imported_module)
 
 # CHECK: module {
 # CHECK-LABEL: func.func @forward
@@ -32,4 +35,3 @@ def foo(x, y):
 # CHECK: return %{{.*}}
 # CHECK: }
 # CHECK: }
-print(dynamo_compiler.imported_module)
diff --git a/tests/Python/test_reciprocal.py b/tests/Python/test_reciprocal.py
new file mode 100644
index 000000000..9c31fb8b5
--- /dev/null
+++ b/tests/Python/test_reciprocal.py
@@ -0,0 +1,36 @@
+# RUN: %PYTHON %s 2>&1 | FileCheck %s
+
+import torch
+from torch._inductor.decomposition import decompositions as inductor_decomp
+
+from buddy.compiler.frontend import DynamoCompiler
+from buddy.compiler.ops import math
+
+
+def foo(x):
+    return torch.ops.aten.reciprocal(x)
+
+
+x = torch.randn(10, 3, 6)
+
+# Initialize the dynamo compiler.
+dynamo_compiler = DynamoCompiler(
+    primary_registry=math.ops_registry,
+    aot_autograd_decomposition=inductor_decomp,
+)
+
+foo_mlir = torch.compile(foo, backend=dynamo_compiler)
+assert torch.allclose(foo_mlir(x), foo(x), equal_nan=True)
+
+graphs = dynamo_compiler.importer(foo, x)
+assert len(graphs) == 1
+graph = graphs[0]
+graph.lower_to_top_level_ir()
+print(graph._imported_module)
+
+# CHECK: module {
+# CHECK-LABEL: func.func @forward
+# CHECK: %{{.*}} = tosa.reciprocal
+# CHECK: return %{{.*}}
+# CHECK: }
+# CHECK: }
diff --git a/tests/Python/test_relu.py b/tests/Python/test_relu.py
new file mode 100644
index 000000000..c6d6bc6ae
--- /dev/null
+++ b/tests/Python/test_relu.py
@@ -0,0 +1,36 @@
+# RUN: %PYTHON %s 2>&1 | FileCheck %s
+
+import torch
+import torch._dynamo as dynamo
+from torch._inductor.decomposition import decompositions as inductor_decomp
+
+from buddy.compiler.frontend import DynamoCompiler
+from buddy.compiler.ops import tosa
+
+
+class foo(torch.nn.Module):
+    def __init__(self, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+
+    def forward(self, a):
+        return torch.relu(a)
+
+
+model = foo()
+dynamo_compiler = DynamoCompiler(
+    primary_registry=tosa.ops_registry,
+    aot_autograd_decomposition=inductor_decomp,
+)
+in1 = torch.randn((1, 3, 640, 480), device="cpu")
+graphs = dynamo_compiler.importer(model, in1)
+assert len(graphs) == 1
+graph = graphs[0]
+graph.lower_to_top_level_ir()
+print(graph._imported_module)
+# CHECK: module {
+# CHECK-LABEL: func.func @forward
+# CHECK: %{{.*}} = "tosa.const"
+# CHECK: %{{.*}} = tosa.maximum
+# CHECK: return %{{.*}}
+# CHECK: }
+# CHECK: }
diff --git a/tests/Python/test_reshape.py b/tests/Python/test_reshape.py
index 56a194697..989e0e4da 100644
--- a/tests/Python/test_reshape.py
+++ b/tests/Python/test_reshape.py
@@ -21,8 +21,11 @@ def foo(x, new_shape):
     aot_autograd_decomposition=inductor_decomp,
 )
 
-foo_mlir = dynamo.optimize(dynamo_compiler)(foo)
-foo_mlir(x, new_shape)
+graphs = dynamo_compiler.importer(foo, x, new_shape)
+assert len(graphs) == 1
+graph = graphs[0]
+graph.lower_to_top_level_ir()
+print(graph._imported_module)
 
 # CHECK: module {
 # CHECK-LABEL: func.func @forward
@@ -30,4 +33,3 @@ def foo(x, new_shape):
 # CHECK: return %{{.*}}
 # CHECK: }
 # CHECK: }
-print(dynamo_compiler.imported_module)
diff --git a/tests/Python/test_rsqrt.py b/tests/Python/test_rsqrt.py
index 8ca0cf929..370334d66 100644
--- a/tests/Python/test_rsqrt.py
+++ b/tests/Python/test_rsqrt.py
@@ -5,7 +5,7 @@
 from torch._inductor.decomposition import decompositions as inductor_decomp
 
 from buddy.compiler.frontend import DynamoCompiler
-from buddy.compiler.ops import tosa
+from buddy.compiler.ops import linalg
 
 
 def foo(x):
@@ -16,17 +16,20 @@ def foo(x):
 
 # Initialize the dynamo compiler.
 dynamo_compiler = DynamoCompiler(
-    primary_registry=tosa.ops_registry,
+    primary_registry=linalg.ops_registry,
     aot_autograd_decomposition=inductor_decomp,
 )
 
-foo_mlir = dynamo.optimize(dynamo_compiler)(foo)
-foo_mlir(x)
+graphs = dynamo_compiler.importer(foo, x)
+assert len(graphs) == 1
+graph = graphs[0]
+graph.lower_to_top_level_ir()
+print(graph._imported_module)
 
 # CHECK: module {
 # CHECK-LABEL: func.func @forward
-# CHECK: %{{.*}} = tosa.rsqrt
+# CHECK: %{{.*}} = tensor.empty()
+# CHECK: %{{.*}} = linalg.generic
 # CHECK: return %{{.*}}
 # CHECK: }
 # CHECK: }
-print(dynamo_compiler.imported_module)
diff --git a/tests/Python/test_rsub.py b/tests/Python/test_rsub.py
index fc945970c..99843af0e 100644
--- a/tests/Python/test_rsub.py
+++ b/tests/Python/test_rsub.py
@@ -6,28 +6,32 @@
 from torch._functorch.aot_autograd import aot_autograd_decompositions
 
 from buddy.compiler.frontend import DynamoCompiler
-from buddy.compiler.ops import tosa
+from buddy.compiler.ops import linalg
 
 
 def foo(x, y):
-    return y-x
+    return torch.ops.aten.rsub(x, y)
 
 
 in1 = torch.ones([13, 13], dtype=torch.float32)
-in2 = torch.ones([13, 13], dtype=torch.float32)
+in2 = 2
 # Initialize the dynamo compiler.
 dynamo_compiler = DynamoCompiler(
-    primary_registry=tosa.ops_registry,
+    primary_registry=linalg.ops_registry,
     aot_autograd_decomposition=aot_autograd_decompositions,
 )
 
-foo_mlir = dynamo.optimize(dynamo_compiler)(foo)
-foo_mlir(in1, in2)
+graphs = dynamo_compiler.importer(foo, in1, in2)
+assert len(graphs) == 1
+graph = graphs[0]
+graph.lower_to_top_level_ir()
+print(graph._imported_module)
 
 # CHECK: module {
 # CHECK-LABEL: func.func @forward
-# CHECK: %{{.*}} = tosa.sub
+# CHECK: %{{.*}} = arith.constant
+# CHECK: %{{.*}} = tensor.empty()
+# CHECK: %{{.*}} = linalg.generic
 # CHECK: return %{{.*}}
 # CHECK: }
 # CHECK: }
-print(dynamo_compiler.imported_module)
diff --git a/tests/Python/test_select.py b/tests/Python/test_select.py
index d94bd296a..c54420a11 100644
--- a/tests/Python/test_select.py
+++ b/tests/Python/test_select.py
@@ -22,8 +22,11 @@ def foo(x, dim, index):
     aot_autograd_decomposition=inductor_decomp,
 )
 
-foo_mlir = dynamo.optimize(dynamo_compiler)(foo)
-foo_mlir(x, dim, index)
+graphs = dynamo_compiler.importer(foo, x, dim, index)
+assert len(graphs) == 1
+graph = graphs[0]
+graph.lower_to_top_level_ir()
+print(graph._imported_module)
 
 # CHECK: module {
 # CHECK-LABEL: func.func @forward
@@ -32,4 +35,3 @@ def foo(x, dim, index):
 # CHECK: return %{{.*}} : tensor<3x2xf32>
 # CHECK: }
 # CHECK: }
-print(dynamo_compiler.imported_module)
diff --git a/tests/Python/test_sigmoid.py b/tests/Python/test_sigmoid.py
new file mode 100644
index 000000000..43f03cc11
--- /dev/null
+++ b/tests/Python/test_sigmoid.py
@@ -0,0 +1,35 @@
+# RUN: %PYTHON %s 2>&1 | FileCheck %s
+
+import torch
+import torch._dynamo as dynamo
+from torch._inductor.decomposition import decompositions as inductor_decomp
+
+from buddy.compiler.frontend import DynamoCompiler
+from buddy.compiler.ops import tosa
+
+
+class foo(torch.nn.Module):
+    def __init__(self, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+
+    def forward(self, a):
+        return torch.sigmoid(a)
+
+
+model = foo()
+dynamo_compiler = DynamoCompiler(
+    primary_registry=tosa.ops_registry,
+    aot_autograd_decomposition=inductor_decomp,
+)
+in1 = torch.randn((1, 3, 640, 480), device="cpu")
+graphs = dynamo_compiler.importer(model, in1)
+assert len(graphs) == 1
+graph = graphs[0]
+graph.lower_to_top_level_ir()
+print(graph._imported_module)
+# CHECK: module {
+# CHECK-LABEL: func.func @forward
+# CHECK: %{{.*}} = tosa.sigmoid
+# CHECK: return %{{.*}}
+# CHECK: }
+# CHECK: }
diff --git a/tests/Python/test_silu.py b/tests/Python/test_silu.py
index dcd919ca5..2aa504776 100644
--- a/tests/Python/test_silu.py
+++ b/tests/Python/test_silu.py
@@ -6,7 +6,7 @@
 from torch._functorch.aot_autograd import aot_autograd_decompositions
 
 from buddy.compiler.frontend import DynamoCompiler
-from buddy.compiler.ops import tosa
+from buddy.compiler.ops import linalg
 
 
 def foo(x):
@@ -16,12 +16,15 @@ def foo(x):
 in1 = torch.ones([13, 13], dtype=torch.float32)
 # Initialize the dynamo compiler.
 dynamo_compiler = DynamoCompiler(
-    primary_registry=tosa.ops_registry,
+    primary_registry=linalg.ops_registry,
     aot_autograd_decomposition=aot_autograd_decompositions,
 )
 
-foo_mlir = dynamo.optimize(dynamo_compiler)(foo)
-foo_mlir(in1)
+graphs = dynamo_compiler.importer(foo, in1)
+assert len(graphs) == 1
+graph = graphs[0]
+graph.lower_to_top_level_ir()
+print(graph._imported_module)
 
 # CHECK: module {
 # CHECK-LABEL: func.func @forward
@@ -30,4 +33,3 @@ def foo(x):
 # CHECK: return %{{.*}}
 # CHECK: }
 # CHECK: }
-print(dynamo_compiler.imported_module)
diff --git a/tests/Python/test_slice.py b/tests/Python/test_slice.py
index 61a8658e1..acc0acaa2 100644
--- a/tests/Python/test_slice.py
+++ b/tests/Python/test_slice.py
@@ -5,7 +5,7 @@
 from torch._inductor.decomposition import decompositions as inductor_decomp
 
 from buddy.compiler.frontend import DynamoCompiler
-from buddy.compiler.ops import tosa
+from buddy.compiler.ops import linalg
 
 
 def foo(x, dim, start_idx, end_idx):
@@ -19,12 +19,15 @@ def foo(x, dim, start_idx, end_idx):
 
 # Initialize the dynamo compiler.
 dynamo_compiler = DynamoCompiler(
-    primary_registry=tosa.ops_registry,
+    primary_registry=linalg.ops_registry,
     aot_autograd_decomposition=inductor_decomp,
 )
 
-foo_mlir = dynamo.optimize(dynamo_compiler)(foo)
-foo_mlir(x, dim, start_idx, end_idx)
+graphs = dynamo_compiler.importer(foo, x, dim, start_idx, end_idx)
+assert len(graphs) == 1
+graph = graphs[0]
+graph.lower_to_top_level_ir()
+print(graph._imported_module)
 
 # CHECK: module {
 # CHECK-LABEL: func.func @forward
@@ -32,4 +35,3 @@ def foo(x, dim, start_idx, end_idx):
 # CHECK: return %{{.*}} : tensor<3x2x2xf32>
 # CHECK: }
 # CHECK: }
-print(dynamo_compiler.imported_module)
diff --git a/tests/Python/test_softmax.py b/tests/Python/test_softmax.py
index d5e656de7..eca5b2c60 100644
--- a/tests/Python/test_softmax.py
+++ b/tests/Python/test_softmax.py
@@ -6,7 +6,7 @@
 from torch._functorch.aot_autograd import aot_autograd_decompositions
 
 from buddy.compiler.frontend import DynamoCompiler
-from buddy.compiler.ops import tosa
+from buddy.compiler.ops import linalg
 
 
 def foo(x):
@@ -16,26 +16,22 @@ def foo(x):
 in1 = torch.ones([13, 13], dtype=torch.float32)
 # Initialize the dynamo compiler.
 dynamo_compiler = DynamoCompiler(
-    primary_registry=tosa.ops_registry,
+    primary_registry=linalg.ops_registry,
     aot_autograd_decomposition=aot_autograd_decompositions,
 )
 
-foo_mlir = dynamo.optimize(dynamo_compiler)(foo)
-foo_mlir(in1)
+graphs = dynamo_compiler.importer(foo, in1)
+assert len(graphs) == 1
+graph = graphs[0]
+graph.lower_to_top_level_ir()
+print(graph._imported_module)
 
 # CHECK: module {
 # CHECK-LABEL: func.func @forward
-# CHECK: %{{.*}} = tensor.empty
+# CHECK: %{{.*}} = arith.constant
 # CHECK: %{{.*}} = linalg.generic
-# CHECK: %{{.*}} = linalg.generic
-# CHECK: %{{.*}} = tensor.empty
-# CHECK: %{{.*}} = linalg.generic
-# CHECK: %{{.*}} = tensor.empty
-# CHECK: %{{.*}} = linalg.generic
-# CHECK: %{{.*}} = linalg.generic
-# CHECK: %{{.*}} = tensor.empty
+# CHECK: %{{.*}} = tensor.empty()
 # CHECK: %{{.*}} = linalg.generic
 # CHECK: return %{{.*}}
 # CHECK: }
 # CHECK: }
-print(dynamo_compiler.imported_module)
diff --git a/tests/Python/test_sqrt.py b/tests/Python/test_sqrt.py
new file mode 100644
index 000000000..b929d1107
--- /dev/null
+++ b/tests/Python/test_sqrt.py
@@ -0,0 +1,36 @@
+# RUN: %PYTHON %s 2>&1 | FileCheck %s
+
+import torch
+from torch._inductor.decomposition import decompositions as inductor_decomp
+
+from buddy.compiler.frontend import DynamoCompiler
+from buddy.compiler.ops import math
+
+
+def foo(x):
+    return torch.ops.aten.sqrt(x)
+
+
+x = torch.randn(10, 3, 6)
+
+# Initialize the dynamo compiler.
+dynamo_compiler = DynamoCompiler(
+    primary_registry=math.ops_registry,
+    aot_autograd_decomposition=inductor_decomp,
+)
+
+foo_mlir = torch.compile(foo, backend=dynamo_compiler)
+assert torch.allclose(foo_mlir(x), foo(x), equal_nan=True)
+
+graphs = dynamo_compiler.importer(foo, x)
+assert len(graphs) == 1
+graph = graphs[0]
+graph.lower_to_top_level_ir()
+print(graph._imported_module)
+
+# CHECK: module {
+# CHECK-LABEL: func.func @forward
+# CHECK: %{{.*}} = math.sqrt
+# CHECK: return %{{.*}}
+# CHECK: }
+# CHECK: }
diff --git a/tests/Python/test_squeeze.py b/tests/Python/test_squeeze.py
index f394ca8d7..e6b1b5c00 100644
--- a/tests/Python/test_squeeze.py
+++ b/tests/Python/test_squeeze.py
@@ -6,7 +6,7 @@
 from torch._functorch.aot_autograd import aot_autograd_decompositions
 
 from buddy.compiler.frontend import DynamoCompiler
-from buddy.compiler.ops import tosa
+from buddy.compiler.ops import linalg
 
 
 def foo(x):
@@ -16,12 +16,15 @@ def foo(x):
 in1 = torch.ones([1, 13, 13], dtype=torch.float32)
 # Initialize the dynamo compiler.
 dynamo_compiler = DynamoCompiler(
-    primary_registry=tosa.ops_registry,
+    primary_registry=linalg.ops_registry,
     aot_autograd_decomposition=aot_autograd_decompositions,
 )
 
-foo_mlir = dynamo.optimize(dynamo_compiler)(foo)
-foo_mlir(in1)
+graphs = dynamo_compiler.importer(foo, in1)
+assert len(graphs) == 1
+graph = graphs[0]
+graph.lower_to_top_level_ir()
+print(graph._imported_module)
 
 # CHECK: module {
 # CHECK-LABEL: func.func @forward
@@ -30,4 +33,3 @@ def foo(x):
 # CHECK: return %{{.*}}
 # CHECK: }
 # CHECK: }
-print(dynamo_compiler.imported_module)
diff --git a/tests/Python/test_sum.py b/tests/Python/test_sum.py
index 713910f15..e97f94209 100644
--- a/tests/Python/test_sum.py
+++ b/tests/Python/test_sum.py
@@ -22,8 +22,11 @@ def foo(x, dim):
     aot_autograd_decomposition=inductor_decomp,
 )
 
-foo_mlir = dynamo.optimize(dynamo_compiler)(foo)
-foo_mlir(x, dim)
+graphs = dynamo_compiler.importer(foo, x, dim)
+assert len(graphs) == 1
+graph = graphs[0]
+graph.lower_to_top_level_ir()
+print(graph._imported_module)
 
 # CHECK: module {
 # CHECK-LABEL: func.func @forward
@@ -31,4 +34,3 @@ def foo(x, dim):
 # CHECK: return %{{.*}}
 # CHECK: }
 # CHECK: }
-print(dynamo_compiler.imported_module)
diff --git a/tests/Python/test_t.py b/tests/Python/test_t.py
index 835bb4c2f..09d44facc 100644
--- a/tests/Python/test_t.py
+++ b/tests/Python/test_t.py
@@ -6,7 +6,7 @@
 from torch._functorch.aot_autograd import aot_autograd_decompositions
 
 from buddy.compiler.frontend import DynamoCompiler
-from buddy.compiler.ops import tosa
+from buddy.compiler.ops import linalg
 
 
 def foo(x):
@@ -16,18 +16,20 @@ def foo(x):
 in1 = torch.ones([13, 13], dtype=torch.float32)
 # Initialize the dynamo compiler.
 dynamo_compiler = DynamoCompiler(
-    primary_registry=tosa.ops_registry,
+    primary_registry=linalg.ops_registry,
     aot_autograd_decomposition=aot_autograd_decompositions,
 )
 
-foo_mlir = dynamo.optimize(dynamo_compiler)(foo)
-foo_mlir(in1)
+graphs = dynamo_compiler.importer(foo, in1)
+assert len(graphs) == 1
+graph = graphs[0]
+graph.lower_to_top_level_ir()
+print(graph._imported_module)
 
 # CHECK: module {
 # CHECK-LABEL: func.func @forward
-# CHECK: %{{.*}} = "tosa.const"
-# CHECK: %{{.*}} = tosa.transpose
+# CHECK: %{{.*}} = tensor.empty()
+# CHECK: %{{.*}} = linalg.transpose
 # CHECK: return %{{.*}}
 # CHECK: }
 # CHECK: }
-print(dynamo_compiler.imported_module)
diff --git a/tests/Python/test_tanh.py b/tests/Python/test_tanh.py
index b1875dfd5..b9ca6082c 100644
--- a/tests/Python/test_tanh.py
+++ b/tests/Python/test_tanh.py
@@ -20,8 +20,11 @@ def foo(x):
     aot_autograd_decomposition=inductor_decomp,
 )
 
-foo_mlir = dynamo.optimize(dynamo_compiler)(foo)
-foo_mlir(x)
+graphs = dynamo_compiler.importer(foo, x)
+assert len(graphs) == 1
+graph = graphs[0]
+graph.lower_to_top_level_ir()
+print(graph._imported_module)
 
 # CHECK: module {
 # CHECK-LABEL: func.func @forward
@@ -29,4 +32,3 @@ def foo(x):
 # CHECK: return %{{.*}}
 # CHECK: }
 # CHECK: }
-print(dynamo_compiler.imported_module)
diff --git a/tests/Python/test_to_copy.py b/tests/Python/test_to_copy.py
index 9632d9f5c..0b6c2ad22 100644
--- a/tests/Python/test_to_copy.py
+++ b/tests/Python/test_to_copy.py
@@ -6,7 +6,7 @@
 from torch._functorch.aot_autograd import aot_autograd_decompositions
 
 from buddy.compiler.frontend import DynamoCompiler
-from buddy.compiler.ops import tosa
+from buddy.compiler.ops import linalg
 
 
 def foo(x):
@@ -16,12 +16,15 @@ def foo(x):
 in1 = torch.ones([13, 13], dtype=torch.bool)
 # Initialize the dynamo compiler.
 dynamo_compiler = DynamoCompiler(
-    primary_registry=tosa.ops_registry,
+    primary_registry=linalg.ops_registry,
     aot_autograd_decomposition=aot_autograd_decompositions,
 )
 
-foo_mlir = dynamo.optimize(dynamo_compiler)(foo)
-foo_mlir(in1)
+graphs = dynamo_compiler.importer(foo, in1)
+assert len(graphs) == 1
+graph = graphs[0]
+graph.lower_to_top_level_ir()
+print(graph._imported_module)
 
 # CHECK: module {
 # CHECK-LABEL: func.func @forward
@@ -30,4 +33,3 @@ def foo(x):
 # CHECK: return %{{.*}}
 # CHECK: }
 # CHECK: }
-print(dynamo_compiler.imported_module)
diff --git a/tests/Python/test_transpose.py b/tests/Python/test_transpose.py
index d7e71be8e..9769604f3 100644
--- a/tests/Python/test_transpose.py
+++ b/tests/Python/test_transpose.py
@@ -3,7 +3,6 @@
 import torch
 import torch._dynamo as dynamo
 from torch._inductor.decomposition import decompositions as inductor_decomp
-from torch._functorch.aot_autograd import aot_autograd_decompositions
 
 from buddy.compiler.frontend import DynamoCompiler
 from buddy.compiler.ops import tosa
@@ -19,17 +18,19 @@ def foo(x, y, z):
 # Initialize the dynamo compiler.
 dynamo_compiler = DynamoCompiler(
     primary_registry=tosa.ops_registry,
-    aot_autograd_decomposition=aot_autograd_decompositions,
+    aot_autograd_decomposition=inductor_decomp,
 )
 
-foo_mlir = dynamo.optimize(dynamo_compiler)(foo)
-foo_mlir(in1, in2, in3)
+graphs = dynamo_compiler.importer(foo, in1, in2, in3)
+assert len(graphs) == 1
+graph = graphs[0]
+graph.lower_to_top_level_ir()
+print(graph._imported_module)
 
 # CHECK: module {
 # CHECK-LABEL: func.func @forward
-# CHECK: %{{.*}} = "tosa.const"
+# CHECK: %{{.*}} = "tosa.const"()
 # CHECK: %{{.*}} = tosa.transpose
 # CHECK: return %{{.*}}
 # CHECK: }
 # CHECK: }
-print(dynamo_compiler.imported_module)
diff --git a/tests/Python/test_unsqueeze.py b/tests/Python/test_unsqueeze.py
index 577354b9f..5cb4ee552 100644
--- a/tests/Python/test_unsqueeze.py
+++ b/tests/Python/test_unsqueeze.py
@@ -5,7 +5,7 @@
 from torch._inductor.decomposition import decompositions as inductor_decomp
 
 from buddy.compiler.frontend import DynamoCompiler
-from buddy.compiler.ops import tosa
+from buddy.compiler.ops import linalg
 
 
 def foo(x, dim):
@@ -17,12 +17,15 @@ def foo(x, dim):
 
 # Initialize the dynamo compiler.
 dynamo_compiler = DynamoCompiler(
-    primary_registry=tosa.ops_registry,
+    primary_registry=linalg.ops_registry,
     aot_autograd_decomposition=inductor_decomp,
 )
 
-foo_mlir = dynamo.optimize(dynamo_compiler)(foo)
-foo_mlir(x, dim)
+graphs = dynamo_compiler.importer(foo, x, dim)
+assert len(graphs) == 1
+graph = graphs[0]
+graph.lower_to_top_level_ir()
+print(graph._imported_module)
 
 # CHECK: module {
 # CHECK-LABEL: func.func @forward
@@ -30,4 +33,3 @@ def foo(x, dim):
 # CHECK: return %{{.*}} : tensor<1x10xf32>
 # CHECK: }
 # CHECK: }
-print(dynamo_compiler.imported_module)
diff --git a/tests/Python/test_var_mean.py b/tests/Python/test_var_mean.py
index eb7f254e4..eae1c9983 100644
--- a/tests/Python/test_var_mean.py
+++ b/tests/Python/test_var_mean.py
@@ -24,8 +24,11 @@ def foo_keepdim(x):
     aot_autograd_decomposition=inductor_decomp,
 )
 
-foo_mlir = dynamo.optimize(dynamo_compiler)(foo)
-foo_mlir(x)
+graphs = dynamo_compiler.importer(foo, x)
+assert len(graphs) == 1
+graph = graphs[0]
+graph.lower_to_top_level_ir()
+print(graph._imported_module)
 
 # CHECK: module {
 # CHECK-LABEL: func.func @forward
@@ -44,10 +47,33 @@ def foo_keepdim(x):
 # CHECK: return %{{.*}} : tensor<f32>, tensor<f32>
 # CHECK: }
 # CHECK: }
-print(dynamo_compiler.imported_module)
 
-foo_keepdim_mlir = dynamo.optimize(dynamo_compiler)(foo_keepdim)
-foo_keepdim_mlir(x)
+graphs = dynamo_compiler.importer(foo_keepdim, x)
+assert len(graphs) == 2
+graphs[0].lower_to_top_level_ir()
+print(graphs[0]._imported_module)
+
+# CHECK: module {
+# CHECK-LABEL: func.func @forward
+# CHECK: %{{.*}} = tosa.reduce_sum
+# CHECK: %{{.*}} = "tosa.const"
+# CHECK: %{{.*}} = tosa.reciprocal
+# CHECK: %{{.*}} = tosa.mul
+# CHECK: %{{.*}} = tosa.sub
+# CHECK: %{{.*}} = tosa.mul
+# CHECK: %{{.*}} = tosa.reduce_sum
+# CHECK: %{{.*}} = "tosa.const"
+# CHECK: %{{.*}} = tosa.reciprocal
+# CHECK: %{{.*}} = tosa.mul
+# CHECK: %{{.*}} = tosa.reshape
+# CHECK: %{{.*}} = tosa.reshape
+# CHECK: return %{{.*}} : tensor<f32>, tensor<f32>
+# CHECK: }
+# CHECK: }
+
+graphs[1].lower_to_top_level_ir()
+print(graphs[1]._imported_module)
+
 # CHECK: module {
 # CHECK-LABEL: func.func @forward
 # CHECK: %{{.*}} = tosa.reduce_sum
@@ -63,4 +89,3 @@ def foo_keepdim(x):
 # CHECK: return %{{.*}} : tensor<1x1x1xf32>, tensor<1x1x1xf32>
 # CHECK: }
 # CHECK: }
-print(dynamo_compiler.imported_module)
diff --git a/tests/Python/test_view.py b/tests/Python/test_view.py
index 44db4609d..31eacddc7 100644
--- a/tests/Python/test_view.py
+++ b/tests/Python/test_view.py
@@ -5,29 +5,31 @@
 from torch._inductor.decomposition import decompositions as inductor_decomp
 
 from buddy.compiler.frontend import DynamoCompiler
-from buddy.compiler.ops import tosa
+from buddy.compiler.ops import linalg
 
 
 def foo(x, y):
-    return x + y
+    return torch.ops.aten.view(x, y)
 
 
 in1 = torch.randn(10)
-in2 = torch.randn(10)
+in2 = (2, 5)
 
 # Initialize the dynamo compiler.
 dynamo_compiler = DynamoCompiler(
-    primary_registry=tosa.ops_registry,
+    primary_registry=linalg.ops_registry,
     aot_autograd_decomposition=inductor_decomp,
 )
 
-foo_mlir = dynamo.optimize(dynamo_compiler)(foo)
-foo_mlir(in1, in2)
+graphs = dynamo_compiler.importer(foo, in1, in2)
+assert len(graphs) == 1
+graph = graphs[0]
+graph.lower_to_top_level_ir()
+print(graph._imported_module)
 
 # CHECK: module {
 # CHECK-LABEL: func.func @forward
-# CHECK: %{{.*}} = tosa.add
+# CHECK: %{{.*}} = tosa.reshape
 # CHECK: return %{{.*}}
 # CHECK: }
 # CHECK: }
-print(dynamo_compiler.imported_module)
diff --git a/tests/Python/test_where.py b/tests/Python/test_where.py
new file mode 100644
index 000000000..5266f00b7
--- /dev/null
+++ b/tests/Python/test_where.py
@@ -0,0 +1,38 @@
+# RUN: %PYTHON %s 2>&1 | FileCheck %s
+
+import torch
+import torch._dynamo as dynamo
+from torch._inductor.decomposition import decompositions as inductor_decomp
+from torch._functorch.aot_autograd import aot_autograd_decompositions
+
+from buddy.compiler.frontend import DynamoCompiler
+from buddy.compiler.ops import linalg
+
+
+def foo(x, y, z):
+    return torch.where(x, y, z)
+
+
+in1 = torch.ones([13, 13], dtype=torch.bool)
+in2 = 0
+in3 = torch.ones([13, 13], dtype=torch.float32)
+# Initialize the dynamo compiler.
+dynamo_compiler = DynamoCompiler(
+    primary_registry=linalg.ops_registry,
+    aot_autograd_decomposition=aot_autograd_decompositions,
+)
+
+graphs = dynamo_compiler.importer(foo, in1, in2, in3)
+assert len(graphs) == 1
+graph = graphs[0]
+graph.lower_to_top_level_ir()
+print(graph._imported_module)
+
+# CHECK: module {
+# CHECK-LABEL: func.func @forward
+# CHECK: %{{.*}} = arith.constant
+# CHECK: %{{.*}} = tensor.empty
+# CHECK: %{{.*}} = linalg.generic
+# CHECK: return %{{.*}}
+# CHECK: }
+# CHECK: }
diff --git a/tools/buddy-opt/CMakeLists.txt b/tools/buddy-opt/CMakeLists.txt
index 575fe7eea..afa07bff8 100644
--- a/tools/buddy-opt/CMakeLists.txt
+++ b/tools/buddy-opt/CMakeLists.txt
@@ -20,6 +20,7 @@ target_link_libraries(buddy-opt
   LowerDIPPass
   BuddyDAP
   LowerDAPPass
+  DAPVectorization
   BuddyRVV
   LowerRVVPass
   MatMulOptimization
@@ -34,4 +35,5 @@ target_link_libraries(buddy-opt
   LowerLinalgToGemminiPass
   SchedulingOnDevices
   LowerSche
+  MLIRGPUPasses
   )
diff --git a/tools/buddy-opt/buddy-opt.cpp b/tools/buddy-opt/buddy-opt.cpp
index 4f359cb18..bf9652d71 100644
--- a/tools/buddy-opt/buddy-opt.cpp
+++ b/tools/buddy-opt/buddy-opt.cpp
@@ -56,6 +56,7 @@ void registerPoolingVectorizationPass();
 void registerLowerBudPass();
 void registerLowerDIPPass();
 void registerLowerDAPPass();
+void registerDAPVectorizePass();
 void registerLowerRVVPass();
 void registerBatchMatMulOptimizePass();
 void registerMatMulOptimizePass();
@@ -68,6 +69,8 @@ void registerLowerGemminiPass();
 void registerLowerLinalgToGemminiPass();
 void registerDeviceSchedulePass();
 void registerLowerSchePass();
+void registerGPUHostRegisterPass();
+void registerBuddyGPUBufferizePass();
 } // namespace buddy
 } // namespace mlir
 
@@ -82,6 +85,8 @@ int main(int argc, char **argv) {
   mlir::buddy::registerLowerBudPass();
   mlir::buddy::registerLowerDIPPass();
   mlir::buddy::registerLowerDAPPass();
+  // Register Vectorization of DAP Dialect.
+  mlir::buddy::registerDAPVectorizePass();
   mlir::buddy::registerLowerRVVPass();
   mlir::buddy::registerLowerVectorExpPass();
   mlir::buddy::registerLowerGemminiPass();
@@ -95,9 +100,13 @@ int main(int argc, char **argv) {
   mlir::buddy::registerTransposeOptimizationPass();
   mlir::buddy::registerConvOptimizePass();
   mlir::buddy::registerDeviceSchedulePass();
-  mlir::buddy::registerLowerSchePass();;
+  mlir::buddy::registerLowerSchePass();
 
-  mlir::DialectRegistry registry;
+  // Register gpu passes
+  mlir::buddy::registerGPUHostRegisterPass();
+  mlir::buddy::registerBuddyGPUBufferizePass();
+
+      mlir::DialectRegistry registry;
   // Register all MLIR core dialects.
   registerAllDialects(registry);
   mlir::registerAllExtensions(registry);