diff --git a/.gitignore b/.gitignore index 485cccfcf..f818d1151 100644 --- a/.gitignore +++ b/.gitignore @@ -12,3 +12,6 @@ # Clangd cache .cache + +# Python venv +venv* diff --git a/.gitmodules b/.gitmodules index 5ed61a524..00d892bd3 100644 --- a/.gitmodules +++ b/.gitmodules @@ -2,6 +2,8 @@ path = llvm url = https://github.com/llvm/llvm-project.git branch = main + shallow = true [submodule "thirdparty/mimalloc"] path = thirdparty/mimalloc url = https://github.com/microsoft/mimalloc.git + shallow = true diff --git a/benchmark/makefile b/benchmark/makefile index 3dc5543b7..112d554c1 100644 --- a/benchmark/makefile +++ b/benchmark/makefile @@ -33,7 +33,7 @@ all:$(OUT) $(shell rm -rf tempFile) BUDDY_OPT_OPTIONS := -conv-vectorization="strip-mining=${STRIP}" -lower-affine -convert-scf-to-cf -convert-vector-to-llvm -finalize-memref-to-llvm -convert-arith-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts -MLIR_OPT_OPTIONS := -convert-linalg-to-loops -convert-linalg-to-llvm -lower-affine -convert-scf-to-cf -convert-scf-to-cf -convert-vector-to-llvm --finalize-memref-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts +MLIR_OPT_OPTIONS := -convert-linalg-to-loops -lower-affine -convert-scf-to-cf -convert-scf-to-cf -convert-vector-to-llvm --finalize-memref-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts $(OUT):$(SOURCE) @echo $* diff --git a/docs/IIRVectorizationAlgorithm.md b/docs/IIRVectorizationAlgorithm.md new file mode 100644 index 000000000..420450581 --- /dev/null +++ b/docs/IIRVectorizationAlgorithm.md @@ -0,0 +1,53 @@ +# Algorithm Explanation + +This document shows the details of the algorithms used in DAPVectorization pass. + +## IIR Vectorization Implementation + +IIR filter can represent in different forms, typically ZPK(Zero-Pole-Gain) form or SOS(Second-Order Sections) form. Filter can be defined in ZPK form and then transformed to SOS form. + +### Scalar Computation for IIR Operation + +Currently, our IIR operation supports filter with SOS form. When the filter has only one set of parameters, denoted as {$𝑏_0, 𝑏_1, b_2, a_1, a_2$}, distinguishing parameters by subscripts. The equation is shown in the following form: + +**IIR with one set of params:** +$$ y_n = 𝑏_0 𝑥_𝑛 + 𝑏_1 𝑥_{𝑛−1} − 𝑎_1 𝑦_{𝑛−1} + 𝑏_2 𝑥_{𝑛−2} − 𝑎_2 𝑦_{𝑛−2} $$ + +When the filter have multiple sets of filters, the operation use a cascade method for calculation. Take two sets of params as an example, filter parameters denoted as {$𝑏_0^0, 𝑏_1^0, b_2^0, a_1^0, a_2^0$} and {$𝑏_0^1, 𝑏_1^1, b_2^1, a_1^1, a_2^1$}, superscript indicates parameters from different sets. The process is listed below: + +**IIR with two sets of params:** +$$y_n^0 = 𝑏_0^0 𝑥_𝑛^0 + 𝑏_1^0 𝑥_{𝑛−1}^0 − 𝑎_1^0 𝑦_{𝑛−1}^0 + 𝑏_2^0 𝑥_{𝑛−2}^0 − 𝑎_2^0 𝑦_{𝑛−2}^0 $$ +$$x_n^1 = y_n^0$$ +$$y_n^1 = 𝑏_0^1 𝑥_𝑛^1 + 𝑏_1^1 𝑥_{𝑛−1}^1 − 𝑎_1^1 𝑦_{𝑛−1}^1 + 𝑏_2^1 𝑥_{𝑛−2}^1 − 𝑎_2^0 𝑦_{𝑛−2}^1$$ + +### Vectorization for IIR Operation + +This section shows the implementation of IIR Vectorization algorithm. The example shown below contains 4 sets of parameters, with superscript {$0, 1, 2, 3$} representing each set of parameters. + +1. **Segment IIR Equation & Generate Vector Params** + ![Segment IIR Equation to three parts due to different time moment](./Images/IIRSegmentation.png) + IIR equation were segmented into 3 parts, each part were calculated in different time moment. When $S2$ was calculated at time $t_i$, it will be used to calculate $S1$ at time $t_{i+1}$, then produce the final result at time $t_{i+2}$. + + ![Generate SOS params in vector form](./Images/IIRVectorParams.png) + In the above image, vector $B0$ were the collection of all $b_0$ params, other vectors $B1, B2, A1, A2$ each collect there corresponding params. + +2. **Computing One Set of Params** + ![Computing step 1](./Images/IIRComputing1.png) + The first step in computation, calculate $y_0^0$ with the following equation: + $$𝑦_0^0=𝑏_0^0𝑥_0+s_1^0$$ + At time moment $0$, the initial values of $S1, S2$ were set to $0$. + ![Computing step 2](./Images/IIRComputing2.png) + The second step in computation, calculate $s_1^0$ with the following equation: + $$𝑠_1^0=𝑏_1^0𝑥_0−𝑎_1^0𝑦_0^0+s_2^0 $$ + ![Computing step 3](./Images/IIRComputing3.png) + The third step in computation, calculate $s_2^0$ with the following equation: + $$𝑠_2^0=𝑏_2^0𝑥_0−𝑎_2^0𝑦_0^0$$ + + The above three steps happen in the same time moment $t$, which is the same loop iteration in program. The order of these three steps cannot change, because the value from vector $S1, S2$ were actually produced before time moment $t$. +3. **Cascade Method** + ![Cascade step 1](./Images/IIRCascade1.png) + Now the values $y_0^0$, $s_1^0$ and $s_2^0$ were produced, here the whole system will get a new input $x1$ and move on the computation. + ![Cascade step 2](./Images/IIRCascade2.png) + The $y_0^0$ were moved right and the new input $x1$ were pushed in. The value in vector $S1$ and $S2$ are not changed and will jump back to the second step. The difference in the next iteration is that two sets of parameters are used and this is where the performance improves. + + When the example above came to the fourth iteration, the computation will be using all the parameters. This situation occurs for the vast majority of the time during the computation. Also, considering a longer vector length(currently support 4, 8, 16, 32, 64), it can achieve a 10x performance improvement. diff --git a/docs/Images/IIRCascade1.png b/docs/Images/IIRCascade1.png new file mode 100644 index 000000000..e766735b1 Binary files /dev/null and b/docs/Images/IIRCascade1.png differ diff --git a/docs/Images/IIRCascade2.png b/docs/Images/IIRCascade2.png new file mode 100644 index 000000000..5e97da94d Binary files /dev/null and b/docs/Images/IIRCascade2.png differ diff --git a/docs/Images/IIRComputing1.png b/docs/Images/IIRComputing1.png new file mode 100644 index 000000000..69b5f33de Binary files /dev/null and b/docs/Images/IIRComputing1.png differ diff --git a/docs/Images/IIRComputing2.png b/docs/Images/IIRComputing2.png new file mode 100644 index 000000000..1e33f8532 Binary files /dev/null and b/docs/Images/IIRComputing2.png differ diff --git a/docs/Images/IIRComputing3.png b/docs/Images/IIRComputing3.png new file mode 100644 index 000000000..4a8d3d2b4 Binary files /dev/null and b/docs/Images/IIRComputing3.png differ diff --git a/docs/Images/IIRSegmentation.png b/docs/Images/IIRSegmentation.png new file mode 100644 index 000000000..e498e8652 Binary files /dev/null and b/docs/Images/IIRSegmentation.png differ diff --git a/docs/Images/IIRVectorParams.png b/docs/Images/IIRVectorParams.png new file mode 100644 index 000000000..87e0e81bf Binary files /dev/null and b/docs/Images/IIRVectorParams.png differ diff --git a/examples/BuddyBert/import-bert.py b/examples/BuddyBert/import-bert.py index c2044cb03..92e8e055e 100644 --- a/examples/BuddyBert/import-bert.py +++ b/examples/BuddyBert/import-bert.py @@ -46,12 +46,16 @@ "attention_mask": torch.tensor([[1 for _ in range(5)]], dtype=torch.int64), } with torch.no_grad(): - module, params = dynamo_compiler.importer(model, **inputs) + graphs = dynamo_compiler.importer(model, **inputs) +assert len(graphs) == 1 +graph = graphs[0] +params = dynamo_compiler.imported_params[graph] +graph.lower_to_top_level_ir(do_params_pack=True) current_path = os.path.dirname(os.path.abspath(__file__)) with open(Path(current_path) / "bert.mlir", "w") as module_file: - module_file.write(str(module)) + module_file.write(str(graph._imported_module)) float32_param = np.concatenate( [param.detach().numpy().reshape([-1]) for param in params[:-1]] diff --git a/examples/BuddyGPU/matmul.mlir b/examples/BuddyGPU/matmul.mlir new file mode 100644 index 000000000..642fc2d0a --- /dev/null +++ b/examples/BuddyGPU/matmul.mlir @@ -0,0 +1,8 @@ +module { + func.func @forward(%arg0: tensor<5376x2048xf32>, %arg1: tensor<2048x5376xf32>) -> tensor<5376x5376xf32> { + %cst = arith.constant dense<0.000000e+00> : tensor<5376x5376xf32> + %0 = linalg.matmul {cast = #linalg.type_fn} ins(%arg0, %arg1 : tensor<5376x2048xf32>, tensor<2048x5376xf32>) outs(%cst : tensor<5376x5376xf32>) -> tensor<5376x5376xf32> + return %0 : tensor<5376x5376xf32> + } +} + diff --git a/examples/BuddyGPU/matmul.py b/examples/BuddyGPU/matmul.py new file mode 100644 index 000000000..af6dfe341 --- /dev/null +++ b/examples/BuddyGPU/matmul.py @@ -0,0 +1,54 @@ +# ===- matmul.py -------------------------------------------------------------- +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# ===-------------------------------------------------------------------------- +# +# This file demonstrates the usage of Buddy's frontend for PyTorch module. +# +# ===-------------------------------------------------------------------------- + +import os +import time + +import numpy +import torch +from transformers import LlamaForCausalLM, LlamaTokenizer +from torch._functorch.aot_autograd import aot_autograd_decompositions +from torch._inductor.decomposition import decompositions as inductor_decomp + +from buddy.compiler.frontend import DynamoCompiler +from buddy.compiler.ops import tosa + +dtype = torch.float32 + +def foo(x, y): + return torch.matmul(x, y) + +in1 = torch.ones([5376, 2048], dtype=torch.float32) +in2 = torch.ones([2048, 5376], dtype=torch.float32) +# Initialize Dynamo Compiler with specific configurations as an importer. +dynamo_compiler = DynamoCompiler( + primary_registry=tosa.ops_registry, + aot_autograd_decomposition=inductor_decomp, +) + +graphs = dynamo_compiler.importer(foo, in1, in2) +assert len(graphs) == 1 +graph = graphs[0] +graph.lower_to_top_level_ir() + +path_prefix = os.path.dirname(os.path.abspath(__file__)) +# Write the MLIR module to the file. +with open(os.path.join(path_prefix, "matmul.mlir"), "w") as module_file: + print(graph._imported_module, file=module_file) diff --git a/examples/BuddyGPU/run-module.py b/examples/BuddyGPU/run-module.py new file mode 100644 index 000000000..ef29995f4 --- /dev/null +++ b/examples/BuddyGPU/run-module.py @@ -0,0 +1,195 @@ +import mlir.ir as ir +import mlir.dialects.func as func +import mlir.dialects.memref as memref +from mlir.passmanager import * +from mlir.execution_engine import * +from mlir import runtime as rt +from mlir.ir import * +import numpy as np +import ctypes +import gc +import torch + + +def to_numpy(element_type: str) -> np.dtype: + match element_type: + case "f16": + return np.float16 + case "f32": + return np.float32 + case "f64": + return np.float64 + case "i8": + return np.int8 + case "i16": + return np.int16 + case "i32": + return np.int32 + case "i64": + return np.int64 + case "bf16": + return ValueError("bf16 is not supported by numpy") + case _: + raise ValueError(f"Unsupported type: {element_type}") + + +def to_mlir(dtype: np.dtype) -> ir.Type: + match dtype: + case np.float16: + return ir.F16Type.get() + case np.float32: + return ir.F32Type.get() + case np.float64: + return ir.F64Type.get() + case np.int8: + return ir.IntegerType.get_signless(8) + case np.int16: + return ir.IntegerType.get_signless(16) + case np.int32: + return ir.IntegerType.get_signless(32) + case np.int64: + return ir.IntegerType.get_signless(64) + case _: + raise ValueError(f"Unsupported type: {dtype}") + + +def lower_to_llvm_cpu(module: Module) -> Module: + pm = PassManager("builtin.module") + pm.add("func.func(tosa-to-linalg-named)") + pm.add("func.func(tosa-to-linalg)") + pm.add("func.func(tosa-to-tensor)") + pm.add("func.func(tosa-to-arith)") + pm.add("arith-expand") + pm.add("eliminate-empty-tensors") + pm.add("empty-tensor-to-alloc-tensor") + pm.add("convert-elementwise-to-linalg") + pm.add("one-shot-bufferize") + pm.add("func.func(convert-linalg-to-affine-loops)") + pm.add("affine-loop-fusion") + pm.add("func.func(affine-parallelize)") + pm.add("lower-affine") + pm.add("convert-scf-to-openmp") + pm.add("func-bufferize") + pm.add("arith-bufferize") + pm.add("func.func(tensor-bufferize)") + pm.add("func.func(buffer-deallocation)") + pm.add("func.func(finalizing-bufferize)") + pm.add("expand-strided-metadata") + pm.add("convert-vector-to-llvm") + pm.add("memref-expand") + pm.add("arith-expand") + pm.add("convert-arith-to-llvm") + pm.add("finalize-memref-to-llvm") + pm.add("convert-scf-to-cf") + pm.add("func.func(llvm-request-c-wrappers)") + pm.add("convert-openmp-to-llvm") + pm.add("convert-math-to-llvm") + pm.add("convert-math-to-libm") + pm.add("convert-func-to-llvm") + pm.add("reconcile-unrealized-casts") + pm.run(module.operation) + return module + + +def new_ranked_memref_descriptor(nparray: np.ndarray): + ctp = rt.as_ctype(nparray.dtype) + if nparray.ndim == 0: + x = rt.make_zero_d_memref_descriptor(ctp)() + x.allocated = nparray.ctypes.data + x.aligned = nparray.ctypes.data_as(ctypes.POINTER(ctp)) + x.offset = ctypes.c_longlong(0) + return x + + x = rt.make_nd_memref_descriptor(nparray.ndim, ctp)() + nbytes = nparray.nbytes + buffer = ctypes.create_string_buffer(nbytes) + ctypes.memmove(buffer, nparray.ctypes.data, nbytes) + x.allocated = ctypes.cast(buffer, ctypes.c_void_p).value + x.aligned = ctypes.cast(buffer, ctypes.POINTER(ctp)) + x.offset = ctypes.c_longlong(0) + x.shape = nparray.ctypes.shape + + # Numpy uses byte quantities to express strides, MLIR OTOH uses the + # torch abstraction which specifies strides in terms of elements. + strides_ctype_t = ctypes.c_longlong * nparray.ndim + x.strides = strides_ctype_t( + *[x // nparray.itemsize for x in nparray.strides] + ) + return x + + +def testMemrefAdd(): + with Context(): + module = Module.parse( + """ + module { + func.func @main(%arg0: memref<1xf32>, %arg1: memref, %arg2: memref<1xf32>) attributes { llvm.emit_c_interface } { + %0 = arith.constant 0 : index + %1 = memref.load %arg0[%0] : memref<1xf32> + %2 = memref.load %arg1[] : memref + %3 = arith.addf %1, %2 : f32 + memref.store %3, %arg2[%0] : memref<1xf32> + return + } + } """ + ) + arg1 = np.array([32.5]).astype(np.float32) + arg2 = np.array(6).astype(np.float32) + res = np.array([0]).astype(np.float32) + + arg1_memref_ptr = ctypes.pointer( + ctypes.pointer(rt.get_ranked_memref_descriptor(arg1)) + ) + arg2_memref_ptr = ctypes.pointer( + ctypes.pointer(rt.get_ranked_memref_descriptor(arg2)) + ) + res_memref_ptr = ctypes.pointer( + ctypes.pointer(rt.get_ranked_memref_descriptor(res)) + ) + + execution_engine = ExecutionEngine(lower_to_llvm_cpu(module)) + execution_engine.invoke( + "main", arg1_memref_ptr, arg2_memref_ptr, res_memref_ptr + ) + npout = rt.ranked_memref_to_numpy(res_memref_ptr[0]) + print(npout) + +def get_memref_descriptors(args: list[Type]): + memref_ptrs = [] + for arg in args: + elem_type = to_numpy(str(arg.element_type)) + np_arg = np.random.rand(*arg.shape).astype(elem_type) + memref_ptrs.append( + ctypes.pointer( + ctypes.pointer(new_ranked_memref_descriptor(np_arg)) + ) + ) + return memref_ptrs + +def test(): + with Context() as ctx: + file = open( + "/home/liam/PLCT/buddy-mlir/examples/BuddyGPU/matmul.mlir", "r" + ) + module: Module = Module.parse(file.read()) + funcOp: func.FuncOp = ( + module.operation.regions[0].blocks[0].operations[0] + ) + funcName = str(funcOp.name).replace('"', "") + assert isinstance(funcOp, func.FuncOp) + args_type: list[Type] = [arg.type for arg in funcOp.arguments] + res_type = funcOp.type.results + + newModule = lower_to_llvm_cpu(module) + memref_ptrs = get_memref_descriptors(res_type+args_type) + + engine = ExecutionEngine(newModule,shared_libs=['/usr/lib/libomp.so']) + engine.invoke(funcName, *memref_ptrs) + out = rt.ranked_memref_to_numpy(memref_ptrs[0][0]) + print(out) + input1 = rt.ranked_memref_to_numpy(memref_ptrs[1][0]) + input2 = rt.ranked_memref_to_numpy(memref_ptrs[2][0]) + numpy_out = np.matmul(input1, input2) + print(f"MLIR equal to PyTorch? {np.allclose(out, numpy_out)}") + +test() diff --git a/examples/BuddyGraph/README.md b/examples/BuddyGraph/README.md new file mode 100644 index 000000000..d7b977f57 --- /dev/null +++ b/examples/BuddyGraph/README.md @@ -0,0 +1,23 @@ +# Buddy Graph Representation Examples + +## Run the Examples + +0. Enter your Python Env +``` +(base)$ conda activate buddy +(buddy)$ ... +``` +1. Build Python Packages +2. Configure Python Path +``` +(buddy)$ cd buddy-mlir/build +(buddy)$ export BUDDY_MLIR_BUILD_DIR=$PWD +(buddy)$ export LLVM_MLIR_BUILD_DIR=$PWD/../llvm/build +(buddy)$ export PYTHONPATH=${LLVM_MLIR_BUILD_DIR}/tools/mlir/python_packages/mlir_core:${BUDDY_MLIR_BUILD_DIR}/python_packages:${PYTHONPATH} + +``` +3. Run the Examples +``` +(buddy)$ cd examples/BuddyGraph +(buddy)$ python import-dynamo-break.py +``` \ No newline at end of file diff --git a/examples/BuddyGraph/import-dynamo-break.py b/examples/BuddyGraph/import-dynamo-break.py new file mode 100644 index 000000000..42bbed603 --- /dev/null +++ b/examples/BuddyGraph/import-dynamo-break.py @@ -0,0 +1,63 @@ +# ===- import-dynamo-break.py -------------------------------------------------- +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# ===--------------------------------------------------------------------------- +# +# The example for dynamo graph break, import, and execute. +# +# ===--------------------------------------------------------------------------- + +import torch +import torch._dynamo as dynamo +from torch._inductor.decomposition import decompositions as inductor_decomp +from torch._functorch.aot_autograd import aot_autograd_decompositions + +from buddy.compiler.frontend import DynamoCompiler +from buddy.compiler.ops import tosa + + +class TestModule(torch.nn.Module): + def __init__(self, *args, **kwargs) -> None: + super().__init__(*args, **kwargs) + + def forward(self, b, c): + if torch.nn.functional.silu(b)[0][0]: + return torch.add(b, c) + else: + return torch.matmul(b, c) + +# Define a PyTorch model and run it with PyTorch runtime. +model = TestModule() +a, b = torch.randn((1024, 1024)), torch.randn((1024, 1024)) +print(model(a, b)) + +# JIT Mode +# Initialize Buddy Dynamo Compiler to compile and execute the PyTorch model. +dynamo_compiler = DynamoCompiler( + primary_registry=tosa.ops_registry, + aot_autograd_decomposition=aot_autograd_decompositions +) +model_opt = torch.compile(model, backend=dynamo_compiler) +print(model_opt(a, b)) + +torch._dynamo.reset() + +# AOT Mode +# Import PyTorch model to Buddy Graph and MLIR/LLVM IR. +graphs = dynamo_compiler.importer( + model, a, b +) +for g in graphs: + g.lower_to_top_level_ir() + print(g._imported_module) diff --git a/examples/BuddyLlama/.gitignore b/examples/BuddyLlama/.gitignore index 6fc96fbb7..ffee494f3 100644 --- a/examples/BuddyLlama/.gitignore +++ b/examples/BuddyLlama/.gitignore @@ -1,5 +1,5 @@ # model params file -arg0.data +*.data # model mlir file -llama.mlir +*.mlir diff --git a/examples/BuddyLlama/CMakeLists.txt b/examples/BuddyLlama/CMakeLists.txt index c344cfe44..6c70f11c7 100644 --- a/examples/BuddyLlama/CMakeLists.txt +++ b/examples/BuddyLlama/CMakeLists.txt @@ -50,6 +50,47 @@ add_custom_command( add_library(LLAMA STATIC llama.o) +add_custom_command( + OUTPUT llama-gpu.o + COMMAND ${LLVM_MLIR_BINARY_DIR}/mlir-opt ${BUDDY_EXAMPLES_DIR}/BuddyLlama/llama.mlir + -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" | + ${BUDDY_BINARY_DIR}/buddy-opt + -arith-expand + -eliminate-empty-tensors + -empty-tensor-to-alloc-tensor + -linalg-bufferize + -matmul-paralell-vectorization-optimize + -batchmatmul-optimize + -convert-linalg-to-affine-loops + -affine-loop-fusion + -affine-parallelize + -lower-affine + -canonicalize + -func-bufferize + -arith-bufferize + -tensor-bufferize + -buffer-deallocation + -finalizing-bufferize + -gpu-map-parallel-loops + -convert-parallel-loops-to-gpu + -canonicalize + -gpu-kernel-outlining + -convert-scf-to-cf + -memref-expand + -finalize-memref-to-llvm + -convert-arith-to-llvm + -convert-gpu-to-nvvm='has-redux=1' + -llvm-request-c-wrappers + --test-lower-to-nvvm="cubin-chip=sm_80 cubin-features=+ptx71 cubin-format=fatbin" | + ${LLVM_MLIR_BINARY_DIR}/mlir-translate -mlir-to-llvmir | + ${LLVM_MLIR_BINARY_DIR}/llvm-as | + ${LLVM_MLIR_BINARY_DIR}/llc -filetype=obj -relocation-model=pic -O3 + -o ${BUDDY_BINARY_DIR}/../examples/BuddyLlama/llama-gpu.o + DEPENDS buddy-opt ${BUDDY_EXAMPLES_DIR}/BuddyLlama/llama.mlir + COMMENT "Building llama-gpu.o " + VERBATIM) +add_library(LLAMA_GPU STATIC llama-gpu.o) + SET_SOURCE_FILES_PROPERTIES( template.o PROPERTIES @@ -74,3 +115,21 @@ if(BUDDY_MLIR_USE_MIMALLOC) endif() target_link_libraries(buddy-llama-run ${BUDDY_LLAMA_LIBS}) + +SET_TARGET_PROPERTIES( + LLAMA_GPU + PROPERTIES + LINKER_LANGUAGE C) + +set(BUDDY_LLAMA_GPU_LIBS + LLAMA_GPU + mlir_c_runner_utils + omp +) +if(BUDDY_MLIR_USE_MIMALLOC) + list(APPEND BUDDY_LLAMA_GPU_LIBS mimalloc) +endif() + +add_executable(buddy-llama-gpu-run llama-gpu.cpp) +target_link_directories(buddy-llama-gpu-run PRIVATE ${LLVM_MLIR_LIBRARY_DIR}) +target_link_libraries(buddy-llama-gpu-run ${BUDDY_LLAMA_GPU_LIBS}) diff --git a/examples/BuddyLlama/README-gpu.md b/examples/BuddyLlama/README-gpu.md new file mode 100644 index 000000000..a6cdffd79 --- /dev/null +++ b/examples/BuddyLlama/README-gpu.md @@ -0,0 +1,197 @@ +# Buddy Compiler LLaMA on GPU Example + +** This is a work in progress. Current version of Buddy-MLIR is using an older version of LLVM, which is not compatible with the latest version of CUDA. We are working on updating the LLVM version. ** + +## 1. Prerequisites +Please refer to [readme-cpu.md](readme-cpu.md) for most of the steps, except for the following steps. + +1. Install CUDA-toolkit +Please refer to [CUDA-toolkit](https://developer.nvidia.com/cuda-toolkit) for installation. +It is suggested that you install nsight system and nsight as well compute for profiling. Please refer to [nsight-system](https://developer.nvidia.com/nsight-systems) and [nsight-compute](https://developer.nvidia.com/nsight-compute) for installation. +Don't forget to add CUDA and other tools to your PATH. + +... + +For Step 4. Build and check LLVM/MLIR, please enable CUDA runner for MLIR. + +``` +$ cd buddy-mlir +$ mkdir llvm/build +$ cd llvm/build +$ cmake -G Ninja ../llvm \ + -DLLVM_ENABLE_PROJECTS="mlir;clang;openmp" \ + -DLLVM_TARGETS_TO_BUILD="host;RISCV;NVPTX" \ + -DLLVM_ENABLE_ASSERTIONS=ON \ + -DOPENMP_ENABLE_LIBOMPTARGET=OFF \ + -DCMAKE_BUILD_TYPE=RELEASE \ + -DMLIR_ENABLE_BINDINGS_PYTHON=ON \ + -DPython3_EXECUTABLE=$(which python3) \ + -DMLIR_ENABLE_CUDA_RUNNER=ON \ + -DLLVM_CCACHE_BUILD=ON +$ ninja check-clang check-mlir omp +``` + +## 2. Lowering LLaMA MLIR to CUDA +We would use multiple steps to demonstrate the lowering process. Notice the first process would require the `mlir-opt` built in previous steps, but the remaining ones would need the latest version of `mlir-opt` and other llvm tools such as `llc`. + +### 2.1 Lowering TOSA to Linalg +Due to the availbilty of certain operations such as `transpose`, current LLaMA lowering process would require the use of TOSA dialect. We would first lower the LLaMA model to a mixture of TOSA and Linalg dialects. +``` +mlir-opt llama.mlir -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" -o llama-linalg-default.mlir +``` +** Use the old version of `mlir-opt` built in previous steps. Or you might get following error: ** +``` +llama.mlir:747:11: error: 'tosa.mul' op attribute 'shift' failed to satisfy constraint: 8-bit signless integer attribute + %36 = "tosa.mul"(%5, %35) {shift = 0 : i32} : (tensor<1x80x4096xf32>, tensor<1x80x1xf32>) -> tensor<1x80x4096xf32> + ^ +``` +There should be no `tosa` operations in the output. Most of the operations should be `linalg` operations such as `matmul`, `batch_matmul` or `generic`. + +### 2.2 Bufferizing Linalg +This step bufferizes the Linalg operations. It would fully convert the linalg-on-tensor operations to scf-on-memref operations. + +- Bufferize using the old bufferization pipeline: +``` +mlir-opt llama-linalg-default.mlir -arith-expand -eliminate-empty-tensors -empty-tensor-to-alloc-tensor -linalg-bufferize -convert-linalg-to-affine-loops -affine-loop-fusion -affine-parallelize -lower-affine -canonicalize -func-bufferize -arith-bufferize -tensor-bufferize -buffer-deallocation -finalizing-bufferize -canonicalize -o llama-bufferized.mlir +``` + +- Bufferize everything using one-shot-bufferize: +``` +mlir-opt llama-linalg-default.mlir -arith-expand -eliminate-empty-tensors -empty-tensor-to-alloc-tensor -one-shot-bufferize="bufferize-function-boundaries" -expand-realloc -resolve-shaped-type-result-dims -canonicalize -buffer-deallocation-simplification -bufferization-lower-deallocations -cse -canonicalize -buffer-deallocation-pipeline -o llama-bufferized.mlir +``` + +- Bufferize everything but function boundaries using one-shot-bufferize: +``` +mlir-opt llama-linalg-default.mlir -arith-expand -eliminate-empty-tensors -empty-tensor-to-alloc-tensor -one-shot-bufferize -func-bufferize -expand-realloc -resolve-shaped-type-result-dims -canonicalize -buffer-deallocation-simplification -bufferization-lower-deallocations -finalizing-bufferize -cse -canonicalize -buffer-deallocation-pipeline -o llama-bufferized.mlir +``` + +- Bufferize GPU first +``` +buddy-opt -gpu-bufferize llama-linalg-default.mlir -o llama-gpu-bufferized.mlir +``` + +- Bufferize everything else using one-shot-bufferize: +``` +mlir-opt llama-gpu-bufferized.mlir -arith-expand -eliminate-empty-tensors -empty-tensor-to-alloc-tensor -one-shot-bufferize="bufferize-function-boundaries" -expand-realloc -resolve-shaped-type-result-dims -canonicalize -buffer-deallocation-simplification -bufferization-lower-deallocations -cse -canonicalize -buffer-deallocation-pipeline -o llama-bufferized.mlir +``` + +You should not be seeing any tensor on linalg operations. All operations would look like this: + +``` +scf.parallel (%arg2, %arg3) = (%c0, %c0) to (%c80, %c80) step (%c1, %c1) { + %6 = memref.load %0[%arg3] : memref<80xi64> + %7 = memref.load %expand_shape_650[%arg2, %c0] : memref<80x1xi64> + %8 = arith.cmpi slt, %6, %7 : i64 + memref.store %8, %alloc_651[%arg2, %arg3] : memref<80x80xi1> + scf.yield + } +``` + +### 2.3 Converting to GPU +This step converts the scf-on-memref operations to gpu operations, with gpu kernels outlined. + +``` +mlir-opt llama-bufferized.mlir -gpu-map-parallel-loops -convert-parallel-loops-to-gpu -canonicalize -gpu-kernel-outlining -o llama-outlined.mlir +``` + +GPU kernels will be converted into separate modules and functions as such: +``` +gpu.module @forward_kernel_753 { + gpu.func @forward_kernel(%arg0: memref<80x4096xf32>, %arg1: memref<4096x4096xf32>, %arg2: memref<80x4096xf32>, %arg3: index, %arg4: index, %arg5: index) kernel attributes {gpu.known_block_size = array, gpu.known_grid_size = array} { + %0 = gpu.block_id x + %1 = gpu.block_id y + %2 = gpu.block_id z + %3 = gpu.thread_id x + %4 = gpu.thread_id y + %5 = gpu.thread_id z + %6 = gpu.grid_dim x + %7 = gpu.grid_dim y + %8 = gpu.grid_dim z + %9 = gpu.block_dim x + %10 = gpu.block_dim y + %11 = gpu.block_dim z + cf.br ^bb1 + ^bb1: // pred: ^bb0 + scf.for %arg6 = %arg3 to %arg4 step %arg5 { + %12 = memref.load %arg0[%0, %arg6] : memref<80x4096xf32> + %13 = memref.load %arg1[%arg6, %1] : memref<4096x4096xf32> + %14 = memref.load %arg2[%0, %1] : memref<80x4096xf32> + %15 = arith.mulf %12, %13 : f32 + %16 = arith.addf %14, %15 : f32 + memref.store %16, %arg2[%0, %1] : memref<80x4096xf32> + } + gpu.return + } + } +``` + +### 2.4 Converting to LLVM and NVVM operations +This step converts the operations to LLVM dialect operations, and then convert some math functions to NVVM intrinsics. + +``` +buddy-opt llama-outlined.mlir -gpu-host-register -o llama-host-registered.mlir +mlir-opt llama-host-registered.mlir -convert-scf-to-cf -memref-expand -finalize-memref-to-llvm -convert-arith-to-llvm -convert-gpu-to-nvvm='has-redux=1' -o llama-nvvm.mlir +``` + +Why do we need the `convert-gpu-to-nvvm` step? If it is not applied, and we are using the unmodified lowering pipeline from torch to linalg, the generated LLVM IR would look like this: +``` +%24 = llvm.getelementptr %17[%23] : (!llvm.ptr, i64) -> !llvm.ptr, f32 +%25 = llvm.load %24 : !llvm.ptr -> f32 +%26 = math.fpowi %25, %arg2 : f32, i32 +%27 = llvm.extractvalue %2[1] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> +%28 = llvm.mlir.constant(327680 : index) : i64 +``` +For CPU, math operations such as `math.fpowi` would be lowered to LLVM intrinsics such as `llvm.powi.f32`. However, for GPU, we need to use NVVM intrinsics. And sadly there is no NVVM intrinsics for `math.fpowi`. So we would need to change the lowering pipeline to use `mlir.powf` instead. Before lowering to nvvm, it would look like this: +``` +%24 = llvm.getelementptr %17[%23] : (!llvm.ptr, i64) -> !llvm.ptr, f32 +%25 = llvm.load %24 : !llvm.ptr -> f32 +%26 = math.powf %25, %arg2 : f32 +%27 = llvm.extractvalue %2[1] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> +%28 = llvm.mlir.constant(327680 : index) : i64 +``` + +And after the lowering: +``` +llvm.func @__nv_powf(f32, f32) -> f32 +... +%41 = llvm.getelementptr %36[%40] : (!llvm.ptr, i64) -> !llvm.ptr, f32 +%42 = llvm.load %41 : !llvm.ptr -> f32 +%43 = llvm.call @__nv_powf(%42, %arg10) : (f32, f32) -> f32 +%44 = llvm.extractvalue %27[1] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> +``` + +It now uses `llvm.call` to call the NVVM intrinsics. + +### 2.5 Request C wrappers +Notice that you must request wrappers before compiling GPU codes. +``` +mlir-opt llama-nvvm.mlir -llvm-request-c-wrappers -o llama-wrapper.mlir +``` + +### 2.6 Lowering to LLVM Dialect + GPU Binary +``` +mlir-opt llama-wrapper.mlir --test-lower-to-nvvm="cubin-chip=sm_80 cubin-features=+ptx71 cubin-format=fatbin" -o llama-cubin.mlir +``` +Now you could use a builtin pipeline to lower code to nvvm. Notice that you must specify the chip and features. You could find the chip and features from Nvidia. +After the process all gpu code would be compiled. +``` + gpu.binary @forward_kernel_1078 [#gpu.object<#nvvm.target, "...">] +``` + +### 2.7 Translate to LLVM IR +``` +mlir-translate llama-cubin.mlir --mlir-to-llvmir -o llama.ll +``` + +### 2.8 Compile the LLVM IR +** Remember to use the latest version of LLC, as the latest version of MLIR generates some new intrinsics that are not supported by the old version of LLC. ** +``` +llc llama.ll -filetype=obj -relocation-model=pic -O3 -o llama.o +``` + +### 2.9 Link the object file and run +Following is an example of linking the object file with the runtime library and run the program. You could find the runtime library in the build directory of llvm-project. +``` +clang llama.o llama-main.cpp.o /path-to/llvm-project/build/lib/libmlir_cuda_runtime.so /path-to/llvm-project/build/lib/libmlir_c_runner_utils.so +``` +** Notice that current version of the llvm-project used by Buddy-MLIR would encounter problems with CUDA_RUNNERS enabled. Please use the latest version of MLIR for this step. ** \ No newline at end of file diff --git a/examples/BuddyLlama/import-llama2.py b/examples/BuddyLlama/import-llama2.py index d5a3a29e1..d63eebe37 100644 --- a/examples/BuddyLlama/import-llama2.py +++ b/examples/BuddyLlama/import-llama2.py @@ -19,11 +19,13 @@ # ===--------------------------------------------------------------------------- import os +import time import numpy import torch from transformers import LlamaForCausalLM, LlamaTokenizer from torch._functorch.aot_autograd import aot_autograd_decompositions +from torch._inductor.decomposition import decompositions as inductor_decomp from buddy.compiler.frontend import DynamoCompiler from buddy.compiler.ops import tosa @@ -31,6 +33,7 @@ # Retrieve the LLaMA model path from environment variables. model_path = os.environ.get("LLAMA_MODEL_PATH") +model_path = "/home/liam/PLCT/Llama-2-7b-chat-hf" if model_path is None: raise EnvironmentError( "The environment variable 'LLAMA_MODEL_PATH' is not set or is invalid." @@ -44,22 +47,29 @@ # Initialize Dynamo Compiler with specific configurations as an importer. dynamo_compiler = DynamoCompiler( primary_registry=tosa.ops_registry, - aot_autograd_decomposition=aot_autograd_decompositions, + aot_autograd_decomposition=inductor_decomp, ) # Import the model into MLIR module and parameters. with torch.no_grad(): - gm, params = dynamo_compiler.importer( - model, torch.tensor([[1 for _ in range(40)]], dtype=torch.int64) - ) + data = torch.tensor([[1 for i in range(40)]], dtype=torch.int64) + graphs = dynamo_compiler.importer(model, data) +assert len(graphs) == 1 +graph = graphs[0] +params = dynamo_compiler.imported_params[graph] +graph.lower_to_top_level_ir(False) path_prefix = os.path.dirname(os.path.abspath(__file__)) # Write the MLIR module to the file. with open(os.path.join(path_prefix, "llama.mlir"), "w") as module_file: - print(gm, file=module_file) + print(graph._imported_module, file=module_file) -# Concatenate all parameters into a single numpy array and write to a file. -all_param = numpy.concatenate( - [param.detach().numpy().reshape([-1]) for param in params] -) -all_param.tofile(os.path.join(path_prefix, "arg0.data")) +param_file = os.path.dirname(os.path.abspath(__file__)) + "/arg0.data" +if not os.path.exists(param_file): + # Concatenate all parameters into a single numpy array and write to a file. + all_param = numpy.concatenate( + [param.detach().numpy().reshape([-1]) for param in params] + ) + # if file exists, skip dumping + + all_param.tofile(param_file) diff --git a/examples/BuddyLlama/llama-gpu.cpp b/examples/BuddyLlama/llama-gpu.cpp new file mode 100644 index 000000000..e07f4a3cf --- /dev/null +++ b/examples/BuddyLlama/llama-gpu.cpp @@ -0,0 +1,189 @@ +//===- llama-main.cpp -----------------------------------------------------===// +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace buddy; + +constexpr size_t ParamsSize = 6755192832; +constexpr size_t MaxVocabSize = 32000; +constexpr size_t MaxTokenLength = 40; +constexpr size_t HiddenSize = 4096; + +/// Declare LLaMA forward function. +extern "C" void _mlir_ciface_forward(MemRef *, MemRef *, + Text *); + +// ----------------------------------------------------------------------------- +// Helper Functions +// ----------------------------------------------------------------------------- + +/// Capture input message. +void getUserInput(std::string &inputStr) { + std::cout << "\nPlease send a message:" << std::endl; + std::cout << ">>> "; + getline(std::cin, inputStr); + std::cout << std::endl; +} + +/// Print [Log] label in bold blue format. +void printLogLabel() { std::cout << "\033[34;1m[Log] \033[0m"; } + +/// Print information for each iteration. +void printIterInfo(size_t iterIdx, std::string str, double time) { + std::cout << "\033[32;1m[Iteration " << iterIdx << "] \033[0m"; + std::cout << "Token: " << str << " | " + << "Time: " << time << "s" << std::endl; +} + +/// Tokenize input data in the container. +void tokenizeInput(const std::string &vocabFile, + Text &inputContainer) { + printLogLabel(); + std::cout << "Vocab file: " << std::filesystem::canonical(vocabFile) + << std::endl; + const auto buddyTokenizeStart = std::chrono::high_resolution_clock::now(); + inputContainer.tokenizeLlama(vocabFile, MaxTokenLength); + const auto buddyTokenizeEnd = std::chrono::high_resolution_clock::now(); + const std::chrono::duration buddyTokenizeTime = + buddyTokenizeEnd - buddyTokenizeStart; + printLogLabel(); + std::cout << "Tokenize time: " << buddyTokenizeTime.count() << "ms" + << std::endl; +} + +/// Load parameters into data container. +void loadParameters(const std::string ¶mFilePath, + MemRef ¶ms) { + const auto loadStart = std::chrono::high_resolution_clock::now(); + std::ifstream paramFile(paramFilePath, std::ios::in | std::ios::binary); + if (!paramFile.is_open()) { + throw std::runtime_error("[Error] Failed to open params file!"); + } + printLogLabel(); + std::cout << "Loading params..." << std::endl; + printLogLabel(); + std::cout << "Params file: " << std::filesystem::canonical(paramFilePath) + << std::endl; + paramFile.read(reinterpret_cast(params.getData()), + sizeof(float) * (params.getSize())); + if (paramFile.fail()) { + throw std::runtime_error("Error occurred while reading params file!"); + } + paramFile.close(); + const auto loadEnd = std::chrono::high_resolution_clock::now(); + const std::chrono::duration loadTime = + loadEnd - loadStart; + printLogLabel(); + std::cout << "Params load time: " << (double)(loadTime.count()) / 1000 + << "s\n" + << std::endl; +} + +/// Find the index of the max value. +int findMaxIndex(const float *start, const float *end) { + return std::distance(start, std::max_element(start, end)); +} + +// ----------------------------------------------------------------------------- +// LLaMA Inference Main Entry +// ----------------------------------------------------------------------------- + +int main() { + /// Print the title of this example. + const std::string title = "LLaMA 2 Inference Powered by Buddy Compiler"; + std::cout << "\033[33;1m" << title << "\033[0m" << std::endl; + + /// Define directories of vacabulary and parameter file. + const std::string vocabDir = "../../examples/BuddyLlama/vocab.txt"; + const std::string paramsDir = "../../examples/BuddyLlama/arg0.data"; + + /// Get user message. + std::string inputStr; + getUserInput(inputStr); + + /// Initialize data containers + // - Input container. + // - Result container + // - Output container. + // - Parameters container. + Text outputContainer; + MemRef resultContainer[2] = { + MemRef({1, MaxTokenLength, MaxVocabSize}, false, 0), + MemRef({1, MaxTokenLength, HiddenSize}, false, 0)}; + Text inputContainer(inputStr); + MemRef paramsContainer({ParamsSize}); + + /// Fill data into containers + // - Input: register vocabulary and tokenize the input string. + // - Output: register vocabulary. + // - Parameters: load parameters from the `arg0` file into the container. + tokenizeInput(vocabDir, inputContainer); + outputContainer.loadVocab(vocabDir); + loadParameters(paramsDir, paramsContainer); + + /// Run LLaMA Inference + // - Perform the forward function. + // - Find and append the generated token. + // - Continue iterating until the terminal condition is met. + int generateLen = MaxTokenLength - inputContainer.getTokenCnt(); + for (int i = 0; i < generateLen; i++) { + const auto inferenceStart = std::chrono::high_resolution_clock::now(); + // Execute the forward pass of the model. + _mlir_ciface_forward(resultContainer, ¶msContainer, &inputContainer); + + const auto inferenceEnd = std::chrono::high_resolution_clock::now(); + const std::chrono::duration inferenceTime = + inferenceEnd - inferenceStart; + + // Determine the generated token. + int tokenIndex = inputContainer.getTokenCnt() - 1; + const float *startPtr = + resultContainer[0].getData() + tokenIndex * MaxVocabSize; + const float *endPtr = startPtr + MaxVocabSize; + int maxIndex = findMaxIndex(startPtr, endPtr); + std::string tok = inputContainer.getStr(maxIndex); + // Print the generated token and inference time. + printIterInfo(i, tok, inferenceTime.count() / 1000); + + // Stop if a separator token (2, ) or line break token (13 <0x0A>) is + // generated. + if (maxIndex == 2) { + break; + } + // Append the generated token into the input and output container. + inputContainer.appendTokenIdx(maxIndex); + outputContainer.appendTokenIdx(maxIndex); + free(resultContainer[0].release()); + free(resultContainer[1].release()); + } + + /// Print the final result + std::cout << "\n\033[33;1m[Input]\033[0m " << inputStr << std::endl; + std::cout << "\033[33;1m[Output]\033[0m " << outputContainer.revertLlama() + << std::endl; + + return 0; +} diff --git a/examples/BuddyLlama/llama-linalg-default.mlir b/examples/BuddyLlama/llama-linalg-default.mlir new file mode 100644 index 000000000..ab99e1246 --- /dev/null +++ b/examples/BuddyLlama/llama-linalg-default.mlir @@ -0,0 +1,12905 @@ +#map = affine_map<(d0, d1) -> (d0, d1)> +#map1 = affine_map<(d0, d1, d2) -> (d0, d1)> +#map2 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> +#map3 = affine_map<(d0) -> (d0)> +#map4 = affine_map<(d0, d1, d2) -> (d1)> +#map5 = affine_map<(d0, d1, d2) -> (d0, d2)> +#map6 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)> +#map7 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> +#map8 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)> +#map9 = affine_map<(d0, d1) -> (d1, d0)> +#map10 = affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)> +#map11 = affine_map<(d0, d1, d2) -> (d0, 0, d1, d2)> +#map12 = affine_map<(d0, d1) -> (0, d0, d1)> +#map13 = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3)> +#map14 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3, d2)> +#map15 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, 0)> +module { + func.func @forward(%arg0: tensor<6755192832xf32> {bufferization.writable = false}, %arg1: tensor<1x80xi64>) -> tensor<1x80x32000xf32> { + %extracted_slice = tensor.extract_slice %arg0[0] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32> + %extracted_slice_0 = tensor.extract_slice %arg0[4096] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32> + %extracted_slice_1 = tensor.extract_slice %arg0[8192] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32> + %extracted_slice_2 = tensor.extract_slice %arg0[12288] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32> + %extracted_slice_3 = tensor.extract_slice %arg0[16384] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32> + %extracted_slice_4 = tensor.extract_slice %arg0[20480] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32> + %extracted_slice_5 = tensor.extract_slice %arg0[24576] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32> + %extracted_slice_6 = tensor.extract_slice %arg0[28672] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32> + %extracted_slice_7 = tensor.extract_slice %arg0[32768] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32> + %extracted_slice_8 = tensor.extract_slice %arg0[36864] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32> + %extracted_slice_9 = tensor.extract_slice %arg0[40960] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32> + %extracted_slice_10 = tensor.extract_slice %arg0[45056] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32> + %extracted_slice_11 = tensor.extract_slice %arg0[49152] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32> + %extracted_slice_12 = tensor.extract_slice %arg0[53248] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32> + %extracted_slice_13 = tensor.extract_slice %arg0[57344] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32> + %extracted_slice_14 = tensor.extract_slice %arg0[61440] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32> + %extracted_slice_15 = tensor.extract_slice %arg0[65536] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32> + %extracted_slice_16 = tensor.extract_slice %arg0[69632] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32> + %extracted_slice_17 = tensor.extract_slice %arg0[73728] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32> + %extracted_slice_18 = tensor.extract_slice %arg0[77824] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32> + %extracted_slice_19 = tensor.extract_slice %arg0[81920] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32> + %extracted_slice_20 = tensor.extract_slice %arg0[86016] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32> + %extracted_slice_21 = tensor.extract_slice %arg0[90112] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32> + %extracted_slice_22 = tensor.extract_slice %arg0[94208] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32> + %extracted_slice_23 = tensor.extract_slice %arg0[98304] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32> + %extracted_slice_24 = tensor.extract_slice %arg0[102400] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32> + %extracted_slice_25 = tensor.extract_slice %arg0[106496] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32> + %extracted_slice_26 = tensor.extract_slice %arg0[110592] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32> + %extracted_slice_27 = tensor.extract_slice %arg0[114688] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32> + %extracted_slice_28 = tensor.extract_slice %arg0[118784] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32> + %extracted_slice_29 = tensor.extract_slice %arg0[122880] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32> + %extracted_slice_30 = tensor.extract_slice %arg0[126976] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32> + %extracted_slice_31 = tensor.extract_slice %arg0[131072] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32> + %extracted_slice_32 = tensor.extract_slice %arg0[135168] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32> + %extracted_slice_33 = tensor.extract_slice %arg0[139264] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32> + %extracted_slice_34 = tensor.extract_slice %arg0[143360] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32> + %extracted_slice_35 = tensor.extract_slice %arg0[147456] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32> + %extracted_slice_36 = tensor.extract_slice %arg0[151552] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32> + %extracted_slice_37 = tensor.extract_slice %arg0[155648] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32> + %extracted_slice_38 = tensor.extract_slice %arg0[159744] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32> + %extracted_slice_39 = tensor.extract_slice %arg0[163840] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32> + %extracted_slice_40 = tensor.extract_slice %arg0[167936] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32> + %extracted_slice_41 = tensor.extract_slice %arg0[172032] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32> + %extracted_slice_42 = tensor.extract_slice %arg0[176128] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32> + %extracted_slice_43 = tensor.extract_slice %arg0[180224] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32> + %extracted_slice_44 = tensor.extract_slice %arg0[184320] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32> + %extracted_slice_45 = tensor.extract_slice %arg0[188416] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32> + %extracted_slice_46 = tensor.extract_slice %arg0[192512] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32> + %extracted_slice_47 = tensor.extract_slice %arg0[196608] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32> + %extracted_slice_48 = tensor.extract_slice %arg0[200704] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32> + %extracted_slice_49 = tensor.extract_slice %arg0[204800] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32> + %extracted_slice_50 = tensor.extract_slice %arg0[208896] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32> + %extracted_slice_51 = tensor.extract_slice %arg0[212992] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32> + %extracted_slice_52 = tensor.extract_slice %arg0[217088] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32> + %extracted_slice_53 = tensor.extract_slice %arg0[221184] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32> + %extracted_slice_54 = tensor.extract_slice %arg0[225280] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32> + %extracted_slice_55 = tensor.extract_slice %arg0[229376] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32> + %extracted_slice_56 = tensor.extract_slice %arg0[233472] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32> + %extracted_slice_57 = tensor.extract_slice %arg0[237568] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32> + %extracted_slice_58 = tensor.extract_slice %arg0[241664] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32> + %extracted_slice_59 = tensor.extract_slice %arg0[245760] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32> + %extracted_slice_60 = tensor.extract_slice %arg0[249856] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32> + %extracted_slice_61 = tensor.extract_slice %arg0[253952] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32> + %extracted_slice_62 = tensor.extract_slice %arg0[258048] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32> + %extracted_slice_63 = tensor.extract_slice %arg0[262144] [4096] [1] : tensor<6755192832xf32> to tensor<4096xf32> + %extracted_slice_64 = tensor.extract_slice %arg0[266240] [131072000] [1] : tensor<6755192832xf32> to tensor<131072000xf32> + %expanded = tensor.expand_shape %extracted_slice_64 [[0, 1]] : tensor<131072000xf32> into tensor<32000x4096xf32> + %extracted_slice_65 = tensor.extract_slice %arg0[131338240] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_66 = tensor.expand_shape %extracted_slice_65 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_67 = tensor.extract_slice %arg0[148115456] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_68 = tensor.expand_shape %extracted_slice_67 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_69 = tensor.extract_slice %arg0[164892672] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_70 = tensor.expand_shape %extracted_slice_69 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_71 = tensor.extract_slice %arg0[181669888] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_72 = tensor.expand_shape %extracted_slice_71 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_73 = tensor.extract_slice %arg0[198447104] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32> + %expanded_74 = tensor.expand_shape %extracted_slice_73 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32> + %extracted_slice_75 = tensor.extract_slice %arg0[243535872] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32> + %expanded_76 = tensor.expand_shape %extracted_slice_75 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32> + %extracted_slice_77 = tensor.extract_slice %arg0[288624640] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32> + %expanded_78 = tensor.expand_shape %extracted_slice_77 [[0, 1]] : tensor<45088768xf32> into tensor<4096x11008xf32> + %extracted_slice_79 = tensor.extract_slice %arg0[333713408] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_80 = tensor.expand_shape %extracted_slice_79 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_81 = tensor.extract_slice %arg0[350490624] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_82 = tensor.expand_shape %extracted_slice_81 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_83 = tensor.extract_slice %arg0[367267840] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_84 = tensor.expand_shape %extracted_slice_83 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_85 = tensor.extract_slice %arg0[384045056] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_86 = tensor.expand_shape %extracted_slice_85 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_87 = tensor.extract_slice %arg0[400822272] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32> + %expanded_88 = tensor.expand_shape %extracted_slice_87 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32> + %extracted_slice_89 = tensor.extract_slice %arg0[445911040] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32> + %expanded_90 = tensor.expand_shape %extracted_slice_89 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32> + %extracted_slice_91 = tensor.extract_slice %arg0[490999808] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32> + %expanded_92 = tensor.expand_shape %extracted_slice_91 [[0, 1]] : tensor<45088768xf32> into tensor<4096x11008xf32> + %extracted_slice_93 = tensor.extract_slice %arg0[536088576] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_94 = tensor.expand_shape %extracted_slice_93 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_95 = tensor.extract_slice %arg0[552865792] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_96 = tensor.expand_shape %extracted_slice_95 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_97 = tensor.extract_slice %arg0[569643008] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_98 = tensor.expand_shape %extracted_slice_97 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_99 = tensor.extract_slice %arg0[586420224] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_100 = tensor.expand_shape %extracted_slice_99 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_101 = tensor.extract_slice %arg0[603197440] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32> + %expanded_102 = tensor.expand_shape %extracted_slice_101 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32> + %extracted_slice_103 = tensor.extract_slice %arg0[648286208] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32> + %expanded_104 = tensor.expand_shape %extracted_slice_103 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32> + %extracted_slice_105 = tensor.extract_slice %arg0[693374976] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32> + %expanded_106 = tensor.expand_shape %extracted_slice_105 [[0, 1]] : tensor<45088768xf32> into tensor<4096x11008xf32> + %extracted_slice_107 = tensor.extract_slice %arg0[738463744] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_108 = tensor.expand_shape %extracted_slice_107 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_109 = tensor.extract_slice %arg0[755240960] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_110 = tensor.expand_shape %extracted_slice_109 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_111 = tensor.extract_slice %arg0[772018176] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_112 = tensor.expand_shape %extracted_slice_111 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_113 = tensor.extract_slice %arg0[788795392] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_114 = tensor.expand_shape %extracted_slice_113 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_115 = tensor.extract_slice %arg0[805572608] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32> + %expanded_116 = tensor.expand_shape %extracted_slice_115 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32> + %extracted_slice_117 = tensor.extract_slice %arg0[850661376] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32> + %expanded_118 = tensor.expand_shape %extracted_slice_117 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32> + %extracted_slice_119 = tensor.extract_slice %arg0[895750144] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32> + %expanded_120 = tensor.expand_shape %extracted_slice_119 [[0, 1]] : tensor<45088768xf32> into tensor<4096x11008xf32> + %extracted_slice_121 = tensor.extract_slice %arg0[940838912] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_122 = tensor.expand_shape %extracted_slice_121 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_123 = tensor.extract_slice %arg0[957616128] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_124 = tensor.expand_shape %extracted_slice_123 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_125 = tensor.extract_slice %arg0[974393344] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_126 = tensor.expand_shape %extracted_slice_125 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_127 = tensor.extract_slice %arg0[991170560] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_128 = tensor.expand_shape %extracted_slice_127 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_129 = tensor.extract_slice %arg0[1007947776] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32> + %expanded_130 = tensor.expand_shape %extracted_slice_129 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32> + %extracted_slice_131 = tensor.extract_slice %arg0[1053036544] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32> + %expanded_132 = tensor.expand_shape %extracted_slice_131 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32> + %extracted_slice_133 = tensor.extract_slice %arg0[1098125312] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32> + %expanded_134 = tensor.expand_shape %extracted_slice_133 [[0, 1]] : tensor<45088768xf32> into tensor<4096x11008xf32> + %extracted_slice_135 = tensor.extract_slice %arg0[1143214080] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_136 = tensor.expand_shape %extracted_slice_135 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_137 = tensor.extract_slice %arg0[1159991296] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_138 = tensor.expand_shape %extracted_slice_137 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_139 = tensor.extract_slice %arg0[1176768512] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_140 = tensor.expand_shape %extracted_slice_139 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_141 = tensor.extract_slice %arg0[1193545728] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_142 = tensor.expand_shape %extracted_slice_141 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_143 = tensor.extract_slice %arg0[1210322944] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32> + %expanded_144 = tensor.expand_shape %extracted_slice_143 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32> + %extracted_slice_145 = tensor.extract_slice %arg0[1255411712] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32> + %expanded_146 = tensor.expand_shape %extracted_slice_145 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32> + %extracted_slice_147 = tensor.extract_slice %arg0[1300500480] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32> + %expanded_148 = tensor.expand_shape %extracted_slice_147 [[0, 1]] : tensor<45088768xf32> into tensor<4096x11008xf32> + %extracted_slice_149 = tensor.extract_slice %arg0[1345589248] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_150 = tensor.expand_shape %extracted_slice_149 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_151 = tensor.extract_slice %arg0[1362366464] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_152 = tensor.expand_shape %extracted_slice_151 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_153 = tensor.extract_slice %arg0[1379143680] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_154 = tensor.expand_shape %extracted_slice_153 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_155 = tensor.extract_slice %arg0[1395920896] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_156 = tensor.expand_shape %extracted_slice_155 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_157 = tensor.extract_slice %arg0[1412698112] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32> + %expanded_158 = tensor.expand_shape %extracted_slice_157 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32> + %extracted_slice_159 = tensor.extract_slice %arg0[1457786880] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32> + %expanded_160 = tensor.expand_shape %extracted_slice_159 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32> + %extracted_slice_161 = tensor.extract_slice %arg0[1502875648] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32> + %expanded_162 = tensor.expand_shape %extracted_slice_161 [[0, 1]] : tensor<45088768xf32> into tensor<4096x11008xf32> + %extracted_slice_163 = tensor.extract_slice %arg0[1547964416] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_164 = tensor.expand_shape %extracted_slice_163 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_165 = tensor.extract_slice %arg0[1564741632] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_166 = tensor.expand_shape %extracted_slice_165 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_167 = tensor.extract_slice %arg0[1581518848] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_168 = tensor.expand_shape %extracted_slice_167 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_169 = tensor.extract_slice %arg0[1598296064] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_170 = tensor.expand_shape %extracted_slice_169 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_171 = tensor.extract_slice %arg0[1615073280] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32> + %expanded_172 = tensor.expand_shape %extracted_slice_171 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32> + %extracted_slice_173 = tensor.extract_slice %arg0[1660162048] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32> + %expanded_174 = tensor.expand_shape %extracted_slice_173 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32> + %extracted_slice_175 = tensor.extract_slice %arg0[1705250816] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32> + %expanded_176 = tensor.expand_shape %extracted_slice_175 [[0, 1]] : tensor<45088768xf32> into tensor<4096x11008xf32> + %extracted_slice_177 = tensor.extract_slice %arg0[1750339584] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_178 = tensor.expand_shape %extracted_slice_177 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_179 = tensor.extract_slice %arg0[1767116800] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_180 = tensor.expand_shape %extracted_slice_179 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_181 = tensor.extract_slice %arg0[1783894016] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_182 = tensor.expand_shape %extracted_slice_181 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_183 = tensor.extract_slice %arg0[1800671232] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_184 = tensor.expand_shape %extracted_slice_183 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_185 = tensor.extract_slice %arg0[1817448448] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32> + %expanded_186 = tensor.expand_shape %extracted_slice_185 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32> + %extracted_slice_187 = tensor.extract_slice %arg0[1862537216] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32> + %expanded_188 = tensor.expand_shape %extracted_slice_187 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32> + %extracted_slice_189 = tensor.extract_slice %arg0[1907625984] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32> + %expanded_190 = tensor.expand_shape %extracted_slice_189 [[0, 1]] : tensor<45088768xf32> into tensor<4096x11008xf32> + %extracted_slice_191 = tensor.extract_slice %arg0[1952714752] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_192 = tensor.expand_shape %extracted_slice_191 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_193 = tensor.extract_slice %arg0[1969491968] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_194 = tensor.expand_shape %extracted_slice_193 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_195 = tensor.extract_slice %arg0[1986269184] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_196 = tensor.expand_shape %extracted_slice_195 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_197 = tensor.extract_slice %arg0[2003046400] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_198 = tensor.expand_shape %extracted_slice_197 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_199 = tensor.extract_slice %arg0[2019823616] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32> + %expanded_200 = tensor.expand_shape %extracted_slice_199 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32> + %extracted_slice_201 = tensor.extract_slice %arg0[2064912384] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32> + %expanded_202 = tensor.expand_shape %extracted_slice_201 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32> + %extracted_slice_203 = tensor.extract_slice %arg0[2110001152] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32> + %expanded_204 = tensor.expand_shape %extracted_slice_203 [[0, 1]] : tensor<45088768xf32> into tensor<4096x11008xf32> + %extracted_slice_205 = tensor.extract_slice %arg0[2155089920] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_206 = tensor.expand_shape %extracted_slice_205 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_207 = tensor.extract_slice %arg0[2171867136] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_208 = tensor.expand_shape %extracted_slice_207 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_209 = tensor.extract_slice %arg0[2188644352] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_210 = tensor.expand_shape %extracted_slice_209 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_211 = tensor.extract_slice %arg0[2205421568] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_212 = tensor.expand_shape %extracted_slice_211 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_213 = tensor.extract_slice %arg0[2222198784] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32> + %expanded_214 = tensor.expand_shape %extracted_slice_213 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32> + %extracted_slice_215 = tensor.extract_slice %arg0[2267287552] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32> + %expanded_216 = tensor.expand_shape %extracted_slice_215 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32> + %extracted_slice_217 = tensor.extract_slice %arg0[2312376320] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32> + %expanded_218 = tensor.expand_shape %extracted_slice_217 [[0, 1]] : tensor<45088768xf32> into tensor<4096x11008xf32> + %extracted_slice_219 = tensor.extract_slice %arg0[2357465088] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_220 = tensor.expand_shape %extracted_slice_219 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_221 = tensor.extract_slice %arg0[2374242304] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_222 = tensor.expand_shape %extracted_slice_221 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_223 = tensor.extract_slice %arg0[2391019520] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_224 = tensor.expand_shape %extracted_slice_223 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_225 = tensor.extract_slice %arg0[2407796736] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_226 = tensor.expand_shape %extracted_slice_225 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_227 = tensor.extract_slice %arg0[2424573952] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32> + %expanded_228 = tensor.expand_shape %extracted_slice_227 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32> + %extracted_slice_229 = tensor.extract_slice %arg0[2469662720] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32> + %expanded_230 = tensor.expand_shape %extracted_slice_229 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32> + %extracted_slice_231 = tensor.extract_slice %arg0[2514751488] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32> + %expanded_232 = tensor.expand_shape %extracted_slice_231 [[0, 1]] : tensor<45088768xf32> into tensor<4096x11008xf32> + %extracted_slice_233 = tensor.extract_slice %arg0[2559840256] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_234 = tensor.expand_shape %extracted_slice_233 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_235 = tensor.extract_slice %arg0[2576617472] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_236 = tensor.expand_shape %extracted_slice_235 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_237 = tensor.extract_slice %arg0[2593394688] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_238 = tensor.expand_shape %extracted_slice_237 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_239 = tensor.extract_slice %arg0[2610171904] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_240 = tensor.expand_shape %extracted_slice_239 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_241 = tensor.extract_slice %arg0[2626949120] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32> + %expanded_242 = tensor.expand_shape %extracted_slice_241 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32> + %extracted_slice_243 = tensor.extract_slice %arg0[2672037888] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32> + %expanded_244 = tensor.expand_shape %extracted_slice_243 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32> + %extracted_slice_245 = tensor.extract_slice %arg0[2717126656] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32> + %expanded_246 = tensor.expand_shape %extracted_slice_245 [[0, 1]] : tensor<45088768xf32> into tensor<4096x11008xf32> + %extracted_slice_247 = tensor.extract_slice %arg0[2762215424] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_248 = tensor.expand_shape %extracted_slice_247 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_249 = tensor.extract_slice %arg0[2778992640] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_250 = tensor.expand_shape %extracted_slice_249 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_251 = tensor.extract_slice %arg0[2795769856] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_252 = tensor.expand_shape %extracted_slice_251 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_253 = tensor.extract_slice %arg0[2812547072] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_254 = tensor.expand_shape %extracted_slice_253 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_255 = tensor.extract_slice %arg0[2829324288] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32> + %expanded_256 = tensor.expand_shape %extracted_slice_255 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32> + %extracted_slice_257 = tensor.extract_slice %arg0[2874413056] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32> + %expanded_258 = tensor.expand_shape %extracted_slice_257 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32> + %extracted_slice_259 = tensor.extract_slice %arg0[2919501824] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32> + %expanded_260 = tensor.expand_shape %extracted_slice_259 [[0, 1]] : tensor<45088768xf32> into tensor<4096x11008xf32> + %extracted_slice_261 = tensor.extract_slice %arg0[2964590592] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_262 = tensor.expand_shape %extracted_slice_261 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_263 = tensor.extract_slice %arg0[2981367808] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_264 = tensor.expand_shape %extracted_slice_263 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_265 = tensor.extract_slice %arg0[2998145024] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_266 = tensor.expand_shape %extracted_slice_265 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_267 = tensor.extract_slice %arg0[3014922240] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_268 = tensor.expand_shape %extracted_slice_267 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_269 = tensor.extract_slice %arg0[3031699456] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32> + %expanded_270 = tensor.expand_shape %extracted_slice_269 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32> + %extracted_slice_271 = tensor.extract_slice %arg0[3076788224] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32> + %expanded_272 = tensor.expand_shape %extracted_slice_271 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32> + %extracted_slice_273 = tensor.extract_slice %arg0[3121876992] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32> + %expanded_274 = tensor.expand_shape %extracted_slice_273 [[0, 1]] : tensor<45088768xf32> into tensor<4096x11008xf32> + %extracted_slice_275 = tensor.extract_slice %arg0[3166965760] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_276 = tensor.expand_shape %extracted_slice_275 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_277 = tensor.extract_slice %arg0[3183742976] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_278 = tensor.expand_shape %extracted_slice_277 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_279 = tensor.extract_slice %arg0[3200520192] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_280 = tensor.expand_shape %extracted_slice_279 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_281 = tensor.extract_slice %arg0[3217297408] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_282 = tensor.expand_shape %extracted_slice_281 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_283 = tensor.extract_slice %arg0[3234074624] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32> + %expanded_284 = tensor.expand_shape %extracted_slice_283 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32> + %extracted_slice_285 = tensor.extract_slice %arg0[3279163392] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32> + %expanded_286 = tensor.expand_shape %extracted_slice_285 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32> + %extracted_slice_287 = tensor.extract_slice %arg0[3324252160] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32> + %expanded_288 = tensor.expand_shape %extracted_slice_287 [[0, 1]] : tensor<45088768xf32> into tensor<4096x11008xf32> + %extracted_slice_289 = tensor.extract_slice %arg0[3369340928] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_290 = tensor.expand_shape %extracted_slice_289 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_291 = tensor.extract_slice %arg0[3386118144] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_292 = tensor.expand_shape %extracted_slice_291 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_293 = tensor.extract_slice %arg0[3402895360] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_294 = tensor.expand_shape %extracted_slice_293 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_295 = tensor.extract_slice %arg0[3419672576] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_296 = tensor.expand_shape %extracted_slice_295 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_297 = tensor.extract_slice %arg0[3436449792] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32> + %expanded_298 = tensor.expand_shape %extracted_slice_297 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32> + %extracted_slice_299 = tensor.extract_slice %arg0[3481538560] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32> + %expanded_300 = tensor.expand_shape %extracted_slice_299 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32> + %extracted_slice_301 = tensor.extract_slice %arg0[3526627328] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32> + %expanded_302 = tensor.expand_shape %extracted_slice_301 [[0, 1]] : tensor<45088768xf32> into tensor<4096x11008xf32> + %extracted_slice_303 = tensor.extract_slice %arg0[3571716096] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_304 = tensor.expand_shape %extracted_slice_303 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_305 = tensor.extract_slice %arg0[3588493312] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_306 = tensor.expand_shape %extracted_slice_305 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_307 = tensor.extract_slice %arg0[3605270528] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_308 = tensor.expand_shape %extracted_slice_307 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_309 = tensor.extract_slice %arg0[3622047744] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_310 = tensor.expand_shape %extracted_slice_309 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_311 = tensor.extract_slice %arg0[3638824960] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32> + %expanded_312 = tensor.expand_shape %extracted_slice_311 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32> + %extracted_slice_313 = tensor.extract_slice %arg0[3683913728] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32> + %expanded_314 = tensor.expand_shape %extracted_slice_313 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32> + %extracted_slice_315 = tensor.extract_slice %arg0[3729002496] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32> + %expanded_316 = tensor.expand_shape %extracted_slice_315 [[0, 1]] : tensor<45088768xf32> into tensor<4096x11008xf32> + %extracted_slice_317 = tensor.extract_slice %arg0[3774091264] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_318 = tensor.expand_shape %extracted_slice_317 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_319 = tensor.extract_slice %arg0[3790868480] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_320 = tensor.expand_shape %extracted_slice_319 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_321 = tensor.extract_slice %arg0[3807645696] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_322 = tensor.expand_shape %extracted_slice_321 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_323 = tensor.extract_slice %arg0[3824422912] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_324 = tensor.expand_shape %extracted_slice_323 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_325 = tensor.extract_slice %arg0[3841200128] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32> + %expanded_326 = tensor.expand_shape %extracted_slice_325 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32> + %extracted_slice_327 = tensor.extract_slice %arg0[3886288896] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32> + %expanded_328 = tensor.expand_shape %extracted_slice_327 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32> + %extracted_slice_329 = tensor.extract_slice %arg0[3931377664] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32> + %expanded_330 = tensor.expand_shape %extracted_slice_329 [[0, 1]] : tensor<45088768xf32> into tensor<4096x11008xf32> + %extracted_slice_331 = tensor.extract_slice %arg0[3976466432] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_332 = tensor.expand_shape %extracted_slice_331 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_333 = tensor.extract_slice %arg0[3993243648] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_334 = tensor.expand_shape %extracted_slice_333 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_335 = tensor.extract_slice %arg0[4010020864] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_336 = tensor.expand_shape %extracted_slice_335 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_337 = tensor.extract_slice %arg0[4026798080] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_338 = tensor.expand_shape %extracted_slice_337 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_339 = tensor.extract_slice %arg0[4043575296] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32> + %expanded_340 = tensor.expand_shape %extracted_slice_339 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32> + %extracted_slice_341 = tensor.extract_slice %arg0[4088664064] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32> + %expanded_342 = tensor.expand_shape %extracted_slice_341 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32> + %extracted_slice_343 = tensor.extract_slice %arg0[4133752832] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32> + %expanded_344 = tensor.expand_shape %extracted_slice_343 [[0, 1]] : tensor<45088768xf32> into tensor<4096x11008xf32> + %extracted_slice_345 = tensor.extract_slice %arg0[4178841600] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_346 = tensor.expand_shape %extracted_slice_345 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_347 = tensor.extract_slice %arg0[4195618816] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_348 = tensor.expand_shape %extracted_slice_347 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_349 = tensor.extract_slice %arg0[4212396032] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_350 = tensor.expand_shape %extracted_slice_349 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_351 = tensor.extract_slice %arg0[4229173248] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_352 = tensor.expand_shape %extracted_slice_351 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_353 = tensor.extract_slice %arg0[4245950464] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32> + %expanded_354 = tensor.expand_shape %extracted_slice_353 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32> + %extracted_slice_355 = tensor.extract_slice %arg0[4291039232] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32> + %expanded_356 = tensor.expand_shape %extracted_slice_355 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32> + %extracted_slice_357 = tensor.extract_slice %arg0[4336128000] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32> + %expanded_358 = tensor.expand_shape %extracted_slice_357 [[0, 1]] : tensor<45088768xf32> into tensor<4096x11008xf32> + %extracted_slice_359 = tensor.extract_slice %arg0[4381216768] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_360 = tensor.expand_shape %extracted_slice_359 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_361 = tensor.extract_slice %arg0[4397993984] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_362 = tensor.expand_shape %extracted_slice_361 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_363 = tensor.extract_slice %arg0[4414771200] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_364 = tensor.expand_shape %extracted_slice_363 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_365 = tensor.extract_slice %arg0[4431548416] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_366 = tensor.expand_shape %extracted_slice_365 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_367 = tensor.extract_slice %arg0[4448325632] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32> + %expanded_368 = tensor.expand_shape %extracted_slice_367 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32> + %extracted_slice_369 = tensor.extract_slice %arg0[4493414400] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32> + %expanded_370 = tensor.expand_shape %extracted_slice_369 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32> + %extracted_slice_371 = tensor.extract_slice %arg0[4538503168] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32> + %expanded_372 = tensor.expand_shape %extracted_slice_371 [[0, 1]] : tensor<45088768xf32> into tensor<4096x11008xf32> + %extracted_slice_373 = tensor.extract_slice %arg0[4583591936] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_374 = tensor.expand_shape %extracted_slice_373 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_375 = tensor.extract_slice %arg0[4600369152] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_376 = tensor.expand_shape %extracted_slice_375 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_377 = tensor.extract_slice %arg0[4617146368] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_378 = tensor.expand_shape %extracted_slice_377 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_379 = tensor.extract_slice %arg0[4633923584] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_380 = tensor.expand_shape %extracted_slice_379 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_381 = tensor.extract_slice %arg0[4650700800] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32> + %expanded_382 = tensor.expand_shape %extracted_slice_381 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32> + %extracted_slice_383 = tensor.extract_slice %arg0[4695789568] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32> + %expanded_384 = tensor.expand_shape %extracted_slice_383 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32> + %extracted_slice_385 = tensor.extract_slice %arg0[4740878336] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32> + %expanded_386 = tensor.expand_shape %extracted_slice_385 [[0, 1]] : tensor<45088768xf32> into tensor<4096x11008xf32> + %extracted_slice_387 = tensor.extract_slice %arg0[4785967104] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_388 = tensor.expand_shape %extracted_slice_387 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_389 = tensor.extract_slice %arg0[4802744320] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_390 = tensor.expand_shape %extracted_slice_389 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_391 = tensor.extract_slice %arg0[4819521536] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_392 = tensor.expand_shape %extracted_slice_391 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_393 = tensor.extract_slice %arg0[4836298752] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_394 = tensor.expand_shape %extracted_slice_393 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_395 = tensor.extract_slice %arg0[4853075968] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32> + %expanded_396 = tensor.expand_shape %extracted_slice_395 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32> + %extracted_slice_397 = tensor.extract_slice %arg0[4898164736] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32> + %expanded_398 = tensor.expand_shape %extracted_slice_397 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32> + %extracted_slice_399 = tensor.extract_slice %arg0[4943253504] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32> + %expanded_400 = tensor.expand_shape %extracted_slice_399 [[0, 1]] : tensor<45088768xf32> into tensor<4096x11008xf32> + %extracted_slice_401 = tensor.extract_slice %arg0[4988342272] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_402 = tensor.expand_shape %extracted_slice_401 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_403 = tensor.extract_slice %arg0[5005119488] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_404 = tensor.expand_shape %extracted_slice_403 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_405 = tensor.extract_slice %arg0[5021896704] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_406 = tensor.expand_shape %extracted_slice_405 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_407 = tensor.extract_slice %arg0[5038673920] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_408 = tensor.expand_shape %extracted_slice_407 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_409 = tensor.extract_slice %arg0[5055451136] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32> + %expanded_410 = tensor.expand_shape %extracted_slice_409 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32> + %extracted_slice_411 = tensor.extract_slice %arg0[5100539904] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32> + %expanded_412 = tensor.expand_shape %extracted_slice_411 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32> + %extracted_slice_413 = tensor.extract_slice %arg0[5145628672] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32> + %expanded_414 = tensor.expand_shape %extracted_slice_413 [[0, 1]] : tensor<45088768xf32> into tensor<4096x11008xf32> + %extracted_slice_415 = tensor.extract_slice %arg0[5190717440] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_416 = tensor.expand_shape %extracted_slice_415 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_417 = tensor.extract_slice %arg0[5207494656] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_418 = tensor.expand_shape %extracted_slice_417 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_419 = tensor.extract_slice %arg0[5224271872] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_420 = tensor.expand_shape %extracted_slice_419 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_421 = tensor.extract_slice %arg0[5241049088] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_422 = tensor.expand_shape %extracted_slice_421 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_423 = tensor.extract_slice %arg0[5257826304] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32> + %expanded_424 = tensor.expand_shape %extracted_slice_423 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32> + %extracted_slice_425 = tensor.extract_slice %arg0[5302915072] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32> + %expanded_426 = tensor.expand_shape %extracted_slice_425 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32> + %extracted_slice_427 = tensor.extract_slice %arg0[5348003840] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32> + %expanded_428 = tensor.expand_shape %extracted_slice_427 [[0, 1]] : tensor<45088768xf32> into tensor<4096x11008xf32> + %extracted_slice_429 = tensor.extract_slice %arg0[5393092608] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_430 = tensor.expand_shape %extracted_slice_429 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_431 = tensor.extract_slice %arg0[5409869824] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_432 = tensor.expand_shape %extracted_slice_431 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_433 = tensor.extract_slice %arg0[5426647040] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_434 = tensor.expand_shape %extracted_slice_433 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_435 = tensor.extract_slice %arg0[5443424256] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_436 = tensor.expand_shape %extracted_slice_435 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_437 = tensor.extract_slice %arg0[5460201472] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32> + %expanded_438 = tensor.expand_shape %extracted_slice_437 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32> + %extracted_slice_439 = tensor.extract_slice %arg0[5505290240] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32> + %expanded_440 = tensor.expand_shape %extracted_slice_439 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32> + %extracted_slice_441 = tensor.extract_slice %arg0[5550379008] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32> + %expanded_442 = tensor.expand_shape %extracted_slice_441 [[0, 1]] : tensor<45088768xf32> into tensor<4096x11008xf32> + %extracted_slice_443 = tensor.extract_slice %arg0[5595467776] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_444 = tensor.expand_shape %extracted_slice_443 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_445 = tensor.extract_slice %arg0[5612244992] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_446 = tensor.expand_shape %extracted_slice_445 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_447 = tensor.extract_slice %arg0[5629022208] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_448 = tensor.expand_shape %extracted_slice_447 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_449 = tensor.extract_slice %arg0[5645799424] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_450 = tensor.expand_shape %extracted_slice_449 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_451 = tensor.extract_slice %arg0[5662576640] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32> + %expanded_452 = tensor.expand_shape %extracted_slice_451 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32> + %extracted_slice_453 = tensor.extract_slice %arg0[5707665408] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32> + %expanded_454 = tensor.expand_shape %extracted_slice_453 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32> + %extracted_slice_455 = tensor.extract_slice %arg0[5752754176] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32> + %expanded_456 = tensor.expand_shape %extracted_slice_455 [[0, 1]] : tensor<45088768xf32> into tensor<4096x11008xf32> + %extracted_slice_457 = tensor.extract_slice %arg0[5797842944] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_458 = tensor.expand_shape %extracted_slice_457 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_459 = tensor.extract_slice %arg0[5814620160] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_460 = tensor.expand_shape %extracted_slice_459 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_461 = tensor.extract_slice %arg0[5831397376] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_462 = tensor.expand_shape %extracted_slice_461 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_463 = tensor.extract_slice %arg0[5848174592] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_464 = tensor.expand_shape %extracted_slice_463 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_465 = tensor.extract_slice %arg0[5864951808] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32> + %expanded_466 = tensor.expand_shape %extracted_slice_465 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32> + %extracted_slice_467 = tensor.extract_slice %arg0[5910040576] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32> + %expanded_468 = tensor.expand_shape %extracted_slice_467 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32> + %extracted_slice_469 = tensor.extract_slice %arg0[5955129344] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32> + %expanded_470 = tensor.expand_shape %extracted_slice_469 [[0, 1]] : tensor<45088768xf32> into tensor<4096x11008xf32> + %extracted_slice_471 = tensor.extract_slice %arg0[6000218112] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_472 = tensor.expand_shape %extracted_slice_471 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_473 = tensor.extract_slice %arg0[6016995328] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_474 = tensor.expand_shape %extracted_slice_473 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_475 = tensor.extract_slice %arg0[6033772544] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_476 = tensor.expand_shape %extracted_slice_475 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_477 = tensor.extract_slice %arg0[6050549760] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_478 = tensor.expand_shape %extracted_slice_477 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_479 = tensor.extract_slice %arg0[6067326976] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32> + %expanded_480 = tensor.expand_shape %extracted_slice_479 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32> + %extracted_slice_481 = tensor.extract_slice %arg0[6112415744] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32> + %expanded_482 = tensor.expand_shape %extracted_slice_481 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32> + %extracted_slice_483 = tensor.extract_slice %arg0[6157504512] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32> + %expanded_484 = tensor.expand_shape %extracted_slice_483 [[0, 1]] : tensor<45088768xf32> into tensor<4096x11008xf32> + %extracted_slice_485 = tensor.extract_slice %arg0[6202593280] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_486 = tensor.expand_shape %extracted_slice_485 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_487 = tensor.extract_slice %arg0[6219370496] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_488 = tensor.expand_shape %extracted_slice_487 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_489 = tensor.extract_slice %arg0[6236147712] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_490 = tensor.expand_shape %extracted_slice_489 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_491 = tensor.extract_slice %arg0[6252924928] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_492 = tensor.expand_shape %extracted_slice_491 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_493 = tensor.extract_slice %arg0[6269702144] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32> + %expanded_494 = tensor.expand_shape %extracted_slice_493 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32> + %extracted_slice_495 = tensor.extract_slice %arg0[6314790912] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32> + %expanded_496 = tensor.expand_shape %extracted_slice_495 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32> + %extracted_slice_497 = tensor.extract_slice %arg0[6359879680] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32> + %expanded_498 = tensor.expand_shape %extracted_slice_497 [[0, 1]] : tensor<45088768xf32> into tensor<4096x11008xf32> + %extracted_slice_499 = tensor.extract_slice %arg0[6404968448] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_500 = tensor.expand_shape %extracted_slice_499 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_501 = tensor.extract_slice %arg0[6421745664] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_502 = tensor.expand_shape %extracted_slice_501 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_503 = tensor.extract_slice %arg0[6438522880] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_504 = tensor.expand_shape %extracted_slice_503 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_505 = tensor.extract_slice %arg0[6455300096] [16777216] [1] : tensor<6755192832xf32> to tensor<16777216xf32> + %expanded_506 = tensor.expand_shape %extracted_slice_505 [[0, 1]] : tensor<16777216xf32> into tensor<4096x4096xf32> + %extracted_slice_507 = tensor.extract_slice %arg0[6472077312] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32> + %expanded_508 = tensor.expand_shape %extracted_slice_507 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32> + %extracted_slice_509 = tensor.extract_slice %arg0[6517166080] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32> + %expanded_510 = tensor.expand_shape %extracted_slice_509 [[0, 1]] : tensor<45088768xf32> into tensor<11008x4096xf32> + %extracted_slice_511 = tensor.extract_slice %arg0[6562254848] [45088768] [1] : tensor<6755192832xf32> to tensor<45088768xf32> + %expanded_512 = tensor.expand_shape %extracted_slice_511 [[0, 1]] : tensor<45088768xf32> into tensor<4096x11008xf32> + %extracted_slice_513 = tensor.extract_slice %arg0[6607343616] [131072000] [1] : tensor<6755192832xf32> to tensor<131072000xf32> + %expanded_514 = tensor.expand_shape %extracted_slice_513 [[0, 1]] : tensor<131072000xf32> into tensor<32000x4096xf32> + %extracted_slice_515 = tensor.extract_slice %arg0[6738415616] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32> + %expanded_516 = tensor.expand_shape %extracted_slice_515 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32> + %extracted_slice_517 = tensor.extract_slice %arg0[6738677760] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32> + %expanded_518 = tensor.expand_shape %extracted_slice_517 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32> + %extracted_slice_519 = tensor.extract_slice %arg0[6738939904] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32> + %expanded_520 = tensor.expand_shape %extracted_slice_519 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32> + %extracted_slice_521 = tensor.extract_slice %arg0[6739202048] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32> + %expanded_522 = tensor.expand_shape %extracted_slice_521 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32> + %extracted_slice_523 = tensor.extract_slice %arg0[6739464192] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32> + %expanded_524 = tensor.expand_shape %extracted_slice_523 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32> + %extracted_slice_525 = tensor.extract_slice %arg0[6739726336] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32> + %expanded_526 = tensor.expand_shape %extracted_slice_525 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32> + %extracted_slice_527 = tensor.extract_slice %arg0[6739988480] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32> + %expanded_528 = tensor.expand_shape %extracted_slice_527 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32> + %extracted_slice_529 = tensor.extract_slice %arg0[6740250624] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32> + %expanded_530 = tensor.expand_shape %extracted_slice_529 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32> + %extracted_slice_531 = tensor.extract_slice %arg0[6740512768] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32> + %expanded_532 = tensor.expand_shape %extracted_slice_531 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32> + %extracted_slice_533 = tensor.extract_slice %arg0[6740774912] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32> + %expanded_534 = tensor.expand_shape %extracted_slice_533 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32> + %extracted_slice_535 = tensor.extract_slice %arg0[6741037056] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32> + %expanded_536 = tensor.expand_shape %extracted_slice_535 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32> + %extracted_slice_537 = tensor.extract_slice %arg0[6741299200] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32> + %expanded_538 = tensor.expand_shape %extracted_slice_537 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32> + %extracted_slice_539 = tensor.extract_slice %arg0[6741561344] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32> + %expanded_540 = tensor.expand_shape %extracted_slice_539 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32> + %extracted_slice_541 = tensor.extract_slice %arg0[6741823488] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32> + %expanded_542 = tensor.expand_shape %extracted_slice_541 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32> + %extracted_slice_543 = tensor.extract_slice %arg0[6742085632] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32> + %expanded_544 = tensor.expand_shape %extracted_slice_543 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32> + %extracted_slice_545 = tensor.extract_slice %arg0[6742347776] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32> + %expanded_546 = tensor.expand_shape %extracted_slice_545 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32> + %extracted_slice_547 = tensor.extract_slice %arg0[6742609920] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32> + %expanded_548 = tensor.expand_shape %extracted_slice_547 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32> + %extracted_slice_549 = tensor.extract_slice %arg0[6742872064] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32> + %expanded_550 = tensor.expand_shape %extracted_slice_549 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32> + %extracted_slice_551 = tensor.extract_slice %arg0[6743134208] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32> + %expanded_552 = tensor.expand_shape %extracted_slice_551 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32> + %extracted_slice_553 = tensor.extract_slice %arg0[6743396352] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32> + %expanded_554 = tensor.expand_shape %extracted_slice_553 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32> + %extracted_slice_555 = tensor.extract_slice %arg0[6743658496] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32> + %expanded_556 = tensor.expand_shape %extracted_slice_555 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32> + %extracted_slice_557 = tensor.extract_slice %arg0[6743920640] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32> + %expanded_558 = tensor.expand_shape %extracted_slice_557 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32> + %extracted_slice_559 = tensor.extract_slice %arg0[6744182784] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32> + %expanded_560 = tensor.expand_shape %extracted_slice_559 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32> + %extracted_slice_561 = tensor.extract_slice %arg0[6744444928] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32> + %expanded_562 = tensor.expand_shape %extracted_slice_561 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32> + %extracted_slice_563 = tensor.extract_slice %arg0[6744707072] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32> + %expanded_564 = tensor.expand_shape %extracted_slice_563 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32> + %extracted_slice_565 = tensor.extract_slice %arg0[6744969216] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32> + %expanded_566 = tensor.expand_shape %extracted_slice_565 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32> + %extracted_slice_567 = tensor.extract_slice %arg0[6745231360] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32> + %expanded_568 = tensor.expand_shape %extracted_slice_567 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32> + %extracted_slice_569 = tensor.extract_slice %arg0[6745493504] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32> + %expanded_570 = tensor.expand_shape %extracted_slice_569 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32> + %extracted_slice_571 = tensor.extract_slice %arg0[6745755648] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32> + %expanded_572 = tensor.expand_shape %extracted_slice_571 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32> + %extracted_slice_573 = tensor.extract_slice %arg0[6746017792] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32> + %expanded_574 = tensor.expand_shape %extracted_slice_573 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32> + %extracted_slice_575 = tensor.extract_slice %arg0[6746279936] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32> + %expanded_576 = tensor.expand_shape %extracted_slice_575 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32> + %extracted_slice_577 = tensor.extract_slice %arg0[6746542080] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32> + %expanded_578 = tensor.expand_shape %extracted_slice_577 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32> + %extracted_slice_579 = tensor.extract_slice %arg0[6746804224] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32> + %expanded_580 = tensor.expand_shape %extracted_slice_579 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32> + %extracted_slice_581 = tensor.extract_slice %arg0[6747066368] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32> + %expanded_582 = tensor.expand_shape %extracted_slice_581 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32> + %extracted_slice_583 = tensor.extract_slice %arg0[6747328512] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32> + %expanded_584 = tensor.expand_shape %extracted_slice_583 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32> + %extracted_slice_585 = tensor.extract_slice %arg0[6747590656] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32> + %expanded_586 = tensor.expand_shape %extracted_slice_585 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32> + %extracted_slice_587 = tensor.extract_slice %arg0[6747852800] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32> + %expanded_588 = tensor.expand_shape %extracted_slice_587 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32> + %extracted_slice_589 = tensor.extract_slice %arg0[6748114944] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32> + %expanded_590 = tensor.expand_shape %extracted_slice_589 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32> + %extracted_slice_591 = tensor.extract_slice %arg0[6748377088] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32> + %expanded_592 = tensor.expand_shape %extracted_slice_591 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32> + %extracted_slice_593 = tensor.extract_slice %arg0[6748639232] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32> + %expanded_594 = tensor.expand_shape %extracted_slice_593 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32> + %extracted_slice_595 = tensor.extract_slice %arg0[6748901376] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32> + %expanded_596 = tensor.expand_shape %extracted_slice_595 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32> + %extracted_slice_597 = tensor.extract_slice %arg0[6749163520] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32> + %expanded_598 = tensor.expand_shape %extracted_slice_597 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32> + %extracted_slice_599 = tensor.extract_slice %arg0[6749425664] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32> + %expanded_600 = tensor.expand_shape %extracted_slice_599 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32> + %extracted_slice_601 = tensor.extract_slice %arg0[6749687808] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32> + %expanded_602 = tensor.expand_shape %extracted_slice_601 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32> + %extracted_slice_603 = tensor.extract_slice %arg0[6749949952] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32> + %expanded_604 = tensor.expand_shape %extracted_slice_603 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32> + %extracted_slice_605 = tensor.extract_slice %arg0[6750212096] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32> + %expanded_606 = tensor.expand_shape %extracted_slice_605 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32> + %extracted_slice_607 = tensor.extract_slice %arg0[6750474240] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32> + %expanded_608 = tensor.expand_shape %extracted_slice_607 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32> + %extracted_slice_609 = tensor.extract_slice %arg0[6750736384] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32> + %expanded_610 = tensor.expand_shape %extracted_slice_609 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32> + %extracted_slice_611 = tensor.extract_slice %arg0[6750998528] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32> + %expanded_612 = tensor.expand_shape %extracted_slice_611 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32> + %extracted_slice_613 = tensor.extract_slice %arg0[6751260672] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32> + %expanded_614 = tensor.expand_shape %extracted_slice_613 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32> + %extracted_slice_615 = tensor.extract_slice %arg0[6751522816] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32> + %expanded_616 = tensor.expand_shape %extracted_slice_615 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32> + %extracted_slice_617 = tensor.extract_slice %arg0[6751784960] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32> + %expanded_618 = tensor.expand_shape %extracted_slice_617 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32> + %extracted_slice_619 = tensor.extract_slice %arg0[6752047104] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32> + %expanded_620 = tensor.expand_shape %extracted_slice_619 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32> + %extracted_slice_621 = tensor.extract_slice %arg0[6752309248] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32> + %expanded_622 = tensor.expand_shape %extracted_slice_621 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32> + %extracted_slice_623 = tensor.extract_slice %arg0[6752571392] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32> + %expanded_624 = tensor.expand_shape %extracted_slice_623 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32> + %extracted_slice_625 = tensor.extract_slice %arg0[6752833536] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32> + %expanded_626 = tensor.expand_shape %extracted_slice_625 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32> + %extracted_slice_627 = tensor.extract_slice %arg0[6753095680] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32> + %expanded_628 = tensor.expand_shape %extracted_slice_627 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32> + %extracted_slice_629 = tensor.extract_slice %arg0[6753357824] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32> + %expanded_630 = tensor.expand_shape %extracted_slice_629 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32> + %extracted_slice_631 = tensor.extract_slice %arg0[6753619968] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32> + %expanded_632 = tensor.expand_shape %extracted_slice_631 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32> + %extracted_slice_633 = tensor.extract_slice %arg0[6753882112] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32> + %expanded_634 = tensor.expand_shape %extracted_slice_633 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32> + %extracted_slice_635 = tensor.extract_slice %arg0[6754144256] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32> + %expanded_636 = tensor.expand_shape %extracted_slice_635 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32> + %extracted_slice_637 = tensor.extract_slice %arg0[6754406400] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32> + %expanded_638 = tensor.expand_shape %extracted_slice_637 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32> + %extracted_slice_639 = tensor.extract_slice %arg0[6754668544] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32> + %expanded_640 = tensor.expand_shape %extracted_slice_639 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32> + %extracted_slice_641 = tensor.extract_slice %arg0[6754930688] [262144] [1] : tensor<6755192832xf32> to tensor<262144xf32> + %expanded_642 = tensor.expand_shape %extracted_slice_641 [[0, 1, 2, 3]] : tensor<262144xf32> into tensor<1x1x2048x128xf32> + %cst = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79]> : tensor<80xi64> + %cst_643 = arith.constant dense<[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79]]> : tensor<1x80xi64> + %0 = tensor.empty() : tensor<1x80xi32> + %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%arg1 : tensor<1x80xi64>) outs(%0 : tensor<1x80xi32>) { + ^bb0(%in: i64, %out: i32): + %3652 = arith.trunci %in : i64 to i32 + linalg.yield %3652 : i32 + } -> tensor<1x80xi32> + %expanded_644 = tensor.expand_shape %expanded [[0, 1], [2]] : tensor<32000x4096xf32> into tensor<1x32000x4096xf32> + %2 = tensor.empty() : tensor<1x80x4096xf32> + %3 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1 : tensor<1x80xi32>) outs(%2 : tensor<1x80x4096xf32>) { + ^bb0(%in: i32, %out: f32): + %3652 = linalg.index 0 : index + %3653 = arith.index_cast %in : i32 to index + %3654 = linalg.index 2 : index + %extracted = tensor.extract %expanded_644[%3652, %3653, %3654] : tensor<1x32000x4096xf32> + linalg.yield %extracted : f32 + } -> tensor<1x80x4096xf32> + %cst_645 = arith.constant dense : tensor<1x80xi1> + %cst_646 = arith.constant dense<-3.40282347E+38> : tensor<80x80xf32> + %cst_647 = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79]> : tensor<80xi64> + %cst_648 = arith.constant dense<1> : tensor<80xi64> + %4 = tensor.empty() : tensor<80xi64> + %5 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel"]} ins(%cst_647, %cst_648 : tensor<80xi64>, tensor<80xi64>) outs(%4 : tensor<80xi64>) { + ^bb0(%in: i64, %in_2684: i64, %out: i64): + %3652 = arith.addi %in, %in_2684 : i64 + linalg.yield %3652 : i64 + } -> tensor<80xi64> + %expanded_649 = tensor.expand_shape %5 [[0, 1]] : tensor<80xi64> into tensor<80x1xi64> + %6 = tensor.empty() : tensor<80x80xi1> + %7 = linalg.generic {indexing_maps = [#map4, #map5, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%cst_647, %expanded_649 : tensor<80xi64>, tensor<80x1xi64>) outs(%6 : tensor<80x80xi1>) { + ^bb0(%in: i64, %in_2684: i64, %out: i1): + %3652 = arith.cmpi slt, %in, %in_2684 : i64 + linalg.yield %3652 : i1 + } -> tensor<80x80xi1> + %cst_650 = arith.constant 0.000000e+00 : f32 + %8 = tensor.empty() : tensor<80x80xf32> + %9 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%cst_646, %7 : tensor<80x80xf32>, tensor<80x80xi1>) outs(%8 : tensor<80x80xf32>) { + ^bb0(%in: f32, %in_2684: i1, %out: f32): + %3652 = arith.select %in_2684, %cst_650, %in : f32 + linalg.yield %3652 : f32 + } -> tensor<80x80xf32> + %cst_651 = arith.constant dense : tensor<1x80xi1> + %expanded_652 = tensor.expand_shape %cst_651 [[0, 1], [2]] : tensor<1x80xi1> into tensor<1x1x80xi1> + %expanded_653 = tensor.expand_shape %expanded_652 [[0], [1, 2], [3]] : tensor<1x1x80xi1> into tensor<1x1x1x80xi1> + %cst_654 = arith.constant dense : tensor<1x1x80x80xi1> + %10 = tensor.empty() : tensor<1x1x80x80xi1> + %collapsed = tensor.collapse_shape %expanded_653 [[0], [1, 2], [3]] : tensor<1x1x1x80xi1> into tensor<1x1x80xi1> + %11 = linalg.generic {indexing_maps = [#map6, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%collapsed, %cst_654 : tensor<1x1x80xi1>, tensor<1x1x80x80xi1>) outs(%10 : tensor<1x1x80x80xi1>) { + ^bb0(%in: i1, %in_2684: i1, %out: i1): + %3652 = arith.addi %in, %in_2684 : i1 + linalg.yield %3652 : i1 + } -> tensor<1x1x80x80xi1> + %12 = tensor.empty() : tensor<1x1x80x80xf32> + %13 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%11 : tensor<1x1x80x80xi1>) outs(%12 : tensor<1x1x80x80xf32>) { + ^bb0(%in: i1, %out: f32): + %3652 = arith.extui %in : i1 to i32 + %3653 = arith.sitofp %3652 : i32 to f32 + linalg.yield %3653 : f32 + } -> tensor<1x1x80x80xf32> + %cst_655 = arith.constant 1.000000e+00 : f32 + %14 = tensor.empty() : tensor<1x1x80x80xf32> + %15 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%13 : tensor<1x1x80x80xf32>) outs(%14 : tensor<1x1x80x80xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.subf %cst_655, %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x1x80x80xf32> + %16 = tensor.empty() : tensor<1x1x80x80xi1> + %17 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%15 : tensor<1x1x80x80xf32>) outs(%16 : tensor<1x1x80x80xi1>) { + ^bb0(%in: f32, %out: i1): + %3652 = arith.fptosi %in : f32 to i32 + %3653 = arith.trunci %3652 : i32 to i1 + linalg.yield %3653 : i1 + } -> tensor<1x1x80x80xi1> + %cst_656 = arith.constant -3.40282347E+38 : f32 + %18 = tensor.empty() : tensor<1x1x80x80xf32> + %19 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%15, %17 : tensor<1x1x80x80xf32>, tensor<1x1x80x80xi1>) outs(%18 : tensor<1x1x80x80xf32>) { + ^bb0(%in: f32, %in_2684: i1, %out: f32): + %3652 = arith.select %in_2684, %cst_656, %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x1x80x80xf32> + %expanded_657 = tensor.expand_shape %9 [[0, 1], [2]] : tensor<80x80xf32> into tensor<1x80x80xf32> + %expanded_658 = tensor.expand_shape %expanded_657 [[0, 1], [2], [3]] : tensor<1x80x80xf32> into tensor<1x1x80x80xf32> + %cst_659 = arith.constant dense<0.000000e+00> : tensor<1x1x80x80xf32> + %20 = tensor.empty() : tensor<1x1x80x80xf32> + %21 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%19, %expanded_658 : tensor<1x1x80x80xf32>, tensor<1x1x80x80xf32>) outs(%20 : tensor<1x1x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x1x80x80xf32> + %22 = tensor.empty() : tensor<1x80x4096xf32> + %cst_660 = arith.constant 2.000000e+00 : f32 + %23 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<1x80x4096xf32>) outs(%22 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.powf %in, %cst_660 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %cst_661 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32> + %24 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%23 : tensor<1x80x4096xf32>) outs(%cst_661 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %cst_2684 = arith.constant 4.096000e+03 : f32 + %3652 = arith.divf %in, %cst_2684 : f32 + %3653 = arith.addf %3652, %out : f32 + linalg.yield %3653 : f32 + } -> tensor<1x80x1xf32> + %cst_662 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32> + %25 = tensor.empty() : tensor<1x80x1xf32> + %26 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%24, %cst_662 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%25 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %27 = tensor.empty() : tensor<1x80x1xf32> + %28 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%26 : tensor<1x80x1xf32>) outs(%27 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.rsqrt %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %29 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_663 = tensor.collapse_shape %28 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32> + %30 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3, %collapsed_663 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%29 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %expanded_664 = tensor.expand_shape %extracted_slice [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32> + %31 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_665 = tensor.collapse_shape %expanded_664 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32> + %32 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_665, %30 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%31 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %33 = tensor.empty() : tensor<4096x4096xf32> + %34 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_66 : tensor<4096x4096xf32>) outs(%33 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_666 = tensor.collapse_shape %32 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_667 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %35 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_666, %34 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_667 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_668 = tensor.expand_shape %35 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %36 = tensor.empty() : tensor<4096x4096xf32> + %37 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_68 : tensor<4096x4096xf32>) outs(%36 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_669 = tensor.collapse_shape %32 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_670 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %38 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_669, %37 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_670 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_671 = tensor.expand_shape %38 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %39 = tensor.empty() : tensor<4096x4096xf32> + %40 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_70 : tensor<4096x4096xf32>) outs(%39 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_672 = tensor.collapse_shape %32 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_673 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %41 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_672, %40 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_673 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_674 = tensor.expand_shape %41 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %expanded_675 = tensor.expand_shape %expanded_668 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32> + %42 = tensor.empty() : tensor<1x32x80x128xf32> + %43 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_675 : tensor<1x80x32x128xf32>) outs(%42 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x80x128xf32> + %expanded_676 = tensor.expand_shape %expanded_671 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32> + %44 = tensor.empty() : tensor<1x32x80x128xf32> + %45 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_676 : tensor<1x80x32x128xf32>) outs(%44 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x80x128xf32> + %expanded_677 = tensor.expand_shape %expanded_674 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32> + %46 = tensor.empty() : tensor<1x32x80x128xf32> + %47 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_677 : tensor<1x80x32x128xf32>) outs(%46 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x80x128xf32> + %extracted_slice_678 = tensor.extract_slice %expanded_516[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32> + %extracted_slice_679 = tensor.extract_slice %expanded_518[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32> + %48 = tensor.empty() : tensor<1x80x128xf32> + %49 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_678 : tensor<1x1x80x128xf32>) outs(%48 : tensor<1x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x80x128xf32> + %50 = tensor.empty() : tensor<80x128xf32> + %51 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%49 : tensor<1x80x128xf32>) outs(%50 : tensor<80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<80x128xf32> + %52 = tensor.empty() : tensor<1x80x128xf32> + %53 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_679 : tensor<1x1x80x128xf32>) outs(%52 : tensor<1x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x80x128xf32> + %54 = tensor.empty() : tensor<80x128xf32> + %55 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%53 : tensor<1x80x128xf32>) outs(%54 : tensor<80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<80x128xf32> + %56 = tensor.empty() : tensor<1x80x128xf32> + %57 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%56 : tensor<1x80x128xf32>) { + ^bb0(%in: i64, %out: f32): + %3652 = arith.index_cast %in : i64 to index + %3653 = linalg.index 2 : index + %extracted = tensor.extract %51[%3652, %3653] : tensor<80x128xf32> + linalg.yield %extracted : f32 + } -> tensor<1x80x128xf32> + %expanded_680 = tensor.expand_shape %57 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32> + %58 = tensor.empty() : tensor<1x80x128xf32> + %59 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%58 : tensor<1x80x128xf32>) { + ^bb0(%in: i64, %out: f32): + %3652 = arith.index_cast %in : i64 to index + %3653 = linalg.index 2 : index + %extracted = tensor.extract %55[%3652, %3653] : tensor<80x128xf32> + linalg.yield %extracted : f32 + } -> tensor<1x80x128xf32> + %expanded_681 = tensor.expand_shape %59 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32> + %60 = tensor.empty() : tensor<1x32x80x128xf32> + %61 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%43, %57 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%60 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %extracted_slice_682 = tensor.extract_slice %43[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %extracted_slice_683 = tensor.extract_slice %43[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %62 = tensor.empty() : tensor<1x32x80x64xf32> + %63 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_683 : tensor<1x32x80x64xf32>) outs(%62 : tensor<1x32x80x64xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.negf %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x64xf32> + %64 = tensor.empty() : tensor<1x32x80x128xf32> + %inserted_slice = tensor.insert_slice %63 into %64[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %inserted_slice_684 = tensor.insert_slice %extracted_slice_682 into %inserted_slice[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %65 = tensor.empty() : tensor<1x32x80x128xf32> + %66 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_684, %59 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%65 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %67 = tensor.empty() : tensor<1x32x80x128xf32> + %68 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%61, %66 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%67 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %69 = tensor.empty() : tensor<1x32x80x128xf32> + %70 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%45, %57 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%69 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %extracted_slice_685 = tensor.extract_slice %45[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %extracted_slice_686 = tensor.extract_slice %45[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %71 = tensor.empty() : tensor<1x32x80x64xf32> + %72 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_686 : tensor<1x32x80x64xf32>) outs(%71 : tensor<1x32x80x64xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.negf %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x64xf32> + %73 = tensor.empty() : tensor<1x32x80x128xf32> + %inserted_slice_687 = tensor.insert_slice %72 into %73[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %inserted_slice_688 = tensor.insert_slice %extracted_slice_685 into %inserted_slice_687[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %74 = tensor.empty() : tensor<1x32x80x128xf32> + %75 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_688, %59 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%74 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %76 = tensor.empty() : tensor<1x32x80x128xf32> + %77 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%70, %75 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%76 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %78 = tensor.empty() : tensor<1x32x128x80xf32> + %79 = linalg.generic {indexing_maps = [#map14, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%77 : tensor<1x32x80x128xf32>) outs(%78 : tensor<1x32x128x80xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x128x80xf32> + %cst_689 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32> + %collapsed_690 = tensor.collapse_shape %68 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32> + %cst_691 = arith.constant dense<0.000000e+00> : tensor<1x32x128x80xf32> + %collapsed_692 = tensor.collapse_shape %79 [[0, 1], [2], [3]] : tensor<1x32x128x80xf32> into tensor<32x128x80xf32> + %cst_693 = arith.constant 0.000000e+00 : f32 + %80 = tensor.empty() : tensor<32x80x80xf32> + %81 = linalg.fill ins(%cst_693 : f32) outs(%80 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32> + %82 = linalg.batch_matmul ins(%collapsed_690, %collapsed_692 : tensor<32x80x128xf32>, tensor<32x128x80xf32>) outs(%81 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32> + %expanded_694 = tensor.expand_shape %82 [[0, 1], [2], [3]] : tensor<32x80x80xf32> into tensor<1x32x80x80xf32> + %cst_695 = arith.constant dense<11.3137083> : tensor<1x32x80x80xf32> + %83 = tensor.empty() : tensor<1x32x80x80xf32> + %84 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst_695 : tensor<1x32x80x80xf32>) outs(%83 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %out: f32): + %cst_2684 = arith.constant 1.000000e+00 : f32 + %3652 = arith.divf %cst_2684, %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %85 = tensor.empty() : tensor<1x32x80x80xf32> + %86 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_694, %84 : tensor<1x32x80x80xf32>, tensor<1x32x80x80xf32>) outs(%85 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %87 = tensor.empty() : tensor<1x32x80x80xf32> + %collapsed_696 = tensor.collapse_shape %21 [[0, 1], [2], [3]] : tensor<1x1x80x80xf32> into tensor<1x80x80xf32> + %88 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%86, %collapsed_696 : tensor<1x32x80x80xf32>, tensor<1x80x80xf32>) outs(%87 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %89 = tensor.empty() : tensor<1x32x80x1xf32> + %90 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%89 : tensor<1x32x80x1xf32>) { + ^bb0(%out: f32): + %cst_2684 = arith.constant 0xFF800000 : f32 + linalg.yield %cst_2684 : f32 + } -> tensor<1x32x80x1xf32> + %91 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%88 : tensor<1x32x80x80xf32>) outs(%89 : tensor<1x32x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.maximumf %in, %out : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x1xf32> + %92 = tensor.empty() : tensor<1x32x80x80xf32> + %93 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%88, %91 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%92 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.subf %in, %in_2684 : f32 + %3653 = math.exp %3652 : f32 + linalg.yield %3653 : f32 + } -> tensor<1x32x80x80xf32> + %94 = tensor.empty() : tensor<1x32x80x1xf32> + %95 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%94 : tensor<1x32x80x1xf32>) { + ^bb0(%out: f32): + %cst_2684 = arith.constant 0.000000e+00 : f32 + linalg.yield %cst_2684 : f32 + } -> tensor<1x32x80x1xf32> + %96 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%93 : tensor<1x32x80x80xf32>) outs(%95 : tensor<1x32x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.addf %in, %out : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x1xf32> + %97 = tensor.empty() : tensor<1x32x80x80xf32> + %98 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%93, %96 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%97 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.divf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %cst_697 = arith.constant dense<0.000000e+00> : tensor<1x32x80x80xf32> + %collapsed_698 = tensor.collapse_shape %98 [[0, 1], [2], [3]] : tensor<1x32x80x80xf32> into tensor<32x80x80xf32> + %cst_699 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32> + %collapsed_700 = tensor.collapse_shape %47 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32> + %cst_701 = arith.constant 0.000000e+00 : f32 + %99 = tensor.empty() : tensor<32x80x128xf32> + %100 = linalg.fill ins(%cst_701 : f32) outs(%99 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32> + %101 = linalg.batch_matmul ins(%collapsed_698, %collapsed_700 : tensor<32x80x80xf32>, tensor<32x80x128xf32>) outs(%100 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32> + %expanded_702 = tensor.expand_shape %101 [[0, 1], [2], [3]] : tensor<32x80x128xf32> into tensor<1x32x80x128xf32> + %102 = tensor.empty() : tensor<1x80x32x128xf32> + %103 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_702 : tensor<1x32x80x128xf32>) outs(%102 : tensor<1x80x32x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x80x32x128xf32> + %collapsed_703 = tensor.collapse_shape %103 [[0], [1], [2, 3]] : tensor<1x80x32x128xf32> into tensor<1x80x4096xf32> + %104 = tensor.empty() : tensor<4096x4096xf32> + %105 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_72 : tensor<4096x4096xf32>) outs(%104 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_704 = tensor.collapse_shape %collapsed_703 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_705 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %106 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_704, %105 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_705 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_706 = tensor.expand_shape %106 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %107 = tensor.empty() : tensor<1x80x4096xf32> + %108 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3, %expanded_706 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%107 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %109 = tensor.empty() : tensor<1x80x4096xf32> + %cst_707 = arith.constant 2.000000e+00 : f32 + %110 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%108 : tensor<1x80x4096xf32>) outs(%109 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.powf %in, %cst_707 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %cst_708 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32> + %111 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%110 : tensor<1x80x4096xf32>) outs(%cst_708 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %cst_2684 = arith.constant 4.096000e+03 : f32 + %3652 = arith.divf %in, %cst_2684 : f32 + %3653 = arith.addf %3652, %out : f32 + linalg.yield %3653 : f32 + } -> tensor<1x80x1xf32> + %cst_709 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32> + %112 = tensor.empty() : tensor<1x80x1xf32> + %113 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%111, %cst_709 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%112 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %114 = tensor.empty() : tensor<1x80x1xf32> + %115 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%113 : tensor<1x80x1xf32>) outs(%114 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.rsqrt %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %116 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_710 = tensor.collapse_shape %115 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32> + %117 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%108, %collapsed_710 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%116 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %expanded_711 = tensor.expand_shape %extracted_slice_0 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32> + %118 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_712 = tensor.collapse_shape %expanded_711 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32> + %119 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_712, %117 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%118 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %120 = tensor.empty() : tensor<4096x11008xf32> + %121 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_74 : tensor<11008x4096xf32>) outs(%120 : tensor<4096x11008xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x11008xf32> + %collapsed_713 = tensor.collapse_shape %119 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_714 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32> + %122 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_713, %121 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_714 : tensor<80x11008xf32>) -> tensor<80x11008xf32> + %expanded_715 = tensor.expand_shape %122 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32> + %123 = tensor.empty() : tensor<1x80x11008xf32> + %124 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_715 : tensor<1x80x11008xf32>) outs(%123 : tensor<1x80x11008xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.negf %in : f32 + %3653 = math.exp %3652 : f32 + %cst_2684 = arith.constant 1.000000e+00 : f32 + %3654 = arith.addf %cst_2684, %3653 : f32 + %3655 = arith.divf %in, %3654 : f32 + linalg.yield %3655 : f32 + } -> tensor<1x80x11008xf32> + %125 = tensor.empty() : tensor<4096x11008xf32> + %126 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_76 : tensor<11008x4096xf32>) outs(%125 : tensor<4096x11008xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x11008xf32> + %collapsed_716 = tensor.collapse_shape %119 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_717 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32> + %127 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_716, %126 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_717 : tensor<80x11008xf32>) -> tensor<80x11008xf32> + %expanded_718 = tensor.expand_shape %127 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32> + %128 = tensor.empty() : tensor<1x80x11008xf32> + %129 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%124, %expanded_718 : tensor<1x80x11008xf32>, tensor<1x80x11008xf32>) outs(%128 : tensor<1x80x11008xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x11008xf32> + %130 = tensor.empty() : tensor<11008x4096xf32> + %131 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_78 : tensor<4096x11008xf32>) outs(%130 : tensor<11008x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<11008x4096xf32> + %collapsed_719 = tensor.collapse_shape %129 [[0, 1], [2]] : tensor<1x80x11008xf32> into tensor<80x11008xf32> + %cst_720 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %132 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_719, %131 : tensor<80x11008xf32>, tensor<11008x4096xf32>) outs(%cst_720 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_721 = tensor.expand_shape %132 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %133 = tensor.empty() : tensor<1x80x4096xf32> + %134 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%108, %expanded_721 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%133 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %135 = tensor.empty() : tensor<1x80x4096xf32> + %cst_722 = arith.constant 2.000000e+00 : f32 + %136 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%134 : tensor<1x80x4096xf32>) outs(%135 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.powf %in, %cst_722 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %cst_723 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32> + %137 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%136 : tensor<1x80x4096xf32>) outs(%cst_723 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %cst_2684 = arith.constant 4.096000e+03 : f32 + %3652 = arith.divf %in, %cst_2684 : f32 + %3653 = arith.addf %3652, %out : f32 + linalg.yield %3653 : f32 + } -> tensor<1x80x1xf32> + %cst_724 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32> + %138 = tensor.empty() : tensor<1x80x1xf32> + %139 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%137, %cst_724 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%138 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %140 = tensor.empty() : tensor<1x80x1xf32> + %141 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%139 : tensor<1x80x1xf32>) outs(%140 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.rsqrt %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %142 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_725 = tensor.collapse_shape %141 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32> + %143 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%134, %collapsed_725 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%142 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %expanded_726 = tensor.expand_shape %extracted_slice_1 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32> + %144 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_727 = tensor.collapse_shape %expanded_726 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32> + %145 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_727, %143 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%144 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %146 = tensor.empty() : tensor<4096x4096xf32> + %147 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_80 : tensor<4096x4096xf32>) outs(%146 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_728 = tensor.collapse_shape %145 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_729 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %148 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_728, %147 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_729 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_730 = tensor.expand_shape %148 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %149 = tensor.empty() : tensor<4096x4096xf32> + %150 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_82 : tensor<4096x4096xf32>) outs(%149 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_731 = tensor.collapse_shape %145 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_732 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %151 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_731, %150 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_732 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_733 = tensor.expand_shape %151 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %152 = tensor.empty() : tensor<4096x4096xf32> + %153 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_84 : tensor<4096x4096xf32>) outs(%152 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_734 = tensor.collapse_shape %145 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_735 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %154 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_734, %153 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_735 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_736 = tensor.expand_shape %154 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %expanded_737 = tensor.expand_shape %expanded_730 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32> + %155 = tensor.empty() : tensor<1x32x80x128xf32> + %156 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_737 : tensor<1x80x32x128xf32>) outs(%155 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x80x128xf32> + %expanded_738 = tensor.expand_shape %expanded_733 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32> + %157 = tensor.empty() : tensor<1x32x80x128xf32> + %158 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_738 : tensor<1x80x32x128xf32>) outs(%157 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x80x128xf32> + %expanded_739 = tensor.expand_shape %expanded_736 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32> + %159 = tensor.empty() : tensor<1x32x80x128xf32> + %160 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_739 : tensor<1x80x32x128xf32>) outs(%159 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x80x128xf32> + %extracted_slice_740 = tensor.extract_slice %expanded_520[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32> + %extracted_slice_741 = tensor.extract_slice %expanded_522[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32> + %161 = tensor.empty() : tensor<1x80x128xf32> + %162 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_740 : tensor<1x1x80x128xf32>) outs(%161 : tensor<1x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x80x128xf32> + %163 = tensor.empty() : tensor<80x128xf32> + %164 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%162 : tensor<1x80x128xf32>) outs(%163 : tensor<80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<80x128xf32> + %165 = tensor.empty() : tensor<1x80x128xf32> + %166 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_741 : tensor<1x1x80x128xf32>) outs(%165 : tensor<1x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x80x128xf32> + %167 = tensor.empty() : tensor<80x128xf32> + %168 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%166 : tensor<1x80x128xf32>) outs(%167 : tensor<80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<80x128xf32> + %169 = tensor.empty() : tensor<1x80x128xf32> + %170 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%169 : tensor<1x80x128xf32>) { + ^bb0(%in: i64, %out: f32): + %3652 = arith.index_cast %in : i64 to index + %3653 = linalg.index 2 : index + %extracted = tensor.extract %164[%3652, %3653] : tensor<80x128xf32> + linalg.yield %extracted : f32 + } -> tensor<1x80x128xf32> + %expanded_742 = tensor.expand_shape %170 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32> + %171 = tensor.empty() : tensor<1x80x128xf32> + %172 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%171 : tensor<1x80x128xf32>) { + ^bb0(%in: i64, %out: f32): + %3652 = arith.index_cast %in : i64 to index + %3653 = linalg.index 2 : index + %extracted = tensor.extract %168[%3652, %3653] : tensor<80x128xf32> + linalg.yield %extracted : f32 + } -> tensor<1x80x128xf32> + %expanded_743 = tensor.expand_shape %172 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32> + %173 = tensor.empty() : tensor<1x32x80x128xf32> + %174 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%156, %170 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%173 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %extracted_slice_744 = tensor.extract_slice %156[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %extracted_slice_745 = tensor.extract_slice %156[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %175 = tensor.empty() : tensor<1x32x80x64xf32> + %176 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_745 : tensor<1x32x80x64xf32>) outs(%175 : tensor<1x32x80x64xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.negf %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x64xf32> + %177 = tensor.empty() : tensor<1x32x80x128xf32> + %inserted_slice_746 = tensor.insert_slice %176 into %177[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %inserted_slice_747 = tensor.insert_slice %extracted_slice_744 into %inserted_slice_746[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %178 = tensor.empty() : tensor<1x32x80x128xf32> + %179 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_747, %172 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%178 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %180 = tensor.empty() : tensor<1x32x80x128xf32> + %181 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%174, %179 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%180 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %182 = tensor.empty() : tensor<1x32x80x128xf32> + %183 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%158, %170 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%182 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %extracted_slice_748 = tensor.extract_slice %158[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %extracted_slice_749 = tensor.extract_slice %158[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %184 = tensor.empty() : tensor<1x32x80x64xf32> + %185 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_749 : tensor<1x32x80x64xf32>) outs(%184 : tensor<1x32x80x64xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.negf %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x64xf32> + %186 = tensor.empty() : tensor<1x32x80x128xf32> + %inserted_slice_750 = tensor.insert_slice %185 into %186[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %inserted_slice_751 = tensor.insert_slice %extracted_slice_748 into %inserted_slice_750[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %187 = tensor.empty() : tensor<1x32x80x128xf32> + %188 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_751, %172 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%187 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %189 = tensor.empty() : tensor<1x32x80x128xf32> + %190 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%183, %188 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%189 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %191 = tensor.empty() : tensor<1x32x128x80xf32> + %192 = linalg.generic {indexing_maps = [#map14, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%190 : tensor<1x32x80x128xf32>) outs(%191 : tensor<1x32x128x80xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x128x80xf32> + %cst_752 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32> + %collapsed_753 = tensor.collapse_shape %181 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32> + %cst_754 = arith.constant dense<0.000000e+00> : tensor<1x32x128x80xf32> + %collapsed_755 = tensor.collapse_shape %192 [[0, 1], [2], [3]] : tensor<1x32x128x80xf32> into tensor<32x128x80xf32> + %cst_756 = arith.constant 0.000000e+00 : f32 + %193 = tensor.empty() : tensor<32x80x80xf32> + %194 = linalg.fill ins(%cst_756 : f32) outs(%193 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32> + %195 = linalg.batch_matmul ins(%collapsed_753, %collapsed_755 : tensor<32x80x128xf32>, tensor<32x128x80xf32>) outs(%194 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32> + %expanded_757 = tensor.expand_shape %195 [[0, 1], [2], [3]] : tensor<32x80x80xf32> into tensor<1x32x80x80xf32> + %cst_758 = arith.constant dense<11.3137083> : tensor<1x32x80x80xf32> + %196 = tensor.empty() : tensor<1x32x80x80xf32> + %197 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst_758 : tensor<1x32x80x80xf32>) outs(%196 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %out: f32): + %cst_2684 = arith.constant 1.000000e+00 : f32 + %3652 = arith.divf %cst_2684, %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %198 = tensor.empty() : tensor<1x32x80x80xf32> + %199 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_757, %197 : tensor<1x32x80x80xf32>, tensor<1x32x80x80xf32>) outs(%198 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %200 = tensor.empty() : tensor<1x32x80x80xf32> + %collapsed_759 = tensor.collapse_shape %21 [[0, 1], [2], [3]] : tensor<1x1x80x80xf32> into tensor<1x80x80xf32> + %201 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%199, %collapsed_759 : tensor<1x32x80x80xf32>, tensor<1x80x80xf32>) outs(%200 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %202 = tensor.empty() : tensor<1x32x80x1xf32> + %203 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%202 : tensor<1x32x80x1xf32>) { + ^bb0(%out: f32): + %cst_2684 = arith.constant 0xFF800000 : f32 + linalg.yield %cst_2684 : f32 + } -> tensor<1x32x80x1xf32> + %204 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%201 : tensor<1x32x80x80xf32>) outs(%202 : tensor<1x32x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.maximumf %in, %out : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x1xf32> + %205 = tensor.empty() : tensor<1x32x80x80xf32> + %206 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%201, %204 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%205 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.subf %in, %in_2684 : f32 + %3653 = math.exp %3652 : f32 + linalg.yield %3653 : f32 + } -> tensor<1x32x80x80xf32> + %207 = tensor.empty() : tensor<1x32x80x1xf32> + %208 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%207 : tensor<1x32x80x1xf32>) { + ^bb0(%out: f32): + %cst_2684 = arith.constant 0.000000e+00 : f32 + linalg.yield %cst_2684 : f32 + } -> tensor<1x32x80x1xf32> + %209 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%206 : tensor<1x32x80x80xf32>) outs(%208 : tensor<1x32x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.addf %in, %out : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x1xf32> + %210 = tensor.empty() : tensor<1x32x80x80xf32> + %211 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%206, %209 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%210 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.divf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %cst_760 = arith.constant dense<0.000000e+00> : tensor<1x32x80x80xf32> + %collapsed_761 = tensor.collapse_shape %211 [[0, 1], [2], [3]] : tensor<1x32x80x80xf32> into tensor<32x80x80xf32> + %cst_762 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32> + %collapsed_763 = tensor.collapse_shape %160 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32> + %cst_764 = arith.constant 0.000000e+00 : f32 + %212 = tensor.empty() : tensor<32x80x128xf32> + %213 = linalg.fill ins(%cst_764 : f32) outs(%212 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32> + %214 = linalg.batch_matmul ins(%collapsed_761, %collapsed_763 : tensor<32x80x80xf32>, tensor<32x80x128xf32>) outs(%213 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32> + %expanded_765 = tensor.expand_shape %214 [[0, 1], [2], [3]] : tensor<32x80x128xf32> into tensor<1x32x80x128xf32> + %215 = tensor.empty() : tensor<1x80x32x128xf32> + %216 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_765 : tensor<1x32x80x128xf32>) outs(%215 : tensor<1x80x32x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x80x32x128xf32> + %collapsed_766 = tensor.collapse_shape %216 [[0], [1], [2, 3]] : tensor<1x80x32x128xf32> into tensor<1x80x4096xf32> + %217 = tensor.empty() : tensor<4096x4096xf32> + %218 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_86 : tensor<4096x4096xf32>) outs(%217 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_767 = tensor.collapse_shape %collapsed_766 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_768 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %219 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_767, %218 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_768 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_769 = tensor.expand_shape %219 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %220 = tensor.empty() : tensor<1x80x4096xf32> + %221 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%134, %expanded_769 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%220 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %222 = tensor.empty() : tensor<1x80x4096xf32> + %cst_770 = arith.constant 2.000000e+00 : f32 + %223 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%221 : tensor<1x80x4096xf32>) outs(%222 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.powf %in, %cst_770 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %cst_771 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32> + %224 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%223 : tensor<1x80x4096xf32>) outs(%cst_771 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %cst_2684 = arith.constant 4.096000e+03 : f32 + %3652 = arith.divf %in, %cst_2684 : f32 + %3653 = arith.addf %3652, %out : f32 + linalg.yield %3653 : f32 + } -> tensor<1x80x1xf32> + %cst_772 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32> + %225 = tensor.empty() : tensor<1x80x1xf32> + %226 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%224, %cst_772 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%225 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %227 = tensor.empty() : tensor<1x80x1xf32> + %228 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%226 : tensor<1x80x1xf32>) outs(%227 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.rsqrt %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %229 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_773 = tensor.collapse_shape %228 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32> + %230 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%221, %collapsed_773 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%229 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %expanded_774 = tensor.expand_shape %extracted_slice_2 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32> + %231 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_775 = tensor.collapse_shape %expanded_774 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32> + %232 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_775, %230 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%231 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %233 = tensor.empty() : tensor<4096x11008xf32> + %234 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_88 : tensor<11008x4096xf32>) outs(%233 : tensor<4096x11008xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x11008xf32> + %collapsed_776 = tensor.collapse_shape %232 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_777 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32> + %235 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_776, %234 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_777 : tensor<80x11008xf32>) -> tensor<80x11008xf32> + %expanded_778 = tensor.expand_shape %235 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32> + %236 = tensor.empty() : tensor<1x80x11008xf32> + %237 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_778 : tensor<1x80x11008xf32>) outs(%236 : tensor<1x80x11008xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.negf %in : f32 + %3653 = math.exp %3652 : f32 + %cst_2684 = arith.constant 1.000000e+00 : f32 + %3654 = arith.addf %cst_2684, %3653 : f32 + %3655 = arith.divf %in, %3654 : f32 + linalg.yield %3655 : f32 + } -> tensor<1x80x11008xf32> + %238 = tensor.empty() : tensor<4096x11008xf32> + %239 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_90 : tensor<11008x4096xf32>) outs(%238 : tensor<4096x11008xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x11008xf32> + %collapsed_779 = tensor.collapse_shape %232 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_780 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32> + %240 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_779, %239 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_780 : tensor<80x11008xf32>) -> tensor<80x11008xf32> + %expanded_781 = tensor.expand_shape %240 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32> + %241 = tensor.empty() : tensor<1x80x11008xf32> + %242 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%237, %expanded_781 : tensor<1x80x11008xf32>, tensor<1x80x11008xf32>) outs(%241 : tensor<1x80x11008xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x11008xf32> + %243 = tensor.empty() : tensor<11008x4096xf32> + %244 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_92 : tensor<4096x11008xf32>) outs(%243 : tensor<11008x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<11008x4096xf32> + %collapsed_782 = tensor.collapse_shape %242 [[0, 1], [2]] : tensor<1x80x11008xf32> into tensor<80x11008xf32> + %cst_783 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %245 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_782, %244 : tensor<80x11008xf32>, tensor<11008x4096xf32>) outs(%cst_783 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_784 = tensor.expand_shape %245 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %246 = tensor.empty() : tensor<1x80x4096xf32> + %247 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%221, %expanded_784 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%246 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %248 = tensor.empty() : tensor<1x80x4096xf32> + %cst_785 = arith.constant 2.000000e+00 : f32 + %249 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%247 : tensor<1x80x4096xf32>) outs(%248 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.powf %in, %cst_785 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %cst_786 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32> + %250 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%249 : tensor<1x80x4096xf32>) outs(%cst_786 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %cst_2684 = arith.constant 4.096000e+03 : f32 + %3652 = arith.divf %in, %cst_2684 : f32 + %3653 = arith.addf %3652, %out : f32 + linalg.yield %3653 : f32 + } -> tensor<1x80x1xf32> + %cst_787 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32> + %251 = tensor.empty() : tensor<1x80x1xf32> + %252 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%250, %cst_787 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%251 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %253 = tensor.empty() : tensor<1x80x1xf32> + %254 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%252 : tensor<1x80x1xf32>) outs(%253 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.rsqrt %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %255 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_788 = tensor.collapse_shape %254 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32> + %256 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%247, %collapsed_788 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%255 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %expanded_789 = tensor.expand_shape %extracted_slice_3 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32> + %257 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_790 = tensor.collapse_shape %expanded_789 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32> + %258 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_790, %256 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%257 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %259 = tensor.empty() : tensor<4096x4096xf32> + %260 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_94 : tensor<4096x4096xf32>) outs(%259 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_791 = tensor.collapse_shape %258 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_792 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %261 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_791, %260 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_792 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_793 = tensor.expand_shape %261 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %262 = tensor.empty() : tensor<4096x4096xf32> + %263 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_96 : tensor<4096x4096xf32>) outs(%262 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_794 = tensor.collapse_shape %258 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_795 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %264 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_794, %263 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_795 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_796 = tensor.expand_shape %264 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %265 = tensor.empty() : tensor<4096x4096xf32> + %266 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_98 : tensor<4096x4096xf32>) outs(%265 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_797 = tensor.collapse_shape %258 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_798 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %267 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_797, %266 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_798 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_799 = tensor.expand_shape %267 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %expanded_800 = tensor.expand_shape %expanded_793 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32> + %268 = tensor.empty() : tensor<1x32x80x128xf32> + %269 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_800 : tensor<1x80x32x128xf32>) outs(%268 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x80x128xf32> + %expanded_801 = tensor.expand_shape %expanded_796 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32> + %270 = tensor.empty() : tensor<1x32x80x128xf32> + %271 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_801 : tensor<1x80x32x128xf32>) outs(%270 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x80x128xf32> + %expanded_802 = tensor.expand_shape %expanded_799 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32> + %272 = tensor.empty() : tensor<1x32x80x128xf32> + %273 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_802 : tensor<1x80x32x128xf32>) outs(%272 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x80x128xf32> + %extracted_slice_803 = tensor.extract_slice %expanded_524[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32> + %extracted_slice_804 = tensor.extract_slice %expanded_526[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32> + %274 = tensor.empty() : tensor<1x80x128xf32> + %275 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_803 : tensor<1x1x80x128xf32>) outs(%274 : tensor<1x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x80x128xf32> + %276 = tensor.empty() : tensor<80x128xf32> + %277 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%275 : tensor<1x80x128xf32>) outs(%276 : tensor<80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<80x128xf32> + %278 = tensor.empty() : tensor<1x80x128xf32> + %279 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_804 : tensor<1x1x80x128xf32>) outs(%278 : tensor<1x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x80x128xf32> + %280 = tensor.empty() : tensor<80x128xf32> + %281 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%279 : tensor<1x80x128xf32>) outs(%280 : tensor<80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<80x128xf32> + %282 = tensor.empty() : tensor<1x80x128xf32> + %283 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%282 : tensor<1x80x128xf32>) { + ^bb0(%in: i64, %out: f32): + %3652 = arith.index_cast %in : i64 to index + %3653 = linalg.index 2 : index + %extracted = tensor.extract %277[%3652, %3653] : tensor<80x128xf32> + linalg.yield %extracted : f32 + } -> tensor<1x80x128xf32> + %expanded_805 = tensor.expand_shape %283 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32> + %284 = tensor.empty() : tensor<1x80x128xf32> + %285 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%284 : tensor<1x80x128xf32>) { + ^bb0(%in: i64, %out: f32): + %3652 = arith.index_cast %in : i64 to index + %3653 = linalg.index 2 : index + %extracted = tensor.extract %281[%3652, %3653] : tensor<80x128xf32> + linalg.yield %extracted : f32 + } -> tensor<1x80x128xf32> + %expanded_806 = tensor.expand_shape %285 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32> + %286 = tensor.empty() : tensor<1x32x80x128xf32> + %287 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%269, %283 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%286 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %extracted_slice_807 = tensor.extract_slice %269[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %extracted_slice_808 = tensor.extract_slice %269[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %288 = tensor.empty() : tensor<1x32x80x64xf32> + %289 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_808 : tensor<1x32x80x64xf32>) outs(%288 : tensor<1x32x80x64xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.negf %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x64xf32> + %290 = tensor.empty() : tensor<1x32x80x128xf32> + %inserted_slice_809 = tensor.insert_slice %289 into %290[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %inserted_slice_810 = tensor.insert_slice %extracted_slice_807 into %inserted_slice_809[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %291 = tensor.empty() : tensor<1x32x80x128xf32> + %292 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_810, %285 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%291 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %293 = tensor.empty() : tensor<1x32x80x128xf32> + %294 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%287, %292 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%293 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %295 = tensor.empty() : tensor<1x32x80x128xf32> + %296 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%271, %283 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%295 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %extracted_slice_811 = tensor.extract_slice %271[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %extracted_slice_812 = tensor.extract_slice %271[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %297 = tensor.empty() : tensor<1x32x80x64xf32> + %298 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_812 : tensor<1x32x80x64xf32>) outs(%297 : tensor<1x32x80x64xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.negf %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x64xf32> + %299 = tensor.empty() : tensor<1x32x80x128xf32> + %inserted_slice_813 = tensor.insert_slice %298 into %299[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %inserted_slice_814 = tensor.insert_slice %extracted_slice_811 into %inserted_slice_813[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %300 = tensor.empty() : tensor<1x32x80x128xf32> + %301 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_814, %285 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%300 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %302 = tensor.empty() : tensor<1x32x80x128xf32> + %303 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%296, %301 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%302 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %304 = tensor.empty() : tensor<1x32x128x80xf32> + %305 = linalg.generic {indexing_maps = [#map14, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%303 : tensor<1x32x80x128xf32>) outs(%304 : tensor<1x32x128x80xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x128x80xf32> + %cst_815 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32> + %collapsed_816 = tensor.collapse_shape %294 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32> + %cst_817 = arith.constant dense<0.000000e+00> : tensor<1x32x128x80xf32> + %collapsed_818 = tensor.collapse_shape %305 [[0, 1], [2], [3]] : tensor<1x32x128x80xf32> into tensor<32x128x80xf32> + %cst_819 = arith.constant 0.000000e+00 : f32 + %306 = tensor.empty() : tensor<32x80x80xf32> + %307 = linalg.fill ins(%cst_819 : f32) outs(%306 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32> + %308 = linalg.batch_matmul ins(%collapsed_816, %collapsed_818 : tensor<32x80x128xf32>, tensor<32x128x80xf32>) outs(%307 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32> + %expanded_820 = tensor.expand_shape %308 [[0, 1], [2], [3]] : tensor<32x80x80xf32> into tensor<1x32x80x80xf32> + %cst_821 = arith.constant dense<11.3137083> : tensor<1x32x80x80xf32> + %309 = tensor.empty() : tensor<1x32x80x80xf32> + %310 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst_821 : tensor<1x32x80x80xf32>) outs(%309 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %out: f32): + %cst_2684 = arith.constant 1.000000e+00 : f32 + %3652 = arith.divf %cst_2684, %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %311 = tensor.empty() : tensor<1x32x80x80xf32> + %312 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_820, %310 : tensor<1x32x80x80xf32>, tensor<1x32x80x80xf32>) outs(%311 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %313 = tensor.empty() : tensor<1x32x80x80xf32> + %collapsed_822 = tensor.collapse_shape %21 [[0, 1], [2], [3]] : tensor<1x1x80x80xf32> into tensor<1x80x80xf32> + %314 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%312, %collapsed_822 : tensor<1x32x80x80xf32>, tensor<1x80x80xf32>) outs(%313 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %315 = tensor.empty() : tensor<1x32x80x1xf32> + %316 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%315 : tensor<1x32x80x1xf32>) { + ^bb0(%out: f32): + %cst_2684 = arith.constant 0xFF800000 : f32 + linalg.yield %cst_2684 : f32 + } -> tensor<1x32x80x1xf32> + %317 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%314 : tensor<1x32x80x80xf32>) outs(%315 : tensor<1x32x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.maximumf %in, %out : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x1xf32> + %318 = tensor.empty() : tensor<1x32x80x80xf32> + %319 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%314, %317 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%318 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.subf %in, %in_2684 : f32 + %3653 = math.exp %3652 : f32 + linalg.yield %3653 : f32 + } -> tensor<1x32x80x80xf32> + %320 = tensor.empty() : tensor<1x32x80x1xf32> + %321 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%320 : tensor<1x32x80x1xf32>) { + ^bb0(%out: f32): + %cst_2684 = arith.constant 0.000000e+00 : f32 + linalg.yield %cst_2684 : f32 + } -> tensor<1x32x80x1xf32> + %322 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%319 : tensor<1x32x80x80xf32>) outs(%321 : tensor<1x32x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.addf %in, %out : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x1xf32> + %323 = tensor.empty() : tensor<1x32x80x80xf32> + %324 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%319, %322 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%323 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.divf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %cst_823 = arith.constant dense<0.000000e+00> : tensor<1x32x80x80xf32> + %collapsed_824 = tensor.collapse_shape %324 [[0, 1], [2], [3]] : tensor<1x32x80x80xf32> into tensor<32x80x80xf32> + %cst_825 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32> + %collapsed_826 = tensor.collapse_shape %273 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32> + %cst_827 = arith.constant 0.000000e+00 : f32 + %325 = tensor.empty() : tensor<32x80x128xf32> + %326 = linalg.fill ins(%cst_827 : f32) outs(%325 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32> + %327 = linalg.batch_matmul ins(%collapsed_824, %collapsed_826 : tensor<32x80x80xf32>, tensor<32x80x128xf32>) outs(%326 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32> + %expanded_828 = tensor.expand_shape %327 [[0, 1], [2], [3]] : tensor<32x80x128xf32> into tensor<1x32x80x128xf32> + %328 = tensor.empty() : tensor<1x80x32x128xf32> + %329 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_828 : tensor<1x32x80x128xf32>) outs(%328 : tensor<1x80x32x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x80x32x128xf32> + %collapsed_829 = tensor.collapse_shape %329 [[0], [1], [2, 3]] : tensor<1x80x32x128xf32> into tensor<1x80x4096xf32> + %330 = tensor.empty() : tensor<4096x4096xf32> + %331 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_100 : tensor<4096x4096xf32>) outs(%330 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_830 = tensor.collapse_shape %collapsed_829 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_831 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %332 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_830, %331 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_831 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_832 = tensor.expand_shape %332 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %333 = tensor.empty() : tensor<1x80x4096xf32> + %334 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%247, %expanded_832 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%333 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %335 = tensor.empty() : tensor<1x80x4096xf32> + %cst_833 = arith.constant 2.000000e+00 : f32 + %336 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%334 : tensor<1x80x4096xf32>) outs(%335 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.powf %in, %cst_833 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %cst_834 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32> + %337 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%336 : tensor<1x80x4096xf32>) outs(%cst_834 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %cst_2684 = arith.constant 4.096000e+03 : f32 + %3652 = arith.divf %in, %cst_2684 : f32 + %3653 = arith.addf %3652, %out : f32 + linalg.yield %3653 : f32 + } -> tensor<1x80x1xf32> + %cst_835 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32> + %338 = tensor.empty() : tensor<1x80x1xf32> + %339 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%337, %cst_835 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%338 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %340 = tensor.empty() : tensor<1x80x1xf32> + %341 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%339 : tensor<1x80x1xf32>) outs(%340 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.rsqrt %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %342 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_836 = tensor.collapse_shape %341 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32> + %343 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%334, %collapsed_836 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%342 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %expanded_837 = tensor.expand_shape %extracted_slice_4 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32> + %344 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_838 = tensor.collapse_shape %expanded_837 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32> + %345 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_838, %343 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%344 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %346 = tensor.empty() : tensor<4096x11008xf32> + %347 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_102 : tensor<11008x4096xf32>) outs(%346 : tensor<4096x11008xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x11008xf32> + %collapsed_839 = tensor.collapse_shape %345 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_840 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32> + %348 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_839, %347 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_840 : tensor<80x11008xf32>) -> tensor<80x11008xf32> + %expanded_841 = tensor.expand_shape %348 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32> + %349 = tensor.empty() : tensor<1x80x11008xf32> + %350 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_841 : tensor<1x80x11008xf32>) outs(%349 : tensor<1x80x11008xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.negf %in : f32 + %3653 = math.exp %3652 : f32 + %cst_2684 = arith.constant 1.000000e+00 : f32 + %3654 = arith.addf %cst_2684, %3653 : f32 + %3655 = arith.divf %in, %3654 : f32 + linalg.yield %3655 : f32 + } -> tensor<1x80x11008xf32> + %351 = tensor.empty() : tensor<4096x11008xf32> + %352 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_104 : tensor<11008x4096xf32>) outs(%351 : tensor<4096x11008xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x11008xf32> + %collapsed_842 = tensor.collapse_shape %345 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_843 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32> + %353 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_842, %352 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_843 : tensor<80x11008xf32>) -> tensor<80x11008xf32> + %expanded_844 = tensor.expand_shape %353 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32> + %354 = tensor.empty() : tensor<1x80x11008xf32> + %355 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%350, %expanded_844 : tensor<1x80x11008xf32>, tensor<1x80x11008xf32>) outs(%354 : tensor<1x80x11008xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x11008xf32> + %356 = tensor.empty() : tensor<11008x4096xf32> + %357 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_106 : tensor<4096x11008xf32>) outs(%356 : tensor<11008x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<11008x4096xf32> + %collapsed_845 = tensor.collapse_shape %355 [[0, 1], [2]] : tensor<1x80x11008xf32> into tensor<80x11008xf32> + %cst_846 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %358 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_845, %357 : tensor<80x11008xf32>, tensor<11008x4096xf32>) outs(%cst_846 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_847 = tensor.expand_shape %358 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %359 = tensor.empty() : tensor<1x80x4096xf32> + %360 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%334, %expanded_847 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%359 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %361 = tensor.empty() : tensor<1x80x4096xf32> + %cst_848 = arith.constant 2.000000e+00 : f32 + %362 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%360 : tensor<1x80x4096xf32>) outs(%361 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.powf %in, %cst_848 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %cst_849 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32> + %363 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%362 : tensor<1x80x4096xf32>) outs(%cst_849 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %cst_2684 = arith.constant 4.096000e+03 : f32 + %3652 = arith.divf %in, %cst_2684 : f32 + %3653 = arith.addf %3652, %out : f32 + linalg.yield %3653 : f32 + } -> tensor<1x80x1xf32> + %cst_850 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32> + %364 = tensor.empty() : tensor<1x80x1xf32> + %365 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%363, %cst_850 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%364 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %366 = tensor.empty() : tensor<1x80x1xf32> + %367 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%365 : tensor<1x80x1xf32>) outs(%366 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.rsqrt %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %368 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_851 = tensor.collapse_shape %367 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32> + %369 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%360, %collapsed_851 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%368 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %expanded_852 = tensor.expand_shape %extracted_slice_5 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32> + %370 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_853 = tensor.collapse_shape %expanded_852 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32> + %371 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_853, %369 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%370 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %372 = tensor.empty() : tensor<4096x4096xf32> + %373 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_108 : tensor<4096x4096xf32>) outs(%372 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_854 = tensor.collapse_shape %371 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_855 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %374 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_854, %373 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_855 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_856 = tensor.expand_shape %374 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %375 = tensor.empty() : tensor<4096x4096xf32> + %376 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_110 : tensor<4096x4096xf32>) outs(%375 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_857 = tensor.collapse_shape %371 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_858 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %377 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_857, %376 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_858 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_859 = tensor.expand_shape %377 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %378 = tensor.empty() : tensor<4096x4096xf32> + %379 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_112 : tensor<4096x4096xf32>) outs(%378 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_860 = tensor.collapse_shape %371 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_861 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %380 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_860, %379 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_861 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_862 = tensor.expand_shape %380 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %expanded_863 = tensor.expand_shape %expanded_856 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32> + %381 = tensor.empty() : tensor<1x32x80x128xf32> + %382 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_863 : tensor<1x80x32x128xf32>) outs(%381 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x80x128xf32> + %expanded_864 = tensor.expand_shape %expanded_859 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32> + %383 = tensor.empty() : tensor<1x32x80x128xf32> + %384 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_864 : tensor<1x80x32x128xf32>) outs(%383 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x80x128xf32> + %expanded_865 = tensor.expand_shape %expanded_862 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32> + %385 = tensor.empty() : tensor<1x32x80x128xf32> + %386 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_865 : tensor<1x80x32x128xf32>) outs(%385 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x80x128xf32> + %extracted_slice_866 = tensor.extract_slice %expanded_528[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32> + %extracted_slice_867 = tensor.extract_slice %expanded_530[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32> + %387 = tensor.empty() : tensor<1x80x128xf32> + %388 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_866 : tensor<1x1x80x128xf32>) outs(%387 : tensor<1x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x80x128xf32> + %389 = tensor.empty() : tensor<80x128xf32> + %390 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%388 : tensor<1x80x128xf32>) outs(%389 : tensor<80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<80x128xf32> + %391 = tensor.empty() : tensor<1x80x128xf32> + %392 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_867 : tensor<1x1x80x128xf32>) outs(%391 : tensor<1x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x80x128xf32> + %393 = tensor.empty() : tensor<80x128xf32> + %394 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%392 : tensor<1x80x128xf32>) outs(%393 : tensor<80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<80x128xf32> + %395 = tensor.empty() : tensor<1x80x128xf32> + %396 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%395 : tensor<1x80x128xf32>) { + ^bb0(%in: i64, %out: f32): + %3652 = arith.index_cast %in : i64 to index + %3653 = linalg.index 2 : index + %extracted = tensor.extract %390[%3652, %3653] : tensor<80x128xf32> + linalg.yield %extracted : f32 + } -> tensor<1x80x128xf32> + %expanded_868 = tensor.expand_shape %396 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32> + %397 = tensor.empty() : tensor<1x80x128xf32> + %398 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%397 : tensor<1x80x128xf32>) { + ^bb0(%in: i64, %out: f32): + %3652 = arith.index_cast %in : i64 to index + %3653 = linalg.index 2 : index + %extracted = tensor.extract %394[%3652, %3653] : tensor<80x128xf32> + linalg.yield %extracted : f32 + } -> tensor<1x80x128xf32> + %expanded_869 = tensor.expand_shape %398 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32> + %399 = tensor.empty() : tensor<1x32x80x128xf32> + %400 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%382, %396 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%399 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %extracted_slice_870 = tensor.extract_slice %382[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %extracted_slice_871 = tensor.extract_slice %382[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %401 = tensor.empty() : tensor<1x32x80x64xf32> + %402 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_871 : tensor<1x32x80x64xf32>) outs(%401 : tensor<1x32x80x64xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.negf %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x64xf32> + %403 = tensor.empty() : tensor<1x32x80x128xf32> + %inserted_slice_872 = tensor.insert_slice %402 into %403[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %inserted_slice_873 = tensor.insert_slice %extracted_slice_870 into %inserted_slice_872[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %404 = tensor.empty() : tensor<1x32x80x128xf32> + %405 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_873, %398 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%404 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %406 = tensor.empty() : tensor<1x32x80x128xf32> + %407 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%400, %405 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%406 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %408 = tensor.empty() : tensor<1x32x80x128xf32> + %409 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%384, %396 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%408 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %extracted_slice_874 = tensor.extract_slice %384[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %extracted_slice_875 = tensor.extract_slice %384[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %410 = tensor.empty() : tensor<1x32x80x64xf32> + %411 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_875 : tensor<1x32x80x64xf32>) outs(%410 : tensor<1x32x80x64xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.negf %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x64xf32> + %412 = tensor.empty() : tensor<1x32x80x128xf32> + %inserted_slice_876 = tensor.insert_slice %411 into %412[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %inserted_slice_877 = tensor.insert_slice %extracted_slice_874 into %inserted_slice_876[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %413 = tensor.empty() : tensor<1x32x80x128xf32> + %414 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_877, %398 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%413 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %415 = tensor.empty() : tensor<1x32x80x128xf32> + %416 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%409, %414 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%415 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %417 = tensor.empty() : tensor<1x32x128x80xf32> + %418 = linalg.generic {indexing_maps = [#map14, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%416 : tensor<1x32x80x128xf32>) outs(%417 : tensor<1x32x128x80xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x128x80xf32> + %cst_878 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32> + %collapsed_879 = tensor.collapse_shape %407 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32> + %cst_880 = arith.constant dense<0.000000e+00> : tensor<1x32x128x80xf32> + %collapsed_881 = tensor.collapse_shape %418 [[0, 1], [2], [3]] : tensor<1x32x128x80xf32> into tensor<32x128x80xf32> + %cst_882 = arith.constant 0.000000e+00 : f32 + %419 = tensor.empty() : tensor<32x80x80xf32> + %420 = linalg.fill ins(%cst_882 : f32) outs(%419 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32> + %421 = linalg.batch_matmul ins(%collapsed_879, %collapsed_881 : tensor<32x80x128xf32>, tensor<32x128x80xf32>) outs(%420 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32> + %expanded_883 = tensor.expand_shape %421 [[0, 1], [2], [3]] : tensor<32x80x80xf32> into tensor<1x32x80x80xf32> + %cst_884 = arith.constant dense<11.3137083> : tensor<1x32x80x80xf32> + %422 = tensor.empty() : tensor<1x32x80x80xf32> + %423 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst_884 : tensor<1x32x80x80xf32>) outs(%422 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %out: f32): + %cst_2684 = arith.constant 1.000000e+00 : f32 + %3652 = arith.divf %cst_2684, %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %424 = tensor.empty() : tensor<1x32x80x80xf32> + %425 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_883, %423 : tensor<1x32x80x80xf32>, tensor<1x32x80x80xf32>) outs(%424 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %426 = tensor.empty() : tensor<1x32x80x80xf32> + %collapsed_885 = tensor.collapse_shape %21 [[0, 1], [2], [3]] : tensor<1x1x80x80xf32> into tensor<1x80x80xf32> + %427 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%425, %collapsed_885 : tensor<1x32x80x80xf32>, tensor<1x80x80xf32>) outs(%426 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %428 = tensor.empty() : tensor<1x32x80x1xf32> + %429 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%428 : tensor<1x32x80x1xf32>) { + ^bb0(%out: f32): + %cst_2684 = arith.constant 0xFF800000 : f32 + linalg.yield %cst_2684 : f32 + } -> tensor<1x32x80x1xf32> + %430 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%427 : tensor<1x32x80x80xf32>) outs(%428 : tensor<1x32x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.maximumf %in, %out : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x1xf32> + %431 = tensor.empty() : tensor<1x32x80x80xf32> + %432 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%427, %430 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%431 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.subf %in, %in_2684 : f32 + %3653 = math.exp %3652 : f32 + linalg.yield %3653 : f32 + } -> tensor<1x32x80x80xf32> + %433 = tensor.empty() : tensor<1x32x80x1xf32> + %434 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%433 : tensor<1x32x80x1xf32>) { + ^bb0(%out: f32): + %cst_2684 = arith.constant 0.000000e+00 : f32 + linalg.yield %cst_2684 : f32 + } -> tensor<1x32x80x1xf32> + %435 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%432 : tensor<1x32x80x80xf32>) outs(%434 : tensor<1x32x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.addf %in, %out : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x1xf32> + %436 = tensor.empty() : tensor<1x32x80x80xf32> + %437 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%432, %435 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%436 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.divf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %cst_886 = arith.constant dense<0.000000e+00> : tensor<1x32x80x80xf32> + %collapsed_887 = tensor.collapse_shape %437 [[0, 1], [2], [3]] : tensor<1x32x80x80xf32> into tensor<32x80x80xf32> + %cst_888 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32> + %collapsed_889 = tensor.collapse_shape %386 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32> + %cst_890 = arith.constant 0.000000e+00 : f32 + %438 = tensor.empty() : tensor<32x80x128xf32> + %439 = linalg.fill ins(%cst_890 : f32) outs(%438 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32> + %440 = linalg.batch_matmul ins(%collapsed_887, %collapsed_889 : tensor<32x80x80xf32>, tensor<32x80x128xf32>) outs(%439 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32> + %expanded_891 = tensor.expand_shape %440 [[0, 1], [2], [3]] : tensor<32x80x128xf32> into tensor<1x32x80x128xf32> + %441 = tensor.empty() : tensor<1x80x32x128xf32> + %442 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_891 : tensor<1x32x80x128xf32>) outs(%441 : tensor<1x80x32x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x80x32x128xf32> + %collapsed_892 = tensor.collapse_shape %442 [[0], [1], [2, 3]] : tensor<1x80x32x128xf32> into tensor<1x80x4096xf32> + %443 = tensor.empty() : tensor<4096x4096xf32> + %444 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_114 : tensor<4096x4096xf32>) outs(%443 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_893 = tensor.collapse_shape %collapsed_892 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_894 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %445 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_893, %444 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_894 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_895 = tensor.expand_shape %445 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %446 = tensor.empty() : tensor<1x80x4096xf32> + %447 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%360, %expanded_895 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%446 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %448 = tensor.empty() : tensor<1x80x4096xf32> + %cst_896 = arith.constant 2.000000e+00 : f32 + %449 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%447 : tensor<1x80x4096xf32>) outs(%448 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.powf %in, %cst_896 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %cst_897 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32> + %450 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%449 : tensor<1x80x4096xf32>) outs(%cst_897 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %cst_2684 = arith.constant 4.096000e+03 : f32 + %3652 = arith.divf %in, %cst_2684 : f32 + %3653 = arith.addf %3652, %out : f32 + linalg.yield %3653 : f32 + } -> tensor<1x80x1xf32> + %cst_898 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32> + %451 = tensor.empty() : tensor<1x80x1xf32> + %452 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%450, %cst_898 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%451 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %453 = tensor.empty() : tensor<1x80x1xf32> + %454 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%452 : tensor<1x80x1xf32>) outs(%453 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.rsqrt %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %455 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_899 = tensor.collapse_shape %454 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32> + %456 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%447, %collapsed_899 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%455 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %expanded_900 = tensor.expand_shape %extracted_slice_6 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32> + %457 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_901 = tensor.collapse_shape %expanded_900 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32> + %458 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_901, %456 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%457 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %459 = tensor.empty() : tensor<4096x11008xf32> + %460 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_116 : tensor<11008x4096xf32>) outs(%459 : tensor<4096x11008xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x11008xf32> + %collapsed_902 = tensor.collapse_shape %458 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_903 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32> + %461 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_902, %460 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_903 : tensor<80x11008xf32>) -> tensor<80x11008xf32> + %expanded_904 = tensor.expand_shape %461 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32> + %462 = tensor.empty() : tensor<1x80x11008xf32> + %463 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_904 : tensor<1x80x11008xf32>) outs(%462 : tensor<1x80x11008xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.negf %in : f32 + %3653 = math.exp %3652 : f32 + %cst_2684 = arith.constant 1.000000e+00 : f32 + %3654 = arith.addf %cst_2684, %3653 : f32 + %3655 = arith.divf %in, %3654 : f32 + linalg.yield %3655 : f32 + } -> tensor<1x80x11008xf32> + %464 = tensor.empty() : tensor<4096x11008xf32> + %465 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_118 : tensor<11008x4096xf32>) outs(%464 : tensor<4096x11008xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x11008xf32> + %collapsed_905 = tensor.collapse_shape %458 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_906 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32> + %466 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_905, %465 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_906 : tensor<80x11008xf32>) -> tensor<80x11008xf32> + %expanded_907 = tensor.expand_shape %466 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32> + %467 = tensor.empty() : tensor<1x80x11008xf32> + %468 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%463, %expanded_907 : tensor<1x80x11008xf32>, tensor<1x80x11008xf32>) outs(%467 : tensor<1x80x11008xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x11008xf32> + %469 = tensor.empty() : tensor<11008x4096xf32> + %470 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_120 : tensor<4096x11008xf32>) outs(%469 : tensor<11008x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<11008x4096xf32> + %collapsed_908 = tensor.collapse_shape %468 [[0, 1], [2]] : tensor<1x80x11008xf32> into tensor<80x11008xf32> + %cst_909 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %471 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_908, %470 : tensor<80x11008xf32>, tensor<11008x4096xf32>) outs(%cst_909 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_910 = tensor.expand_shape %471 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %472 = tensor.empty() : tensor<1x80x4096xf32> + %473 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%447, %expanded_910 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%472 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %474 = tensor.empty() : tensor<1x80x4096xf32> + %cst_911 = arith.constant 2.000000e+00 : f32 + %475 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%473 : tensor<1x80x4096xf32>) outs(%474 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.powf %in, %cst_911 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %cst_912 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32> + %476 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%475 : tensor<1x80x4096xf32>) outs(%cst_912 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %cst_2684 = arith.constant 4.096000e+03 : f32 + %3652 = arith.divf %in, %cst_2684 : f32 + %3653 = arith.addf %3652, %out : f32 + linalg.yield %3653 : f32 + } -> tensor<1x80x1xf32> + %cst_913 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32> + %477 = tensor.empty() : tensor<1x80x1xf32> + %478 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%476, %cst_913 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%477 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %479 = tensor.empty() : tensor<1x80x1xf32> + %480 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%478 : tensor<1x80x1xf32>) outs(%479 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.rsqrt %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %481 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_914 = tensor.collapse_shape %480 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32> + %482 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%473, %collapsed_914 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%481 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %expanded_915 = tensor.expand_shape %extracted_slice_7 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32> + %483 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_916 = tensor.collapse_shape %expanded_915 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32> + %484 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_916, %482 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%483 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %485 = tensor.empty() : tensor<4096x4096xf32> + %486 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_122 : tensor<4096x4096xf32>) outs(%485 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_917 = tensor.collapse_shape %484 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_918 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %487 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_917, %486 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_918 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_919 = tensor.expand_shape %487 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %488 = tensor.empty() : tensor<4096x4096xf32> + %489 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_124 : tensor<4096x4096xf32>) outs(%488 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_920 = tensor.collapse_shape %484 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_921 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %490 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_920, %489 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_921 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_922 = tensor.expand_shape %490 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %491 = tensor.empty() : tensor<4096x4096xf32> + %492 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_126 : tensor<4096x4096xf32>) outs(%491 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_923 = tensor.collapse_shape %484 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_924 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %493 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_923, %492 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_924 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_925 = tensor.expand_shape %493 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %expanded_926 = tensor.expand_shape %expanded_919 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32> + %494 = tensor.empty() : tensor<1x32x80x128xf32> + %495 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_926 : tensor<1x80x32x128xf32>) outs(%494 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x80x128xf32> + %expanded_927 = tensor.expand_shape %expanded_922 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32> + %496 = tensor.empty() : tensor<1x32x80x128xf32> + %497 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_927 : tensor<1x80x32x128xf32>) outs(%496 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x80x128xf32> + %expanded_928 = tensor.expand_shape %expanded_925 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32> + %498 = tensor.empty() : tensor<1x32x80x128xf32> + %499 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_928 : tensor<1x80x32x128xf32>) outs(%498 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x80x128xf32> + %extracted_slice_929 = tensor.extract_slice %expanded_532[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32> + %extracted_slice_930 = tensor.extract_slice %expanded_534[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32> + %500 = tensor.empty() : tensor<1x80x128xf32> + %501 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_929 : tensor<1x1x80x128xf32>) outs(%500 : tensor<1x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x80x128xf32> + %502 = tensor.empty() : tensor<80x128xf32> + %503 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%501 : tensor<1x80x128xf32>) outs(%502 : tensor<80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<80x128xf32> + %504 = tensor.empty() : tensor<1x80x128xf32> + %505 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_930 : tensor<1x1x80x128xf32>) outs(%504 : tensor<1x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x80x128xf32> + %506 = tensor.empty() : tensor<80x128xf32> + %507 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%505 : tensor<1x80x128xf32>) outs(%506 : tensor<80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<80x128xf32> + %508 = tensor.empty() : tensor<1x80x128xf32> + %509 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%508 : tensor<1x80x128xf32>) { + ^bb0(%in: i64, %out: f32): + %3652 = arith.index_cast %in : i64 to index + %3653 = linalg.index 2 : index + %extracted = tensor.extract %503[%3652, %3653] : tensor<80x128xf32> + linalg.yield %extracted : f32 + } -> tensor<1x80x128xf32> + %expanded_931 = tensor.expand_shape %509 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32> + %510 = tensor.empty() : tensor<1x80x128xf32> + %511 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%510 : tensor<1x80x128xf32>) { + ^bb0(%in: i64, %out: f32): + %3652 = arith.index_cast %in : i64 to index + %3653 = linalg.index 2 : index + %extracted = tensor.extract %507[%3652, %3653] : tensor<80x128xf32> + linalg.yield %extracted : f32 + } -> tensor<1x80x128xf32> + %expanded_932 = tensor.expand_shape %511 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32> + %512 = tensor.empty() : tensor<1x32x80x128xf32> + %513 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%495, %509 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%512 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %extracted_slice_933 = tensor.extract_slice %495[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %extracted_slice_934 = tensor.extract_slice %495[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %514 = tensor.empty() : tensor<1x32x80x64xf32> + %515 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_934 : tensor<1x32x80x64xf32>) outs(%514 : tensor<1x32x80x64xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.negf %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x64xf32> + %516 = tensor.empty() : tensor<1x32x80x128xf32> + %inserted_slice_935 = tensor.insert_slice %515 into %516[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %inserted_slice_936 = tensor.insert_slice %extracted_slice_933 into %inserted_slice_935[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %517 = tensor.empty() : tensor<1x32x80x128xf32> + %518 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_936, %511 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%517 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %519 = tensor.empty() : tensor<1x32x80x128xf32> + %520 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%513, %518 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%519 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %521 = tensor.empty() : tensor<1x32x80x128xf32> + %522 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%497, %509 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%521 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %extracted_slice_937 = tensor.extract_slice %497[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %extracted_slice_938 = tensor.extract_slice %497[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %523 = tensor.empty() : tensor<1x32x80x64xf32> + %524 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_938 : tensor<1x32x80x64xf32>) outs(%523 : tensor<1x32x80x64xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.negf %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x64xf32> + %525 = tensor.empty() : tensor<1x32x80x128xf32> + %inserted_slice_939 = tensor.insert_slice %524 into %525[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %inserted_slice_940 = tensor.insert_slice %extracted_slice_937 into %inserted_slice_939[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %526 = tensor.empty() : tensor<1x32x80x128xf32> + %527 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_940, %511 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%526 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %528 = tensor.empty() : tensor<1x32x80x128xf32> + %529 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%522, %527 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%528 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %530 = tensor.empty() : tensor<1x32x128x80xf32> + %531 = linalg.generic {indexing_maps = [#map14, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%529 : tensor<1x32x80x128xf32>) outs(%530 : tensor<1x32x128x80xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x128x80xf32> + %cst_941 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32> + %collapsed_942 = tensor.collapse_shape %520 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32> + %cst_943 = arith.constant dense<0.000000e+00> : tensor<1x32x128x80xf32> + %collapsed_944 = tensor.collapse_shape %531 [[0, 1], [2], [3]] : tensor<1x32x128x80xf32> into tensor<32x128x80xf32> + %cst_945 = arith.constant 0.000000e+00 : f32 + %532 = tensor.empty() : tensor<32x80x80xf32> + %533 = linalg.fill ins(%cst_945 : f32) outs(%532 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32> + %534 = linalg.batch_matmul ins(%collapsed_942, %collapsed_944 : tensor<32x80x128xf32>, tensor<32x128x80xf32>) outs(%533 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32> + %expanded_946 = tensor.expand_shape %534 [[0, 1], [2], [3]] : tensor<32x80x80xf32> into tensor<1x32x80x80xf32> + %cst_947 = arith.constant dense<11.3137083> : tensor<1x32x80x80xf32> + %535 = tensor.empty() : tensor<1x32x80x80xf32> + %536 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst_947 : tensor<1x32x80x80xf32>) outs(%535 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %out: f32): + %cst_2684 = arith.constant 1.000000e+00 : f32 + %3652 = arith.divf %cst_2684, %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %537 = tensor.empty() : tensor<1x32x80x80xf32> + %538 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_946, %536 : tensor<1x32x80x80xf32>, tensor<1x32x80x80xf32>) outs(%537 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %539 = tensor.empty() : tensor<1x32x80x80xf32> + %collapsed_948 = tensor.collapse_shape %21 [[0, 1], [2], [3]] : tensor<1x1x80x80xf32> into tensor<1x80x80xf32> + %540 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%538, %collapsed_948 : tensor<1x32x80x80xf32>, tensor<1x80x80xf32>) outs(%539 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %541 = tensor.empty() : tensor<1x32x80x1xf32> + %542 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%541 : tensor<1x32x80x1xf32>) { + ^bb0(%out: f32): + %cst_2684 = arith.constant 0xFF800000 : f32 + linalg.yield %cst_2684 : f32 + } -> tensor<1x32x80x1xf32> + %543 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%540 : tensor<1x32x80x80xf32>) outs(%541 : tensor<1x32x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.maximumf %in, %out : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x1xf32> + %544 = tensor.empty() : tensor<1x32x80x80xf32> + %545 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%540, %543 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%544 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.subf %in, %in_2684 : f32 + %3653 = math.exp %3652 : f32 + linalg.yield %3653 : f32 + } -> tensor<1x32x80x80xf32> + %546 = tensor.empty() : tensor<1x32x80x1xf32> + %547 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%546 : tensor<1x32x80x1xf32>) { + ^bb0(%out: f32): + %cst_2684 = arith.constant 0.000000e+00 : f32 + linalg.yield %cst_2684 : f32 + } -> tensor<1x32x80x1xf32> + %548 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%545 : tensor<1x32x80x80xf32>) outs(%547 : tensor<1x32x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.addf %in, %out : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x1xf32> + %549 = tensor.empty() : tensor<1x32x80x80xf32> + %550 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%545, %548 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%549 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.divf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %cst_949 = arith.constant dense<0.000000e+00> : tensor<1x32x80x80xf32> + %collapsed_950 = tensor.collapse_shape %550 [[0, 1], [2], [3]] : tensor<1x32x80x80xf32> into tensor<32x80x80xf32> + %cst_951 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32> + %collapsed_952 = tensor.collapse_shape %499 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32> + %cst_953 = arith.constant 0.000000e+00 : f32 + %551 = tensor.empty() : tensor<32x80x128xf32> + %552 = linalg.fill ins(%cst_953 : f32) outs(%551 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32> + %553 = linalg.batch_matmul ins(%collapsed_950, %collapsed_952 : tensor<32x80x80xf32>, tensor<32x80x128xf32>) outs(%552 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32> + %expanded_954 = tensor.expand_shape %553 [[0, 1], [2], [3]] : tensor<32x80x128xf32> into tensor<1x32x80x128xf32> + %554 = tensor.empty() : tensor<1x80x32x128xf32> + %555 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_954 : tensor<1x32x80x128xf32>) outs(%554 : tensor<1x80x32x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x80x32x128xf32> + %collapsed_955 = tensor.collapse_shape %555 [[0], [1], [2, 3]] : tensor<1x80x32x128xf32> into tensor<1x80x4096xf32> + %556 = tensor.empty() : tensor<4096x4096xf32> + %557 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_128 : tensor<4096x4096xf32>) outs(%556 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_956 = tensor.collapse_shape %collapsed_955 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_957 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %558 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_956, %557 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_957 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_958 = tensor.expand_shape %558 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %559 = tensor.empty() : tensor<1x80x4096xf32> + %560 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%473, %expanded_958 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%559 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %561 = tensor.empty() : tensor<1x80x4096xf32> + %cst_959 = arith.constant 2.000000e+00 : f32 + %562 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%560 : tensor<1x80x4096xf32>) outs(%561 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.powf %in, %cst_959 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %cst_960 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32> + %563 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%562 : tensor<1x80x4096xf32>) outs(%cst_960 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %cst_2684 = arith.constant 4.096000e+03 : f32 + %3652 = arith.divf %in, %cst_2684 : f32 + %3653 = arith.addf %3652, %out : f32 + linalg.yield %3653 : f32 + } -> tensor<1x80x1xf32> + %cst_961 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32> + %564 = tensor.empty() : tensor<1x80x1xf32> + %565 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%563, %cst_961 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%564 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %566 = tensor.empty() : tensor<1x80x1xf32> + %567 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%565 : tensor<1x80x1xf32>) outs(%566 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.rsqrt %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %568 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_962 = tensor.collapse_shape %567 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32> + %569 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%560, %collapsed_962 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%568 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %expanded_963 = tensor.expand_shape %extracted_slice_8 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32> + %570 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_964 = tensor.collapse_shape %expanded_963 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32> + %571 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_964, %569 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%570 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %572 = tensor.empty() : tensor<4096x11008xf32> + %573 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_130 : tensor<11008x4096xf32>) outs(%572 : tensor<4096x11008xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x11008xf32> + %collapsed_965 = tensor.collapse_shape %571 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_966 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32> + %574 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_965, %573 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_966 : tensor<80x11008xf32>) -> tensor<80x11008xf32> + %expanded_967 = tensor.expand_shape %574 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32> + %575 = tensor.empty() : tensor<1x80x11008xf32> + %576 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_967 : tensor<1x80x11008xf32>) outs(%575 : tensor<1x80x11008xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.negf %in : f32 + %3653 = math.exp %3652 : f32 + %cst_2684 = arith.constant 1.000000e+00 : f32 + %3654 = arith.addf %cst_2684, %3653 : f32 + %3655 = arith.divf %in, %3654 : f32 + linalg.yield %3655 : f32 + } -> tensor<1x80x11008xf32> + %577 = tensor.empty() : tensor<4096x11008xf32> + %578 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_132 : tensor<11008x4096xf32>) outs(%577 : tensor<4096x11008xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x11008xf32> + %collapsed_968 = tensor.collapse_shape %571 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_969 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32> + %579 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_968, %578 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_969 : tensor<80x11008xf32>) -> tensor<80x11008xf32> + %expanded_970 = tensor.expand_shape %579 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32> + %580 = tensor.empty() : tensor<1x80x11008xf32> + %581 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%576, %expanded_970 : tensor<1x80x11008xf32>, tensor<1x80x11008xf32>) outs(%580 : tensor<1x80x11008xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x11008xf32> + %582 = tensor.empty() : tensor<11008x4096xf32> + %583 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_134 : tensor<4096x11008xf32>) outs(%582 : tensor<11008x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<11008x4096xf32> + %collapsed_971 = tensor.collapse_shape %581 [[0, 1], [2]] : tensor<1x80x11008xf32> into tensor<80x11008xf32> + %cst_972 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %584 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_971, %583 : tensor<80x11008xf32>, tensor<11008x4096xf32>) outs(%cst_972 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_973 = tensor.expand_shape %584 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %585 = tensor.empty() : tensor<1x80x4096xf32> + %586 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%560, %expanded_973 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%585 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %587 = tensor.empty() : tensor<1x80x4096xf32> + %cst_974 = arith.constant 2.000000e+00 : f32 + %588 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%586 : tensor<1x80x4096xf32>) outs(%587 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.powf %in, %cst_974 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %cst_975 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32> + %589 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%588 : tensor<1x80x4096xf32>) outs(%cst_975 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %cst_2684 = arith.constant 4.096000e+03 : f32 + %3652 = arith.divf %in, %cst_2684 : f32 + %3653 = arith.addf %3652, %out : f32 + linalg.yield %3653 : f32 + } -> tensor<1x80x1xf32> + %cst_976 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32> + %590 = tensor.empty() : tensor<1x80x1xf32> + %591 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%589, %cst_976 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%590 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %592 = tensor.empty() : tensor<1x80x1xf32> + %593 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%591 : tensor<1x80x1xf32>) outs(%592 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.rsqrt %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %594 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_977 = tensor.collapse_shape %593 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32> + %595 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%586, %collapsed_977 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%594 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %expanded_978 = tensor.expand_shape %extracted_slice_9 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32> + %596 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_979 = tensor.collapse_shape %expanded_978 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32> + %597 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_979, %595 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%596 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %598 = tensor.empty() : tensor<4096x4096xf32> + %599 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_136 : tensor<4096x4096xf32>) outs(%598 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_980 = tensor.collapse_shape %597 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_981 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %600 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_980, %599 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_981 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_982 = tensor.expand_shape %600 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %601 = tensor.empty() : tensor<4096x4096xf32> + %602 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_138 : tensor<4096x4096xf32>) outs(%601 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_983 = tensor.collapse_shape %597 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_984 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %603 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_983, %602 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_984 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_985 = tensor.expand_shape %603 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %604 = tensor.empty() : tensor<4096x4096xf32> + %605 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_140 : tensor<4096x4096xf32>) outs(%604 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_986 = tensor.collapse_shape %597 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_987 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %606 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_986, %605 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_987 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_988 = tensor.expand_shape %606 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %expanded_989 = tensor.expand_shape %expanded_982 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32> + %607 = tensor.empty() : tensor<1x32x80x128xf32> + %608 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_989 : tensor<1x80x32x128xf32>) outs(%607 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x80x128xf32> + %expanded_990 = tensor.expand_shape %expanded_985 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32> + %609 = tensor.empty() : tensor<1x32x80x128xf32> + %610 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_990 : tensor<1x80x32x128xf32>) outs(%609 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x80x128xf32> + %expanded_991 = tensor.expand_shape %expanded_988 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32> + %611 = tensor.empty() : tensor<1x32x80x128xf32> + %612 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_991 : tensor<1x80x32x128xf32>) outs(%611 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x80x128xf32> + %extracted_slice_992 = tensor.extract_slice %expanded_536[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32> + %extracted_slice_993 = tensor.extract_slice %expanded_538[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32> + %613 = tensor.empty() : tensor<1x80x128xf32> + %614 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_992 : tensor<1x1x80x128xf32>) outs(%613 : tensor<1x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x80x128xf32> + %615 = tensor.empty() : tensor<80x128xf32> + %616 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%614 : tensor<1x80x128xf32>) outs(%615 : tensor<80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<80x128xf32> + %617 = tensor.empty() : tensor<1x80x128xf32> + %618 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_993 : tensor<1x1x80x128xf32>) outs(%617 : tensor<1x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x80x128xf32> + %619 = tensor.empty() : tensor<80x128xf32> + %620 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%618 : tensor<1x80x128xf32>) outs(%619 : tensor<80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<80x128xf32> + %621 = tensor.empty() : tensor<1x80x128xf32> + %622 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%621 : tensor<1x80x128xf32>) { + ^bb0(%in: i64, %out: f32): + %3652 = arith.index_cast %in : i64 to index + %3653 = linalg.index 2 : index + %extracted = tensor.extract %616[%3652, %3653] : tensor<80x128xf32> + linalg.yield %extracted : f32 + } -> tensor<1x80x128xf32> + %expanded_994 = tensor.expand_shape %622 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32> + %623 = tensor.empty() : tensor<1x80x128xf32> + %624 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%623 : tensor<1x80x128xf32>) { + ^bb0(%in: i64, %out: f32): + %3652 = arith.index_cast %in : i64 to index + %3653 = linalg.index 2 : index + %extracted = tensor.extract %620[%3652, %3653] : tensor<80x128xf32> + linalg.yield %extracted : f32 + } -> tensor<1x80x128xf32> + %expanded_995 = tensor.expand_shape %624 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32> + %625 = tensor.empty() : tensor<1x32x80x128xf32> + %626 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%608, %622 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%625 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %extracted_slice_996 = tensor.extract_slice %608[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %extracted_slice_997 = tensor.extract_slice %608[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %627 = tensor.empty() : tensor<1x32x80x64xf32> + %628 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_997 : tensor<1x32x80x64xf32>) outs(%627 : tensor<1x32x80x64xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.negf %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x64xf32> + %629 = tensor.empty() : tensor<1x32x80x128xf32> + %inserted_slice_998 = tensor.insert_slice %628 into %629[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %inserted_slice_999 = tensor.insert_slice %extracted_slice_996 into %inserted_slice_998[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %630 = tensor.empty() : tensor<1x32x80x128xf32> + %631 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_999, %624 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%630 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %632 = tensor.empty() : tensor<1x32x80x128xf32> + %633 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%626, %631 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%632 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %634 = tensor.empty() : tensor<1x32x80x128xf32> + %635 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%610, %622 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%634 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %extracted_slice_1000 = tensor.extract_slice %610[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %extracted_slice_1001 = tensor.extract_slice %610[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %636 = tensor.empty() : tensor<1x32x80x64xf32> + %637 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_1001 : tensor<1x32x80x64xf32>) outs(%636 : tensor<1x32x80x64xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.negf %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x64xf32> + %638 = tensor.empty() : tensor<1x32x80x128xf32> + %inserted_slice_1002 = tensor.insert_slice %637 into %638[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %inserted_slice_1003 = tensor.insert_slice %extracted_slice_1000 into %inserted_slice_1002[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %639 = tensor.empty() : tensor<1x32x80x128xf32> + %640 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_1003, %624 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%639 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %641 = tensor.empty() : tensor<1x32x80x128xf32> + %642 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%635, %640 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%641 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %643 = tensor.empty() : tensor<1x32x128x80xf32> + %644 = linalg.generic {indexing_maps = [#map14, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%642 : tensor<1x32x80x128xf32>) outs(%643 : tensor<1x32x128x80xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x128x80xf32> + %cst_1004 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32> + %collapsed_1005 = tensor.collapse_shape %633 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32> + %cst_1006 = arith.constant dense<0.000000e+00> : tensor<1x32x128x80xf32> + %collapsed_1007 = tensor.collapse_shape %644 [[0, 1], [2], [3]] : tensor<1x32x128x80xf32> into tensor<32x128x80xf32> + %cst_1008 = arith.constant 0.000000e+00 : f32 + %645 = tensor.empty() : tensor<32x80x80xf32> + %646 = linalg.fill ins(%cst_1008 : f32) outs(%645 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32> + %647 = linalg.batch_matmul ins(%collapsed_1005, %collapsed_1007 : tensor<32x80x128xf32>, tensor<32x128x80xf32>) outs(%646 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32> + %expanded_1009 = tensor.expand_shape %647 [[0, 1], [2], [3]] : tensor<32x80x80xf32> into tensor<1x32x80x80xf32> + %cst_1010 = arith.constant dense<11.3137083> : tensor<1x32x80x80xf32> + %648 = tensor.empty() : tensor<1x32x80x80xf32> + %649 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst_1010 : tensor<1x32x80x80xf32>) outs(%648 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %out: f32): + %cst_2684 = arith.constant 1.000000e+00 : f32 + %3652 = arith.divf %cst_2684, %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %650 = tensor.empty() : tensor<1x32x80x80xf32> + %651 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1009, %649 : tensor<1x32x80x80xf32>, tensor<1x32x80x80xf32>) outs(%650 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %652 = tensor.empty() : tensor<1x32x80x80xf32> + %collapsed_1011 = tensor.collapse_shape %21 [[0, 1], [2], [3]] : tensor<1x1x80x80xf32> into tensor<1x80x80xf32> + %653 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%651, %collapsed_1011 : tensor<1x32x80x80xf32>, tensor<1x80x80xf32>) outs(%652 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %654 = tensor.empty() : tensor<1x32x80x1xf32> + %655 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%654 : tensor<1x32x80x1xf32>) { + ^bb0(%out: f32): + %cst_2684 = arith.constant 0xFF800000 : f32 + linalg.yield %cst_2684 : f32 + } -> tensor<1x32x80x1xf32> + %656 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%653 : tensor<1x32x80x80xf32>) outs(%654 : tensor<1x32x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.maximumf %in, %out : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x1xf32> + %657 = tensor.empty() : tensor<1x32x80x80xf32> + %658 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%653, %656 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%657 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.subf %in, %in_2684 : f32 + %3653 = math.exp %3652 : f32 + linalg.yield %3653 : f32 + } -> tensor<1x32x80x80xf32> + %659 = tensor.empty() : tensor<1x32x80x1xf32> + %660 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%659 : tensor<1x32x80x1xf32>) { + ^bb0(%out: f32): + %cst_2684 = arith.constant 0.000000e+00 : f32 + linalg.yield %cst_2684 : f32 + } -> tensor<1x32x80x1xf32> + %661 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%658 : tensor<1x32x80x80xf32>) outs(%660 : tensor<1x32x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.addf %in, %out : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x1xf32> + %662 = tensor.empty() : tensor<1x32x80x80xf32> + %663 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%658, %661 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%662 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.divf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %cst_1012 = arith.constant dense<0.000000e+00> : tensor<1x32x80x80xf32> + %collapsed_1013 = tensor.collapse_shape %663 [[0, 1], [2], [3]] : tensor<1x32x80x80xf32> into tensor<32x80x80xf32> + %cst_1014 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32> + %collapsed_1015 = tensor.collapse_shape %612 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32> + %cst_1016 = arith.constant 0.000000e+00 : f32 + %664 = tensor.empty() : tensor<32x80x128xf32> + %665 = linalg.fill ins(%cst_1016 : f32) outs(%664 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32> + %666 = linalg.batch_matmul ins(%collapsed_1013, %collapsed_1015 : tensor<32x80x80xf32>, tensor<32x80x128xf32>) outs(%665 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32> + %expanded_1017 = tensor.expand_shape %666 [[0, 1], [2], [3]] : tensor<32x80x128xf32> into tensor<1x32x80x128xf32> + %667 = tensor.empty() : tensor<1x80x32x128xf32> + %668 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1017 : tensor<1x32x80x128xf32>) outs(%667 : tensor<1x80x32x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x80x32x128xf32> + %collapsed_1018 = tensor.collapse_shape %668 [[0], [1], [2, 3]] : tensor<1x80x32x128xf32> into tensor<1x80x4096xf32> + %669 = tensor.empty() : tensor<4096x4096xf32> + %670 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_142 : tensor<4096x4096xf32>) outs(%669 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_1019 = tensor.collapse_shape %collapsed_1018 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_1020 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %671 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1019, %670 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1020 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_1021 = tensor.expand_shape %671 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %672 = tensor.empty() : tensor<1x80x4096xf32> + %673 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%586, %expanded_1021 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%672 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %674 = tensor.empty() : tensor<1x80x4096xf32> + %cst_1022 = arith.constant 2.000000e+00 : f32 + %675 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%673 : tensor<1x80x4096xf32>) outs(%674 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.powf %in, %cst_1022 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %cst_1023 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32> + %676 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%675 : tensor<1x80x4096xf32>) outs(%cst_1023 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %cst_2684 = arith.constant 4.096000e+03 : f32 + %3652 = arith.divf %in, %cst_2684 : f32 + %3653 = arith.addf %3652, %out : f32 + linalg.yield %3653 : f32 + } -> tensor<1x80x1xf32> + %cst_1024 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32> + %677 = tensor.empty() : tensor<1x80x1xf32> + %678 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%676, %cst_1024 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%677 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %679 = tensor.empty() : tensor<1x80x1xf32> + %680 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%678 : tensor<1x80x1xf32>) outs(%679 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.rsqrt %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %681 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_1025 = tensor.collapse_shape %680 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32> + %682 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%673, %collapsed_1025 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%681 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %expanded_1026 = tensor.expand_shape %extracted_slice_10 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32> + %683 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_1027 = tensor.collapse_shape %expanded_1026 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32> + %684 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_1027, %682 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%683 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %685 = tensor.empty() : tensor<4096x11008xf32> + %686 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_144 : tensor<11008x4096xf32>) outs(%685 : tensor<4096x11008xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x11008xf32> + %collapsed_1028 = tensor.collapse_shape %684 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_1029 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32> + %687 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1028, %686 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_1029 : tensor<80x11008xf32>) -> tensor<80x11008xf32> + %expanded_1030 = tensor.expand_shape %687 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32> + %688 = tensor.empty() : tensor<1x80x11008xf32> + %689 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_1030 : tensor<1x80x11008xf32>) outs(%688 : tensor<1x80x11008xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.negf %in : f32 + %3653 = math.exp %3652 : f32 + %cst_2684 = arith.constant 1.000000e+00 : f32 + %3654 = arith.addf %cst_2684, %3653 : f32 + %3655 = arith.divf %in, %3654 : f32 + linalg.yield %3655 : f32 + } -> tensor<1x80x11008xf32> + %690 = tensor.empty() : tensor<4096x11008xf32> + %691 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_146 : tensor<11008x4096xf32>) outs(%690 : tensor<4096x11008xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x11008xf32> + %collapsed_1031 = tensor.collapse_shape %684 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_1032 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32> + %692 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1031, %691 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_1032 : tensor<80x11008xf32>) -> tensor<80x11008xf32> + %expanded_1033 = tensor.expand_shape %692 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32> + %693 = tensor.empty() : tensor<1x80x11008xf32> + %694 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%689, %expanded_1033 : tensor<1x80x11008xf32>, tensor<1x80x11008xf32>) outs(%693 : tensor<1x80x11008xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x11008xf32> + %695 = tensor.empty() : tensor<11008x4096xf32> + %696 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_148 : tensor<4096x11008xf32>) outs(%695 : tensor<11008x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<11008x4096xf32> + %collapsed_1034 = tensor.collapse_shape %694 [[0, 1], [2]] : tensor<1x80x11008xf32> into tensor<80x11008xf32> + %cst_1035 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %697 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1034, %696 : tensor<80x11008xf32>, tensor<11008x4096xf32>) outs(%cst_1035 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_1036 = tensor.expand_shape %697 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %698 = tensor.empty() : tensor<1x80x4096xf32> + %699 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%673, %expanded_1036 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%698 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %700 = tensor.empty() : tensor<1x80x4096xf32> + %cst_1037 = arith.constant 2.000000e+00 : f32 + %701 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%699 : tensor<1x80x4096xf32>) outs(%700 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.powf %in, %cst_1037 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %cst_1038 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32> + %702 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%701 : tensor<1x80x4096xf32>) outs(%cst_1038 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %cst_2684 = arith.constant 4.096000e+03 : f32 + %3652 = arith.divf %in, %cst_2684 : f32 + %3653 = arith.addf %3652, %out : f32 + linalg.yield %3653 : f32 + } -> tensor<1x80x1xf32> + %cst_1039 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32> + %703 = tensor.empty() : tensor<1x80x1xf32> + %704 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%702, %cst_1039 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%703 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %705 = tensor.empty() : tensor<1x80x1xf32> + %706 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%704 : tensor<1x80x1xf32>) outs(%705 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.rsqrt %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %707 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_1040 = tensor.collapse_shape %706 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32> + %708 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%699, %collapsed_1040 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%707 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %expanded_1041 = tensor.expand_shape %extracted_slice_11 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32> + %709 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_1042 = tensor.collapse_shape %expanded_1041 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32> + %710 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_1042, %708 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%709 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %711 = tensor.empty() : tensor<4096x4096xf32> + %712 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_150 : tensor<4096x4096xf32>) outs(%711 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_1043 = tensor.collapse_shape %710 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_1044 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %713 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1043, %712 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1044 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_1045 = tensor.expand_shape %713 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %714 = tensor.empty() : tensor<4096x4096xf32> + %715 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_152 : tensor<4096x4096xf32>) outs(%714 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_1046 = tensor.collapse_shape %710 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_1047 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %716 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1046, %715 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1047 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_1048 = tensor.expand_shape %716 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %717 = tensor.empty() : tensor<4096x4096xf32> + %718 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_154 : tensor<4096x4096xf32>) outs(%717 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_1049 = tensor.collapse_shape %710 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_1050 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %719 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1049, %718 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1050 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_1051 = tensor.expand_shape %719 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %expanded_1052 = tensor.expand_shape %expanded_1045 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32> + %720 = tensor.empty() : tensor<1x32x80x128xf32> + %721 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1052 : tensor<1x80x32x128xf32>) outs(%720 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x80x128xf32> + %expanded_1053 = tensor.expand_shape %expanded_1048 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32> + %722 = tensor.empty() : tensor<1x32x80x128xf32> + %723 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1053 : tensor<1x80x32x128xf32>) outs(%722 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x80x128xf32> + %expanded_1054 = tensor.expand_shape %expanded_1051 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32> + %724 = tensor.empty() : tensor<1x32x80x128xf32> + %725 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1054 : tensor<1x80x32x128xf32>) outs(%724 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x80x128xf32> + %extracted_slice_1055 = tensor.extract_slice %expanded_540[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32> + %extracted_slice_1056 = tensor.extract_slice %expanded_542[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32> + %726 = tensor.empty() : tensor<1x80x128xf32> + %727 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_1055 : tensor<1x1x80x128xf32>) outs(%726 : tensor<1x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x80x128xf32> + %728 = tensor.empty() : tensor<80x128xf32> + %729 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%727 : tensor<1x80x128xf32>) outs(%728 : tensor<80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<80x128xf32> + %730 = tensor.empty() : tensor<1x80x128xf32> + %731 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_1056 : tensor<1x1x80x128xf32>) outs(%730 : tensor<1x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x80x128xf32> + %732 = tensor.empty() : tensor<80x128xf32> + %733 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%731 : tensor<1x80x128xf32>) outs(%732 : tensor<80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<80x128xf32> + %734 = tensor.empty() : tensor<1x80x128xf32> + %735 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%734 : tensor<1x80x128xf32>) { + ^bb0(%in: i64, %out: f32): + %3652 = arith.index_cast %in : i64 to index + %3653 = linalg.index 2 : index + %extracted = tensor.extract %729[%3652, %3653] : tensor<80x128xf32> + linalg.yield %extracted : f32 + } -> tensor<1x80x128xf32> + %expanded_1057 = tensor.expand_shape %735 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32> + %736 = tensor.empty() : tensor<1x80x128xf32> + %737 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%736 : tensor<1x80x128xf32>) { + ^bb0(%in: i64, %out: f32): + %3652 = arith.index_cast %in : i64 to index + %3653 = linalg.index 2 : index + %extracted = tensor.extract %733[%3652, %3653] : tensor<80x128xf32> + linalg.yield %extracted : f32 + } -> tensor<1x80x128xf32> + %expanded_1058 = tensor.expand_shape %737 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32> + %738 = tensor.empty() : tensor<1x32x80x128xf32> + %739 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%721, %735 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%738 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %extracted_slice_1059 = tensor.extract_slice %721[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %extracted_slice_1060 = tensor.extract_slice %721[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %740 = tensor.empty() : tensor<1x32x80x64xf32> + %741 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_1060 : tensor<1x32x80x64xf32>) outs(%740 : tensor<1x32x80x64xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.negf %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x64xf32> + %742 = tensor.empty() : tensor<1x32x80x128xf32> + %inserted_slice_1061 = tensor.insert_slice %741 into %742[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %inserted_slice_1062 = tensor.insert_slice %extracted_slice_1059 into %inserted_slice_1061[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %743 = tensor.empty() : tensor<1x32x80x128xf32> + %744 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_1062, %737 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%743 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %745 = tensor.empty() : tensor<1x32x80x128xf32> + %746 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%739, %744 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%745 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %747 = tensor.empty() : tensor<1x32x80x128xf32> + %748 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%723, %735 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%747 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %extracted_slice_1063 = tensor.extract_slice %723[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %extracted_slice_1064 = tensor.extract_slice %723[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %749 = tensor.empty() : tensor<1x32x80x64xf32> + %750 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_1064 : tensor<1x32x80x64xf32>) outs(%749 : tensor<1x32x80x64xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.negf %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x64xf32> + %751 = tensor.empty() : tensor<1x32x80x128xf32> + %inserted_slice_1065 = tensor.insert_slice %750 into %751[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %inserted_slice_1066 = tensor.insert_slice %extracted_slice_1063 into %inserted_slice_1065[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %752 = tensor.empty() : tensor<1x32x80x128xf32> + %753 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_1066, %737 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%752 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %754 = tensor.empty() : tensor<1x32x80x128xf32> + %755 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%748, %753 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%754 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %756 = tensor.empty() : tensor<1x32x128x80xf32> + %757 = linalg.generic {indexing_maps = [#map14, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%755 : tensor<1x32x80x128xf32>) outs(%756 : tensor<1x32x128x80xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x128x80xf32> + %cst_1067 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32> + %collapsed_1068 = tensor.collapse_shape %746 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32> + %cst_1069 = arith.constant dense<0.000000e+00> : tensor<1x32x128x80xf32> + %collapsed_1070 = tensor.collapse_shape %757 [[0, 1], [2], [3]] : tensor<1x32x128x80xf32> into tensor<32x128x80xf32> + %cst_1071 = arith.constant 0.000000e+00 : f32 + %758 = tensor.empty() : tensor<32x80x80xf32> + %759 = linalg.fill ins(%cst_1071 : f32) outs(%758 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32> + %760 = linalg.batch_matmul ins(%collapsed_1068, %collapsed_1070 : tensor<32x80x128xf32>, tensor<32x128x80xf32>) outs(%759 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32> + %expanded_1072 = tensor.expand_shape %760 [[0, 1], [2], [3]] : tensor<32x80x80xf32> into tensor<1x32x80x80xf32> + %cst_1073 = arith.constant dense<11.3137083> : tensor<1x32x80x80xf32> + %761 = tensor.empty() : tensor<1x32x80x80xf32> + %762 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst_1073 : tensor<1x32x80x80xf32>) outs(%761 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %out: f32): + %cst_2684 = arith.constant 1.000000e+00 : f32 + %3652 = arith.divf %cst_2684, %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %763 = tensor.empty() : tensor<1x32x80x80xf32> + %764 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1072, %762 : tensor<1x32x80x80xf32>, tensor<1x32x80x80xf32>) outs(%763 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %765 = tensor.empty() : tensor<1x32x80x80xf32> + %collapsed_1074 = tensor.collapse_shape %21 [[0, 1], [2], [3]] : tensor<1x1x80x80xf32> into tensor<1x80x80xf32> + %766 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%764, %collapsed_1074 : tensor<1x32x80x80xf32>, tensor<1x80x80xf32>) outs(%765 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %767 = tensor.empty() : tensor<1x32x80x1xf32> + %768 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%767 : tensor<1x32x80x1xf32>) { + ^bb0(%out: f32): + %cst_2684 = arith.constant 0xFF800000 : f32 + linalg.yield %cst_2684 : f32 + } -> tensor<1x32x80x1xf32> + %769 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%766 : tensor<1x32x80x80xf32>) outs(%767 : tensor<1x32x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.maximumf %in, %out : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x1xf32> + %770 = tensor.empty() : tensor<1x32x80x80xf32> + %771 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%766, %769 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%770 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.subf %in, %in_2684 : f32 + %3653 = math.exp %3652 : f32 + linalg.yield %3653 : f32 + } -> tensor<1x32x80x80xf32> + %772 = tensor.empty() : tensor<1x32x80x1xf32> + %773 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%772 : tensor<1x32x80x1xf32>) { + ^bb0(%out: f32): + %cst_2684 = arith.constant 0.000000e+00 : f32 + linalg.yield %cst_2684 : f32 + } -> tensor<1x32x80x1xf32> + %774 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%771 : tensor<1x32x80x80xf32>) outs(%773 : tensor<1x32x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.addf %in, %out : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x1xf32> + %775 = tensor.empty() : tensor<1x32x80x80xf32> + %776 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%771, %774 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%775 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.divf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %cst_1075 = arith.constant dense<0.000000e+00> : tensor<1x32x80x80xf32> + %collapsed_1076 = tensor.collapse_shape %776 [[0, 1], [2], [3]] : tensor<1x32x80x80xf32> into tensor<32x80x80xf32> + %cst_1077 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32> + %collapsed_1078 = tensor.collapse_shape %725 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32> + %cst_1079 = arith.constant 0.000000e+00 : f32 + %777 = tensor.empty() : tensor<32x80x128xf32> + %778 = linalg.fill ins(%cst_1079 : f32) outs(%777 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32> + %779 = linalg.batch_matmul ins(%collapsed_1076, %collapsed_1078 : tensor<32x80x80xf32>, tensor<32x80x128xf32>) outs(%778 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32> + %expanded_1080 = tensor.expand_shape %779 [[0, 1], [2], [3]] : tensor<32x80x128xf32> into tensor<1x32x80x128xf32> + %780 = tensor.empty() : tensor<1x80x32x128xf32> + %781 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1080 : tensor<1x32x80x128xf32>) outs(%780 : tensor<1x80x32x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x80x32x128xf32> + %collapsed_1081 = tensor.collapse_shape %781 [[0], [1], [2, 3]] : tensor<1x80x32x128xf32> into tensor<1x80x4096xf32> + %782 = tensor.empty() : tensor<4096x4096xf32> + %783 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_156 : tensor<4096x4096xf32>) outs(%782 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_1082 = tensor.collapse_shape %collapsed_1081 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_1083 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %784 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1082, %783 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1083 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_1084 = tensor.expand_shape %784 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %785 = tensor.empty() : tensor<1x80x4096xf32> + %786 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%699, %expanded_1084 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%785 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %787 = tensor.empty() : tensor<1x80x4096xf32> + %cst_1085 = arith.constant 2.000000e+00 : f32 + %788 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%786 : tensor<1x80x4096xf32>) outs(%787 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.powf %in, %cst_1085 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %cst_1086 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32> + %789 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%788 : tensor<1x80x4096xf32>) outs(%cst_1086 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %cst_2684 = arith.constant 4.096000e+03 : f32 + %3652 = arith.divf %in, %cst_2684 : f32 + %3653 = arith.addf %3652, %out : f32 + linalg.yield %3653 : f32 + } -> tensor<1x80x1xf32> + %cst_1087 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32> + %790 = tensor.empty() : tensor<1x80x1xf32> + %791 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%789, %cst_1087 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%790 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %792 = tensor.empty() : tensor<1x80x1xf32> + %793 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%791 : tensor<1x80x1xf32>) outs(%792 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.rsqrt %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %794 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_1088 = tensor.collapse_shape %793 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32> + %795 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%786, %collapsed_1088 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%794 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %expanded_1089 = tensor.expand_shape %extracted_slice_12 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32> + %796 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_1090 = tensor.collapse_shape %expanded_1089 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32> + %797 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_1090, %795 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%796 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %798 = tensor.empty() : tensor<4096x11008xf32> + %799 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_158 : tensor<11008x4096xf32>) outs(%798 : tensor<4096x11008xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x11008xf32> + %collapsed_1091 = tensor.collapse_shape %797 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_1092 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32> + %800 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1091, %799 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_1092 : tensor<80x11008xf32>) -> tensor<80x11008xf32> + %expanded_1093 = tensor.expand_shape %800 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32> + %801 = tensor.empty() : tensor<1x80x11008xf32> + %802 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_1093 : tensor<1x80x11008xf32>) outs(%801 : tensor<1x80x11008xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.negf %in : f32 + %3653 = math.exp %3652 : f32 + %cst_2684 = arith.constant 1.000000e+00 : f32 + %3654 = arith.addf %cst_2684, %3653 : f32 + %3655 = arith.divf %in, %3654 : f32 + linalg.yield %3655 : f32 + } -> tensor<1x80x11008xf32> + %803 = tensor.empty() : tensor<4096x11008xf32> + %804 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_160 : tensor<11008x4096xf32>) outs(%803 : tensor<4096x11008xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x11008xf32> + %collapsed_1094 = tensor.collapse_shape %797 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_1095 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32> + %805 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1094, %804 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_1095 : tensor<80x11008xf32>) -> tensor<80x11008xf32> + %expanded_1096 = tensor.expand_shape %805 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32> + %806 = tensor.empty() : tensor<1x80x11008xf32> + %807 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%802, %expanded_1096 : tensor<1x80x11008xf32>, tensor<1x80x11008xf32>) outs(%806 : tensor<1x80x11008xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x11008xf32> + %808 = tensor.empty() : tensor<11008x4096xf32> + %809 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_162 : tensor<4096x11008xf32>) outs(%808 : tensor<11008x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<11008x4096xf32> + %collapsed_1097 = tensor.collapse_shape %807 [[0, 1], [2]] : tensor<1x80x11008xf32> into tensor<80x11008xf32> + %cst_1098 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %810 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1097, %809 : tensor<80x11008xf32>, tensor<11008x4096xf32>) outs(%cst_1098 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_1099 = tensor.expand_shape %810 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %811 = tensor.empty() : tensor<1x80x4096xf32> + %812 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%786, %expanded_1099 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%811 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %813 = tensor.empty() : tensor<1x80x4096xf32> + %cst_1100 = arith.constant 2.000000e+00 : f32 + %814 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%812 : tensor<1x80x4096xf32>) outs(%813 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.powf %in, %cst_1100 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %cst_1101 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32> + %815 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%814 : tensor<1x80x4096xf32>) outs(%cst_1101 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %cst_2684 = arith.constant 4.096000e+03 : f32 + %3652 = arith.divf %in, %cst_2684 : f32 + %3653 = arith.addf %3652, %out : f32 + linalg.yield %3653 : f32 + } -> tensor<1x80x1xf32> + %cst_1102 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32> + %816 = tensor.empty() : tensor<1x80x1xf32> + %817 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%815, %cst_1102 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%816 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %818 = tensor.empty() : tensor<1x80x1xf32> + %819 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%817 : tensor<1x80x1xf32>) outs(%818 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.rsqrt %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %820 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_1103 = tensor.collapse_shape %819 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32> + %821 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%812, %collapsed_1103 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%820 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %expanded_1104 = tensor.expand_shape %extracted_slice_13 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32> + %822 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_1105 = tensor.collapse_shape %expanded_1104 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32> + %823 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_1105, %821 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%822 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %824 = tensor.empty() : tensor<4096x4096xf32> + %825 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_164 : tensor<4096x4096xf32>) outs(%824 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_1106 = tensor.collapse_shape %823 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_1107 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %826 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1106, %825 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1107 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_1108 = tensor.expand_shape %826 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %827 = tensor.empty() : tensor<4096x4096xf32> + %828 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_166 : tensor<4096x4096xf32>) outs(%827 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_1109 = tensor.collapse_shape %823 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_1110 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %829 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1109, %828 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1110 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_1111 = tensor.expand_shape %829 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %830 = tensor.empty() : tensor<4096x4096xf32> + %831 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_168 : tensor<4096x4096xf32>) outs(%830 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_1112 = tensor.collapse_shape %823 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_1113 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %832 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1112, %831 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1113 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_1114 = tensor.expand_shape %832 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %expanded_1115 = tensor.expand_shape %expanded_1108 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32> + %833 = tensor.empty() : tensor<1x32x80x128xf32> + %834 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1115 : tensor<1x80x32x128xf32>) outs(%833 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x80x128xf32> + %expanded_1116 = tensor.expand_shape %expanded_1111 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32> + %835 = tensor.empty() : tensor<1x32x80x128xf32> + %836 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1116 : tensor<1x80x32x128xf32>) outs(%835 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x80x128xf32> + %expanded_1117 = tensor.expand_shape %expanded_1114 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32> + %837 = tensor.empty() : tensor<1x32x80x128xf32> + %838 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1117 : tensor<1x80x32x128xf32>) outs(%837 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x80x128xf32> + %extracted_slice_1118 = tensor.extract_slice %expanded_544[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32> + %extracted_slice_1119 = tensor.extract_slice %expanded_546[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32> + %839 = tensor.empty() : tensor<1x80x128xf32> + %840 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_1118 : tensor<1x1x80x128xf32>) outs(%839 : tensor<1x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x80x128xf32> + %841 = tensor.empty() : tensor<80x128xf32> + %842 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%840 : tensor<1x80x128xf32>) outs(%841 : tensor<80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<80x128xf32> + %843 = tensor.empty() : tensor<1x80x128xf32> + %844 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_1119 : tensor<1x1x80x128xf32>) outs(%843 : tensor<1x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x80x128xf32> + %845 = tensor.empty() : tensor<80x128xf32> + %846 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%844 : tensor<1x80x128xf32>) outs(%845 : tensor<80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<80x128xf32> + %847 = tensor.empty() : tensor<1x80x128xf32> + %848 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%847 : tensor<1x80x128xf32>) { + ^bb0(%in: i64, %out: f32): + %3652 = arith.index_cast %in : i64 to index + %3653 = linalg.index 2 : index + %extracted = tensor.extract %842[%3652, %3653] : tensor<80x128xf32> + linalg.yield %extracted : f32 + } -> tensor<1x80x128xf32> + %expanded_1120 = tensor.expand_shape %848 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32> + %849 = tensor.empty() : tensor<1x80x128xf32> + %850 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%849 : tensor<1x80x128xf32>) { + ^bb0(%in: i64, %out: f32): + %3652 = arith.index_cast %in : i64 to index + %3653 = linalg.index 2 : index + %extracted = tensor.extract %846[%3652, %3653] : tensor<80x128xf32> + linalg.yield %extracted : f32 + } -> tensor<1x80x128xf32> + %expanded_1121 = tensor.expand_shape %850 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32> + %851 = tensor.empty() : tensor<1x32x80x128xf32> + %852 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%834, %848 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%851 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %extracted_slice_1122 = tensor.extract_slice %834[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %extracted_slice_1123 = tensor.extract_slice %834[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %853 = tensor.empty() : tensor<1x32x80x64xf32> + %854 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_1123 : tensor<1x32x80x64xf32>) outs(%853 : tensor<1x32x80x64xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.negf %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x64xf32> + %855 = tensor.empty() : tensor<1x32x80x128xf32> + %inserted_slice_1124 = tensor.insert_slice %854 into %855[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %inserted_slice_1125 = tensor.insert_slice %extracted_slice_1122 into %inserted_slice_1124[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %856 = tensor.empty() : tensor<1x32x80x128xf32> + %857 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_1125, %850 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%856 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %858 = tensor.empty() : tensor<1x32x80x128xf32> + %859 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%852, %857 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%858 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %860 = tensor.empty() : tensor<1x32x80x128xf32> + %861 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%836, %848 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%860 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %extracted_slice_1126 = tensor.extract_slice %836[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %extracted_slice_1127 = tensor.extract_slice %836[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %862 = tensor.empty() : tensor<1x32x80x64xf32> + %863 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_1127 : tensor<1x32x80x64xf32>) outs(%862 : tensor<1x32x80x64xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.negf %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x64xf32> + %864 = tensor.empty() : tensor<1x32x80x128xf32> + %inserted_slice_1128 = tensor.insert_slice %863 into %864[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %inserted_slice_1129 = tensor.insert_slice %extracted_slice_1126 into %inserted_slice_1128[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %865 = tensor.empty() : tensor<1x32x80x128xf32> + %866 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_1129, %850 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%865 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %867 = tensor.empty() : tensor<1x32x80x128xf32> + %868 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%861, %866 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%867 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %869 = tensor.empty() : tensor<1x32x128x80xf32> + %870 = linalg.generic {indexing_maps = [#map14, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%868 : tensor<1x32x80x128xf32>) outs(%869 : tensor<1x32x128x80xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x128x80xf32> + %cst_1130 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32> + %collapsed_1131 = tensor.collapse_shape %859 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32> + %cst_1132 = arith.constant dense<0.000000e+00> : tensor<1x32x128x80xf32> + %collapsed_1133 = tensor.collapse_shape %870 [[0, 1], [2], [3]] : tensor<1x32x128x80xf32> into tensor<32x128x80xf32> + %cst_1134 = arith.constant 0.000000e+00 : f32 + %871 = tensor.empty() : tensor<32x80x80xf32> + %872 = linalg.fill ins(%cst_1134 : f32) outs(%871 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32> + %873 = linalg.batch_matmul ins(%collapsed_1131, %collapsed_1133 : tensor<32x80x128xf32>, tensor<32x128x80xf32>) outs(%872 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32> + %expanded_1135 = tensor.expand_shape %873 [[0, 1], [2], [3]] : tensor<32x80x80xf32> into tensor<1x32x80x80xf32> + %cst_1136 = arith.constant dense<11.3137083> : tensor<1x32x80x80xf32> + %874 = tensor.empty() : tensor<1x32x80x80xf32> + %875 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst_1136 : tensor<1x32x80x80xf32>) outs(%874 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %out: f32): + %cst_2684 = arith.constant 1.000000e+00 : f32 + %3652 = arith.divf %cst_2684, %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %876 = tensor.empty() : tensor<1x32x80x80xf32> + %877 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1135, %875 : tensor<1x32x80x80xf32>, tensor<1x32x80x80xf32>) outs(%876 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %878 = tensor.empty() : tensor<1x32x80x80xf32> + %collapsed_1137 = tensor.collapse_shape %21 [[0, 1], [2], [3]] : tensor<1x1x80x80xf32> into tensor<1x80x80xf32> + %879 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%877, %collapsed_1137 : tensor<1x32x80x80xf32>, tensor<1x80x80xf32>) outs(%878 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %880 = tensor.empty() : tensor<1x32x80x1xf32> + %881 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%880 : tensor<1x32x80x1xf32>) { + ^bb0(%out: f32): + %cst_2684 = arith.constant 0xFF800000 : f32 + linalg.yield %cst_2684 : f32 + } -> tensor<1x32x80x1xf32> + %882 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%879 : tensor<1x32x80x80xf32>) outs(%880 : tensor<1x32x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.maximumf %in, %out : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x1xf32> + %883 = tensor.empty() : tensor<1x32x80x80xf32> + %884 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%879, %882 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%883 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.subf %in, %in_2684 : f32 + %3653 = math.exp %3652 : f32 + linalg.yield %3653 : f32 + } -> tensor<1x32x80x80xf32> + %885 = tensor.empty() : tensor<1x32x80x1xf32> + %886 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%885 : tensor<1x32x80x1xf32>) { + ^bb0(%out: f32): + %cst_2684 = arith.constant 0.000000e+00 : f32 + linalg.yield %cst_2684 : f32 + } -> tensor<1x32x80x1xf32> + %887 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%884 : tensor<1x32x80x80xf32>) outs(%886 : tensor<1x32x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.addf %in, %out : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x1xf32> + %888 = tensor.empty() : tensor<1x32x80x80xf32> + %889 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%884, %887 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%888 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.divf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %cst_1138 = arith.constant dense<0.000000e+00> : tensor<1x32x80x80xf32> + %collapsed_1139 = tensor.collapse_shape %889 [[0, 1], [2], [3]] : tensor<1x32x80x80xf32> into tensor<32x80x80xf32> + %cst_1140 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32> + %collapsed_1141 = tensor.collapse_shape %838 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32> + %cst_1142 = arith.constant 0.000000e+00 : f32 + %890 = tensor.empty() : tensor<32x80x128xf32> + %891 = linalg.fill ins(%cst_1142 : f32) outs(%890 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32> + %892 = linalg.batch_matmul ins(%collapsed_1139, %collapsed_1141 : tensor<32x80x80xf32>, tensor<32x80x128xf32>) outs(%891 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32> + %expanded_1143 = tensor.expand_shape %892 [[0, 1], [2], [3]] : tensor<32x80x128xf32> into tensor<1x32x80x128xf32> + %893 = tensor.empty() : tensor<1x80x32x128xf32> + %894 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1143 : tensor<1x32x80x128xf32>) outs(%893 : tensor<1x80x32x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x80x32x128xf32> + %collapsed_1144 = tensor.collapse_shape %894 [[0], [1], [2, 3]] : tensor<1x80x32x128xf32> into tensor<1x80x4096xf32> + %895 = tensor.empty() : tensor<4096x4096xf32> + %896 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_170 : tensor<4096x4096xf32>) outs(%895 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_1145 = tensor.collapse_shape %collapsed_1144 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_1146 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %897 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1145, %896 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1146 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_1147 = tensor.expand_shape %897 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %898 = tensor.empty() : tensor<1x80x4096xf32> + %899 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%812, %expanded_1147 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%898 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %900 = tensor.empty() : tensor<1x80x4096xf32> + %cst_1148 = arith.constant 2.000000e+00 : f32 + %901 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%899 : tensor<1x80x4096xf32>) outs(%900 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.powf %in, %cst_1148 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %cst_1149 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32> + %902 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%901 : tensor<1x80x4096xf32>) outs(%cst_1149 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %cst_2684 = arith.constant 4.096000e+03 : f32 + %3652 = arith.divf %in, %cst_2684 : f32 + %3653 = arith.addf %3652, %out : f32 + linalg.yield %3653 : f32 + } -> tensor<1x80x1xf32> + %cst_1150 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32> + %903 = tensor.empty() : tensor<1x80x1xf32> + %904 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%902, %cst_1150 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%903 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %905 = tensor.empty() : tensor<1x80x1xf32> + %906 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%904 : tensor<1x80x1xf32>) outs(%905 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.rsqrt %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %907 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_1151 = tensor.collapse_shape %906 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32> + %908 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%899, %collapsed_1151 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%907 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %expanded_1152 = tensor.expand_shape %extracted_slice_14 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32> + %909 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_1153 = tensor.collapse_shape %expanded_1152 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32> + %910 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_1153, %908 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%909 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %911 = tensor.empty() : tensor<4096x11008xf32> + %912 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_172 : tensor<11008x4096xf32>) outs(%911 : tensor<4096x11008xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x11008xf32> + %collapsed_1154 = tensor.collapse_shape %910 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_1155 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32> + %913 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1154, %912 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_1155 : tensor<80x11008xf32>) -> tensor<80x11008xf32> + %expanded_1156 = tensor.expand_shape %913 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32> + %914 = tensor.empty() : tensor<1x80x11008xf32> + %915 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_1156 : tensor<1x80x11008xf32>) outs(%914 : tensor<1x80x11008xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.negf %in : f32 + %3653 = math.exp %3652 : f32 + %cst_2684 = arith.constant 1.000000e+00 : f32 + %3654 = arith.addf %cst_2684, %3653 : f32 + %3655 = arith.divf %in, %3654 : f32 + linalg.yield %3655 : f32 + } -> tensor<1x80x11008xf32> + %916 = tensor.empty() : tensor<4096x11008xf32> + %917 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_174 : tensor<11008x4096xf32>) outs(%916 : tensor<4096x11008xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x11008xf32> + %collapsed_1157 = tensor.collapse_shape %910 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_1158 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32> + %918 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1157, %917 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_1158 : tensor<80x11008xf32>) -> tensor<80x11008xf32> + %expanded_1159 = tensor.expand_shape %918 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32> + %919 = tensor.empty() : tensor<1x80x11008xf32> + %920 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%915, %expanded_1159 : tensor<1x80x11008xf32>, tensor<1x80x11008xf32>) outs(%919 : tensor<1x80x11008xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x11008xf32> + %921 = tensor.empty() : tensor<11008x4096xf32> + %922 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_176 : tensor<4096x11008xf32>) outs(%921 : tensor<11008x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<11008x4096xf32> + %collapsed_1160 = tensor.collapse_shape %920 [[0, 1], [2]] : tensor<1x80x11008xf32> into tensor<80x11008xf32> + %cst_1161 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %923 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1160, %922 : tensor<80x11008xf32>, tensor<11008x4096xf32>) outs(%cst_1161 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_1162 = tensor.expand_shape %923 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %924 = tensor.empty() : tensor<1x80x4096xf32> + %925 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%899, %expanded_1162 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%924 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %926 = tensor.empty() : tensor<1x80x4096xf32> + %cst_1163 = arith.constant 2.000000e+00 : f32 + %927 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%925 : tensor<1x80x4096xf32>) outs(%926 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.powf %in, %cst_1163 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %cst_1164 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32> + %928 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%927 : tensor<1x80x4096xf32>) outs(%cst_1164 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %cst_2684 = arith.constant 4.096000e+03 : f32 + %3652 = arith.divf %in, %cst_2684 : f32 + %3653 = arith.addf %3652, %out : f32 + linalg.yield %3653 : f32 + } -> tensor<1x80x1xf32> + %cst_1165 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32> + %929 = tensor.empty() : tensor<1x80x1xf32> + %930 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%928, %cst_1165 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%929 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %931 = tensor.empty() : tensor<1x80x1xf32> + %932 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%930 : tensor<1x80x1xf32>) outs(%931 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.rsqrt %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %933 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_1166 = tensor.collapse_shape %932 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32> + %934 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%925, %collapsed_1166 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%933 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %expanded_1167 = tensor.expand_shape %extracted_slice_15 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32> + %935 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_1168 = tensor.collapse_shape %expanded_1167 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32> + %936 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_1168, %934 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%935 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %937 = tensor.empty() : tensor<4096x4096xf32> + %938 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_178 : tensor<4096x4096xf32>) outs(%937 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_1169 = tensor.collapse_shape %936 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_1170 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %939 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1169, %938 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1170 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_1171 = tensor.expand_shape %939 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %940 = tensor.empty() : tensor<4096x4096xf32> + %941 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_180 : tensor<4096x4096xf32>) outs(%940 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_1172 = tensor.collapse_shape %936 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_1173 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %942 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1172, %941 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1173 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_1174 = tensor.expand_shape %942 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %943 = tensor.empty() : tensor<4096x4096xf32> + %944 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_182 : tensor<4096x4096xf32>) outs(%943 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_1175 = tensor.collapse_shape %936 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_1176 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %945 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1175, %944 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1176 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_1177 = tensor.expand_shape %945 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %expanded_1178 = tensor.expand_shape %expanded_1171 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32> + %946 = tensor.empty() : tensor<1x32x80x128xf32> + %947 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1178 : tensor<1x80x32x128xf32>) outs(%946 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x80x128xf32> + %expanded_1179 = tensor.expand_shape %expanded_1174 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32> + %948 = tensor.empty() : tensor<1x32x80x128xf32> + %949 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1179 : tensor<1x80x32x128xf32>) outs(%948 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x80x128xf32> + %expanded_1180 = tensor.expand_shape %expanded_1177 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32> + %950 = tensor.empty() : tensor<1x32x80x128xf32> + %951 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1180 : tensor<1x80x32x128xf32>) outs(%950 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x80x128xf32> + %extracted_slice_1181 = tensor.extract_slice %expanded_548[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32> + %extracted_slice_1182 = tensor.extract_slice %expanded_550[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32> + %952 = tensor.empty() : tensor<1x80x128xf32> + %953 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_1181 : tensor<1x1x80x128xf32>) outs(%952 : tensor<1x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x80x128xf32> + %954 = tensor.empty() : tensor<80x128xf32> + %955 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%953 : tensor<1x80x128xf32>) outs(%954 : tensor<80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<80x128xf32> + %956 = tensor.empty() : tensor<1x80x128xf32> + %957 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_1182 : tensor<1x1x80x128xf32>) outs(%956 : tensor<1x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x80x128xf32> + %958 = tensor.empty() : tensor<80x128xf32> + %959 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%957 : tensor<1x80x128xf32>) outs(%958 : tensor<80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<80x128xf32> + %960 = tensor.empty() : tensor<1x80x128xf32> + %961 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%960 : tensor<1x80x128xf32>) { + ^bb0(%in: i64, %out: f32): + %3652 = arith.index_cast %in : i64 to index + %3653 = linalg.index 2 : index + %extracted = tensor.extract %955[%3652, %3653] : tensor<80x128xf32> + linalg.yield %extracted : f32 + } -> tensor<1x80x128xf32> + %expanded_1183 = tensor.expand_shape %961 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32> + %962 = tensor.empty() : tensor<1x80x128xf32> + %963 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%962 : tensor<1x80x128xf32>) { + ^bb0(%in: i64, %out: f32): + %3652 = arith.index_cast %in : i64 to index + %3653 = linalg.index 2 : index + %extracted = tensor.extract %959[%3652, %3653] : tensor<80x128xf32> + linalg.yield %extracted : f32 + } -> tensor<1x80x128xf32> + %expanded_1184 = tensor.expand_shape %963 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32> + %964 = tensor.empty() : tensor<1x32x80x128xf32> + %965 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%947, %961 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%964 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %extracted_slice_1185 = tensor.extract_slice %947[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %extracted_slice_1186 = tensor.extract_slice %947[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %966 = tensor.empty() : tensor<1x32x80x64xf32> + %967 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_1186 : tensor<1x32x80x64xf32>) outs(%966 : tensor<1x32x80x64xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.negf %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x64xf32> + %968 = tensor.empty() : tensor<1x32x80x128xf32> + %inserted_slice_1187 = tensor.insert_slice %967 into %968[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %inserted_slice_1188 = tensor.insert_slice %extracted_slice_1185 into %inserted_slice_1187[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %969 = tensor.empty() : tensor<1x32x80x128xf32> + %970 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_1188, %963 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%969 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %971 = tensor.empty() : tensor<1x32x80x128xf32> + %972 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%965, %970 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%971 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %973 = tensor.empty() : tensor<1x32x80x128xf32> + %974 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%949, %961 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%973 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %extracted_slice_1189 = tensor.extract_slice %949[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %extracted_slice_1190 = tensor.extract_slice %949[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %975 = tensor.empty() : tensor<1x32x80x64xf32> + %976 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_1190 : tensor<1x32x80x64xf32>) outs(%975 : tensor<1x32x80x64xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.negf %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x64xf32> + %977 = tensor.empty() : tensor<1x32x80x128xf32> + %inserted_slice_1191 = tensor.insert_slice %976 into %977[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %inserted_slice_1192 = tensor.insert_slice %extracted_slice_1189 into %inserted_slice_1191[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %978 = tensor.empty() : tensor<1x32x80x128xf32> + %979 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_1192, %963 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%978 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %980 = tensor.empty() : tensor<1x32x80x128xf32> + %981 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%974, %979 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%980 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %982 = tensor.empty() : tensor<1x32x128x80xf32> + %983 = linalg.generic {indexing_maps = [#map14, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%981 : tensor<1x32x80x128xf32>) outs(%982 : tensor<1x32x128x80xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x128x80xf32> + %cst_1193 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32> + %collapsed_1194 = tensor.collapse_shape %972 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32> + %cst_1195 = arith.constant dense<0.000000e+00> : tensor<1x32x128x80xf32> + %collapsed_1196 = tensor.collapse_shape %983 [[0, 1], [2], [3]] : tensor<1x32x128x80xf32> into tensor<32x128x80xf32> + %cst_1197 = arith.constant 0.000000e+00 : f32 + %984 = tensor.empty() : tensor<32x80x80xf32> + %985 = linalg.fill ins(%cst_1197 : f32) outs(%984 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32> + %986 = linalg.batch_matmul ins(%collapsed_1194, %collapsed_1196 : tensor<32x80x128xf32>, tensor<32x128x80xf32>) outs(%985 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32> + %expanded_1198 = tensor.expand_shape %986 [[0, 1], [2], [3]] : tensor<32x80x80xf32> into tensor<1x32x80x80xf32> + %cst_1199 = arith.constant dense<11.3137083> : tensor<1x32x80x80xf32> + %987 = tensor.empty() : tensor<1x32x80x80xf32> + %988 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst_1199 : tensor<1x32x80x80xf32>) outs(%987 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %out: f32): + %cst_2684 = arith.constant 1.000000e+00 : f32 + %3652 = arith.divf %cst_2684, %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %989 = tensor.empty() : tensor<1x32x80x80xf32> + %990 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1198, %988 : tensor<1x32x80x80xf32>, tensor<1x32x80x80xf32>) outs(%989 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %991 = tensor.empty() : tensor<1x32x80x80xf32> + %collapsed_1200 = tensor.collapse_shape %21 [[0, 1], [2], [3]] : tensor<1x1x80x80xf32> into tensor<1x80x80xf32> + %992 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%990, %collapsed_1200 : tensor<1x32x80x80xf32>, tensor<1x80x80xf32>) outs(%991 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %993 = tensor.empty() : tensor<1x32x80x1xf32> + %994 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%993 : tensor<1x32x80x1xf32>) { + ^bb0(%out: f32): + %cst_2684 = arith.constant 0xFF800000 : f32 + linalg.yield %cst_2684 : f32 + } -> tensor<1x32x80x1xf32> + %995 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%992 : tensor<1x32x80x80xf32>) outs(%993 : tensor<1x32x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.maximumf %in, %out : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x1xf32> + %996 = tensor.empty() : tensor<1x32x80x80xf32> + %997 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%992, %995 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%996 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.subf %in, %in_2684 : f32 + %3653 = math.exp %3652 : f32 + linalg.yield %3653 : f32 + } -> tensor<1x32x80x80xf32> + %998 = tensor.empty() : tensor<1x32x80x1xf32> + %999 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%998 : tensor<1x32x80x1xf32>) { + ^bb0(%out: f32): + %cst_2684 = arith.constant 0.000000e+00 : f32 + linalg.yield %cst_2684 : f32 + } -> tensor<1x32x80x1xf32> + %1000 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%997 : tensor<1x32x80x80xf32>) outs(%999 : tensor<1x32x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.addf %in, %out : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x1xf32> + %1001 = tensor.empty() : tensor<1x32x80x80xf32> + %1002 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%997, %1000 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%1001 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.divf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %cst_1201 = arith.constant dense<0.000000e+00> : tensor<1x32x80x80xf32> + %collapsed_1202 = tensor.collapse_shape %1002 [[0, 1], [2], [3]] : tensor<1x32x80x80xf32> into tensor<32x80x80xf32> + %cst_1203 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32> + %collapsed_1204 = tensor.collapse_shape %951 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32> + %cst_1205 = arith.constant 0.000000e+00 : f32 + %1003 = tensor.empty() : tensor<32x80x128xf32> + %1004 = linalg.fill ins(%cst_1205 : f32) outs(%1003 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32> + %1005 = linalg.batch_matmul ins(%collapsed_1202, %collapsed_1204 : tensor<32x80x80xf32>, tensor<32x80x128xf32>) outs(%1004 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32> + %expanded_1206 = tensor.expand_shape %1005 [[0, 1], [2], [3]] : tensor<32x80x128xf32> into tensor<1x32x80x128xf32> + %1006 = tensor.empty() : tensor<1x80x32x128xf32> + %1007 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1206 : tensor<1x32x80x128xf32>) outs(%1006 : tensor<1x80x32x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x80x32x128xf32> + %collapsed_1207 = tensor.collapse_shape %1007 [[0], [1], [2, 3]] : tensor<1x80x32x128xf32> into tensor<1x80x4096xf32> + %1008 = tensor.empty() : tensor<4096x4096xf32> + %1009 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_184 : tensor<4096x4096xf32>) outs(%1008 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_1208 = tensor.collapse_shape %collapsed_1207 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_1209 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %1010 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1208, %1009 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1209 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_1210 = tensor.expand_shape %1010 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %1011 = tensor.empty() : tensor<1x80x4096xf32> + %1012 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%925, %expanded_1210 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%1011 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %1013 = tensor.empty() : tensor<1x80x4096xf32> + %cst_1211 = arith.constant 2.000000e+00 : f32 + %1014 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1012 : tensor<1x80x4096xf32>) outs(%1013 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.powf %in, %cst_1211 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %cst_1212 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32> + %1015 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%1014 : tensor<1x80x4096xf32>) outs(%cst_1212 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %cst_2684 = arith.constant 4.096000e+03 : f32 + %3652 = arith.divf %in, %cst_2684 : f32 + %3653 = arith.addf %3652, %out : f32 + linalg.yield %3653 : f32 + } -> tensor<1x80x1xf32> + %cst_1213 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32> + %1016 = tensor.empty() : tensor<1x80x1xf32> + %1017 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1015, %cst_1213 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%1016 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %1018 = tensor.empty() : tensor<1x80x1xf32> + %1019 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1017 : tensor<1x80x1xf32>) outs(%1018 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.rsqrt %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %1020 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_1214 = tensor.collapse_shape %1019 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32> + %1021 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1012, %collapsed_1214 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%1020 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %expanded_1215 = tensor.expand_shape %extracted_slice_16 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32> + %1022 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_1216 = tensor.collapse_shape %expanded_1215 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32> + %1023 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_1216, %1021 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%1022 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %1024 = tensor.empty() : tensor<4096x11008xf32> + %1025 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_186 : tensor<11008x4096xf32>) outs(%1024 : tensor<4096x11008xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x11008xf32> + %collapsed_1217 = tensor.collapse_shape %1023 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_1218 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32> + %1026 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1217, %1025 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_1218 : tensor<80x11008xf32>) -> tensor<80x11008xf32> + %expanded_1219 = tensor.expand_shape %1026 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32> + %1027 = tensor.empty() : tensor<1x80x11008xf32> + %1028 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_1219 : tensor<1x80x11008xf32>) outs(%1027 : tensor<1x80x11008xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.negf %in : f32 + %3653 = math.exp %3652 : f32 + %cst_2684 = arith.constant 1.000000e+00 : f32 + %3654 = arith.addf %cst_2684, %3653 : f32 + %3655 = arith.divf %in, %3654 : f32 + linalg.yield %3655 : f32 + } -> tensor<1x80x11008xf32> + %1029 = tensor.empty() : tensor<4096x11008xf32> + %1030 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_188 : tensor<11008x4096xf32>) outs(%1029 : tensor<4096x11008xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x11008xf32> + %collapsed_1220 = tensor.collapse_shape %1023 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_1221 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32> + %1031 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1220, %1030 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_1221 : tensor<80x11008xf32>) -> tensor<80x11008xf32> + %expanded_1222 = tensor.expand_shape %1031 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32> + %1032 = tensor.empty() : tensor<1x80x11008xf32> + %1033 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1028, %expanded_1222 : tensor<1x80x11008xf32>, tensor<1x80x11008xf32>) outs(%1032 : tensor<1x80x11008xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x11008xf32> + %1034 = tensor.empty() : tensor<11008x4096xf32> + %1035 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_190 : tensor<4096x11008xf32>) outs(%1034 : tensor<11008x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<11008x4096xf32> + %collapsed_1223 = tensor.collapse_shape %1033 [[0, 1], [2]] : tensor<1x80x11008xf32> into tensor<80x11008xf32> + %cst_1224 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %1036 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1223, %1035 : tensor<80x11008xf32>, tensor<11008x4096xf32>) outs(%cst_1224 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_1225 = tensor.expand_shape %1036 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %1037 = tensor.empty() : tensor<1x80x4096xf32> + %1038 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1012, %expanded_1225 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%1037 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %1039 = tensor.empty() : tensor<1x80x4096xf32> + %cst_1226 = arith.constant 2.000000e+00 : f32 + %1040 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1038 : tensor<1x80x4096xf32>) outs(%1039 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.powf %in, %cst_1226 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %cst_1227 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32> + %1041 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%1040 : tensor<1x80x4096xf32>) outs(%cst_1227 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %cst_2684 = arith.constant 4.096000e+03 : f32 + %3652 = arith.divf %in, %cst_2684 : f32 + %3653 = arith.addf %3652, %out : f32 + linalg.yield %3653 : f32 + } -> tensor<1x80x1xf32> + %cst_1228 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32> + %1042 = tensor.empty() : tensor<1x80x1xf32> + %1043 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1041, %cst_1228 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%1042 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %1044 = tensor.empty() : tensor<1x80x1xf32> + %1045 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1043 : tensor<1x80x1xf32>) outs(%1044 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.rsqrt %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %1046 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_1229 = tensor.collapse_shape %1045 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32> + %1047 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1038, %collapsed_1229 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%1046 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %expanded_1230 = tensor.expand_shape %extracted_slice_17 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32> + %1048 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_1231 = tensor.collapse_shape %expanded_1230 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32> + %1049 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_1231, %1047 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%1048 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %1050 = tensor.empty() : tensor<4096x4096xf32> + %1051 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_192 : tensor<4096x4096xf32>) outs(%1050 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_1232 = tensor.collapse_shape %1049 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_1233 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %1052 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1232, %1051 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1233 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_1234 = tensor.expand_shape %1052 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %1053 = tensor.empty() : tensor<4096x4096xf32> + %1054 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_194 : tensor<4096x4096xf32>) outs(%1053 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_1235 = tensor.collapse_shape %1049 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_1236 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %1055 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1235, %1054 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1236 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_1237 = tensor.expand_shape %1055 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %1056 = tensor.empty() : tensor<4096x4096xf32> + %1057 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_196 : tensor<4096x4096xf32>) outs(%1056 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_1238 = tensor.collapse_shape %1049 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_1239 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %1058 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1238, %1057 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1239 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_1240 = tensor.expand_shape %1058 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %expanded_1241 = tensor.expand_shape %expanded_1234 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32> + %1059 = tensor.empty() : tensor<1x32x80x128xf32> + %1060 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1241 : tensor<1x80x32x128xf32>) outs(%1059 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x80x128xf32> + %expanded_1242 = tensor.expand_shape %expanded_1237 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32> + %1061 = tensor.empty() : tensor<1x32x80x128xf32> + %1062 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1242 : tensor<1x80x32x128xf32>) outs(%1061 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x80x128xf32> + %expanded_1243 = tensor.expand_shape %expanded_1240 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32> + %1063 = tensor.empty() : tensor<1x32x80x128xf32> + %1064 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1243 : tensor<1x80x32x128xf32>) outs(%1063 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x80x128xf32> + %extracted_slice_1244 = tensor.extract_slice %expanded_552[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32> + %extracted_slice_1245 = tensor.extract_slice %expanded_554[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32> + %1065 = tensor.empty() : tensor<1x80x128xf32> + %1066 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_1244 : tensor<1x1x80x128xf32>) outs(%1065 : tensor<1x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x80x128xf32> + %1067 = tensor.empty() : tensor<80x128xf32> + %1068 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%1066 : tensor<1x80x128xf32>) outs(%1067 : tensor<80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<80x128xf32> + %1069 = tensor.empty() : tensor<1x80x128xf32> + %1070 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_1245 : tensor<1x1x80x128xf32>) outs(%1069 : tensor<1x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x80x128xf32> + %1071 = tensor.empty() : tensor<80x128xf32> + %1072 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%1070 : tensor<1x80x128xf32>) outs(%1071 : tensor<80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<80x128xf32> + %1073 = tensor.empty() : tensor<1x80x128xf32> + %1074 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%1073 : tensor<1x80x128xf32>) { + ^bb0(%in: i64, %out: f32): + %3652 = arith.index_cast %in : i64 to index + %3653 = linalg.index 2 : index + %extracted = tensor.extract %1068[%3652, %3653] : tensor<80x128xf32> + linalg.yield %extracted : f32 + } -> tensor<1x80x128xf32> + %expanded_1246 = tensor.expand_shape %1074 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32> + %1075 = tensor.empty() : tensor<1x80x128xf32> + %1076 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%1075 : tensor<1x80x128xf32>) { + ^bb0(%in: i64, %out: f32): + %3652 = arith.index_cast %in : i64 to index + %3653 = linalg.index 2 : index + %extracted = tensor.extract %1072[%3652, %3653] : tensor<80x128xf32> + linalg.yield %extracted : f32 + } -> tensor<1x80x128xf32> + %expanded_1247 = tensor.expand_shape %1076 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32> + %1077 = tensor.empty() : tensor<1x32x80x128xf32> + %1078 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1060, %1074 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%1077 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %extracted_slice_1248 = tensor.extract_slice %1060[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %extracted_slice_1249 = tensor.extract_slice %1060[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %1079 = tensor.empty() : tensor<1x32x80x64xf32> + %1080 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_1249 : tensor<1x32x80x64xf32>) outs(%1079 : tensor<1x32x80x64xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.negf %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x64xf32> + %1081 = tensor.empty() : tensor<1x32x80x128xf32> + %inserted_slice_1250 = tensor.insert_slice %1080 into %1081[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %inserted_slice_1251 = tensor.insert_slice %extracted_slice_1248 into %inserted_slice_1250[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %1082 = tensor.empty() : tensor<1x32x80x128xf32> + %1083 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_1251, %1076 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%1082 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %1084 = tensor.empty() : tensor<1x32x80x128xf32> + %1085 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1078, %1083 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%1084 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %1086 = tensor.empty() : tensor<1x32x80x128xf32> + %1087 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1062, %1074 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%1086 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %extracted_slice_1252 = tensor.extract_slice %1062[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %extracted_slice_1253 = tensor.extract_slice %1062[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %1088 = tensor.empty() : tensor<1x32x80x64xf32> + %1089 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_1253 : tensor<1x32x80x64xf32>) outs(%1088 : tensor<1x32x80x64xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.negf %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x64xf32> + %1090 = tensor.empty() : tensor<1x32x80x128xf32> + %inserted_slice_1254 = tensor.insert_slice %1089 into %1090[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %inserted_slice_1255 = tensor.insert_slice %extracted_slice_1252 into %inserted_slice_1254[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %1091 = tensor.empty() : tensor<1x32x80x128xf32> + %1092 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_1255, %1076 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%1091 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %1093 = tensor.empty() : tensor<1x32x80x128xf32> + %1094 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1087, %1092 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%1093 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %1095 = tensor.empty() : tensor<1x32x128x80xf32> + %1096 = linalg.generic {indexing_maps = [#map14, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1094 : tensor<1x32x80x128xf32>) outs(%1095 : tensor<1x32x128x80xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x128x80xf32> + %cst_1256 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32> + %collapsed_1257 = tensor.collapse_shape %1085 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32> + %cst_1258 = arith.constant dense<0.000000e+00> : tensor<1x32x128x80xf32> + %collapsed_1259 = tensor.collapse_shape %1096 [[0, 1], [2], [3]] : tensor<1x32x128x80xf32> into tensor<32x128x80xf32> + %cst_1260 = arith.constant 0.000000e+00 : f32 + %1097 = tensor.empty() : tensor<32x80x80xf32> + %1098 = linalg.fill ins(%cst_1260 : f32) outs(%1097 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32> + %1099 = linalg.batch_matmul ins(%collapsed_1257, %collapsed_1259 : tensor<32x80x128xf32>, tensor<32x128x80xf32>) outs(%1098 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32> + %expanded_1261 = tensor.expand_shape %1099 [[0, 1], [2], [3]] : tensor<32x80x80xf32> into tensor<1x32x80x80xf32> + %cst_1262 = arith.constant dense<11.3137083> : tensor<1x32x80x80xf32> + %1100 = tensor.empty() : tensor<1x32x80x80xf32> + %1101 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst_1262 : tensor<1x32x80x80xf32>) outs(%1100 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %out: f32): + %cst_2684 = arith.constant 1.000000e+00 : f32 + %3652 = arith.divf %cst_2684, %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %1102 = tensor.empty() : tensor<1x32x80x80xf32> + %1103 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1261, %1101 : tensor<1x32x80x80xf32>, tensor<1x32x80x80xf32>) outs(%1102 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %1104 = tensor.empty() : tensor<1x32x80x80xf32> + %collapsed_1263 = tensor.collapse_shape %21 [[0, 1], [2], [3]] : tensor<1x1x80x80xf32> into tensor<1x80x80xf32> + %1105 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1103, %collapsed_1263 : tensor<1x32x80x80xf32>, tensor<1x80x80xf32>) outs(%1104 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %1106 = tensor.empty() : tensor<1x32x80x1xf32> + %1107 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%1106 : tensor<1x32x80x1xf32>) { + ^bb0(%out: f32): + %cst_2684 = arith.constant 0xFF800000 : f32 + linalg.yield %cst_2684 : f32 + } -> tensor<1x32x80x1xf32> + %1108 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%1105 : tensor<1x32x80x80xf32>) outs(%1106 : tensor<1x32x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.maximumf %in, %out : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x1xf32> + %1109 = tensor.empty() : tensor<1x32x80x80xf32> + %1110 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1105, %1108 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%1109 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.subf %in, %in_2684 : f32 + %3653 = math.exp %3652 : f32 + linalg.yield %3653 : f32 + } -> tensor<1x32x80x80xf32> + %1111 = tensor.empty() : tensor<1x32x80x1xf32> + %1112 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%1111 : tensor<1x32x80x1xf32>) { + ^bb0(%out: f32): + %cst_2684 = arith.constant 0.000000e+00 : f32 + linalg.yield %cst_2684 : f32 + } -> tensor<1x32x80x1xf32> + %1113 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%1110 : tensor<1x32x80x80xf32>) outs(%1112 : tensor<1x32x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.addf %in, %out : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x1xf32> + %1114 = tensor.empty() : tensor<1x32x80x80xf32> + %1115 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1110, %1113 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%1114 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.divf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %cst_1264 = arith.constant dense<0.000000e+00> : tensor<1x32x80x80xf32> + %collapsed_1265 = tensor.collapse_shape %1115 [[0, 1], [2], [3]] : tensor<1x32x80x80xf32> into tensor<32x80x80xf32> + %cst_1266 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32> + %collapsed_1267 = tensor.collapse_shape %1064 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32> + %cst_1268 = arith.constant 0.000000e+00 : f32 + %1116 = tensor.empty() : tensor<32x80x128xf32> + %1117 = linalg.fill ins(%cst_1268 : f32) outs(%1116 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32> + %1118 = linalg.batch_matmul ins(%collapsed_1265, %collapsed_1267 : tensor<32x80x80xf32>, tensor<32x80x128xf32>) outs(%1117 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32> + %expanded_1269 = tensor.expand_shape %1118 [[0, 1], [2], [3]] : tensor<32x80x128xf32> into tensor<1x32x80x128xf32> + %1119 = tensor.empty() : tensor<1x80x32x128xf32> + %1120 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1269 : tensor<1x32x80x128xf32>) outs(%1119 : tensor<1x80x32x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x80x32x128xf32> + %collapsed_1270 = tensor.collapse_shape %1120 [[0], [1], [2, 3]] : tensor<1x80x32x128xf32> into tensor<1x80x4096xf32> + %1121 = tensor.empty() : tensor<4096x4096xf32> + %1122 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_198 : tensor<4096x4096xf32>) outs(%1121 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_1271 = tensor.collapse_shape %collapsed_1270 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_1272 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %1123 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1271, %1122 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1272 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_1273 = tensor.expand_shape %1123 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %1124 = tensor.empty() : tensor<1x80x4096xf32> + %1125 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1038, %expanded_1273 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%1124 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %1126 = tensor.empty() : tensor<1x80x4096xf32> + %cst_1274 = arith.constant 2.000000e+00 : f32 + %1127 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1125 : tensor<1x80x4096xf32>) outs(%1126 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.powf %in, %cst_1274 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %cst_1275 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32> + %1128 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%1127 : tensor<1x80x4096xf32>) outs(%cst_1275 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %cst_2684 = arith.constant 4.096000e+03 : f32 + %3652 = arith.divf %in, %cst_2684 : f32 + %3653 = arith.addf %3652, %out : f32 + linalg.yield %3653 : f32 + } -> tensor<1x80x1xf32> + %cst_1276 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32> + %1129 = tensor.empty() : tensor<1x80x1xf32> + %1130 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1128, %cst_1276 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%1129 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %1131 = tensor.empty() : tensor<1x80x1xf32> + %1132 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1130 : tensor<1x80x1xf32>) outs(%1131 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.rsqrt %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %1133 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_1277 = tensor.collapse_shape %1132 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32> + %1134 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1125, %collapsed_1277 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%1133 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %expanded_1278 = tensor.expand_shape %extracted_slice_18 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32> + %1135 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_1279 = tensor.collapse_shape %expanded_1278 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32> + %1136 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_1279, %1134 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%1135 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %1137 = tensor.empty() : tensor<4096x11008xf32> + %1138 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_200 : tensor<11008x4096xf32>) outs(%1137 : tensor<4096x11008xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x11008xf32> + %collapsed_1280 = tensor.collapse_shape %1136 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_1281 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32> + %1139 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1280, %1138 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_1281 : tensor<80x11008xf32>) -> tensor<80x11008xf32> + %expanded_1282 = tensor.expand_shape %1139 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32> + %1140 = tensor.empty() : tensor<1x80x11008xf32> + %1141 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_1282 : tensor<1x80x11008xf32>) outs(%1140 : tensor<1x80x11008xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.negf %in : f32 + %3653 = math.exp %3652 : f32 + %cst_2684 = arith.constant 1.000000e+00 : f32 + %3654 = arith.addf %cst_2684, %3653 : f32 + %3655 = arith.divf %in, %3654 : f32 + linalg.yield %3655 : f32 + } -> tensor<1x80x11008xf32> + %1142 = tensor.empty() : tensor<4096x11008xf32> + %1143 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_202 : tensor<11008x4096xf32>) outs(%1142 : tensor<4096x11008xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x11008xf32> + %collapsed_1283 = tensor.collapse_shape %1136 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_1284 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32> + %1144 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1283, %1143 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_1284 : tensor<80x11008xf32>) -> tensor<80x11008xf32> + %expanded_1285 = tensor.expand_shape %1144 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32> + %1145 = tensor.empty() : tensor<1x80x11008xf32> + %1146 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1141, %expanded_1285 : tensor<1x80x11008xf32>, tensor<1x80x11008xf32>) outs(%1145 : tensor<1x80x11008xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x11008xf32> + %1147 = tensor.empty() : tensor<11008x4096xf32> + %1148 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_204 : tensor<4096x11008xf32>) outs(%1147 : tensor<11008x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<11008x4096xf32> + %collapsed_1286 = tensor.collapse_shape %1146 [[0, 1], [2]] : tensor<1x80x11008xf32> into tensor<80x11008xf32> + %cst_1287 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %1149 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1286, %1148 : tensor<80x11008xf32>, tensor<11008x4096xf32>) outs(%cst_1287 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_1288 = tensor.expand_shape %1149 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %1150 = tensor.empty() : tensor<1x80x4096xf32> + %1151 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1125, %expanded_1288 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%1150 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %1152 = tensor.empty() : tensor<1x80x4096xf32> + %cst_1289 = arith.constant 2.000000e+00 : f32 + %1153 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1151 : tensor<1x80x4096xf32>) outs(%1152 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.powf %in, %cst_1289 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %cst_1290 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32> + %1154 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%1153 : tensor<1x80x4096xf32>) outs(%cst_1290 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %cst_2684 = arith.constant 4.096000e+03 : f32 + %3652 = arith.divf %in, %cst_2684 : f32 + %3653 = arith.addf %3652, %out : f32 + linalg.yield %3653 : f32 + } -> tensor<1x80x1xf32> + %cst_1291 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32> + %1155 = tensor.empty() : tensor<1x80x1xf32> + %1156 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1154, %cst_1291 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%1155 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %1157 = tensor.empty() : tensor<1x80x1xf32> + %1158 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1156 : tensor<1x80x1xf32>) outs(%1157 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.rsqrt %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %1159 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_1292 = tensor.collapse_shape %1158 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32> + %1160 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1151, %collapsed_1292 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%1159 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %expanded_1293 = tensor.expand_shape %extracted_slice_19 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32> + %1161 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_1294 = tensor.collapse_shape %expanded_1293 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32> + %1162 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_1294, %1160 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%1161 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %1163 = tensor.empty() : tensor<4096x4096xf32> + %1164 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_206 : tensor<4096x4096xf32>) outs(%1163 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_1295 = tensor.collapse_shape %1162 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_1296 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %1165 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1295, %1164 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1296 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_1297 = tensor.expand_shape %1165 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %1166 = tensor.empty() : tensor<4096x4096xf32> + %1167 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_208 : tensor<4096x4096xf32>) outs(%1166 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_1298 = tensor.collapse_shape %1162 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_1299 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %1168 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1298, %1167 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1299 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_1300 = tensor.expand_shape %1168 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %1169 = tensor.empty() : tensor<4096x4096xf32> + %1170 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_210 : tensor<4096x4096xf32>) outs(%1169 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_1301 = tensor.collapse_shape %1162 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_1302 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %1171 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1301, %1170 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1302 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_1303 = tensor.expand_shape %1171 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %expanded_1304 = tensor.expand_shape %expanded_1297 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32> + %1172 = tensor.empty() : tensor<1x32x80x128xf32> + %1173 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1304 : tensor<1x80x32x128xf32>) outs(%1172 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x80x128xf32> + %expanded_1305 = tensor.expand_shape %expanded_1300 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32> + %1174 = tensor.empty() : tensor<1x32x80x128xf32> + %1175 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1305 : tensor<1x80x32x128xf32>) outs(%1174 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x80x128xf32> + %expanded_1306 = tensor.expand_shape %expanded_1303 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32> + %1176 = tensor.empty() : tensor<1x32x80x128xf32> + %1177 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1306 : tensor<1x80x32x128xf32>) outs(%1176 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x80x128xf32> + %extracted_slice_1307 = tensor.extract_slice %expanded_556[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32> + %extracted_slice_1308 = tensor.extract_slice %expanded_558[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32> + %1178 = tensor.empty() : tensor<1x80x128xf32> + %1179 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_1307 : tensor<1x1x80x128xf32>) outs(%1178 : tensor<1x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x80x128xf32> + %1180 = tensor.empty() : tensor<80x128xf32> + %1181 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%1179 : tensor<1x80x128xf32>) outs(%1180 : tensor<80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<80x128xf32> + %1182 = tensor.empty() : tensor<1x80x128xf32> + %1183 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_1308 : tensor<1x1x80x128xf32>) outs(%1182 : tensor<1x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x80x128xf32> + %1184 = tensor.empty() : tensor<80x128xf32> + %1185 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%1183 : tensor<1x80x128xf32>) outs(%1184 : tensor<80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<80x128xf32> + %1186 = tensor.empty() : tensor<1x80x128xf32> + %1187 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%1186 : tensor<1x80x128xf32>) { + ^bb0(%in: i64, %out: f32): + %3652 = arith.index_cast %in : i64 to index + %3653 = linalg.index 2 : index + %extracted = tensor.extract %1181[%3652, %3653] : tensor<80x128xf32> + linalg.yield %extracted : f32 + } -> tensor<1x80x128xf32> + %expanded_1309 = tensor.expand_shape %1187 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32> + %1188 = tensor.empty() : tensor<1x80x128xf32> + %1189 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%1188 : tensor<1x80x128xf32>) { + ^bb0(%in: i64, %out: f32): + %3652 = arith.index_cast %in : i64 to index + %3653 = linalg.index 2 : index + %extracted = tensor.extract %1185[%3652, %3653] : tensor<80x128xf32> + linalg.yield %extracted : f32 + } -> tensor<1x80x128xf32> + %expanded_1310 = tensor.expand_shape %1189 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32> + %1190 = tensor.empty() : tensor<1x32x80x128xf32> + %1191 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1173, %1187 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%1190 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %extracted_slice_1311 = tensor.extract_slice %1173[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %extracted_slice_1312 = tensor.extract_slice %1173[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %1192 = tensor.empty() : tensor<1x32x80x64xf32> + %1193 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_1312 : tensor<1x32x80x64xf32>) outs(%1192 : tensor<1x32x80x64xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.negf %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x64xf32> + %1194 = tensor.empty() : tensor<1x32x80x128xf32> + %inserted_slice_1313 = tensor.insert_slice %1193 into %1194[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %inserted_slice_1314 = tensor.insert_slice %extracted_slice_1311 into %inserted_slice_1313[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %1195 = tensor.empty() : tensor<1x32x80x128xf32> + %1196 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_1314, %1189 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%1195 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %1197 = tensor.empty() : tensor<1x32x80x128xf32> + %1198 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1191, %1196 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%1197 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %1199 = tensor.empty() : tensor<1x32x80x128xf32> + %1200 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1175, %1187 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%1199 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %extracted_slice_1315 = tensor.extract_slice %1175[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %extracted_slice_1316 = tensor.extract_slice %1175[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %1201 = tensor.empty() : tensor<1x32x80x64xf32> + %1202 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_1316 : tensor<1x32x80x64xf32>) outs(%1201 : tensor<1x32x80x64xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.negf %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x64xf32> + %1203 = tensor.empty() : tensor<1x32x80x128xf32> + %inserted_slice_1317 = tensor.insert_slice %1202 into %1203[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %inserted_slice_1318 = tensor.insert_slice %extracted_slice_1315 into %inserted_slice_1317[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %1204 = tensor.empty() : tensor<1x32x80x128xf32> + %1205 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_1318, %1189 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%1204 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %1206 = tensor.empty() : tensor<1x32x80x128xf32> + %1207 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1200, %1205 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%1206 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %1208 = tensor.empty() : tensor<1x32x128x80xf32> + %1209 = linalg.generic {indexing_maps = [#map14, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1207 : tensor<1x32x80x128xf32>) outs(%1208 : tensor<1x32x128x80xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x128x80xf32> + %cst_1319 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32> + %collapsed_1320 = tensor.collapse_shape %1198 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32> + %cst_1321 = arith.constant dense<0.000000e+00> : tensor<1x32x128x80xf32> + %collapsed_1322 = tensor.collapse_shape %1209 [[0, 1], [2], [3]] : tensor<1x32x128x80xf32> into tensor<32x128x80xf32> + %cst_1323 = arith.constant 0.000000e+00 : f32 + %1210 = tensor.empty() : tensor<32x80x80xf32> + %1211 = linalg.fill ins(%cst_1323 : f32) outs(%1210 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32> + %1212 = linalg.batch_matmul ins(%collapsed_1320, %collapsed_1322 : tensor<32x80x128xf32>, tensor<32x128x80xf32>) outs(%1211 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32> + %expanded_1324 = tensor.expand_shape %1212 [[0, 1], [2], [3]] : tensor<32x80x80xf32> into tensor<1x32x80x80xf32> + %cst_1325 = arith.constant dense<11.3137083> : tensor<1x32x80x80xf32> + %1213 = tensor.empty() : tensor<1x32x80x80xf32> + %1214 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst_1325 : tensor<1x32x80x80xf32>) outs(%1213 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %out: f32): + %cst_2684 = arith.constant 1.000000e+00 : f32 + %3652 = arith.divf %cst_2684, %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %1215 = tensor.empty() : tensor<1x32x80x80xf32> + %1216 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1324, %1214 : tensor<1x32x80x80xf32>, tensor<1x32x80x80xf32>) outs(%1215 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %1217 = tensor.empty() : tensor<1x32x80x80xf32> + %collapsed_1326 = tensor.collapse_shape %21 [[0, 1], [2], [3]] : tensor<1x1x80x80xf32> into tensor<1x80x80xf32> + %1218 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1216, %collapsed_1326 : tensor<1x32x80x80xf32>, tensor<1x80x80xf32>) outs(%1217 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %1219 = tensor.empty() : tensor<1x32x80x1xf32> + %1220 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%1219 : tensor<1x32x80x1xf32>) { + ^bb0(%out: f32): + %cst_2684 = arith.constant 0xFF800000 : f32 + linalg.yield %cst_2684 : f32 + } -> tensor<1x32x80x1xf32> + %1221 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%1218 : tensor<1x32x80x80xf32>) outs(%1219 : tensor<1x32x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.maximumf %in, %out : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x1xf32> + %1222 = tensor.empty() : tensor<1x32x80x80xf32> + %1223 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1218, %1221 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%1222 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.subf %in, %in_2684 : f32 + %3653 = math.exp %3652 : f32 + linalg.yield %3653 : f32 + } -> tensor<1x32x80x80xf32> + %1224 = tensor.empty() : tensor<1x32x80x1xf32> + %1225 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%1224 : tensor<1x32x80x1xf32>) { + ^bb0(%out: f32): + %cst_2684 = arith.constant 0.000000e+00 : f32 + linalg.yield %cst_2684 : f32 + } -> tensor<1x32x80x1xf32> + %1226 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%1223 : tensor<1x32x80x80xf32>) outs(%1225 : tensor<1x32x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.addf %in, %out : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x1xf32> + %1227 = tensor.empty() : tensor<1x32x80x80xf32> + %1228 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1223, %1226 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%1227 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.divf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %cst_1327 = arith.constant dense<0.000000e+00> : tensor<1x32x80x80xf32> + %collapsed_1328 = tensor.collapse_shape %1228 [[0, 1], [2], [3]] : tensor<1x32x80x80xf32> into tensor<32x80x80xf32> + %cst_1329 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32> + %collapsed_1330 = tensor.collapse_shape %1177 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32> + %cst_1331 = arith.constant 0.000000e+00 : f32 + %1229 = tensor.empty() : tensor<32x80x128xf32> + %1230 = linalg.fill ins(%cst_1331 : f32) outs(%1229 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32> + %1231 = linalg.batch_matmul ins(%collapsed_1328, %collapsed_1330 : tensor<32x80x80xf32>, tensor<32x80x128xf32>) outs(%1230 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32> + %expanded_1332 = tensor.expand_shape %1231 [[0, 1], [2], [3]] : tensor<32x80x128xf32> into tensor<1x32x80x128xf32> + %1232 = tensor.empty() : tensor<1x80x32x128xf32> + %1233 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1332 : tensor<1x32x80x128xf32>) outs(%1232 : tensor<1x80x32x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x80x32x128xf32> + %collapsed_1333 = tensor.collapse_shape %1233 [[0], [1], [2, 3]] : tensor<1x80x32x128xf32> into tensor<1x80x4096xf32> + %1234 = tensor.empty() : tensor<4096x4096xf32> + %1235 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_212 : tensor<4096x4096xf32>) outs(%1234 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_1334 = tensor.collapse_shape %collapsed_1333 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_1335 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %1236 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1334, %1235 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1335 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_1336 = tensor.expand_shape %1236 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %1237 = tensor.empty() : tensor<1x80x4096xf32> + %1238 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1151, %expanded_1336 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%1237 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %1239 = tensor.empty() : tensor<1x80x4096xf32> + %cst_1337 = arith.constant 2.000000e+00 : f32 + %1240 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1238 : tensor<1x80x4096xf32>) outs(%1239 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.powf %in, %cst_1337 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %cst_1338 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32> + %1241 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%1240 : tensor<1x80x4096xf32>) outs(%cst_1338 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %cst_2684 = arith.constant 4.096000e+03 : f32 + %3652 = arith.divf %in, %cst_2684 : f32 + %3653 = arith.addf %3652, %out : f32 + linalg.yield %3653 : f32 + } -> tensor<1x80x1xf32> + %cst_1339 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32> + %1242 = tensor.empty() : tensor<1x80x1xf32> + %1243 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1241, %cst_1339 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%1242 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %1244 = tensor.empty() : tensor<1x80x1xf32> + %1245 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1243 : tensor<1x80x1xf32>) outs(%1244 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.rsqrt %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %1246 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_1340 = tensor.collapse_shape %1245 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32> + %1247 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1238, %collapsed_1340 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%1246 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %expanded_1341 = tensor.expand_shape %extracted_slice_20 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32> + %1248 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_1342 = tensor.collapse_shape %expanded_1341 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32> + %1249 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_1342, %1247 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%1248 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %1250 = tensor.empty() : tensor<4096x11008xf32> + %1251 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_214 : tensor<11008x4096xf32>) outs(%1250 : tensor<4096x11008xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x11008xf32> + %collapsed_1343 = tensor.collapse_shape %1249 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_1344 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32> + %1252 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1343, %1251 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_1344 : tensor<80x11008xf32>) -> tensor<80x11008xf32> + %expanded_1345 = tensor.expand_shape %1252 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32> + %1253 = tensor.empty() : tensor<1x80x11008xf32> + %1254 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_1345 : tensor<1x80x11008xf32>) outs(%1253 : tensor<1x80x11008xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.negf %in : f32 + %3653 = math.exp %3652 : f32 + %cst_2684 = arith.constant 1.000000e+00 : f32 + %3654 = arith.addf %cst_2684, %3653 : f32 + %3655 = arith.divf %in, %3654 : f32 + linalg.yield %3655 : f32 + } -> tensor<1x80x11008xf32> + %1255 = tensor.empty() : tensor<4096x11008xf32> + %1256 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_216 : tensor<11008x4096xf32>) outs(%1255 : tensor<4096x11008xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x11008xf32> + %collapsed_1346 = tensor.collapse_shape %1249 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_1347 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32> + %1257 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1346, %1256 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_1347 : tensor<80x11008xf32>) -> tensor<80x11008xf32> + %expanded_1348 = tensor.expand_shape %1257 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32> + %1258 = tensor.empty() : tensor<1x80x11008xf32> + %1259 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1254, %expanded_1348 : tensor<1x80x11008xf32>, tensor<1x80x11008xf32>) outs(%1258 : tensor<1x80x11008xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x11008xf32> + %1260 = tensor.empty() : tensor<11008x4096xf32> + %1261 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_218 : tensor<4096x11008xf32>) outs(%1260 : tensor<11008x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<11008x4096xf32> + %collapsed_1349 = tensor.collapse_shape %1259 [[0, 1], [2]] : tensor<1x80x11008xf32> into tensor<80x11008xf32> + %cst_1350 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %1262 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1349, %1261 : tensor<80x11008xf32>, tensor<11008x4096xf32>) outs(%cst_1350 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_1351 = tensor.expand_shape %1262 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %1263 = tensor.empty() : tensor<1x80x4096xf32> + %1264 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1238, %expanded_1351 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%1263 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %1265 = tensor.empty() : tensor<1x80x4096xf32> + %cst_1352 = arith.constant 2.000000e+00 : f32 + %1266 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1264 : tensor<1x80x4096xf32>) outs(%1265 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.powf %in, %cst_1352 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %cst_1353 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32> + %1267 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%1266 : tensor<1x80x4096xf32>) outs(%cst_1353 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %cst_2684 = arith.constant 4.096000e+03 : f32 + %3652 = arith.divf %in, %cst_2684 : f32 + %3653 = arith.addf %3652, %out : f32 + linalg.yield %3653 : f32 + } -> tensor<1x80x1xf32> + %cst_1354 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32> + %1268 = tensor.empty() : tensor<1x80x1xf32> + %1269 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1267, %cst_1354 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%1268 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %1270 = tensor.empty() : tensor<1x80x1xf32> + %1271 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1269 : tensor<1x80x1xf32>) outs(%1270 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.rsqrt %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %1272 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_1355 = tensor.collapse_shape %1271 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32> + %1273 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1264, %collapsed_1355 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%1272 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %expanded_1356 = tensor.expand_shape %extracted_slice_21 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32> + %1274 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_1357 = tensor.collapse_shape %expanded_1356 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32> + %1275 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_1357, %1273 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%1274 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %1276 = tensor.empty() : tensor<4096x4096xf32> + %1277 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_220 : tensor<4096x4096xf32>) outs(%1276 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_1358 = tensor.collapse_shape %1275 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_1359 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %1278 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1358, %1277 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1359 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_1360 = tensor.expand_shape %1278 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %1279 = tensor.empty() : tensor<4096x4096xf32> + %1280 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_222 : tensor<4096x4096xf32>) outs(%1279 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_1361 = tensor.collapse_shape %1275 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_1362 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %1281 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1361, %1280 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1362 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_1363 = tensor.expand_shape %1281 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %1282 = tensor.empty() : tensor<4096x4096xf32> + %1283 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_224 : tensor<4096x4096xf32>) outs(%1282 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_1364 = tensor.collapse_shape %1275 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_1365 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %1284 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1364, %1283 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1365 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_1366 = tensor.expand_shape %1284 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %expanded_1367 = tensor.expand_shape %expanded_1360 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32> + %1285 = tensor.empty() : tensor<1x32x80x128xf32> + %1286 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1367 : tensor<1x80x32x128xf32>) outs(%1285 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x80x128xf32> + %expanded_1368 = tensor.expand_shape %expanded_1363 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32> + %1287 = tensor.empty() : tensor<1x32x80x128xf32> + %1288 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1368 : tensor<1x80x32x128xf32>) outs(%1287 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x80x128xf32> + %expanded_1369 = tensor.expand_shape %expanded_1366 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32> + %1289 = tensor.empty() : tensor<1x32x80x128xf32> + %1290 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1369 : tensor<1x80x32x128xf32>) outs(%1289 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x80x128xf32> + %extracted_slice_1370 = tensor.extract_slice %expanded_560[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32> + %extracted_slice_1371 = tensor.extract_slice %expanded_562[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32> + %1291 = tensor.empty() : tensor<1x80x128xf32> + %1292 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_1370 : tensor<1x1x80x128xf32>) outs(%1291 : tensor<1x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x80x128xf32> + %1293 = tensor.empty() : tensor<80x128xf32> + %1294 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%1292 : tensor<1x80x128xf32>) outs(%1293 : tensor<80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<80x128xf32> + %1295 = tensor.empty() : tensor<1x80x128xf32> + %1296 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_1371 : tensor<1x1x80x128xf32>) outs(%1295 : tensor<1x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x80x128xf32> + %1297 = tensor.empty() : tensor<80x128xf32> + %1298 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%1296 : tensor<1x80x128xf32>) outs(%1297 : tensor<80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<80x128xf32> + %1299 = tensor.empty() : tensor<1x80x128xf32> + %1300 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%1299 : tensor<1x80x128xf32>) { + ^bb0(%in: i64, %out: f32): + %3652 = arith.index_cast %in : i64 to index + %3653 = linalg.index 2 : index + %extracted = tensor.extract %1294[%3652, %3653] : tensor<80x128xf32> + linalg.yield %extracted : f32 + } -> tensor<1x80x128xf32> + %expanded_1372 = tensor.expand_shape %1300 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32> + %1301 = tensor.empty() : tensor<1x80x128xf32> + %1302 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%1301 : tensor<1x80x128xf32>) { + ^bb0(%in: i64, %out: f32): + %3652 = arith.index_cast %in : i64 to index + %3653 = linalg.index 2 : index + %extracted = tensor.extract %1298[%3652, %3653] : tensor<80x128xf32> + linalg.yield %extracted : f32 + } -> tensor<1x80x128xf32> + %expanded_1373 = tensor.expand_shape %1302 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32> + %1303 = tensor.empty() : tensor<1x32x80x128xf32> + %1304 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1286, %1300 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%1303 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %extracted_slice_1374 = tensor.extract_slice %1286[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %extracted_slice_1375 = tensor.extract_slice %1286[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %1305 = tensor.empty() : tensor<1x32x80x64xf32> + %1306 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_1375 : tensor<1x32x80x64xf32>) outs(%1305 : tensor<1x32x80x64xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.negf %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x64xf32> + %1307 = tensor.empty() : tensor<1x32x80x128xf32> + %inserted_slice_1376 = tensor.insert_slice %1306 into %1307[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %inserted_slice_1377 = tensor.insert_slice %extracted_slice_1374 into %inserted_slice_1376[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %1308 = tensor.empty() : tensor<1x32x80x128xf32> + %1309 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_1377, %1302 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%1308 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %1310 = tensor.empty() : tensor<1x32x80x128xf32> + %1311 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1304, %1309 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%1310 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %1312 = tensor.empty() : tensor<1x32x80x128xf32> + %1313 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1288, %1300 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%1312 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %extracted_slice_1378 = tensor.extract_slice %1288[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %extracted_slice_1379 = tensor.extract_slice %1288[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %1314 = tensor.empty() : tensor<1x32x80x64xf32> + %1315 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_1379 : tensor<1x32x80x64xf32>) outs(%1314 : tensor<1x32x80x64xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.negf %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x64xf32> + %1316 = tensor.empty() : tensor<1x32x80x128xf32> + %inserted_slice_1380 = tensor.insert_slice %1315 into %1316[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %inserted_slice_1381 = tensor.insert_slice %extracted_slice_1378 into %inserted_slice_1380[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %1317 = tensor.empty() : tensor<1x32x80x128xf32> + %1318 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_1381, %1302 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%1317 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %1319 = tensor.empty() : tensor<1x32x80x128xf32> + %1320 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1313, %1318 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%1319 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %1321 = tensor.empty() : tensor<1x32x128x80xf32> + %1322 = linalg.generic {indexing_maps = [#map14, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1320 : tensor<1x32x80x128xf32>) outs(%1321 : tensor<1x32x128x80xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x128x80xf32> + %cst_1382 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32> + %collapsed_1383 = tensor.collapse_shape %1311 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32> + %cst_1384 = arith.constant dense<0.000000e+00> : tensor<1x32x128x80xf32> + %collapsed_1385 = tensor.collapse_shape %1322 [[0, 1], [2], [3]] : tensor<1x32x128x80xf32> into tensor<32x128x80xf32> + %cst_1386 = arith.constant 0.000000e+00 : f32 + %1323 = tensor.empty() : tensor<32x80x80xf32> + %1324 = linalg.fill ins(%cst_1386 : f32) outs(%1323 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32> + %1325 = linalg.batch_matmul ins(%collapsed_1383, %collapsed_1385 : tensor<32x80x128xf32>, tensor<32x128x80xf32>) outs(%1324 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32> + %expanded_1387 = tensor.expand_shape %1325 [[0, 1], [2], [3]] : tensor<32x80x80xf32> into tensor<1x32x80x80xf32> + %cst_1388 = arith.constant dense<11.3137083> : tensor<1x32x80x80xf32> + %1326 = tensor.empty() : tensor<1x32x80x80xf32> + %1327 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst_1388 : tensor<1x32x80x80xf32>) outs(%1326 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %out: f32): + %cst_2684 = arith.constant 1.000000e+00 : f32 + %3652 = arith.divf %cst_2684, %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %1328 = tensor.empty() : tensor<1x32x80x80xf32> + %1329 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1387, %1327 : tensor<1x32x80x80xf32>, tensor<1x32x80x80xf32>) outs(%1328 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %1330 = tensor.empty() : tensor<1x32x80x80xf32> + %collapsed_1389 = tensor.collapse_shape %21 [[0, 1], [2], [3]] : tensor<1x1x80x80xf32> into tensor<1x80x80xf32> + %1331 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1329, %collapsed_1389 : tensor<1x32x80x80xf32>, tensor<1x80x80xf32>) outs(%1330 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %1332 = tensor.empty() : tensor<1x32x80x1xf32> + %1333 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%1332 : tensor<1x32x80x1xf32>) { + ^bb0(%out: f32): + %cst_2684 = arith.constant 0xFF800000 : f32 + linalg.yield %cst_2684 : f32 + } -> tensor<1x32x80x1xf32> + %1334 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%1331 : tensor<1x32x80x80xf32>) outs(%1332 : tensor<1x32x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.maximumf %in, %out : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x1xf32> + %1335 = tensor.empty() : tensor<1x32x80x80xf32> + %1336 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1331, %1334 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%1335 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.subf %in, %in_2684 : f32 + %3653 = math.exp %3652 : f32 + linalg.yield %3653 : f32 + } -> tensor<1x32x80x80xf32> + %1337 = tensor.empty() : tensor<1x32x80x1xf32> + %1338 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%1337 : tensor<1x32x80x1xf32>) { + ^bb0(%out: f32): + %cst_2684 = arith.constant 0.000000e+00 : f32 + linalg.yield %cst_2684 : f32 + } -> tensor<1x32x80x1xf32> + %1339 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%1336 : tensor<1x32x80x80xf32>) outs(%1338 : tensor<1x32x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.addf %in, %out : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x1xf32> + %1340 = tensor.empty() : tensor<1x32x80x80xf32> + %1341 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1336, %1339 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%1340 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.divf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %cst_1390 = arith.constant dense<0.000000e+00> : tensor<1x32x80x80xf32> + %collapsed_1391 = tensor.collapse_shape %1341 [[0, 1], [2], [3]] : tensor<1x32x80x80xf32> into tensor<32x80x80xf32> + %cst_1392 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32> + %collapsed_1393 = tensor.collapse_shape %1290 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32> + %cst_1394 = arith.constant 0.000000e+00 : f32 + %1342 = tensor.empty() : tensor<32x80x128xf32> + %1343 = linalg.fill ins(%cst_1394 : f32) outs(%1342 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32> + %1344 = linalg.batch_matmul ins(%collapsed_1391, %collapsed_1393 : tensor<32x80x80xf32>, tensor<32x80x128xf32>) outs(%1343 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32> + %expanded_1395 = tensor.expand_shape %1344 [[0, 1], [2], [3]] : tensor<32x80x128xf32> into tensor<1x32x80x128xf32> + %1345 = tensor.empty() : tensor<1x80x32x128xf32> + %1346 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1395 : tensor<1x32x80x128xf32>) outs(%1345 : tensor<1x80x32x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x80x32x128xf32> + %collapsed_1396 = tensor.collapse_shape %1346 [[0], [1], [2, 3]] : tensor<1x80x32x128xf32> into tensor<1x80x4096xf32> + %1347 = tensor.empty() : tensor<4096x4096xf32> + %1348 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_226 : tensor<4096x4096xf32>) outs(%1347 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_1397 = tensor.collapse_shape %collapsed_1396 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_1398 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %1349 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1397, %1348 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1398 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_1399 = tensor.expand_shape %1349 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %1350 = tensor.empty() : tensor<1x80x4096xf32> + %1351 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1264, %expanded_1399 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%1350 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %1352 = tensor.empty() : tensor<1x80x4096xf32> + %cst_1400 = arith.constant 2.000000e+00 : f32 + %1353 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1351 : tensor<1x80x4096xf32>) outs(%1352 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.powf %in, %cst_1400 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %cst_1401 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32> + %1354 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%1353 : tensor<1x80x4096xf32>) outs(%cst_1401 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %cst_2684 = arith.constant 4.096000e+03 : f32 + %3652 = arith.divf %in, %cst_2684 : f32 + %3653 = arith.addf %3652, %out : f32 + linalg.yield %3653 : f32 + } -> tensor<1x80x1xf32> + %cst_1402 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32> + %1355 = tensor.empty() : tensor<1x80x1xf32> + %1356 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1354, %cst_1402 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%1355 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %1357 = tensor.empty() : tensor<1x80x1xf32> + %1358 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1356 : tensor<1x80x1xf32>) outs(%1357 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.rsqrt %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %1359 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_1403 = tensor.collapse_shape %1358 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32> + %1360 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1351, %collapsed_1403 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%1359 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %expanded_1404 = tensor.expand_shape %extracted_slice_22 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32> + %1361 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_1405 = tensor.collapse_shape %expanded_1404 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32> + %1362 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_1405, %1360 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%1361 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %1363 = tensor.empty() : tensor<4096x11008xf32> + %1364 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_228 : tensor<11008x4096xf32>) outs(%1363 : tensor<4096x11008xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x11008xf32> + %collapsed_1406 = tensor.collapse_shape %1362 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_1407 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32> + %1365 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1406, %1364 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_1407 : tensor<80x11008xf32>) -> tensor<80x11008xf32> + %expanded_1408 = tensor.expand_shape %1365 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32> + %1366 = tensor.empty() : tensor<1x80x11008xf32> + %1367 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_1408 : tensor<1x80x11008xf32>) outs(%1366 : tensor<1x80x11008xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.negf %in : f32 + %3653 = math.exp %3652 : f32 + %cst_2684 = arith.constant 1.000000e+00 : f32 + %3654 = arith.addf %cst_2684, %3653 : f32 + %3655 = arith.divf %in, %3654 : f32 + linalg.yield %3655 : f32 + } -> tensor<1x80x11008xf32> + %1368 = tensor.empty() : tensor<4096x11008xf32> + %1369 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_230 : tensor<11008x4096xf32>) outs(%1368 : tensor<4096x11008xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x11008xf32> + %collapsed_1409 = tensor.collapse_shape %1362 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_1410 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32> + %1370 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1409, %1369 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_1410 : tensor<80x11008xf32>) -> tensor<80x11008xf32> + %expanded_1411 = tensor.expand_shape %1370 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32> + %1371 = tensor.empty() : tensor<1x80x11008xf32> + %1372 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1367, %expanded_1411 : tensor<1x80x11008xf32>, tensor<1x80x11008xf32>) outs(%1371 : tensor<1x80x11008xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x11008xf32> + %1373 = tensor.empty() : tensor<11008x4096xf32> + %1374 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_232 : tensor<4096x11008xf32>) outs(%1373 : tensor<11008x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<11008x4096xf32> + %collapsed_1412 = tensor.collapse_shape %1372 [[0, 1], [2]] : tensor<1x80x11008xf32> into tensor<80x11008xf32> + %cst_1413 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %1375 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1412, %1374 : tensor<80x11008xf32>, tensor<11008x4096xf32>) outs(%cst_1413 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_1414 = tensor.expand_shape %1375 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %1376 = tensor.empty() : tensor<1x80x4096xf32> + %1377 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1351, %expanded_1414 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%1376 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %1378 = tensor.empty() : tensor<1x80x4096xf32> + %cst_1415 = arith.constant 2.000000e+00 : f32 + %1379 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1377 : tensor<1x80x4096xf32>) outs(%1378 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.powf %in, %cst_1415 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %cst_1416 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32> + %1380 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%1379 : tensor<1x80x4096xf32>) outs(%cst_1416 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %cst_2684 = arith.constant 4.096000e+03 : f32 + %3652 = arith.divf %in, %cst_2684 : f32 + %3653 = arith.addf %3652, %out : f32 + linalg.yield %3653 : f32 + } -> tensor<1x80x1xf32> + %cst_1417 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32> + %1381 = tensor.empty() : tensor<1x80x1xf32> + %1382 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1380, %cst_1417 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%1381 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %1383 = tensor.empty() : tensor<1x80x1xf32> + %1384 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1382 : tensor<1x80x1xf32>) outs(%1383 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.rsqrt %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %1385 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_1418 = tensor.collapse_shape %1384 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32> + %1386 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1377, %collapsed_1418 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%1385 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %expanded_1419 = tensor.expand_shape %extracted_slice_23 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32> + %1387 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_1420 = tensor.collapse_shape %expanded_1419 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32> + %1388 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_1420, %1386 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%1387 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %1389 = tensor.empty() : tensor<4096x4096xf32> + %1390 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_234 : tensor<4096x4096xf32>) outs(%1389 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_1421 = tensor.collapse_shape %1388 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_1422 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %1391 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1421, %1390 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1422 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_1423 = tensor.expand_shape %1391 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %1392 = tensor.empty() : tensor<4096x4096xf32> + %1393 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_236 : tensor<4096x4096xf32>) outs(%1392 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_1424 = tensor.collapse_shape %1388 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_1425 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %1394 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1424, %1393 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1425 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_1426 = tensor.expand_shape %1394 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %1395 = tensor.empty() : tensor<4096x4096xf32> + %1396 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_238 : tensor<4096x4096xf32>) outs(%1395 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_1427 = tensor.collapse_shape %1388 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_1428 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %1397 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1427, %1396 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1428 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_1429 = tensor.expand_shape %1397 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %expanded_1430 = tensor.expand_shape %expanded_1423 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32> + %1398 = tensor.empty() : tensor<1x32x80x128xf32> + %1399 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1430 : tensor<1x80x32x128xf32>) outs(%1398 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x80x128xf32> + %expanded_1431 = tensor.expand_shape %expanded_1426 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32> + %1400 = tensor.empty() : tensor<1x32x80x128xf32> + %1401 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1431 : tensor<1x80x32x128xf32>) outs(%1400 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x80x128xf32> + %expanded_1432 = tensor.expand_shape %expanded_1429 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32> + %1402 = tensor.empty() : tensor<1x32x80x128xf32> + %1403 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1432 : tensor<1x80x32x128xf32>) outs(%1402 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x80x128xf32> + %extracted_slice_1433 = tensor.extract_slice %expanded_564[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32> + %extracted_slice_1434 = tensor.extract_slice %expanded_566[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32> + %1404 = tensor.empty() : tensor<1x80x128xf32> + %1405 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_1433 : tensor<1x1x80x128xf32>) outs(%1404 : tensor<1x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x80x128xf32> + %1406 = tensor.empty() : tensor<80x128xf32> + %1407 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%1405 : tensor<1x80x128xf32>) outs(%1406 : tensor<80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<80x128xf32> + %1408 = tensor.empty() : tensor<1x80x128xf32> + %1409 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_1434 : tensor<1x1x80x128xf32>) outs(%1408 : tensor<1x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x80x128xf32> + %1410 = tensor.empty() : tensor<80x128xf32> + %1411 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%1409 : tensor<1x80x128xf32>) outs(%1410 : tensor<80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<80x128xf32> + %1412 = tensor.empty() : tensor<1x80x128xf32> + %1413 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%1412 : tensor<1x80x128xf32>) { + ^bb0(%in: i64, %out: f32): + %3652 = arith.index_cast %in : i64 to index + %3653 = linalg.index 2 : index + %extracted = tensor.extract %1407[%3652, %3653] : tensor<80x128xf32> + linalg.yield %extracted : f32 + } -> tensor<1x80x128xf32> + %expanded_1435 = tensor.expand_shape %1413 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32> + %1414 = tensor.empty() : tensor<1x80x128xf32> + %1415 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%1414 : tensor<1x80x128xf32>) { + ^bb0(%in: i64, %out: f32): + %3652 = arith.index_cast %in : i64 to index + %3653 = linalg.index 2 : index + %extracted = tensor.extract %1411[%3652, %3653] : tensor<80x128xf32> + linalg.yield %extracted : f32 + } -> tensor<1x80x128xf32> + %expanded_1436 = tensor.expand_shape %1415 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32> + %1416 = tensor.empty() : tensor<1x32x80x128xf32> + %1417 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1399, %1413 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%1416 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %extracted_slice_1437 = tensor.extract_slice %1399[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %extracted_slice_1438 = tensor.extract_slice %1399[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %1418 = tensor.empty() : tensor<1x32x80x64xf32> + %1419 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_1438 : tensor<1x32x80x64xf32>) outs(%1418 : tensor<1x32x80x64xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.negf %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x64xf32> + %1420 = tensor.empty() : tensor<1x32x80x128xf32> + %inserted_slice_1439 = tensor.insert_slice %1419 into %1420[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %inserted_slice_1440 = tensor.insert_slice %extracted_slice_1437 into %inserted_slice_1439[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %1421 = tensor.empty() : tensor<1x32x80x128xf32> + %1422 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_1440, %1415 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%1421 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %1423 = tensor.empty() : tensor<1x32x80x128xf32> + %1424 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1417, %1422 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%1423 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %1425 = tensor.empty() : tensor<1x32x80x128xf32> + %1426 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1401, %1413 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%1425 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %extracted_slice_1441 = tensor.extract_slice %1401[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %extracted_slice_1442 = tensor.extract_slice %1401[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %1427 = tensor.empty() : tensor<1x32x80x64xf32> + %1428 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_1442 : tensor<1x32x80x64xf32>) outs(%1427 : tensor<1x32x80x64xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.negf %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x64xf32> + %1429 = tensor.empty() : tensor<1x32x80x128xf32> + %inserted_slice_1443 = tensor.insert_slice %1428 into %1429[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %inserted_slice_1444 = tensor.insert_slice %extracted_slice_1441 into %inserted_slice_1443[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %1430 = tensor.empty() : tensor<1x32x80x128xf32> + %1431 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_1444, %1415 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%1430 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %1432 = tensor.empty() : tensor<1x32x80x128xf32> + %1433 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1426, %1431 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%1432 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %1434 = tensor.empty() : tensor<1x32x128x80xf32> + %1435 = linalg.generic {indexing_maps = [#map14, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1433 : tensor<1x32x80x128xf32>) outs(%1434 : tensor<1x32x128x80xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x128x80xf32> + %cst_1445 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32> + %collapsed_1446 = tensor.collapse_shape %1424 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32> + %cst_1447 = arith.constant dense<0.000000e+00> : tensor<1x32x128x80xf32> + %collapsed_1448 = tensor.collapse_shape %1435 [[0, 1], [2], [3]] : tensor<1x32x128x80xf32> into tensor<32x128x80xf32> + %cst_1449 = arith.constant 0.000000e+00 : f32 + %1436 = tensor.empty() : tensor<32x80x80xf32> + %1437 = linalg.fill ins(%cst_1449 : f32) outs(%1436 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32> + %1438 = linalg.batch_matmul ins(%collapsed_1446, %collapsed_1448 : tensor<32x80x128xf32>, tensor<32x128x80xf32>) outs(%1437 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32> + %expanded_1450 = tensor.expand_shape %1438 [[0, 1], [2], [3]] : tensor<32x80x80xf32> into tensor<1x32x80x80xf32> + %cst_1451 = arith.constant dense<11.3137083> : tensor<1x32x80x80xf32> + %1439 = tensor.empty() : tensor<1x32x80x80xf32> + %1440 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst_1451 : tensor<1x32x80x80xf32>) outs(%1439 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %out: f32): + %cst_2684 = arith.constant 1.000000e+00 : f32 + %3652 = arith.divf %cst_2684, %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %1441 = tensor.empty() : tensor<1x32x80x80xf32> + %1442 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1450, %1440 : tensor<1x32x80x80xf32>, tensor<1x32x80x80xf32>) outs(%1441 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %1443 = tensor.empty() : tensor<1x32x80x80xf32> + %collapsed_1452 = tensor.collapse_shape %21 [[0, 1], [2], [3]] : tensor<1x1x80x80xf32> into tensor<1x80x80xf32> + %1444 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1442, %collapsed_1452 : tensor<1x32x80x80xf32>, tensor<1x80x80xf32>) outs(%1443 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %1445 = tensor.empty() : tensor<1x32x80x1xf32> + %1446 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%1445 : tensor<1x32x80x1xf32>) { + ^bb0(%out: f32): + %cst_2684 = arith.constant 0xFF800000 : f32 + linalg.yield %cst_2684 : f32 + } -> tensor<1x32x80x1xf32> + %1447 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%1444 : tensor<1x32x80x80xf32>) outs(%1445 : tensor<1x32x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.maximumf %in, %out : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x1xf32> + %1448 = tensor.empty() : tensor<1x32x80x80xf32> + %1449 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1444, %1447 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%1448 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.subf %in, %in_2684 : f32 + %3653 = math.exp %3652 : f32 + linalg.yield %3653 : f32 + } -> tensor<1x32x80x80xf32> + %1450 = tensor.empty() : tensor<1x32x80x1xf32> + %1451 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%1450 : tensor<1x32x80x1xf32>) { + ^bb0(%out: f32): + %cst_2684 = arith.constant 0.000000e+00 : f32 + linalg.yield %cst_2684 : f32 + } -> tensor<1x32x80x1xf32> + %1452 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%1449 : tensor<1x32x80x80xf32>) outs(%1451 : tensor<1x32x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.addf %in, %out : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x1xf32> + %1453 = tensor.empty() : tensor<1x32x80x80xf32> + %1454 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1449, %1452 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%1453 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.divf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %cst_1453 = arith.constant dense<0.000000e+00> : tensor<1x32x80x80xf32> + %collapsed_1454 = tensor.collapse_shape %1454 [[0, 1], [2], [3]] : tensor<1x32x80x80xf32> into tensor<32x80x80xf32> + %cst_1455 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32> + %collapsed_1456 = tensor.collapse_shape %1403 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32> + %cst_1457 = arith.constant 0.000000e+00 : f32 + %1455 = tensor.empty() : tensor<32x80x128xf32> + %1456 = linalg.fill ins(%cst_1457 : f32) outs(%1455 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32> + %1457 = linalg.batch_matmul ins(%collapsed_1454, %collapsed_1456 : tensor<32x80x80xf32>, tensor<32x80x128xf32>) outs(%1456 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32> + %expanded_1458 = tensor.expand_shape %1457 [[0, 1], [2], [3]] : tensor<32x80x128xf32> into tensor<1x32x80x128xf32> + %1458 = tensor.empty() : tensor<1x80x32x128xf32> + %1459 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1458 : tensor<1x32x80x128xf32>) outs(%1458 : tensor<1x80x32x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x80x32x128xf32> + %collapsed_1459 = tensor.collapse_shape %1459 [[0], [1], [2, 3]] : tensor<1x80x32x128xf32> into tensor<1x80x4096xf32> + %1460 = tensor.empty() : tensor<4096x4096xf32> + %1461 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_240 : tensor<4096x4096xf32>) outs(%1460 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_1460 = tensor.collapse_shape %collapsed_1459 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_1461 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %1462 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1460, %1461 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1461 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_1462 = tensor.expand_shape %1462 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %1463 = tensor.empty() : tensor<1x80x4096xf32> + %1464 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1377, %expanded_1462 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%1463 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %1465 = tensor.empty() : tensor<1x80x4096xf32> + %cst_1463 = arith.constant 2.000000e+00 : f32 + %1466 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1464 : tensor<1x80x4096xf32>) outs(%1465 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.powf %in, %cst_1463 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %cst_1464 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32> + %1467 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%1466 : tensor<1x80x4096xf32>) outs(%cst_1464 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %cst_2684 = arith.constant 4.096000e+03 : f32 + %3652 = arith.divf %in, %cst_2684 : f32 + %3653 = arith.addf %3652, %out : f32 + linalg.yield %3653 : f32 + } -> tensor<1x80x1xf32> + %cst_1465 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32> + %1468 = tensor.empty() : tensor<1x80x1xf32> + %1469 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1467, %cst_1465 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%1468 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %1470 = tensor.empty() : tensor<1x80x1xf32> + %1471 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1469 : tensor<1x80x1xf32>) outs(%1470 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.rsqrt %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %1472 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_1466 = tensor.collapse_shape %1471 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32> + %1473 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1464, %collapsed_1466 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%1472 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %expanded_1467 = tensor.expand_shape %extracted_slice_24 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32> + %1474 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_1468 = tensor.collapse_shape %expanded_1467 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32> + %1475 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_1468, %1473 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%1474 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %1476 = tensor.empty() : tensor<4096x11008xf32> + %1477 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_242 : tensor<11008x4096xf32>) outs(%1476 : tensor<4096x11008xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x11008xf32> + %collapsed_1469 = tensor.collapse_shape %1475 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_1470 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32> + %1478 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1469, %1477 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_1470 : tensor<80x11008xf32>) -> tensor<80x11008xf32> + %expanded_1471 = tensor.expand_shape %1478 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32> + %1479 = tensor.empty() : tensor<1x80x11008xf32> + %1480 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_1471 : tensor<1x80x11008xf32>) outs(%1479 : tensor<1x80x11008xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.negf %in : f32 + %3653 = math.exp %3652 : f32 + %cst_2684 = arith.constant 1.000000e+00 : f32 + %3654 = arith.addf %cst_2684, %3653 : f32 + %3655 = arith.divf %in, %3654 : f32 + linalg.yield %3655 : f32 + } -> tensor<1x80x11008xf32> + %1481 = tensor.empty() : tensor<4096x11008xf32> + %1482 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_244 : tensor<11008x4096xf32>) outs(%1481 : tensor<4096x11008xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x11008xf32> + %collapsed_1472 = tensor.collapse_shape %1475 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_1473 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32> + %1483 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1472, %1482 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_1473 : tensor<80x11008xf32>) -> tensor<80x11008xf32> + %expanded_1474 = tensor.expand_shape %1483 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32> + %1484 = tensor.empty() : tensor<1x80x11008xf32> + %1485 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1480, %expanded_1474 : tensor<1x80x11008xf32>, tensor<1x80x11008xf32>) outs(%1484 : tensor<1x80x11008xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x11008xf32> + %1486 = tensor.empty() : tensor<11008x4096xf32> + %1487 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_246 : tensor<4096x11008xf32>) outs(%1486 : tensor<11008x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<11008x4096xf32> + %collapsed_1475 = tensor.collapse_shape %1485 [[0, 1], [2]] : tensor<1x80x11008xf32> into tensor<80x11008xf32> + %cst_1476 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %1488 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1475, %1487 : tensor<80x11008xf32>, tensor<11008x4096xf32>) outs(%cst_1476 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_1477 = tensor.expand_shape %1488 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %1489 = tensor.empty() : tensor<1x80x4096xf32> + %1490 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1464, %expanded_1477 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%1489 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %1491 = tensor.empty() : tensor<1x80x4096xf32> + %cst_1478 = arith.constant 2.000000e+00 : f32 + %1492 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1490 : tensor<1x80x4096xf32>) outs(%1491 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.powf %in, %cst_1478 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %cst_1479 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32> + %1493 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%1492 : tensor<1x80x4096xf32>) outs(%cst_1479 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %cst_2684 = arith.constant 4.096000e+03 : f32 + %3652 = arith.divf %in, %cst_2684 : f32 + %3653 = arith.addf %3652, %out : f32 + linalg.yield %3653 : f32 + } -> tensor<1x80x1xf32> + %cst_1480 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32> + %1494 = tensor.empty() : tensor<1x80x1xf32> + %1495 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1493, %cst_1480 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%1494 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %1496 = tensor.empty() : tensor<1x80x1xf32> + %1497 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1495 : tensor<1x80x1xf32>) outs(%1496 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.rsqrt %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %1498 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_1481 = tensor.collapse_shape %1497 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32> + %1499 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1490, %collapsed_1481 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%1498 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %expanded_1482 = tensor.expand_shape %extracted_slice_25 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32> + %1500 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_1483 = tensor.collapse_shape %expanded_1482 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32> + %1501 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_1483, %1499 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%1500 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %1502 = tensor.empty() : tensor<4096x4096xf32> + %1503 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_248 : tensor<4096x4096xf32>) outs(%1502 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_1484 = tensor.collapse_shape %1501 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_1485 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %1504 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1484, %1503 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1485 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_1486 = tensor.expand_shape %1504 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %1505 = tensor.empty() : tensor<4096x4096xf32> + %1506 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_250 : tensor<4096x4096xf32>) outs(%1505 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_1487 = tensor.collapse_shape %1501 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_1488 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %1507 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1487, %1506 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1488 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_1489 = tensor.expand_shape %1507 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %1508 = tensor.empty() : tensor<4096x4096xf32> + %1509 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_252 : tensor<4096x4096xf32>) outs(%1508 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_1490 = tensor.collapse_shape %1501 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_1491 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %1510 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1490, %1509 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1491 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_1492 = tensor.expand_shape %1510 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %expanded_1493 = tensor.expand_shape %expanded_1486 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32> + %1511 = tensor.empty() : tensor<1x32x80x128xf32> + %1512 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1493 : tensor<1x80x32x128xf32>) outs(%1511 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x80x128xf32> + %expanded_1494 = tensor.expand_shape %expanded_1489 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32> + %1513 = tensor.empty() : tensor<1x32x80x128xf32> + %1514 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1494 : tensor<1x80x32x128xf32>) outs(%1513 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x80x128xf32> + %expanded_1495 = tensor.expand_shape %expanded_1492 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32> + %1515 = tensor.empty() : tensor<1x32x80x128xf32> + %1516 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1495 : tensor<1x80x32x128xf32>) outs(%1515 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x80x128xf32> + %extracted_slice_1496 = tensor.extract_slice %expanded_568[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32> + %extracted_slice_1497 = tensor.extract_slice %expanded_570[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32> + %1517 = tensor.empty() : tensor<1x80x128xf32> + %1518 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_1496 : tensor<1x1x80x128xf32>) outs(%1517 : tensor<1x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x80x128xf32> + %1519 = tensor.empty() : tensor<80x128xf32> + %1520 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%1518 : tensor<1x80x128xf32>) outs(%1519 : tensor<80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<80x128xf32> + %1521 = tensor.empty() : tensor<1x80x128xf32> + %1522 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_1497 : tensor<1x1x80x128xf32>) outs(%1521 : tensor<1x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x80x128xf32> + %1523 = tensor.empty() : tensor<80x128xf32> + %1524 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%1522 : tensor<1x80x128xf32>) outs(%1523 : tensor<80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<80x128xf32> + %1525 = tensor.empty() : tensor<1x80x128xf32> + %1526 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%1525 : tensor<1x80x128xf32>) { + ^bb0(%in: i64, %out: f32): + %3652 = arith.index_cast %in : i64 to index + %3653 = linalg.index 2 : index + %extracted = tensor.extract %1520[%3652, %3653] : tensor<80x128xf32> + linalg.yield %extracted : f32 + } -> tensor<1x80x128xf32> + %expanded_1498 = tensor.expand_shape %1526 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32> + %1527 = tensor.empty() : tensor<1x80x128xf32> + %1528 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%1527 : tensor<1x80x128xf32>) { + ^bb0(%in: i64, %out: f32): + %3652 = arith.index_cast %in : i64 to index + %3653 = linalg.index 2 : index + %extracted = tensor.extract %1524[%3652, %3653] : tensor<80x128xf32> + linalg.yield %extracted : f32 + } -> tensor<1x80x128xf32> + %expanded_1499 = tensor.expand_shape %1528 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32> + %1529 = tensor.empty() : tensor<1x32x80x128xf32> + %1530 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1512, %1526 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%1529 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %extracted_slice_1500 = tensor.extract_slice %1512[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %extracted_slice_1501 = tensor.extract_slice %1512[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %1531 = tensor.empty() : tensor<1x32x80x64xf32> + %1532 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_1501 : tensor<1x32x80x64xf32>) outs(%1531 : tensor<1x32x80x64xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.negf %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x64xf32> + %1533 = tensor.empty() : tensor<1x32x80x128xf32> + %inserted_slice_1502 = tensor.insert_slice %1532 into %1533[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %inserted_slice_1503 = tensor.insert_slice %extracted_slice_1500 into %inserted_slice_1502[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %1534 = tensor.empty() : tensor<1x32x80x128xf32> + %1535 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_1503, %1528 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%1534 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %1536 = tensor.empty() : tensor<1x32x80x128xf32> + %1537 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1530, %1535 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%1536 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %1538 = tensor.empty() : tensor<1x32x80x128xf32> + %1539 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1514, %1526 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%1538 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %extracted_slice_1504 = tensor.extract_slice %1514[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %extracted_slice_1505 = tensor.extract_slice %1514[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %1540 = tensor.empty() : tensor<1x32x80x64xf32> + %1541 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_1505 : tensor<1x32x80x64xf32>) outs(%1540 : tensor<1x32x80x64xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.negf %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x64xf32> + %1542 = tensor.empty() : tensor<1x32x80x128xf32> + %inserted_slice_1506 = tensor.insert_slice %1541 into %1542[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %inserted_slice_1507 = tensor.insert_slice %extracted_slice_1504 into %inserted_slice_1506[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %1543 = tensor.empty() : tensor<1x32x80x128xf32> + %1544 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_1507, %1528 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%1543 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %1545 = tensor.empty() : tensor<1x32x80x128xf32> + %1546 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1539, %1544 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%1545 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %1547 = tensor.empty() : tensor<1x32x128x80xf32> + %1548 = linalg.generic {indexing_maps = [#map14, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1546 : tensor<1x32x80x128xf32>) outs(%1547 : tensor<1x32x128x80xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x128x80xf32> + %cst_1508 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32> + %collapsed_1509 = tensor.collapse_shape %1537 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32> + %cst_1510 = arith.constant dense<0.000000e+00> : tensor<1x32x128x80xf32> + %collapsed_1511 = tensor.collapse_shape %1548 [[0, 1], [2], [3]] : tensor<1x32x128x80xf32> into tensor<32x128x80xf32> + %cst_1512 = arith.constant 0.000000e+00 : f32 + %1549 = tensor.empty() : tensor<32x80x80xf32> + %1550 = linalg.fill ins(%cst_1512 : f32) outs(%1549 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32> + %1551 = linalg.batch_matmul ins(%collapsed_1509, %collapsed_1511 : tensor<32x80x128xf32>, tensor<32x128x80xf32>) outs(%1550 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32> + %expanded_1513 = tensor.expand_shape %1551 [[0, 1], [2], [3]] : tensor<32x80x80xf32> into tensor<1x32x80x80xf32> + %cst_1514 = arith.constant dense<11.3137083> : tensor<1x32x80x80xf32> + %1552 = tensor.empty() : tensor<1x32x80x80xf32> + %1553 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst_1514 : tensor<1x32x80x80xf32>) outs(%1552 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %out: f32): + %cst_2684 = arith.constant 1.000000e+00 : f32 + %3652 = arith.divf %cst_2684, %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %1554 = tensor.empty() : tensor<1x32x80x80xf32> + %1555 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1513, %1553 : tensor<1x32x80x80xf32>, tensor<1x32x80x80xf32>) outs(%1554 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %1556 = tensor.empty() : tensor<1x32x80x80xf32> + %collapsed_1515 = tensor.collapse_shape %21 [[0, 1], [2], [3]] : tensor<1x1x80x80xf32> into tensor<1x80x80xf32> + %1557 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1555, %collapsed_1515 : tensor<1x32x80x80xf32>, tensor<1x80x80xf32>) outs(%1556 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %1558 = tensor.empty() : tensor<1x32x80x1xf32> + %1559 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%1558 : tensor<1x32x80x1xf32>) { + ^bb0(%out: f32): + %cst_2684 = arith.constant 0xFF800000 : f32 + linalg.yield %cst_2684 : f32 + } -> tensor<1x32x80x1xf32> + %1560 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%1557 : tensor<1x32x80x80xf32>) outs(%1558 : tensor<1x32x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.maximumf %in, %out : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x1xf32> + %1561 = tensor.empty() : tensor<1x32x80x80xf32> + %1562 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1557, %1560 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%1561 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.subf %in, %in_2684 : f32 + %3653 = math.exp %3652 : f32 + linalg.yield %3653 : f32 + } -> tensor<1x32x80x80xf32> + %1563 = tensor.empty() : tensor<1x32x80x1xf32> + %1564 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%1563 : tensor<1x32x80x1xf32>) { + ^bb0(%out: f32): + %cst_2684 = arith.constant 0.000000e+00 : f32 + linalg.yield %cst_2684 : f32 + } -> tensor<1x32x80x1xf32> + %1565 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%1562 : tensor<1x32x80x80xf32>) outs(%1564 : tensor<1x32x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.addf %in, %out : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x1xf32> + %1566 = tensor.empty() : tensor<1x32x80x80xf32> + %1567 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1562, %1565 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%1566 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.divf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %cst_1516 = arith.constant dense<0.000000e+00> : tensor<1x32x80x80xf32> + %collapsed_1517 = tensor.collapse_shape %1567 [[0, 1], [2], [3]] : tensor<1x32x80x80xf32> into tensor<32x80x80xf32> + %cst_1518 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32> + %collapsed_1519 = tensor.collapse_shape %1516 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32> + %cst_1520 = arith.constant 0.000000e+00 : f32 + %1568 = tensor.empty() : tensor<32x80x128xf32> + %1569 = linalg.fill ins(%cst_1520 : f32) outs(%1568 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32> + %1570 = linalg.batch_matmul ins(%collapsed_1517, %collapsed_1519 : tensor<32x80x80xf32>, tensor<32x80x128xf32>) outs(%1569 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32> + %expanded_1521 = tensor.expand_shape %1570 [[0, 1], [2], [3]] : tensor<32x80x128xf32> into tensor<1x32x80x128xf32> + %1571 = tensor.empty() : tensor<1x80x32x128xf32> + %1572 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1521 : tensor<1x32x80x128xf32>) outs(%1571 : tensor<1x80x32x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x80x32x128xf32> + %collapsed_1522 = tensor.collapse_shape %1572 [[0], [1], [2, 3]] : tensor<1x80x32x128xf32> into tensor<1x80x4096xf32> + %1573 = tensor.empty() : tensor<4096x4096xf32> + %1574 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_254 : tensor<4096x4096xf32>) outs(%1573 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_1523 = tensor.collapse_shape %collapsed_1522 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_1524 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %1575 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1523, %1574 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1524 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_1525 = tensor.expand_shape %1575 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %1576 = tensor.empty() : tensor<1x80x4096xf32> + %1577 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1490, %expanded_1525 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%1576 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %1578 = tensor.empty() : tensor<1x80x4096xf32> + %cst_1526 = arith.constant 2.000000e+00 : f32 + %1579 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1577 : tensor<1x80x4096xf32>) outs(%1578 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.powf %in, %cst_1526 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %cst_1527 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32> + %1580 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%1579 : tensor<1x80x4096xf32>) outs(%cst_1527 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %cst_2684 = arith.constant 4.096000e+03 : f32 + %3652 = arith.divf %in, %cst_2684 : f32 + %3653 = arith.addf %3652, %out : f32 + linalg.yield %3653 : f32 + } -> tensor<1x80x1xf32> + %cst_1528 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32> + %1581 = tensor.empty() : tensor<1x80x1xf32> + %1582 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1580, %cst_1528 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%1581 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %1583 = tensor.empty() : tensor<1x80x1xf32> + %1584 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1582 : tensor<1x80x1xf32>) outs(%1583 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.rsqrt %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %1585 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_1529 = tensor.collapse_shape %1584 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32> + %1586 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1577, %collapsed_1529 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%1585 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %expanded_1530 = tensor.expand_shape %extracted_slice_26 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32> + %1587 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_1531 = tensor.collapse_shape %expanded_1530 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32> + %1588 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_1531, %1586 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%1587 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %1589 = tensor.empty() : tensor<4096x11008xf32> + %1590 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_256 : tensor<11008x4096xf32>) outs(%1589 : tensor<4096x11008xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x11008xf32> + %collapsed_1532 = tensor.collapse_shape %1588 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_1533 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32> + %1591 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1532, %1590 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_1533 : tensor<80x11008xf32>) -> tensor<80x11008xf32> + %expanded_1534 = tensor.expand_shape %1591 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32> + %1592 = tensor.empty() : tensor<1x80x11008xf32> + %1593 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_1534 : tensor<1x80x11008xf32>) outs(%1592 : tensor<1x80x11008xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.negf %in : f32 + %3653 = math.exp %3652 : f32 + %cst_2684 = arith.constant 1.000000e+00 : f32 + %3654 = arith.addf %cst_2684, %3653 : f32 + %3655 = arith.divf %in, %3654 : f32 + linalg.yield %3655 : f32 + } -> tensor<1x80x11008xf32> + %1594 = tensor.empty() : tensor<4096x11008xf32> + %1595 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_258 : tensor<11008x4096xf32>) outs(%1594 : tensor<4096x11008xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x11008xf32> + %collapsed_1535 = tensor.collapse_shape %1588 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_1536 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32> + %1596 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1535, %1595 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_1536 : tensor<80x11008xf32>) -> tensor<80x11008xf32> + %expanded_1537 = tensor.expand_shape %1596 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32> + %1597 = tensor.empty() : tensor<1x80x11008xf32> + %1598 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1593, %expanded_1537 : tensor<1x80x11008xf32>, tensor<1x80x11008xf32>) outs(%1597 : tensor<1x80x11008xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x11008xf32> + %1599 = tensor.empty() : tensor<11008x4096xf32> + %1600 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_260 : tensor<4096x11008xf32>) outs(%1599 : tensor<11008x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<11008x4096xf32> + %collapsed_1538 = tensor.collapse_shape %1598 [[0, 1], [2]] : tensor<1x80x11008xf32> into tensor<80x11008xf32> + %cst_1539 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %1601 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1538, %1600 : tensor<80x11008xf32>, tensor<11008x4096xf32>) outs(%cst_1539 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_1540 = tensor.expand_shape %1601 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %1602 = tensor.empty() : tensor<1x80x4096xf32> + %1603 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1577, %expanded_1540 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%1602 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %1604 = tensor.empty() : tensor<1x80x4096xf32> + %cst_1541 = arith.constant 2.000000e+00 : f32 + %1605 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1603 : tensor<1x80x4096xf32>) outs(%1604 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.powf %in, %cst_1541 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %cst_1542 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32> + %1606 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%1605 : tensor<1x80x4096xf32>) outs(%cst_1542 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %cst_2684 = arith.constant 4.096000e+03 : f32 + %3652 = arith.divf %in, %cst_2684 : f32 + %3653 = arith.addf %3652, %out : f32 + linalg.yield %3653 : f32 + } -> tensor<1x80x1xf32> + %cst_1543 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32> + %1607 = tensor.empty() : tensor<1x80x1xf32> + %1608 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1606, %cst_1543 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%1607 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %1609 = tensor.empty() : tensor<1x80x1xf32> + %1610 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1608 : tensor<1x80x1xf32>) outs(%1609 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.rsqrt %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %1611 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_1544 = tensor.collapse_shape %1610 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32> + %1612 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1603, %collapsed_1544 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%1611 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %expanded_1545 = tensor.expand_shape %extracted_slice_27 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32> + %1613 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_1546 = tensor.collapse_shape %expanded_1545 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32> + %1614 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_1546, %1612 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%1613 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %1615 = tensor.empty() : tensor<4096x4096xf32> + %1616 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_262 : tensor<4096x4096xf32>) outs(%1615 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_1547 = tensor.collapse_shape %1614 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_1548 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %1617 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1547, %1616 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1548 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_1549 = tensor.expand_shape %1617 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %1618 = tensor.empty() : tensor<4096x4096xf32> + %1619 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_264 : tensor<4096x4096xf32>) outs(%1618 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_1550 = tensor.collapse_shape %1614 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_1551 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %1620 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1550, %1619 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1551 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_1552 = tensor.expand_shape %1620 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %1621 = tensor.empty() : tensor<4096x4096xf32> + %1622 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_266 : tensor<4096x4096xf32>) outs(%1621 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_1553 = tensor.collapse_shape %1614 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_1554 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %1623 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1553, %1622 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1554 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_1555 = tensor.expand_shape %1623 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %expanded_1556 = tensor.expand_shape %expanded_1549 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32> + %1624 = tensor.empty() : tensor<1x32x80x128xf32> + %1625 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1556 : tensor<1x80x32x128xf32>) outs(%1624 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x80x128xf32> + %expanded_1557 = tensor.expand_shape %expanded_1552 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32> + %1626 = tensor.empty() : tensor<1x32x80x128xf32> + %1627 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1557 : tensor<1x80x32x128xf32>) outs(%1626 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x80x128xf32> + %expanded_1558 = tensor.expand_shape %expanded_1555 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32> + %1628 = tensor.empty() : tensor<1x32x80x128xf32> + %1629 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1558 : tensor<1x80x32x128xf32>) outs(%1628 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x80x128xf32> + %extracted_slice_1559 = tensor.extract_slice %expanded_572[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32> + %extracted_slice_1560 = tensor.extract_slice %expanded_574[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32> + %1630 = tensor.empty() : tensor<1x80x128xf32> + %1631 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_1559 : tensor<1x1x80x128xf32>) outs(%1630 : tensor<1x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x80x128xf32> + %1632 = tensor.empty() : tensor<80x128xf32> + %1633 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%1631 : tensor<1x80x128xf32>) outs(%1632 : tensor<80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<80x128xf32> + %1634 = tensor.empty() : tensor<1x80x128xf32> + %1635 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_1560 : tensor<1x1x80x128xf32>) outs(%1634 : tensor<1x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x80x128xf32> + %1636 = tensor.empty() : tensor<80x128xf32> + %1637 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%1635 : tensor<1x80x128xf32>) outs(%1636 : tensor<80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<80x128xf32> + %1638 = tensor.empty() : tensor<1x80x128xf32> + %1639 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%1638 : tensor<1x80x128xf32>) { + ^bb0(%in: i64, %out: f32): + %3652 = arith.index_cast %in : i64 to index + %3653 = linalg.index 2 : index + %extracted = tensor.extract %1633[%3652, %3653] : tensor<80x128xf32> + linalg.yield %extracted : f32 + } -> tensor<1x80x128xf32> + %expanded_1561 = tensor.expand_shape %1639 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32> + %1640 = tensor.empty() : tensor<1x80x128xf32> + %1641 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%1640 : tensor<1x80x128xf32>) { + ^bb0(%in: i64, %out: f32): + %3652 = arith.index_cast %in : i64 to index + %3653 = linalg.index 2 : index + %extracted = tensor.extract %1637[%3652, %3653] : tensor<80x128xf32> + linalg.yield %extracted : f32 + } -> tensor<1x80x128xf32> + %expanded_1562 = tensor.expand_shape %1641 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32> + %1642 = tensor.empty() : tensor<1x32x80x128xf32> + %1643 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1625, %1639 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%1642 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %extracted_slice_1563 = tensor.extract_slice %1625[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %extracted_slice_1564 = tensor.extract_slice %1625[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %1644 = tensor.empty() : tensor<1x32x80x64xf32> + %1645 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_1564 : tensor<1x32x80x64xf32>) outs(%1644 : tensor<1x32x80x64xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.negf %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x64xf32> + %1646 = tensor.empty() : tensor<1x32x80x128xf32> + %inserted_slice_1565 = tensor.insert_slice %1645 into %1646[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %inserted_slice_1566 = tensor.insert_slice %extracted_slice_1563 into %inserted_slice_1565[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %1647 = tensor.empty() : tensor<1x32x80x128xf32> + %1648 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_1566, %1641 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%1647 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %1649 = tensor.empty() : tensor<1x32x80x128xf32> + %1650 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1643, %1648 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%1649 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %1651 = tensor.empty() : tensor<1x32x80x128xf32> + %1652 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1627, %1639 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%1651 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %extracted_slice_1567 = tensor.extract_slice %1627[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %extracted_slice_1568 = tensor.extract_slice %1627[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %1653 = tensor.empty() : tensor<1x32x80x64xf32> + %1654 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_1568 : tensor<1x32x80x64xf32>) outs(%1653 : tensor<1x32x80x64xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.negf %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x64xf32> + %1655 = tensor.empty() : tensor<1x32x80x128xf32> + %inserted_slice_1569 = tensor.insert_slice %1654 into %1655[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %inserted_slice_1570 = tensor.insert_slice %extracted_slice_1567 into %inserted_slice_1569[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %1656 = tensor.empty() : tensor<1x32x80x128xf32> + %1657 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_1570, %1641 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%1656 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %1658 = tensor.empty() : tensor<1x32x80x128xf32> + %1659 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1652, %1657 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%1658 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %1660 = tensor.empty() : tensor<1x32x128x80xf32> + %1661 = linalg.generic {indexing_maps = [#map14, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1659 : tensor<1x32x80x128xf32>) outs(%1660 : tensor<1x32x128x80xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x128x80xf32> + %cst_1571 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32> + %collapsed_1572 = tensor.collapse_shape %1650 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32> + %cst_1573 = arith.constant dense<0.000000e+00> : tensor<1x32x128x80xf32> + %collapsed_1574 = tensor.collapse_shape %1661 [[0, 1], [2], [3]] : tensor<1x32x128x80xf32> into tensor<32x128x80xf32> + %cst_1575 = arith.constant 0.000000e+00 : f32 + %1662 = tensor.empty() : tensor<32x80x80xf32> + %1663 = linalg.fill ins(%cst_1575 : f32) outs(%1662 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32> + %1664 = linalg.batch_matmul ins(%collapsed_1572, %collapsed_1574 : tensor<32x80x128xf32>, tensor<32x128x80xf32>) outs(%1663 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32> + %expanded_1576 = tensor.expand_shape %1664 [[0, 1], [2], [3]] : tensor<32x80x80xf32> into tensor<1x32x80x80xf32> + %cst_1577 = arith.constant dense<11.3137083> : tensor<1x32x80x80xf32> + %1665 = tensor.empty() : tensor<1x32x80x80xf32> + %1666 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst_1577 : tensor<1x32x80x80xf32>) outs(%1665 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %out: f32): + %cst_2684 = arith.constant 1.000000e+00 : f32 + %3652 = arith.divf %cst_2684, %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %1667 = tensor.empty() : tensor<1x32x80x80xf32> + %1668 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1576, %1666 : tensor<1x32x80x80xf32>, tensor<1x32x80x80xf32>) outs(%1667 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %1669 = tensor.empty() : tensor<1x32x80x80xf32> + %collapsed_1578 = tensor.collapse_shape %21 [[0, 1], [2], [3]] : tensor<1x1x80x80xf32> into tensor<1x80x80xf32> + %1670 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1668, %collapsed_1578 : tensor<1x32x80x80xf32>, tensor<1x80x80xf32>) outs(%1669 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %1671 = tensor.empty() : tensor<1x32x80x1xf32> + %1672 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%1671 : tensor<1x32x80x1xf32>) { + ^bb0(%out: f32): + %cst_2684 = arith.constant 0xFF800000 : f32 + linalg.yield %cst_2684 : f32 + } -> tensor<1x32x80x1xf32> + %1673 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%1670 : tensor<1x32x80x80xf32>) outs(%1671 : tensor<1x32x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.maximumf %in, %out : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x1xf32> + %1674 = tensor.empty() : tensor<1x32x80x80xf32> + %1675 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1670, %1673 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%1674 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.subf %in, %in_2684 : f32 + %3653 = math.exp %3652 : f32 + linalg.yield %3653 : f32 + } -> tensor<1x32x80x80xf32> + %1676 = tensor.empty() : tensor<1x32x80x1xf32> + %1677 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%1676 : tensor<1x32x80x1xf32>) { + ^bb0(%out: f32): + %cst_2684 = arith.constant 0.000000e+00 : f32 + linalg.yield %cst_2684 : f32 + } -> tensor<1x32x80x1xf32> + %1678 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%1675 : tensor<1x32x80x80xf32>) outs(%1677 : tensor<1x32x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.addf %in, %out : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x1xf32> + %1679 = tensor.empty() : tensor<1x32x80x80xf32> + %1680 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1675, %1678 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%1679 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.divf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %cst_1579 = arith.constant dense<0.000000e+00> : tensor<1x32x80x80xf32> + %collapsed_1580 = tensor.collapse_shape %1680 [[0, 1], [2], [3]] : tensor<1x32x80x80xf32> into tensor<32x80x80xf32> + %cst_1581 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32> + %collapsed_1582 = tensor.collapse_shape %1629 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32> + %cst_1583 = arith.constant 0.000000e+00 : f32 + %1681 = tensor.empty() : tensor<32x80x128xf32> + %1682 = linalg.fill ins(%cst_1583 : f32) outs(%1681 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32> + %1683 = linalg.batch_matmul ins(%collapsed_1580, %collapsed_1582 : tensor<32x80x80xf32>, tensor<32x80x128xf32>) outs(%1682 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32> + %expanded_1584 = tensor.expand_shape %1683 [[0, 1], [2], [3]] : tensor<32x80x128xf32> into tensor<1x32x80x128xf32> + %1684 = tensor.empty() : tensor<1x80x32x128xf32> + %1685 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1584 : tensor<1x32x80x128xf32>) outs(%1684 : tensor<1x80x32x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x80x32x128xf32> + %collapsed_1585 = tensor.collapse_shape %1685 [[0], [1], [2, 3]] : tensor<1x80x32x128xf32> into tensor<1x80x4096xf32> + %1686 = tensor.empty() : tensor<4096x4096xf32> + %1687 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_268 : tensor<4096x4096xf32>) outs(%1686 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_1586 = tensor.collapse_shape %collapsed_1585 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_1587 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %1688 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1586, %1687 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1587 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_1588 = tensor.expand_shape %1688 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %1689 = tensor.empty() : tensor<1x80x4096xf32> + %1690 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1603, %expanded_1588 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%1689 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %1691 = tensor.empty() : tensor<1x80x4096xf32> + %cst_1589 = arith.constant 2.000000e+00 : f32 + %1692 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1690 : tensor<1x80x4096xf32>) outs(%1691 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.powf %in, %cst_1589 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %cst_1590 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32> + %1693 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%1692 : tensor<1x80x4096xf32>) outs(%cst_1590 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %cst_2684 = arith.constant 4.096000e+03 : f32 + %3652 = arith.divf %in, %cst_2684 : f32 + %3653 = arith.addf %3652, %out : f32 + linalg.yield %3653 : f32 + } -> tensor<1x80x1xf32> + %cst_1591 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32> + %1694 = tensor.empty() : tensor<1x80x1xf32> + %1695 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1693, %cst_1591 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%1694 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %1696 = tensor.empty() : tensor<1x80x1xf32> + %1697 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1695 : tensor<1x80x1xf32>) outs(%1696 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.rsqrt %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %1698 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_1592 = tensor.collapse_shape %1697 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32> + %1699 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1690, %collapsed_1592 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%1698 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %expanded_1593 = tensor.expand_shape %extracted_slice_28 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32> + %1700 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_1594 = tensor.collapse_shape %expanded_1593 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32> + %1701 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_1594, %1699 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%1700 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %1702 = tensor.empty() : tensor<4096x11008xf32> + %1703 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_270 : tensor<11008x4096xf32>) outs(%1702 : tensor<4096x11008xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x11008xf32> + %collapsed_1595 = tensor.collapse_shape %1701 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_1596 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32> + %1704 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1595, %1703 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_1596 : tensor<80x11008xf32>) -> tensor<80x11008xf32> + %expanded_1597 = tensor.expand_shape %1704 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32> + %1705 = tensor.empty() : tensor<1x80x11008xf32> + %1706 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_1597 : tensor<1x80x11008xf32>) outs(%1705 : tensor<1x80x11008xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.negf %in : f32 + %3653 = math.exp %3652 : f32 + %cst_2684 = arith.constant 1.000000e+00 : f32 + %3654 = arith.addf %cst_2684, %3653 : f32 + %3655 = arith.divf %in, %3654 : f32 + linalg.yield %3655 : f32 + } -> tensor<1x80x11008xf32> + %1707 = tensor.empty() : tensor<4096x11008xf32> + %1708 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_272 : tensor<11008x4096xf32>) outs(%1707 : tensor<4096x11008xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x11008xf32> + %collapsed_1598 = tensor.collapse_shape %1701 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_1599 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32> + %1709 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1598, %1708 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_1599 : tensor<80x11008xf32>) -> tensor<80x11008xf32> + %expanded_1600 = tensor.expand_shape %1709 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32> + %1710 = tensor.empty() : tensor<1x80x11008xf32> + %1711 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1706, %expanded_1600 : tensor<1x80x11008xf32>, tensor<1x80x11008xf32>) outs(%1710 : tensor<1x80x11008xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x11008xf32> + %1712 = tensor.empty() : tensor<11008x4096xf32> + %1713 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_274 : tensor<4096x11008xf32>) outs(%1712 : tensor<11008x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<11008x4096xf32> + %collapsed_1601 = tensor.collapse_shape %1711 [[0, 1], [2]] : tensor<1x80x11008xf32> into tensor<80x11008xf32> + %cst_1602 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %1714 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1601, %1713 : tensor<80x11008xf32>, tensor<11008x4096xf32>) outs(%cst_1602 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_1603 = tensor.expand_shape %1714 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %1715 = tensor.empty() : tensor<1x80x4096xf32> + %1716 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1690, %expanded_1603 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%1715 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %1717 = tensor.empty() : tensor<1x80x4096xf32> + %cst_1604 = arith.constant 2.000000e+00 : f32 + %1718 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1716 : tensor<1x80x4096xf32>) outs(%1717 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.powf %in, %cst_1604 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %cst_1605 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32> + %1719 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%1718 : tensor<1x80x4096xf32>) outs(%cst_1605 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %cst_2684 = arith.constant 4.096000e+03 : f32 + %3652 = arith.divf %in, %cst_2684 : f32 + %3653 = arith.addf %3652, %out : f32 + linalg.yield %3653 : f32 + } -> tensor<1x80x1xf32> + %cst_1606 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32> + %1720 = tensor.empty() : tensor<1x80x1xf32> + %1721 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1719, %cst_1606 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%1720 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %1722 = tensor.empty() : tensor<1x80x1xf32> + %1723 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1721 : tensor<1x80x1xf32>) outs(%1722 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.rsqrt %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %1724 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_1607 = tensor.collapse_shape %1723 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32> + %1725 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1716, %collapsed_1607 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%1724 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %expanded_1608 = tensor.expand_shape %extracted_slice_29 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32> + %1726 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_1609 = tensor.collapse_shape %expanded_1608 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32> + %1727 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_1609, %1725 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%1726 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %1728 = tensor.empty() : tensor<4096x4096xf32> + %1729 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_276 : tensor<4096x4096xf32>) outs(%1728 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_1610 = tensor.collapse_shape %1727 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_1611 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %1730 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1610, %1729 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1611 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_1612 = tensor.expand_shape %1730 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %1731 = tensor.empty() : tensor<4096x4096xf32> + %1732 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_278 : tensor<4096x4096xf32>) outs(%1731 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_1613 = tensor.collapse_shape %1727 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_1614 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %1733 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1613, %1732 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1614 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_1615 = tensor.expand_shape %1733 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %1734 = tensor.empty() : tensor<4096x4096xf32> + %1735 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_280 : tensor<4096x4096xf32>) outs(%1734 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_1616 = tensor.collapse_shape %1727 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_1617 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %1736 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1616, %1735 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1617 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_1618 = tensor.expand_shape %1736 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %expanded_1619 = tensor.expand_shape %expanded_1612 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32> + %1737 = tensor.empty() : tensor<1x32x80x128xf32> + %1738 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1619 : tensor<1x80x32x128xf32>) outs(%1737 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x80x128xf32> + %expanded_1620 = tensor.expand_shape %expanded_1615 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32> + %1739 = tensor.empty() : tensor<1x32x80x128xf32> + %1740 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1620 : tensor<1x80x32x128xf32>) outs(%1739 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x80x128xf32> + %expanded_1621 = tensor.expand_shape %expanded_1618 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32> + %1741 = tensor.empty() : tensor<1x32x80x128xf32> + %1742 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1621 : tensor<1x80x32x128xf32>) outs(%1741 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x80x128xf32> + %extracted_slice_1622 = tensor.extract_slice %expanded_576[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32> + %extracted_slice_1623 = tensor.extract_slice %expanded_578[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32> + %1743 = tensor.empty() : tensor<1x80x128xf32> + %1744 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_1622 : tensor<1x1x80x128xf32>) outs(%1743 : tensor<1x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x80x128xf32> + %1745 = tensor.empty() : tensor<80x128xf32> + %1746 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%1744 : tensor<1x80x128xf32>) outs(%1745 : tensor<80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<80x128xf32> + %1747 = tensor.empty() : tensor<1x80x128xf32> + %1748 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_1623 : tensor<1x1x80x128xf32>) outs(%1747 : tensor<1x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x80x128xf32> + %1749 = tensor.empty() : tensor<80x128xf32> + %1750 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%1748 : tensor<1x80x128xf32>) outs(%1749 : tensor<80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<80x128xf32> + %1751 = tensor.empty() : tensor<1x80x128xf32> + %1752 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%1751 : tensor<1x80x128xf32>) { + ^bb0(%in: i64, %out: f32): + %3652 = arith.index_cast %in : i64 to index + %3653 = linalg.index 2 : index + %extracted = tensor.extract %1746[%3652, %3653] : tensor<80x128xf32> + linalg.yield %extracted : f32 + } -> tensor<1x80x128xf32> + %expanded_1624 = tensor.expand_shape %1752 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32> + %1753 = tensor.empty() : tensor<1x80x128xf32> + %1754 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%1753 : tensor<1x80x128xf32>) { + ^bb0(%in: i64, %out: f32): + %3652 = arith.index_cast %in : i64 to index + %3653 = linalg.index 2 : index + %extracted = tensor.extract %1750[%3652, %3653] : tensor<80x128xf32> + linalg.yield %extracted : f32 + } -> tensor<1x80x128xf32> + %expanded_1625 = tensor.expand_shape %1754 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32> + %1755 = tensor.empty() : tensor<1x32x80x128xf32> + %1756 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1738, %1752 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%1755 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %extracted_slice_1626 = tensor.extract_slice %1738[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %extracted_slice_1627 = tensor.extract_slice %1738[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %1757 = tensor.empty() : tensor<1x32x80x64xf32> + %1758 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_1627 : tensor<1x32x80x64xf32>) outs(%1757 : tensor<1x32x80x64xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.negf %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x64xf32> + %1759 = tensor.empty() : tensor<1x32x80x128xf32> + %inserted_slice_1628 = tensor.insert_slice %1758 into %1759[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %inserted_slice_1629 = tensor.insert_slice %extracted_slice_1626 into %inserted_slice_1628[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %1760 = tensor.empty() : tensor<1x32x80x128xf32> + %1761 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_1629, %1754 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%1760 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %1762 = tensor.empty() : tensor<1x32x80x128xf32> + %1763 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1756, %1761 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%1762 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %1764 = tensor.empty() : tensor<1x32x80x128xf32> + %1765 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1740, %1752 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%1764 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %extracted_slice_1630 = tensor.extract_slice %1740[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %extracted_slice_1631 = tensor.extract_slice %1740[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %1766 = tensor.empty() : tensor<1x32x80x64xf32> + %1767 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_1631 : tensor<1x32x80x64xf32>) outs(%1766 : tensor<1x32x80x64xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.negf %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x64xf32> + %1768 = tensor.empty() : tensor<1x32x80x128xf32> + %inserted_slice_1632 = tensor.insert_slice %1767 into %1768[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %inserted_slice_1633 = tensor.insert_slice %extracted_slice_1630 into %inserted_slice_1632[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %1769 = tensor.empty() : tensor<1x32x80x128xf32> + %1770 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_1633, %1754 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%1769 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %1771 = tensor.empty() : tensor<1x32x80x128xf32> + %1772 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1765, %1770 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%1771 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %1773 = tensor.empty() : tensor<1x32x128x80xf32> + %1774 = linalg.generic {indexing_maps = [#map14, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1772 : tensor<1x32x80x128xf32>) outs(%1773 : tensor<1x32x128x80xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x128x80xf32> + %cst_1634 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32> + %collapsed_1635 = tensor.collapse_shape %1763 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32> + %cst_1636 = arith.constant dense<0.000000e+00> : tensor<1x32x128x80xf32> + %collapsed_1637 = tensor.collapse_shape %1774 [[0, 1], [2], [3]] : tensor<1x32x128x80xf32> into tensor<32x128x80xf32> + %cst_1638 = arith.constant 0.000000e+00 : f32 + %1775 = tensor.empty() : tensor<32x80x80xf32> + %1776 = linalg.fill ins(%cst_1638 : f32) outs(%1775 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32> + %1777 = linalg.batch_matmul ins(%collapsed_1635, %collapsed_1637 : tensor<32x80x128xf32>, tensor<32x128x80xf32>) outs(%1776 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32> + %expanded_1639 = tensor.expand_shape %1777 [[0, 1], [2], [3]] : tensor<32x80x80xf32> into tensor<1x32x80x80xf32> + %cst_1640 = arith.constant dense<11.3137083> : tensor<1x32x80x80xf32> + %1778 = tensor.empty() : tensor<1x32x80x80xf32> + %1779 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst_1640 : tensor<1x32x80x80xf32>) outs(%1778 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %out: f32): + %cst_2684 = arith.constant 1.000000e+00 : f32 + %3652 = arith.divf %cst_2684, %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %1780 = tensor.empty() : tensor<1x32x80x80xf32> + %1781 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1639, %1779 : tensor<1x32x80x80xf32>, tensor<1x32x80x80xf32>) outs(%1780 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %1782 = tensor.empty() : tensor<1x32x80x80xf32> + %collapsed_1641 = tensor.collapse_shape %21 [[0, 1], [2], [3]] : tensor<1x1x80x80xf32> into tensor<1x80x80xf32> + %1783 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1781, %collapsed_1641 : tensor<1x32x80x80xf32>, tensor<1x80x80xf32>) outs(%1782 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %1784 = tensor.empty() : tensor<1x32x80x1xf32> + %1785 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%1784 : tensor<1x32x80x1xf32>) { + ^bb0(%out: f32): + %cst_2684 = arith.constant 0xFF800000 : f32 + linalg.yield %cst_2684 : f32 + } -> tensor<1x32x80x1xf32> + %1786 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%1783 : tensor<1x32x80x80xf32>) outs(%1784 : tensor<1x32x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.maximumf %in, %out : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x1xf32> + %1787 = tensor.empty() : tensor<1x32x80x80xf32> + %1788 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1783, %1786 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%1787 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.subf %in, %in_2684 : f32 + %3653 = math.exp %3652 : f32 + linalg.yield %3653 : f32 + } -> tensor<1x32x80x80xf32> + %1789 = tensor.empty() : tensor<1x32x80x1xf32> + %1790 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%1789 : tensor<1x32x80x1xf32>) { + ^bb0(%out: f32): + %cst_2684 = arith.constant 0.000000e+00 : f32 + linalg.yield %cst_2684 : f32 + } -> tensor<1x32x80x1xf32> + %1791 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%1788 : tensor<1x32x80x80xf32>) outs(%1790 : tensor<1x32x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.addf %in, %out : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x1xf32> + %1792 = tensor.empty() : tensor<1x32x80x80xf32> + %1793 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1788, %1791 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%1792 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.divf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %cst_1642 = arith.constant dense<0.000000e+00> : tensor<1x32x80x80xf32> + %collapsed_1643 = tensor.collapse_shape %1793 [[0, 1], [2], [3]] : tensor<1x32x80x80xf32> into tensor<32x80x80xf32> + %cst_1644 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32> + %collapsed_1645 = tensor.collapse_shape %1742 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32> + %cst_1646 = arith.constant 0.000000e+00 : f32 + %1794 = tensor.empty() : tensor<32x80x128xf32> + %1795 = linalg.fill ins(%cst_1646 : f32) outs(%1794 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32> + %1796 = linalg.batch_matmul ins(%collapsed_1643, %collapsed_1645 : tensor<32x80x80xf32>, tensor<32x80x128xf32>) outs(%1795 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32> + %expanded_1647 = tensor.expand_shape %1796 [[0, 1], [2], [3]] : tensor<32x80x128xf32> into tensor<1x32x80x128xf32> + %1797 = tensor.empty() : tensor<1x80x32x128xf32> + %1798 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1647 : tensor<1x32x80x128xf32>) outs(%1797 : tensor<1x80x32x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x80x32x128xf32> + %collapsed_1648 = tensor.collapse_shape %1798 [[0], [1], [2, 3]] : tensor<1x80x32x128xf32> into tensor<1x80x4096xf32> + %1799 = tensor.empty() : tensor<4096x4096xf32> + %1800 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_282 : tensor<4096x4096xf32>) outs(%1799 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_1649 = tensor.collapse_shape %collapsed_1648 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_1650 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %1801 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1649, %1800 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1650 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_1651 = tensor.expand_shape %1801 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %1802 = tensor.empty() : tensor<1x80x4096xf32> + %1803 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1716, %expanded_1651 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%1802 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %1804 = tensor.empty() : tensor<1x80x4096xf32> + %cst_1652 = arith.constant 2.000000e+00 : f32 + %1805 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1803 : tensor<1x80x4096xf32>) outs(%1804 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.powf %in, %cst_1652 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %cst_1653 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32> + %1806 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%1805 : tensor<1x80x4096xf32>) outs(%cst_1653 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %cst_2684 = arith.constant 4.096000e+03 : f32 + %3652 = arith.divf %in, %cst_2684 : f32 + %3653 = arith.addf %3652, %out : f32 + linalg.yield %3653 : f32 + } -> tensor<1x80x1xf32> + %cst_1654 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32> + %1807 = tensor.empty() : tensor<1x80x1xf32> + %1808 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1806, %cst_1654 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%1807 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %1809 = tensor.empty() : tensor<1x80x1xf32> + %1810 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1808 : tensor<1x80x1xf32>) outs(%1809 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.rsqrt %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %1811 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_1655 = tensor.collapse_shape %1810 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32> + %1812 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1803, %collapsed_1655 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%1811 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %expanded_1656 = tensor.expand_shape %extracted_slice_30 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32> + %1813 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_1657 = tensor.collapse_shape %expanded_1656 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32> + %1814 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_1657, %1812 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%1813 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %1815 = tensor.empty() : tensor<4096x11008xf32> + %1816 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_284 : tensor<11008x4096xf32>) outs(%1815 : tensor<4096x11008xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x11008xf32> + %collapsed_1658 = tensor.collapse_shape %1814 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_1659 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32> + %1817 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1658, %1816 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_1659 : tensor<80x11008xf32>) -> tensor<80x11008xf32> + %expanded_1660 = tensor.expand_shape %1817 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32> + %1818 = tensor.empty() : tensor<1x80x11008xf32> + %1819 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_1660 : tensor<1x80x11008xf32>) outs(%1818 : tensor<1x80x11008xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.negf %in : f32 + %3653 = math.exp %3652 : f32 + %cst_2684 = arith.constant 1.000000e+00 : f32 + %3654 = arith.addf %cst_2684, %3653 : f32 + %3655 = arith.divf %in, %3654 : f32 + linalg.yield %3655 : f32 + } -> tensor<1x80x11008xf32> + %1820 = tensor.empty() : tensor<4096x11008xf32> + %1821 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_286 : tensor<11008x4096xf32>) outs(%1820 : tensor<4096x11008xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x11008xf32> + %collapsed_1661 = tensor.collapse_shape %1814 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_1662 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32> + %1822 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1661, %1821 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_1662 : tensor<80x11008xf32>) -> tensor<80x11008xf32> + %expanded_1663 = tensor.expand_shape %1822 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32> + %1823 = tensor.empty() : tensor<1x80x11008xf32> + %1824 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1819, %expanded_1663 : tensor<1x80x11008xf32>, tensor<1x80x11008xf32>) outs(%1823 : tensor<1x80x11008xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x11008xf32> + %1825 = tensor.empty() : tensor<11008x4096xf32> + %1826 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_288 : tensor<4096x11008xf32>) outs(%1825 : tensor<11008x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<11008x4096xf32> + %collapsed_1664 = tensor.collapse_shape %1824 [[0, 1], [2]] : tensor<1x80x11008xf32> into tensor<80x11008xf32> + %cst_1665 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %1827 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1664, %1826 : tensor<80x11008xf32>, tensor<11008x4096xf32>) outs(%cst_1665 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_1666 = tensor.expand_shape %1827 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %1828 = tensor.empty() : tensor<1x80x4096xf32> + %1829 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1803, %expanded_1666 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%1828 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %1830 = tensor.empty() : tensor<1x80x4096xf32> + %cst_1667 = arith.constant 2.000000e+00 : f32 + %1831 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1829 : tensor<1x80x4096xf32>) outs(%1830 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.powf %in, %cst_1667 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %cst_1668 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32> + %1832 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%1831 : tensor<1x80x4096xf32>) outs(%cst_1668 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %cst_2684 = arith.constant 4.096000e+03 : f32 + %3652 = arith.divf %in, %cst_2684 : f32 + %3653 = arith.addf %3652, %out : f32 + linalg.yield %3653 : f32 + } -> tensor<1x80x1xf32> + %cst_1669 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32> + %1833 = tensor.empty() : tensor<1x80x1xf32> + %1834 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1832, %cst_1669 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%1833 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %1835 = tensor.empty() : tensor<1x80x1xf32> + %1836 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1834 : tensor<1x80x1xf32>) outs(%1835 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.rsqrt %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %1837 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_1670 = tensor.collapse_shape %1836 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32> + %1838 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1829, %collapsed_1670 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%1837 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %expanded_1671 = tensor.expand_shape %extracted_slice_31 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32> + %1839 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_1672 = tensor.collapse_shape %expanded_1671 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32> + %1840 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_1672, %1838 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%1839 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %1841 = tensor.empty() : tensor<4096x4096xf32> + %1842 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_290 : tensor<4096x4096xf32>) outs(%1841 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_1673 = tensor.collapse_shape %1840 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_1674 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %1843 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1673, %1842 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1674 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_1675 = tensor.expand_shape %1843 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %1844 = tensor.empty() : tensor<4096x4096xf32> + %1845 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_292 : tensor<4096x4096xf32>) outs(%1844 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_1676 = tensor.collapse_shape %1840 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_1677 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %1846 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1676, %1845 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1677 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_1678 = tensor.expand_shape %1846 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %1847 = tensor.empty() : tensor<4096x4096xf32> + %1848 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_294 : tensor<4096x4096xf32>) outs(%1847 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_1679 = tensor.collapse_shape %1840 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_1680 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %1849 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1679, %1848 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1680 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_1681 = tensor.expand_shape %1849 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %expanded_1682 = tensor.expand_shape %expanded_1675 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32> + %1850 = tensor.empty() : tensor<1x32x80x128xf32> + %1851 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1682 : tensor<1x80x32x128xf32>) outs(%1850 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x80x128xf32> + %expanded_1683 = tensor.expand_shape %expanded_1678 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32> + %1852 = tensor.empty() : tensor<1x32x80x128xf32> + %1853 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1683 : tensor<1x80x32x128xf32>) outs(%1852 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x80x128xf32> + %expanded_1684 = tensor.expand_shape %expanded_1681 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32> + %1854 = tensor.empty() : tensor<1x32x80x128xf32> + %1855 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1684 : tensor<1x80x32x128xf32>) outs(%1854 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x80x128xf32> + %extracted_slice_1685 = tensor.extract_slice %expanded_580[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32> + %extracted_slice_1686 = tensor.extract_slice %expanded_582[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32> + %1856 = tensor.empty() : tensor<1x80x128xf32> + %1857 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_1685 : tensor<1x1x80x128xf32>) outs(%1856 : tensor<1x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x80x128xf32> + %1858 = tensor.empty() : tensor<80x128xf32> + %1859 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%1857 : tensor<1x80x128xf32>) outs(%1858 : tensor<80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<80x128xf32> + %1860 = tensor.empty() : tensor<1x80x128xf32> + %1861 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_1686 : tensor<1x1x80x128xf32>) outs(%1860 : tensor<1x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x80x128xf32> + %1862 = tensor.empty() : tensor<80x128xf32> + %1863 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%1861 : tensor<1x80x128xf32>) outs(%1862 : tensor<80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<80x128xf32> + %1864 = tensor.empty() : tensor<1x80x128xf32> + %1865 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%1864 : tensor<1x80x128xf32>) { + ^bb0(%in: i64, %out: f32): + %3652 = arith.index_cast %in : i64 to index + %3653 = linalg.index 2 : index + %extracted = tensor.extract %1859[%3652, %3653] : tensor<80x128xf32> + linalg.yield %extracted : f32 + } -> tensor<1x80x128xf32> + %expanded_1687 = tensor.expand_shape %1865 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32> + %1866 = tensor.empty() : tensor<1x80x128xf32> + %1867 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%1866 : tensor<1x80x128xf32>) { + ^bb0(%in: i64, %out: f32): + %3652 = arith.index_cast %in : i64 to index + %3653 = linalg.index 2 : index + %extracted = tensor.extract %1863[%3652, %3653] : tensor<80x128xf32> + linalg.yield %extracted : f32 + } -> tensor<1x80x128xf32> + %expanded_1688 = tensor.expand_shape %1867 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32> + %1868 = tensor.empty() : tensor<1x32x80x128xf32> + %1869 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1851, %1865 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%1868 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %extracted_slice_1689 = tensor.extract_slice %1851[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %extracted_slice_1690 = tensor.extract_slice %1851[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %1870 = tensor.empty() : tensor<1x32x80x64xf32> + %1871 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_1690 : tensor<1x32x80x64xf32>) outs(%1870 : tensor<1x32x80x64xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.negf %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x64xf32> + %1872 = tensor.empty() : tensor<1x32x80x128xf32> + %inserted_slice_1691 = tensor.insert_slice %1871 into %1872[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %inserted_slice_1692 = tensor.insert_slice %extracted_slice_1689 into %inserted_slice_1691[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %1873 = tensor.empty() : tensor<1x32x80x128xf32> + %1874 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_1692, %1867 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%1873 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %1875 = tensor.empty() : tensor<1x32x80x128xf32> + %1876 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1869, %1874 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%1875 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %1877 = tensor.empty() : tensor<1x32x80x128xf32> + %1878 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1853, %1865 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%1877 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %extracted_slice_1693 = tensor.extract_slice %1853[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %extracted_slice_1694 = tensor.extract_slice %1853[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %1879 = tensor.empty() : tensor<1x32x80x64xf32> + %1880 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_1694 : tensor<1x32x80x64xf32>) outs(%1879 : tensor<1x32x80x64xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.negf %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x64xf32> + %1881 = tensor.empty() : tensor<1x32x80x128xf32> + %inserted_slice_1695 = tensor.insert_slice %1880 into %1881[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %inserted_slice_1696 = tensor.insert_slice %extracted_slice_1693 into %inserted_slice_1695[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %1882 = tensor.empty() : tensor<1x32x80x128xf32> + %1883 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_1696, %1867 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%1882 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %1884 = tensor.empty() : tensor<1x32x80x128xf32> + %1885 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1878, %1883 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%1884 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %1886 = tensor.empty() : tensor<1x32x128x80xf32> + %1887 = linalg.generic {indexing_maps = [#map14, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1885 : tensor<1x32x80x128xf32>) outs(%1886 : tensor<1x32x128x80xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x128x80xf32> + %cst_1697 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32> + %collapsed_1698 = tensor.collapse_shape %1876 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32> + %cst_1699 = arith.constant dense<0.000000e+00> : tensor<1x32x128x80xf32> + %collapsed_1700 = tensor.collapse_shape %1887 [[0, 1], [2], [3]] : tensor<1x32x128x80xf32> into tensor<32x128x80xf32> + %cst_1701 = arith.constant 0.000000e+00 : f32 + %1888 = tensor.empty() : tensor<32x80x80xf32> + %1889 = linalg.fill ins(%cst_1701 : f32) outs(%1888 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32> + %1890 = linalg.batch_matmul ins(%collapsed_1698, %collapsed_1700 : tensor<32x80x128xf32>, tensor<32x128x80xf32>) outs(%1889 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32> + %expanded_1702 = tensor.expand_shape %1890 [[0, 1], [2], [3]] : tensor<32x80x80xf32> into tensor<1x32x80x80xf32> + %cst_1703 = arith.constant dense<11.3137083> : tensor<1x32x80x80xf32> + %1891 = tensor.empty() : tensor<1x32x80x80xf32> + %1892 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst_1703 : tensor<1x32x80x80xf32>) outs(%1891 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %out: f32): + %cst_2684 = arith.constant 1.000000e+00 : f32 + %3652 = arith.divf %cst_2684, %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %1893 = tensor.empty() : tensor<1x32x80x80xf32> + %1894 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1702, %1892 : tensor<1x32x80x80xf32>, tensor<1x32x80x80xf32>) outs(%1893 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %1895 = tensor.empty() : tensor<1x32x80x80xf32> + %collapsed_1704 = tensor.collapse_shape %21 [[0, 1], [2], [3]] : tensor<1x1x80x80xf32> into tensor<1x80x80xf32> + %1896 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1894, %collapsed_1704 : tensor<1x32x80x80xf32>, tensor<1x80x80xf32>) outs(%1895 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %1897 = tensor.empty() : tensor<1x32x80x1xf32> + %1898 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%1897 : tensor<1x32x80x1xf32>) { + ^bb0(%out: f32): + %cst_2684 = arith.constant 0xFF800000 : f32 + linalg.yield %cst_2684 : f32 + } -> tensor<1x32x80x1xf32> + %1899 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%1896 : tensor<1x32x80x80xf32>) outs(%1897 : tensor<1x32x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.maximumf %in, %out : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x1xf32> + %1900 = tensor.empty() : tensor<1x32x80x80xf32> + %1901 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1896, %1899 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%1900 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.subf %in, %in_2684 : f32 + %3653 = math.exp %3652 : f32 + linalg.yield %3653 : f32 + } -> tensor<1x32x80x80xf32> + %1902 = tensor.empty() : tensor<1x32x80x1xf32> + %1903 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%1902 : tensor<1x32x80x1xf32>) { + ^bb0(%out: f32): + %cst_2684 = arith.constant 0.000000e+00 : f32 + linalg.yield %cst_2684 : f32 + } -> tensor<1x32x80x1xf32> + %1904 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%1901 : tensor<1x32x80x80xf32>) outs(%1903 : tensor<1x32x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.addf %in, %out : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x1xf32> + %1905 = tensor.empty() : tensor<1x32x80x80xf32> + %1906 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1901, %1904 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%1905 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.divf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %cst_1705 = arith.constant dense<0.000000e+00> : tensor<1x32x80x80xf32> + %collapsed_1706 = tensor.collapse_shape %1906 [[0, 1], [2], [3]] : tensor<1x32x80x80xf32> into tensor<32x80x80xf32> + %cst_1707 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32> + %collapsed_1708 = tensor.collapse_shape %1855 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32> + %cst_1709 = arith.constant 0.000000e+00 : f32 + %1907 = tensor.empty() : tensor<32x80x128xf32> + %1908 = linalg.fill ins(%cst_1709 : f32) outs(%1907 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32> + %1909 = linalg.batch_matmul ins(%collapsed_1706, %collapsed_1708 : tensor<32x80x80xf32>, tensor<32x80x128xf32>) outs(%1908 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32> + %expanded_1710 = tensor.expand_shape %1909 [[0, 1], [2], [3]] : tensor<32x80x128xf32> into tensor<1x32x80x128xf32> + %1910 = tensor.empty() : tensor<1x80x32x128xf32> + %1911 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1710 : tensor<1x32x80x128xf32>) outs(%1910 : tensor<1x80x32x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x80x32x128xf32> + %collapsed_1711 = tensor.collapse_shape %1911 [[0], [1], [2, 3]] : tensor<1x80x32x128xf32> into tensor<1x80x4096xf32> + %1912 = tensor.empty() : tensor<4096x4096xf32> + %1913 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_296 : tensor<4096x4096xf32>) outs(%1912 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_1712 = tensor.collapse_shape %collapsed_1711 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_1713 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %1914 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1712, %1913 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1713 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_1714 = tensor.expand_shape %1914 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %1915 = tensor.empty() : tensor<1x80x4096xf32> + %1916 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1829, %expanded_1714 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%1915 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %1917 = tensor.empty() : tensor<1x80x4096xf32> + %cst_1715 = arith.constant 2.000000e+00 : f32 + %1918 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1916 : tensor<1x80x4096xf32>) outs(%1917 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.powf %in, %cst_1715 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %cst_1716 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32> + %1919 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%1918 : tensor<1x80x4096xf32>) outs(%cst_1716 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %cst_2684 = arith.constant 4.096000e+03 : f32 + %3652 = arith.divf %in, %cst_2684 : f32 + %3653 = arith.addf %3652, %out : f32 + linalg.yield %3653 : f32 + } -> tensor<1x80x1xf32> + %cst_1717 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32> + %1920 = tensor.empty() : tensor<1x80x1xf32> + %1921 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1919, %cst_1717 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%1920 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %1922 = tensor.empty() : tensor<1x80x1xf32> + %1923 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1921 : tensor<1x80x1xf32>) outs(%1922 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.rsqrt %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %1924 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_1718 = tensor.collapse_shape %1923 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32> + %1925 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1916, %collapsed_1718 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%1924 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %expanded_1719 = tensor.expand_shape %extracted_slice_32 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32> + %1926 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_1720 = tensor.collapse_shape %expanded_1719 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32> + %1927 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_1720, %1925 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%1926 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %1928 = tensor.empty() : tensor<4096x11008xf32> + %1929 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_298 : tensor<11008x4096xf32>) outs(%1928 : tensor<4096x11008xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x11008xf32> + %collapsed_1721 = tensor.collapse_shape %1927 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_1722 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32> + %1930 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1721, %1929 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_1722 : tensor<80x11008xf32>) -> tensor<80x11008xf32> + %expanded_1723 = tensor.expand_shape %1930 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32> + %1931 = tensor.empty() : tensor<1x80x11008xf32> + %1932 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_1723 : tensor<1x80x11008xf32>) outs(%1931 : tensor<1x80x11008xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.negf %in : f32 + %3653 = math.exp %3652 : f32 + %cst_2684 = arith.constant 1.000000e+00 : f32 + %3654 = arith.addf %cst_2684, %3653 : f32 + %3655 = arith.divf %in, %3654 : f32 + linalg.yield %3655 : f32 + } -> tensor<1x80x11008xf32> + %1933 = tensor.empty() : tensor<4096x11008xf32> + %1934 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_300 : tensor<11008x4096xf32>) outs(%1933 : tensor<4096x11008xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x11008xf32> + %collapsed_1724 = tensor.collapse_shape %1927 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_1725 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32> + %1935 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1724, %1934 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_1725 : tensor<80x11008xf32>) -> tensor<80x11008xf32> + %expanded_1726 = tensor.expand_shape %1935 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32> + %1936 = tensor.empty() : tensor<1x80x11008xf32> + %1937 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1932, %expanded_1726 : tensor<1x80x11008xf32>, tensor<1x80x11008xf32>) outs(%1936 : tensor<1x80x11008xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x11008xf32> + %1938 = tensor.empty() : tensor<11008x4096xf32> + %1939 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_302 : tensor<4096x11008xf32>) outs(%1938 : tensor<11008x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<11008x4096xf32> + %collapsed_1727 = tensor.collapse_shape %1937 [[0, 1], [2]] : tensor<1x80x11008xf32> into tensor<80x11008xf32> + %cst_1728 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %1940 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1727, %1939 : tensor<80x11008xf32>, tensor<11008x4096xf32>) outs(%cst_1728 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_1729 = tensor.expand_shape %1940 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %1941 = tensor.empty() : tensor<1x80x4096xf32> + %1942 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1916, %expanded_1729 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%1941 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %1943 = tensor.empty() : tensor<1x80x4096xf32> + %cst_1730 = arith.constant 2.000000e+00 : f32 + %1944 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1942 : tensor<1x80x4096xf32>) outs(%1943 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.powf %in, %cst_1730 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %cst_1731 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32> + %1945 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%1944 : tensor<1x80x4096xf32>) outs(%cst_1731 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %cst_2684 = arith.constant 4.096000e+03 : f32 + %3652 = arith.divf %in, %cst_2684 : f32 + %3653 = arith.addf %3652, %out : f32 + linalg.yield %3653 : f32 + } -> tensor<1x80x1xf32> + %cst_1732 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32> + %1946 = tensor.empty() : tensor<1x80x1xf32> + %1947 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1945, %cst_1732 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%1946 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %1948 = tensor.empty() : tensor<1x80x1xf32> + %1949 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1947 : tensor<1x80x1xf32>) outs(%1948 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.rsqrt %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %1950 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_1733 = tensor.collapse_shape %1949 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32> + %1951 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1942, %collapsed_1733 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%1950 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %expanded_1734 = tensor.expand_shape %extracted_slice_33 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32> + %1952 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_1735 = tensor.collapse_shape %expanded_1734 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32> + %1953 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_1735, %1951 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%1952 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %1954 = tensor.empty() : tensor<4096x4096xf32> + %1955 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_304 : tensor<4096x4096xf32>) outs(%1954 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_1736 = tensor.collapse_shape %1953 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_1737 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %1956 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1736, %1955 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1737 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_1738 = tensor.expand_shape %1956 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %1957 = tensor.empty() : tensor<4096x4096xf32> + %1958 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_306 : tensor<4096x4096xf32>) outs(%1957 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_1739 = tensor.collapse_shape %1953 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_1740 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %1959 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1739, %1958 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1740 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_1741 = tensor.expand_shape %1959 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %1960 = tensor.empty() : tensor<4096x4096xf32> + %1961 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_308 : tensor<4096x4096xf32>) outs(%1960 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_1742 = tensor.collapse_shape %1953 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_1743 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %1962 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1742, %1961 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1743 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_1744 = tensor.expand_shape %1962 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %expanded_1745 = tensor.expand_shape %expanded_1738 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32> + %1963 = tensor.empty() : tensor<1x32x80x128xf32> + %1964 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1745 : tensor<1x80x32x128xf32>) outs(%1963 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x80x128xf32> + %expanded_1746 = tensor.expand_shape %expanded_1741 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32> + %1965 = tensor.empty() : tensor<1x32x80x128xf32> + %1966 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1746 : tensor<1x80x32x128xf32>) outs(%1965 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x80x128xf32> + %expanded_1747 = tensor.expand_shape %expanded_1744 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32> + %1967 = tensor.empty() : tensor<1x32x80x128xf32> + %1968 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1747 : tensor<1x80x32x128xf32>) outs(%1967 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x80x128xf32> + %extracted_slice_1748 = tensor.extract_slice %expanded_584[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32> + %extracted_slice_1749 = tensor.extract_slice %expanded_586[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32> + %1969 = tensor.empty() : tensor<1x80x128xf32> + %1970 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_1748 : tensor<1x1x80x128xf32>) outs(%1969 : tensor<1x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x80x128xf32> + %1971 = tensor.empty() : tensor<80x128xf32> + %1972 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%1970 : tensor<1x80x128xf32>) outs(%1971 : tensor<80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<80x128xf32> + %1973 = tensor.empty() : tensor<1x80x128xf32> + %1974 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_1749 : tensor<1x1x80x128xf32>) outs(%1973 : tensor<1x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x80x128xf32> + %1975 = tensor.empty() : tensor<80x128xf32> + %1976 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%1974 : tensor<1x80x128xf32>) outs(%1975 : tensor<80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<80x128xf32> + %1977 = tensor.empty() : tensor<1x80x128xf32> + %1978 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%1977 : tensor<1x80x128xf32>) { + ^bb0(%in: i64, %out: f32): + %3652 = arith.index_cast %in : i64 to index + %3653 = linalg.index 2 : index + %extracted = tensor.extract %1972[%3652, %3653] : tensor<80x128xf32> + linalg.yield %extracted : f32 + } -> tensor<1x80x128xf32> + %expanded_1750 = tensor.expand_shape %1978 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32> + %1979 = tensor.empty() : tensor<1x80x128xf32> + %1980 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%1979 : tensor<1x80x128xf32>) { + ^bb0(%in: i64, %out: f32): + %3652 = arith.index_cast %in : i64 to index + %3653 = linalg.index 2 : index + %extracted = tensor.extract %1976[%3652, %3653] : tensor<80x128xf32> + linalg.yield %extracted : f32 + } -> tensor<1x80x128xf32> + %expanded_1751 = tensor.expand_shape %1980 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32> + %1981 = tensor.empty() : tensor<1x32x80x128xf32> + %1982 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1964, %1978 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%1981 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %extracted_slice_1752 = tensor.extract_slice %1964[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %extracted_slice_1753 = tensor.extract_slice %1964[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %1983 = tensor.empty() : tensor<1x32x80x64xf32> + %1984 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_1753 : tensor<1x32x80x64xf32>) outs(%1983 : tensor<1x32x80x64xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.negf %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x64xf32> + %1985 = tensor.empty() : tensor<1x32x80x128xf32> + %inserted_slice_1754 = tensor.insert_slice %1984 into %1985[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %inserted_slice_1755 = tensor.insert_slice %extracted_slice_1752 into %inserted_slice_1754[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %1986 = tensor.empty() : tensor<1x32x80x128xf32> + %1987 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_1755, %1980 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%1986 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %1988 = tensor.empty() : tensor<1x32x80x128xf32> + %1989 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1982, %1987 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%1988 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %1990 = tensor.empty() : tensor<1x32x80x128xf32> + %1991 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1966, %1978 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%1990 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %extracted_slice_1756 = tensor.extract_slice %1966[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %extracted_slice_1757 = tensor.extract_slice %1966[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %1992 = tensor.empty() : tensor<1x32x80x64xf32> + %1993 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_1757 : tensor<1x32x80x64xf32>) outs(%1992 : tensor<1x32x80x64xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.negf %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x64xf32> + %1994 = tensor.empty() : tensor<1x32x80x128xf32> + %inserted_slice_1758 = tensor.insert_slice %1993 into %1994[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %inserted_slice_1759 = tensor.insert_slice %extracted_slice_1756 into %inserted_slice_1758[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %1995 = tensor.empty() : tensor<1x32x80x128xf32> + %1996 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_1759, %1980 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%1995 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %1997 = tensor.empty() : tensor<1x32x80x128xf32> + %1998 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1991, %1996 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%1997 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %1999 = tensor.empty() : tensor<1x32x128x80xf32> + %2000 = linalg.generic {indexing_maps = [#map14, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1998 : tensor<1x32x80x128xf32>) outs(%1999 : tensor<1x32x128x80xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x128x80xf32> + %cst_1760 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32> + %collapsed_1761 = tensor.collapse_shape %1989 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32> + %cst_1762 = arith.constant dense<0.000000e+00> : tensor<1x32x128x80xf32> + %collapsed_1763 = tensor.collapse_shape %2000 [[0, 1], [2], [3]] : tensor<1x32x128x80xf32> into tensor<32x128x80xf32> + %cst_1764 = arith.constant 0.000000e+00 : f32 + %2001 = tensor.empty() : tensor<32x80x80xf32> + %2002 = linalg.fill ins(%cst_1764 : f32) outs(%2001 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32> + %2003 = linalg.batch_matmul ins(%collapsed_1761, %collapsed_1763 : tensor<32x80x128xf32>, tensor<32x128x80xf32>) outs(%2002 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32> + %expanded_1765 = tensor.expand_shape %2003 [[0, 1], [2], [3]] : tensor<32x80x80xf32> into tensor<1x32x80x80xf32> + %cst_1766 = arith.constant dense<11.3137083> : tensor<1x32x80x80xf32> + %2004 = tensor.empty() : tensor<1x32x80x80xf32> + %2005 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst_1766 : tensor<1x32x80x80xf32>) outs(%2004 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %out: f32): + %cst_2684 = arith.constant 1.000000e+00 : f32 + %3652 = arith.divf %cst_2684, %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %2006 = tensor.empty() : tensor<1x32x80x80xf32> + %2007 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1765, %2005 : tensor<1x32x80x80xf32>, tensor<1x32x80x80xf32>) outs(%2006 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %2008 = tensor.empty() : tensor<1x32x80x80xf32> + %collapsed_1767 = tensor.collapse_shape %21 [[0, 1], [2], [3]] : tensor<1x1x80x80xf32> into tensor<1x80x80xf32> + %2009 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2007, %collapsed_1767 : tensor<1x32x80x80xf32>, tensor<1x80x80xf32>) outs(%2008 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %2010 = tensor.empty() : tensor<1x32x80x1xf32> + %2011 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%2010 : tensor<1x32x80x1xf32>) { + ^bb0(%out: f32): + %cst_2684 = arith.constant 0xFF800000 : f32 + linalg.yield %cst_2684 : f32 + } -> tensor<1x32x80x1xf32> + %2012 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%2009 : tensor<1x32x80x80xf32>) outs(%2010 : tensor<1x32x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.maximumf %in, %out : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x1xf32> + %2013 = tensor.empty() : tensor<1x32x80x80xf32> + %2014 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2009, %2012 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%2013 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.subf %in, %in_2684 : f32 + %3653 = math.exp %3652 : f32 + linalg.yield %3653 : f32 + } -> tensor<1x32x80x80xf32> + %2015 = tensor.empty() : tensor<1x32x80x1xf32> + %2016 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%2015 : tensor<1x32x80x1xf32>) { + ^bb0(%out: f32): + %cst_2684 = arith.constant 0.000000e+00 : f32 + linalg.yield %cst_2684 : f32 + } -> tensor<1x32x80x1xf32> + %2017 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%2014 : tensor<1x32x80x80xf32>) outs(%2016 : tensor<1x32x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.addf %in, %out : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x1xf32> + %2018 = tensor.empty() : tensor<1x32x80x80xf32> + %2019 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2014, %2017 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%2018 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.divf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %cst_1768 = arith.constant dense<0.000000e+00> : tensor<1x32x80x80xf32> + %collapsed_1769 = tensor.collapse_shape %2019 [[0, 1], [2], [3]] : tensor<1x32x80x80xf32> into tensor<32x80x80xf32> + %cst_1770 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32> + %collapsed_1771 = tensor.collapse_shape %1968 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32> + %cst_1772 = arith.constant 0.000000e+00 : f32 + %2020 = tensor.empty() : tensor<32x80x128xf32> + %2021 = linalg.fill ins(%cst_1772 : f32) outs(%2020 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32> + %2022 = linalg.batch_matmul ins(%collapsed_1769, %collapsed_1771 : tensor<32x80x80xf32>, tensor<32x80x128xf32>) outs(%2021 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32> + %expanded_1773 = tensor.expand_shape %2022 [[0, 1], [2], [3]] : tensor<32x80x128xf32> into tensor<1x32x80x128xf32> + %2023 = tensor.empty() : tensor<1x80x32x128xf32> + %2024 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1773 : tensor<1x32x80x128xf32>) outs(%2023 : tensor<1x80x32x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x80x32x128xf32> + %collapsed_1774 = tensor.collapse_shape %2024 [[0], [1], [2, 3]] : tensor<1x80x32x128xf32> into tensor<1x80x4096xf32> + %2025 = tensor.empty() : tensor<4096x4096xf32> + %2026 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_310 : tensor<4096x4096xf32>) outs(%2025 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_1775 = tensor.collapse_shape %collapsed_1774 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_1776 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %2027 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1775, %2026 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1776 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_1777 = tensor.expand_shape %2027 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %2028 = tensor.empty() : tensor<1x80x4096xf32> + %2029 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1942, %expanded_1777 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%2028 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %2030 = tensor.empty() : tensor<1x80x4096xf32> + %cst_1778 = arith.constant 2.000000e+00 : f32 + %2031 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2029 : tensor<1x80x4096xf32>) outs(%2030 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.powf %in, %cst_1778 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %cst_1779 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32> + %2032 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%2031 : tensor<1x80x4096xf32>) outs(%cst_1779 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %cst_2684 = arith.constant 4.096000e+03 : f32 + %3652 = arith.divf %in, %cst_2684 : f32 + %3653 = arith.addf %3652, %out : f32 + linalg.yield %3653 : f32 + } -> tensor<1x80x1xf32> + %cst_1780 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32> + %2033 = tensor.empty() : tensor<1x80x1xf32> + %2034 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2032, %cst_1780 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%2033 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %2035 = tensor.empty() : tensor<1x80x1xf32> + %2036 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2034 : tensor<1x80x1xf32>) outs(%2035 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.rsqrt %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %2037 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_1781 = tensor.collapse_shape %2036 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32> + %2038 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2029, %collapsed_1781 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%2037 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %expanded_1782 = tensor.expand_shape %extracted_slice_34 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32> + %2039 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_1783 = tensor.collapse_shape %expanded_1782 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32> + %2040 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_1783, %2038 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%2039 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %2041 = tensor.empty() : tensor<4096x11008xf32> + %2042 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_312 : tensor<11008x4096xf32>) outs(%2041 : tensor<4096x11008xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x11008xf32> + %collapsed_1784 = tensor.collapse_shape %2040 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_1785 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32> + %2043 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1784, %2042 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_1785 : tensor<80x11008xf32>) -> tensor<80x11008xf32> + %expanded_1786 = tensor.expand_shape %2043 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32> + %2044 = tensor.empty() : tensor<1x80x11008xf32> + %2045 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_1786 : tensor<1x80x11008xf32>) outs(%2044 : tensor<1x80x11008xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.negf %in : f32 + %3653 = math.exp %3652 : f32 + %cst_2684 = arith.constant 1.000000e+00 : f32 + %3654 = arith.addf %cst_2684, %3653 : f32 + %3655 = arith.divf %in, %3654 : f32 + linalg.yield %3655 : f32 + } -> tensor<1x80x11008xf32> + %2046 = tensor.empty() : tensor<4096x11008xf32> + %2047 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_314 : tensor<11008x4096xf32>) outs(%2046 : tensor<4096x11008xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x11008xf32> + %collapsed_1787 = tensor.collapse_shape %2040 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_1788 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32> + %2048 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1787, %2047 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_1788 : tensor<80x11008xf32>) -> tensor<80x11008xf32> + %expanded_1789 = tensor.expand_shape %2048 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32> + %2049 = tensor.empty() : tensor<1x80x11008xf32> + %2050 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2045, %expanded_1789 : tensor<1x80x11008xf32>, tensor<1x80x11008xf32>) outs(%2049 : tensor<1x80x11008xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x11008xf32> + %2051 = tensor.empty() : tensor<11008x4096xf32> + %2052 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_316 : tensor<4096x11008xf32>) outs(%2051 : tensor<11008x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<11008x4096xf32> + %collapsed_1790 = tensor.collapse_shape %2050 [[0, 1], [2]] : tensor<1x80x11008xf32> into tensor<80x11008xf32> + %cst_1791 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %2053 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1790, %2052 : tensor<80x11008xf32>, tensor<11008x4096xf32>) outs(%cst_1791 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_1792 = tensor.expand_shape %2053 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %2054 = tensor.empty() : tensor<1x80x4096xf32> + %2055 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2029, %expanded_1792 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%2054 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %2056 = tensor.empty() : tensor<1x80x4096xf32> + %cst_1793 = arith.constant 2.000000e+00 : f32 + %2057 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2055 : tensor<1x80x4096xf32>) outs(%2056 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.powf %in, %cst_1793 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %cst_1794 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32> + %2058 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%2057 : tensor<1x80x4096xf32>) outs(%cst_1794 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %cst_2684 = arith.constant 4.096000e+03 : f32 + %3652 = arith.divf %in, %cst_2684 : f32 + %3653 = arith.addf %3652, %out : f32 + linalg.yield %3653 : f32 + } -> tensor<1x80x1xf32> + %cst_1795 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32> + %2059 = tensor.empty() : tensor<1x80x1xf32> + %2060 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2058, %cst_1795 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%2059 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %2061 = tensor.empty() : tensor<1x80x1xf32> + %2062 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2060 : tensor<1x80x1xf32>) outs(%2061 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.rsqrt %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %2063 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_1796 = tensor.collapse_shape %2062 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32> + %2064 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2055, %collapsed_1796 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%2063 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %expanded_1797 = tensor.expand_shape %extracted_slice_35 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32> + %2065 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_1798 = tensor.collapse_shape %expanded_1797 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32> + %2066 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_1798, %2064 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%2065 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %2067 = tensor.empty() : tensor<4096x4096xf32> + %2068 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_318 : tensor<4096x4096xf32>) outs(%2067 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_1799 = tensor.collapse_shape %2066 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_1800 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %2069 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1799, %2068 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1800 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_1801 = tensor.expand_shape %2069 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %2070 = tensor.empty() : tensor<4096x4096xf32> + %2071 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_320 : tensor<4096x4096xf32>) outs(%2070 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_1802 = tensor.collapse_shape %2066 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_1803 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %2072 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1802, %2071 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1803 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_1804 = tensor.expand_shape %2072 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %2073 = tensor.empty() : tensor<4096x4096xf32> + %2074 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_322 : tensor<4096x4096xf32>) outs(%2073 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_1805 = tensor.collapse_shape %2066 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_1806 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %2075 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1805, %2074 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1806 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_1807 = tensor.expand_shape %2075 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %expanded_1808 = tensor.expand_shape %expanded_1801 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32> + %2076 = tensor.empty() : tensor<1x32x80x128xf32> + %2077 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1808 : tensor<1x80x32x128xf32>) outs(%2076 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x80x128xf32> + %expanded_1809 = tensor.expand_shape %expanded_1804 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32> + %2078 = tensor.empty() : tensor<1x32x80x128xf32> + %2079 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1809 : tensor<1x80x32x128xf32>) outs(%2078 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x80x128xf32> + %expanded_1810 = tensor.expand_shape %expanded_1807 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32> + %2080 = tensor.empty() : tensor<1x32x80x128xf32> + %2081 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1810 : tensor<1x80x32x128xf32>) outs(%2080 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x80x128xf32> + %extracted_slice_1811 = tensor.extract_slice %expanded_588[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32> + %extracted_slice_1812 = tensor.extract_slice %expanded_590[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32> + %2082 = tensor.empty() : tensor<1x80x128xf32> + %2083 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_1811 : tensor<1x1x80x128xf32>) outs(%2082 : tensor<1x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x80x128xf32> + %2084 = tensor.empty() : tensor<80x128xf32> + %2085 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%2083 : tensor<1x80x128xf32>) outs(%2084 : tensor<80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<80x128xf32> + %2086 = tensor.empty() : tensor<1x80x128xf32> + %2087 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_1812 : tensor<1x1x80x128xf32>) outs(%2086 : tensor<1x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x80x128xf32> + %2088 = tensor.empty() : tensor<80x128xf32> + %2089 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%2087 : tensor<1x80x128xf32>) outs(%2088 : tensor<80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<80x128xf32> + %2090 = tensor.empty() : tensor<1x80x128xf32> + %2091 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%2090 : tensor<1x80x128xf32>) { + ^bb0(%in: i64, %out: f32): + %3652 = arith.index_cast %in : i64 to index + %3653 = linalg.index 2 : index + %extracted = tensor.extract %2085[%3652, %3653] : tensor<80x128xf32> + linalg.yield %extracted : f32 + } -> tensor<1x80x128xf32> + %expanded_1813 = tensor.expand_shape %2091 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32> + %2092 = tensor.empty() : tensor<1x80x128xf32> + %2093 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%2092 : tensor<1x80x128xf32>) { + ^bb0(%in: i64, %out: f32): + %3652 = arith.index_cast %in : i64 to index + %3653 = linalg.index 2 : index + %extracted = tensor.extract %2089[%3652, %3653] : tensor<80x128xf32> + linalg.yield %extracted : f32 + } -> tensor<1x80x128xf32> + %expanded_1814 = tensor.expand_shape %2093 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32> + %2094 = tensor.empty() : tensor<1x32x80x128xf32> + %2095 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2077, %2091 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%2094 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %extracted_slice_1815 = tensor.extract_slice %2077[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %extracted_slice_1816 = tensor.extract_slice %2077[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %2096 = tensor.empty() : tensor<1x32x80x64xf32> + %2097 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_1816 : tensor<1x32x80x64xf32>) outs(%2096 : tensor<1x32x80x64xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.negf %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x64xf32> + %2098 = tensor.empty() : tensor<1x32x80x128xf32> + %inserted_slice_1817 = tensor.insert_slice %2097 into %2098[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %inserted_slice_1818 = tensor.insert_slice %extracted_slice_1815 into %inserted_slice_1817[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %2099 = tensor.empty() : tensor<1x32x80x128xf32> + %2100 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_1818, %2093 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%2099 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %2101 = tensor.empty() : tensor<1x32x80x128xf32> + %2102 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2095, %2100 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%2101 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %2103 = tensor.empty() : tensor<1x32x80x128xf32> + %2104 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2079, %2091 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%2103 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %extracted_slice_1819 = tensor.extract_slice %2079[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %extracted_slice_1820 = tensor.extract_slice %2079[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %2105 = tensor.empty() : tensor<1x32x80x64xf32> + %2106 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_1820 : tensor<1x32x80x64xf32>) outs(%2105 : tensor<1x32x80x64xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.negf %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x64xf32> + %2107 = tensor.empty() : tensor<1x32x80x128xf32> + %inserted_slice_1821 = tensor.insert_slice %2106 into %2107[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %inserted_slice_1822 = tensor.insert_slice %extracted_slice_1819 into %inserted_slice_1821[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %2108 = tensor.empty() : tensor<1x32x80x128xf32> + %2109 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_1822, %2093 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%2108 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %2110 = tensor.empty() : tensor<1x32x80x128xf32> + %2111 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2104, %2109 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%2110 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %2112 = tensor.empty() : tensor<1x32x128x80xf32> + %2113 = linalg.generic {indexing_maps = [#map14, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2111 : tensor<1x32x80x128xf32>) outs(%2112 : tensor<1x32x128x80xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x128x80xf32> + %cst_1823 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32> + %collapsed_1824 = tensor.collapse_shape %2102 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32> + %cst_1825 = arith.constant dense<0.000000e+00> : tensor<1x32x128x80xf32> + %collapsed_1826 = tensor.collapse_shape %2113 [[0, 1], [2], [3]] : tensor<1x32x128x80xf32> into tensor<32x128x80xf32> + %cst_1827 = arith.constant 0.000000e+00 : f32 + %2114 = tensor.empty() : tensor<32x80x80xf32> + %2115 = linalg.fill ins(%cst_1827 : f32) outs(%2114 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32> + %2116 = linalg.batch_matmul ins(%collapsed_1824, %collapsed_1826 : tensor<32x80x128xf32>, tensor<32x128x80xf32>) outs(%2115 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32> + %expanded_1828 = tensor.expand_shape %2116 [[0, 1], [2], [3]] : tensor<32x80x80xf32> into tensor<1x32x80x80xf32> + %cst_1829 = arith.constant dense<11.3137083> : tensor<1x32x80x80xf32> + %2117 = tensor.empty() : tensor<1x32x80x80xf32> + %2118 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst_1829 : tensor<1x32x80x80xf32>) outs(%2117 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %out: f32): + %cst_2684 = arith.constant 1.000000e+00 : f32 + %3652 = arith.divf %cst_2684, %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %2119 = tensor.empty() : tensor<1x32x80x80xf32> + %2120 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1828, %2118 : tensor<1x32x80x80xf32>, tensor<1x32x80x80xf32>) outs(%2119 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %2121 = tensor.empty() : tensor<1x32x80x80xf32> + %collapsed_1830 = tensor.collapse_shape %21 [[0, 1], [2], [3]] : tensor<1x1x80x80xf32> into tensor<1x80x80xf32> + %2122 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2120, %collapsed_1830 : tensor<1x32x80x80xf32>, tensor<1x80x80xf32>) outs(%2121 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %2123 = tensor.empty() : tensor<1x32x80x1xf32> + %2124 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%2123 : tensor<1x32x80x1xf32>) { + ^bb0(%out: f32): + %cst_2684 = arith.constant 0xFF800000 : f32 + linalg.yield %cst_2684 : f32 + } -> tensor<1x32x80x1xf32> + %2125 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%2122 : tensor<1x32x80x80xf32>) outs(%2123 : tensor<1x32x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.maximumf %in, %out : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x1xf32> + %2126 = tensor.empty() : tensor<1x32x80x80xf32> + %2127 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2122, %2125 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%2126 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.subf %in, %in_2684 : f32 + %3653 = math.exp %3652 : f32 + linalg.yield %3653 : f32 + } -> tensor<1x32x80x80xf32> + %2128 = tensor.empty() : tensor<1x32x80x1xf32> + %2129 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%2128 : tensor<1x32x80x1xf32>) { + ^bb0(%out: f32): + %cst_2684 = arith.constant 0.000000e+00 : f32 + linalg.yield %cst_2684 : f32 + } -> tensor<1x32x80x1xf32> + %2130 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%2127 : tensor<1x32x80x80xf32>) outs(%2129 : tensor<1x32x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.addf %in, %out : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x1xf32> + %2131 = tensor.empty() : tensor<1x32x80x80xf32> + %2132 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2127, %2130 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%2131 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.divf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %cst_1831 = arith.constant dense<0.000000e+00> : tensor<1x32x80x80xf32> + %collapsed_1832 = tensor.collapse_shape %2132 [[0, 1], [2], [3]] : tensor<1x32x80x80xf32> into tensor<32x80x80xf32> + %cst_1833 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32> + %collapsed_1834 = tensor.collapse_shape %2081 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32> + %cst_1835 = arith.constant 0.000000e+00 : f32 + %2133 = tensor.empty() : tensor<32x80x128xf32> + %2134 = linalg.fill ins(%cst_1835 : f32) outs(%2133 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32> + %2135 = linalg.batch_matmul ins(%collapsed_1832, %collapsed_1834 : tensor<32x80x80xf32>, tensor<32x80x128xf32>) outs(%2134 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32> + %expanded_1836 = tensor.expand_shape %2135 [[0, 1], [2], [3]] : tensor<32x80x128xf32> into tensor<1x32x80x128xf32> + %2136 = tensor.empty() : tensor<1x80x32x128xf32> + %2137 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1836 : tensor<1x32x80x128xf32>) outs(%2136 : tensor<1x80x32x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x80x32x128xf32> + %collapsed_1837 = tensor.collapse_shape %2137 [[0], [1], [2, 3]] : tensor<1x80x32x128xf32> into tensor<1x80x4096xf32> + %2138 = tensor.empty() : tensor<4096x4096xf32> + %2139 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_324 : tensor<4096x4096xf32>) outs(%2138 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_1838 = tensor.collapse_shape %collapsed_1837 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_1839 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %2140 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1838, %2139 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1839 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_1840 = tensor.expand_shape %2140 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %2141 = tensor.empty() : tensor<1x80x4096xf32> + %2142 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2055, %expanded_1840 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%2141 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %2143 = tensor.empty() : tensor<1x80x4096xf32> + %cst_1841 = arith.constant 2.000000e+00 : f32 + %2144 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2142 : tensor<1x80x4096xf32>) outs(%2143 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.powf %in, %cst_1841 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %cst_1842 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32> + %2145 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%2144 : tensor<1x80x4096xf32>) outs(%cst_1842 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %cst_2684 = arith.constant 4.096000e+03 : f32 + %3652 = arith.divf %in, %cst_2684 : f32 + %3653 = arith.addf %3652, %out : f32 + linalg.yield %3653 : f32 + } -> tensor<1x80x1xf32> + %cst_1843 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32> + %2146 = tensor.empty() : tensor<1x80x1xf32> + %2147 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2145, %cst_1843 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%2146 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %2148 = tensor.empty() : tensor<1x80x1xf32> + %2149 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2147 : tensor<1x80x1xf32>) outs(%2148 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.rsqrt %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %2150 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_1844 = tensor.collapse_shape %2149 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32> + %2151 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2142, %collapsed_1844 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%2150 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %expanded_1845 = tensor.expand_shape %extracted_slice_36 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32> + %2152 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_1846 = tensor.collapse_shape %expanded_1845 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32> + %2153 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_1846, %2151 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%2152 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %2154 = tensor.empty() : tensor<4096x11008xf32> + %2155 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_326 : tensor<11008x4096xf32>) outs(%2154 : tensor<4096x11008xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x11008xf32> + %collapsed_1847 = tensor.collapse_shape %2153 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_1848 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32> + %2156 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1847, %2155 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_1848 : tensor<80x11008xf32>) -> tensor<80x11008xf32> + %expanded_1849 = tensor.expand_shape %2156 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32> + %2157 = tensor.empty() : tensor<1x80x11008xf32> + %2158 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_1849 : tensor<1x80x11008xf32>) outs(%2157 : tensor<1x80x11008xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.negf %in : f32 + %3653 = math.exp %3652 : f32 + %cst_2684 = arith.constant 1.000000e+00 : f32 + %3654 = arith.addf %cst_2684, %3653 : f32 + %3655 = arith.divf %in, %3654 : f32 + linalg.yield %3655 : f32 + } -> tensor<1x80x11008xf32> + %2159 = tensor.empty() : tensor<4096x11008xf32> + %2160 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_328 : tensor<11008x4096xf32>) outs(%2159 : tensor<4096x11008xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x11008xf32> + %collapsed_1850 = tensor.collapse_shape %2153 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_1851 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32> + %2161 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1850, %2160 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_1851 : tensor<80x11008xf32>) -> tensor<80x11008xf32> + %expanded_1852 = tensor.expand_shape %2161 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32> + %2162 = tensor.empty() : tensor<1x80x11008xf32> + %2163 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2158, %expanded_1852 : tensor<1x80x11008xf32>, tensor<1x80x11008xf32>) outs(%2162 : tensor<1x80x11008xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x11008xf32> + %2164 = tensor.empty() : tensor<11008x4096xf32> + %2165 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_330 : tensor<4096x11008xf32>) outs(%2164 : tensor<11008x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<11008x4096xf32> + %collapsed_1853 = tensor.collapse_shape %2163 [[0, 1], [2]] : tensor<1x80x11008xf32> into tensor<80x11008xf32> + %cst_1854 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %2166 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1853, %2165 : tensor<80x11008xf32>, tensor<11008x4096xf32>) outs(%cst_1854 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_1855 = tensor.expand_shape %2166 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %2167 = tensor.empty() : tensor<1x80x4096xf32> + %2168 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2142, %expanded_1855 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%2167 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %2169 = tensor.empty() : tensor<1x80x4096xf32> + %cst_1856 = arith.constant 2.000000e+00 : f32 + %2170 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2168 : tensor<1x80x4096xf32>) outs(%2169 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.powf %in, %cst_1856 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %cst_1857 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32> + %2171 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%2170 : tensor<1x80x4096xf32>) outs(%cst_1857 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %cst_2684 = arith.constant 4.096000e+03 : f32 + %3652 = arith.divf %in, %cst_2684 : f32 + %3653 = arith.addf %3652, %out : f32 + linalg.yield %3653 : f32 + } -> tensor<1x80x1xf32> + %cst_1858 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32> + %2172 = tensor.empty() : tensor<1x80x1xf32> + %2173 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2171, %cst_1858 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%2172 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %2174 = tensor.empty() : tensor<1x80x1xf32> + %2175 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2173 : tensor<1x80x1xf32>) outs(%2174 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.rsqrt %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %2176 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_1859 = tensor.collapse_shape %2175 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32> + %2177 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2168, %collapsed_1859 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%2176 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %expanded_1860 = tensor.expand_shape %extracted_slice_37 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32> + %2178 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_1861 = tensor.collapse_shape %expanded_1860 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32> + %2179 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_1861, %2177 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%2178 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %2180 = tensor.empty() : tensor<4096x4096xf32> + %2181 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_332 : tensor<4096x4096xf32>) outs(%2180 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_1862 = tensor.collapse_shape %2179 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_1863 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %2182 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1862, %2181 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1863 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_1864 = tensor.expand_shape %2182 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %2183 = tensor.empty() : tensor<4096x4096xf32> + %2184 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_334 : tensor<4096x4096xf32>) outs(%2183 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_1865 = tensor.collapse_shape %2179 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_1866 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %2185 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1865, %2184 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1866 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_1867 = tensor.expand_shape %2185 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %2186 = tensor.empty() : tensor<4096x4096xf32> + %2187 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_336 : tensor<4096x4096xf32>) outs(%2186 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_1868 = tensor.collapse_shape %2179 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_1869 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %2188 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1868, %2187 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1869 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_1870 = tensor.expand_shape %2188 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %expanded_1871 = tensor.expand_shape %expanded_1864 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32> + %2189 = tensor.empty() : tensor<1x32x80x128xf32> + %2190 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1871 : tensor<1x80x32x128xf32>) outs(%2189 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x80x128xf32> + %expanded_1872 = tensor.expand_shape %expanded_1867 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32> + %2191 = tensor.empty() : tensor<1x32x80x128xf32> + %2192 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1872 : tensor<1x80x32x128xf32>) outs(%2191 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x80x128xf32> + %expanded_1873 = tensor.expand_shape %expanded_1870 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32> + %2193 = tensor.empty() : tensor<1x32x80x128xf32> + %2194 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1873 : tensor<1x80x32x128xf32>) outs(%2193 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x80x128xf32> + %extracted_slice_1874 = tensor.extract_slice %expanded_592[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32> + %extracted_slice_1875 = tensor.extract_slice %expanded_594[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32> + %2195 = tensor.empty() : tensor<1x80x128xf32> + %2196 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_1874 : tensor<1x1x80x128xf32>) outs(%2195 : tensor<1x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x80x128xf32> + %2197 = tensor.empty() : tensor<80x128xf32> + %2198 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%2196 : tensor<1x80x128xf32>) outs(%2197 : tensor<80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<80x128xf32> + %2199 = tensor.empty() : tensor<1x80x128xf32> + %2200 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_1875 : tensor<1x1x80x128xf32>) outs(%2199 : tensor<1x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x80x128xf32> + %2201 = tensor.empty() : tensor<80x128xf32> + %2202 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%2200 : tensor<1x80x128xf32>) outs(%2201 : tensor<80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<80x128xf32> + %2203 = tensor.empty() : tensor<1x80x128xf32> + %2204 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%2203 : tensor<1x80x128xf32>) { + ^bb0(%in: i64, %out: f32): + %3652 = arith.index_cast %in : i64 to index + %3653 = linalg.index 2 : index + %extracted = tensor.extract %2198[%3652, %3653] : tensor<80x128xf32> + linalg.yield %extracted : f32 + } -> tensor<1x80x128xf32> + %expanded_1876 = tensor.expand_shape %2204 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32> + %2205 = tensor.empty() : tensor<1x80x128xf32> + %2206 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%2205 : tensor<1x80x128xf32>) { + ^bb0(%in: i64, %out: f32): + %3652 = arith.index_cast %in : i64 to index + %3653 = linalg.index 2 : index + %extracted = tensor.extract %2202[%3652, %3653] : tensor<80x128xf32> + linalg.yield %extracted : f32 + } -> tensor<1x80x128xf32> + %expanded_1877 = tensor.expand_shape %2206 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32> + %2207 = tensor.empty() : tensor<1x32x80x128xf32> + %2208 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2190, %2204 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%2207 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %extracted_slice_1878 = tensor.extract_slice %2190[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %extracted_slice_1879 = tensor.extract_slice %2190[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %2209 = tensor.empty() : tensor<1x32x80x64xf32> + %2210 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_1879 : tensor<1x32x80x64xf32>) outs(%2209 : tensor<1x32x80x64xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.negf %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x64xf32> + %2211 = tensor.empty() : tensor<1x32x80x128xf32> + %inserted_slice_1880 = tensor.insert_slice %2210 into %2211[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %inserted_slice_1881 = tensor.insert_slice %extracted_slice_1878 into %inserted_slice_1880[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %2212 = tensor.empty() : tensor<1x32x80x128xf32> + %2213 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_1881, %2206 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%2212 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %2214 = tensor.empty() : tensor<1x32x80x128xf32> + %2215 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2208, %2213 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%2214 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %2216 = tensor.empty() : tensor<1x32x80x128xf32> + %2217 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2192, %2204 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%2216 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %extracted_slice_1882 = tensor.extract_slice %2192[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %extracted_slice_1883 = tensor.extract_slice %2192[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %2218 = tensor.empty() : tensor<1x32x80x64xf32> + %2219 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_1883 : tensor<1x32x80x64xf32>) outs(%2218 : tensor<1x32x80x64xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.negf %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x64xf32> + %2220 = tensor.empty() : tensor<1x32x80x128xf32> + %inserted_slice_1884 = tensor.insert_slice %2219 into %2220[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %inserted_slice_1885 = tensor.insert_slice %extracted_slice_1882 into %inserted_slice_1884[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %2221 = tensor.empty() : tensor<1x32x80x128xf32> + %2222 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_1885, %2206 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%2221 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %2223 = tensor.empty() : tensor<1x32x80x128xf32> + %2224 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2217, %2222 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%2223 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %2225 = tensor.empty() : tensor<1x32x128x80xf32> + %2226 = linalg.generic {indexing_maps = [#map14, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2224 : tensor<1x32x80x128xf32>) outs(%2225 : tensor<1x32x128x80xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x128x80xf32> + %cst_1886 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32> + %collapsed_1887 = tensor.collapse_shape %2215 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32> + %cst_1888 = arith.constant dense<0.000000e+00> : tensor<1x32x128x80xf32> + %collapsed_1889 = tensor.collapse_shape %2226 [[0, 1], [2], [3]] : tensor<1x32x128x80xf32> into tensor<32x128x80xf32> + %cst_1890 = arith.constant 0.000000e+00 : f32 + %2227 = tensor.empty() : tensor<32x80x80xf32> + %2228 = linalg.fill ins(%cst_1890 : f32) outs(%2227 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32> + %2229 = linalg.batch_matmul ins(%collapsed_1887, %collapsed_1889 : tensor<32x80x128xf32>, tensor<32x128x80xf32>) outs(%2228 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32> + %expanded_1891 = tensor.expand_shape %2229 [[0, 1], [2], [3]] : tensor<32x80x80xf32> into tensor<1x32x80x80xf32> + %cst_1892 = arith.constant dense<11.3137083> : tensor<1x32x80x80xf32> + %2230 = tensor.empty() : tensor<1x32x80x80xf32> + %2231 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst_1892 : tensor<1x32x80x80xf32>) outs(%2230 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %out: f32): + %cst_2684 = arith.constant 1.000000e+00 : f32 + %3652 = arith.divf %cst_2684, %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %2232 = tensor.empty() : tensor<1x32x80x80xf32> + %2233 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1891, %2231 : tensor<1x32x80x80xf32>, tensor<1x32x80x80xf32>) outs(%2232 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %2234 = tensor.empty() : tensor<1x32x80x80xf32> + %collapsed_1893 = tensor.collapse_shape %21 [[0, 1], [2], [3]] : tensor<1x1x80x80xf32> into tensor<1x80x80xf32> + %2235 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2233, %collapsed_1893 : tensor<1x32x80x80xf32>, tensor<1x80x80xf32>) outs(%2234 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %2236 = tensor.empty() : tensor<1x32x80x1xf32> + %2237 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%2236 : tensor<1x32x80x1xf32>) { + ^bb0(%out: f32): + %cst_2684 = arith.constant 0xFF800000 : f32 + linalg.yield %cst_2684 : f32 + } -> tensor<1x32x80x1xf32> + %2238 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%2235 : tensor<1x32x80x80xf32>) outs(%2236 : tensor<1x32x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.maximumf %in, %out : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x1xf32> + %2239 = tensor.empty() : tensor<1x32x80x80xf32> + %2240 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2235, %2238 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%2239 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.subf %in, %in_2684 : f32 + %3653 = math.exp %3652 : f32 + linalg.yield %3653 : f32 + } -> tensor<1x32x80x80xf32> + %2241 = tensor.empty() : tensor<1x32x80x1xf32> + %2242 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%2241 : tensor<1x32x80x1xf32>) { + ^bb0(%out: f32): + %cst_2684 = arith.constant 0.000000e+00 : f32 + linalg.yield %cst_2684 : f32 + } -> tensor<1x32x80x1xf32> + %2243 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%2240 : tensor<1x32x80x80xf32>) outs(%2242 : tensor<1x32x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.addf %in, %out : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x1xf32> + %2244 = tensor.empty() : tensor<1x32x80x80xf32> + %2245 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2240, %2243 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%2244 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.divf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %cst_1894 = arith.constant dense<0.000000e+00> : tensor<1x32x80x80xf32> + %collapsed_1895 = tensor.collapse_shape %2245 [[0, 1], [2], [3]] : tensor<1x32x80x80xf32> into tensor<32x80x80xf32> + %cst_1896 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32> + %collapsed_1897 = tensor.collapse_shape %2194 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32> + %cst_1898 = arith.constant 0.000000e+00 : f32 + %2246 = tensor.empty() : tensor<32x80x128xf32> + %2247 = linalg.fill ins(%cst_1898 : f32) outs(%2246 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32> + %2248 = linalg.batch_matmul ins(%collapsed_1895, %collapsed_1897 : tensor<32x80x80xf32>, tensor<32x80x128xf32>) outs(%2247 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32> + %expanded_1899 = tensor.expand_shape %2248 [[0, 1], [2], [3]] : tensor<32x80x128xf32> into tensor<1x32x80x128xf32> + %2249 = tensor.empty() : tensor<1x80x32x128xf32> + %2250 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1899 : tensor<1x32x80x128xf32>) outs(%2249 : tensor<1x80x32x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x80x32x128xf32> + %collapsed_1900 = tensor.collapse_shape %2250 [[0], [1], [2, 3]] : tensor<1x80x32x128xf32> into tensor<1x80x4096xf32> + %2251 = tensor.empty() : tensor<4096x4096xf32> + %2252 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_338 : tensor<4096x4096xf32>) outs(%2251 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_1901 = tensor.collapse_shape %collapsed_1900 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_1902 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %2253 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1901, %2252 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1902 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_1903 = tensor.expand_shape %2253 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %2254 = tensor.empty() : tensor<1x80x4096xf32> + %2255 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2168, %expanded_1903 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%2254 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %2256 = tensor.empty() : tensor<1x80x4096xf32> + %cst_1904 = arith.constant 2.000000e+00 : f32 + %2257 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2255 : tensor<1x80x4096xf32>) outs(%2256 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.powf %in, %cst_1904 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %cst_1905 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32> + %2258 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%2257 : tensor<1x80x4096xf32>) outs(%cst_1905 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %cst_2684 = arith.constant 4.096000e+03 : f32 + %3652 = arith.divf %in, %cst_2684 : f32 + %3653 = arith.addf %3652, %out : f32 + linalg.yield %3653 : f32 + } -> tensor<1x80x1xf32> + %cst_1906 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32> + %2259 = tensor.empty() : tensor<1x80x1xf32> + %2260 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2258, %cst_1906 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%2259 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %2261 = tensor.empty() : tensor<1x80x1xf32> + %2262 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2260 : tensor<1x80x1xf32>) outs(%2261 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.rsqrt %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %2263 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_1907 = tensor.collapse_shape %2262 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32> + %2264 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2255, %collapsed_1907 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%2263 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %expanded_1908 = tensor.expand_shape %extracted_slice_38 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32> + %2265 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_1909 = tensor.collapse_shape %expanded_1908 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32> + %2266 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_1909, %2264 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%2265 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %2267 = tensor.empty() : tensor<4096x11008xf32> + %2268 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_340 : tensor<11008x4096xf32>) outs(%2267 : tensor<4096x11008xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x11008xf32> + %collapsed_1910 = tensor.collapse_shape %2266 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_1911 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32> + %2269 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1910, %2268 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_1911 : tensor<80x11008xf32>) -> tensor<80x11008xf32> + %expanded_1912 = tensor.expand_shape %2269 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32> + %2270 = tensor.empty() : tensor<1x80x11008xf32> + %2271 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_1912 : tensor<1x80x11008xf32>) outs(%2270 : tensor<1x80x11008xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.negf %in : f32 + %3653 = math.exp %3652 : f32 + %cst_2684 = arith.constant 1.000000e+00 : f32 + %3654 = arith.addf %cst_2684, %3653 : f32 + %3655 = arith.divf %in, %3654 : f32 + linalg.yield %3655 : f32 + } -> tensor<1x80x11008xf32> + %2272 = tensor.empty() : tensor<4096x11008xf32> + %2273 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_342 : tensor<11008x4096xf32>) outs(%2272 : tensor<4096x11008xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x11008xf32> + %collapsed_1913 = tensor.collapse_shape %2266 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_1914 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32> + %2274 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1913, %2273 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_1914 : tensor<80x11008xf32>) -> tensor<80x11008xf32> + %expanded_1915 = tensor.expand_shape %2274 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32> + %2275 = tensor.empty() : tensor<1x80x11008xf32> + %2276 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2271, %expanded_1915 : tensor<1x80x11008xf32>, tensor<1x80x11008xf32>) outs(%2275 : tensor<1x80x11008xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x11008xf32> + %2277 = tensor.empty() : tensor<11008x4096xf32> + %2278 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_344 : tensor<4096x11008xf32>) outs(%2277 : tensor<11008x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<11008x4096xf32> + %collapsed_1916 = tensor.collapse_shape %2276 [[0, 1], [2]] : tensor<1x80x11008xf32> into tensor<80x11008xf32> + %cst_1917 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %2279 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1916, %2278 : tensor<80x11008xf32>, tensor<11008x4096xf32>) outs(%cst_1917 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_1918 = tensor.expand_shape %2279 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %2280 = tensor.empty() : tensor<1x80x4096xf32> + %2281 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2255, %expanded_1918 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%2280 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %2282 = tensor.empty() : tensor<1x80x4096xf32> + %cst_1919 = arith.constant 2.000000e+00 : f32 + %2283 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2281 : tensor<1x80x4096xf32>) outs(%2282 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.powf %in, %cst_1919 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %cst_1920 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32> + %2284 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%2283 : tensor<1x80x4096xf32>) outs(%cst_1920 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %cst_2684 = arith.constant 4.096000e+03 : f32 + %3652 = arith.divf %in, %cst_2684 : f32 + %3653 = arith.addf %3652, %out : f32 + linalg.yield %3653 : f32 + } -> tensor<1x80x1xf32> + %cst_1921 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32> + %2285 = tensor.empty() : tensor<1x80x1xf32> + %2286 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2284, %cst_1921 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%2285 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %2287 = tensor.empty() : tensor<1x80x1xf32> + %2288 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2286 : tensor<1x80x1xf32>) outs(%2287 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.rsqrt %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %2289 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_1922 = tensor.collapse_shape %2288 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32> + %2290 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2281, %collapsed_1922 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%2289 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %expanded_1923 = tensor.expand_shape %extracted_slice_39 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32> + %2291 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_1924 = tensor.collapse_shape %expanded_1923 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32> + %2292 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_1924, %2290 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%2291 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %2293 = tensor.empty() : tensor<4096x4096xf32> + %2294 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_346 : tensor<4096x4096xf32>) outs(%2293 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_1925 = tensor.collapse_shape %2292 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_1926 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %2295 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1925, %2294 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1926 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_1927 = tensor.expand_shape %2295 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %2296 = tensor.empty() : tensor<4096x4096xf32> + %2297 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_348 : tensor<4096x4096xf32>) outs(%2296 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_1928 = tensor.collapse_shape %2292 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_1929 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %2298 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1928, %2297 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1929 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_1930 = tensor.expand_shape %2298 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %2299 = tensor.empty() : tensor<4096x4096xf32> + %2300 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_350 : tensor<4096x4096xf32>) outs(%2299 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_1931 = tensor.collapse_shape %2292 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_1932 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %2301 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1931, %2300 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1932 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_1933 = tensor.expand_shape %2301 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %expanded_1934 = tensor.expand_shape %expanded_1927 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32> + %2302 = tensor.empty() : tensor<1x32x80x128xf32> + %2303 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1934 : tensor<1x80x32x128xf32>) outs(%2302 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x80x128xf32> + %expanded_1935 = tensor.expand_shape %expanded_1930 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32> + %2304 = tensor.empty() : tensor<1x32x80x128xf32> + %2305 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1935 : tensor<1x80x32x128xf32>) outs(%2304 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x80x128xf32> + %expanded_1936 = tensor.expand_shape %expanded_1933 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32> + %2306 = tensor.empty() : tensor<1x32x80x128xf32> + %2307 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1936 : tensor<1x80x32x128xf32>) outs(%2306 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x80x128xf32> + %extracted_slice_1937 = tensor.extract_slice %expanded_596[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32> + %extracted_slice_1938 = tensor.extract_slice %expanded_598[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32> + %2308 = tensor.empty() : tensor<1x80x128xf32> + %2309 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_1937 : tensor<1x1x80x128xf32>) outs(%2308 : tensor<1x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x80x128xf32> + %2310 = tensor.empty() : tensor<80x128xf32> + %2311 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%2309 : tensor<1x80x128xf32>) outs(%2310 : tensor<80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<80x128xf32> + %2312 = tensor.empty() : tensor<1x80x128xf32> + %2313 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_1938 : tensor<1x1x80x128xf32>) outs(%2312 : tensor<1x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x80x128xf32> + %2314 = tensor.empty() : tensor<80x128xf32> + %2315 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%2313 : tensor<1x80x128xf32>) outs(%2314 : tensor<80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<80x128xf32> + %2316 = tensor.empty() : tensor<1x80x128xf32> + %2317 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%2316 : tensor<1x80x128xf32>) { + ^bb0(%in: i64, %out: f32): + %3652 = arith.index_cast %in : i64 to index + %3653 = linalg.index 2 : index + %extracted = tensor.extract %2311[%3652, %3653] : tensor<80x128xf32> + linalg.yield %extracted : f32 + } -> tensor<1x80x128xf32> + %expanded_1939 = tensor.expand_shape %2317 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32> + %2318 = tensor.empty() : tensor<1x80x128xf32> + %2319 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%2318 : tensor<1x80x128xf32>) { + ^bb0(%in: i64, %out: f32): + %3652 = arith.index_cast %in : i64 to index + %3653 = linalg.index 2 : index + %extracted = tensor.extract %2315[%3652, %3653] : tensor<80x128xf32> + linalg.yield %extracted : f32 + } -> tensor<1x80x128xf32> + %expanded_1940 = tensor.expand_shape %2319 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32> + %2320 = tensor.empty() : tensor<1x32x80x128xf32> + %2321 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2303, %2317 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%2320 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %extracted_slice_1941 = tensor.extract_slice %2303[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %extracted_slice_1942 = tensor.extract_slice %2303[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %2322 = tensor.empty() : tensor<1x32x80x64xf32> + %2323 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_1942 : tensor<1x32x80x64xf32>) outs(%2322 : tensor<1x32x80x64xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.negf %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x64xf32> + %2324 = tensor.empty() : tensor<1x32x80x128xf32> + %inserted_slice_1943 = tensor.insert_slice %2323 into %2324[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %inserted_slice_1944 = tensor.insert_slice %extracted_slice_1941 into %inserted_slice_1943[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %2325 = tensor.empty() : tensor<1x32x80x128xf32> + %2326 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_1944, %2319 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%2325 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %2327 = tensor.empty() : tensor<1x32x80x128xf32> + %2328 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2321, %2326 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%2327 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %2329 = tensor.empty() : tensor<1x32x80x128xf32> + %2330 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2305, %2317 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%2329 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %extracted_slice_1945 = tensor.extract_slice %2305[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %extracted_slice_1946 = tensor.extract_slice %2305[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %2331 = tensor.empty() : tensor<1x32x80x64xf32> + %2332 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_1946 : tensor<1x32x80x64xf32>) outs(%2331 : tensor<1x32x80x64xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.negf %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x64xf32> + %2333 = tensor.empty() : tensor<1x32x80x128xf32> + %inserted_slice_1947 = tensor.insert_slice %2332 into %2333[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %inserted_slice_1948 = tensor.insert_slice %extracted_slice_1945 into %inserted_slice_1947[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %2334 = tensor.empty() : tensor<1x32x80x128xf32> + %2335 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_1948, %2319 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%2334 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %2336 = tensor.empty() : tensor<1x32x80x128xf32> + %2337 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2330, %2335 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%2336 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %2338 = tensor.empty() : tensor<1x32x128x80xf32> + %2339 = linalg.generic {indexing_maps = [#map14, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2337 : tensor<1x32x80x128xf32>) outs(%2338 : tensor<1x32x128x80xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x128x80xf32> + %cst_1949 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32> + %collapsed_1950 = tensor.collapse_shape %2328 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32> + %cst_1951 = arith.constant dense<0.000000e+00> : tensor<1x32x128x80xf32> + %collapsed_1952 = tensor.collapse_shape %2339 [[0, 1], [2], [3]] : tensor<1x32x128x80xf32> into tensor<32x128x80xf32> + %cst_1953 = arith.constant 0.000000e+00 : f32 + %2340 = tensor.empty() : tensor<32x80x80xf32> + %2341 = linalg.fill ins(%cst_1953 : f32) outs(%2340 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32> + %2342 = linalg.batch_matmul ins(%collapsed_1950, %collapsed_1952 : tensor<32x80x128xf32>, tensor<32x128x80xf32>) outs(%2341 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32> + %expanded_1954 = tensor.expand_shape %2342 [[0, 1], [2], [3]] : tensor<32x80x80xf32> into tensor<1x32x80x80xf32> + %cst_1955 = arith.constant dense<11.3137083> : tensor<1x32x80x80xf32> + %2343 = tensor.empty() : tensor<1x32x80x80xf32> + %2344 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst_1955 : tensor<1x32x80x80xf32>) outs(%2343 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %out: f32): + %cst_2684 = arith.constant 1.000000e+00 : f32 + %3652 = arith.divf %cst_2684, %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %2345 = tensor.empty() : tensor<1x32x80x80xf32> + %2346 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1954, %2344 : tensor<1x32x80x80xf32>, tensor<1x32x80x80xf32>) outs(%2345 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %2347 = tensor.empty() : tensor<1x32x80x80xf32> + %collapsed_1956 = tensor.collapse_shape %21 [[0, 1], [2], [3]] : tensor<1x1x80x80xf32> into tensor<1x80x80xf32> + %2348 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2346, %collapsed_1956 : tensor<1x32x80x80xf32>, tensor<1x80x80xf32>) outs(%2347 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %2349 = tensor.empty() : tensor<1x32x80x1xf32> + %2350 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%2349 : tensor<1x32x80x1xf32>) { + ^bb0(%out: f32): + %cst_2684 = arith.constant 0xFF800000 : f32 + linalg.yield %cst_2684 : f32 + } -> tensor<1x32x80x1xf32> + %2351 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%2348 : tensor<1x32x80x80xf32>) outs(%2349 : tensor<1x32x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.maximumf %in, %out : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x1xf32> + %2352 = tensor.empty() : tensor<1x32x80x80xf32> + %2353 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2348, %2351 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%2352 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.subf %in, %in_2684 : f32 + %3653 = math.exp %3652 : f32 + linalg.yield %3653 : f32 + } -> tensor<1x32x80x80xf32> + %2354 = tensor.empty() : tensor<1x32x80x1xf32> + %2355 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%2354 : tensor<1x32x80x1xf32>) { + ^bb0(%out: f32): + %cst_2684 = arith.constant 0.000000e+00 : f32 + linalg.yield %cst_2684 : f32 + } -> tensor<1x32x80x1xf32> + %2356 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%2353 : tensor<1x32x80x80xf32>) outs(%2355 : tensor<1x32x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.addf %in, %out : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x1xf32> + %2357 = tensor.empty() : tensor<1x32x80x80xf32> + %2358 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2353, %2356 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%2357 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.divf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %cst_1957 = arith.constant dense<0.000000e+00> : tensor<1x32x80x80xf32> + %collapsed_1958 = tensor.collapse_shape %2358 [[0, 1], [2], [3]] : tensor<1x32x80x80xf32> into tensor<32x80x80xf32> + %cst_1959 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32> + %collapsed_1960 = tensor.collapse_shape %2307 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32> + %cst_1961 = arith.constant 0.000000e+00 : f32 + %2359 = tensor.empty() : tensor<32x80x128xf32> + %2360 = linalg.fill ins(%cst_1961 : f32) outs(%2359 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32> + %2361 = linalg.batch_matmul ins(%collapsed_1958, %collapsed_1960 : tensor<32x80x80xf32>, tensor<32x80x128xf32>) outs(%2360 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32> + %expanded_1962 = tensor.expand_shape %2361 [[0, 1], [2], [3]] : tensor<32x80x128xf32> into tensor<1x32x80x128xf32> + %2362 = tensor.empty() : tensor<1x80x32x128xf32> + %2363 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1962 : tensor<1x32x80x128xf32>) outs(%2362 : tensor<1x80x32x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x80x32x128xf32> + %collapsed_1963 = tensor.collapse_shape %2363 [[0], [1], [2, 3]] : tensor<1x80x32x128xf32> into tensor<1x80x4096xf32> + %2364 = tensor.empty() : tensor<4096x4096xf32> + %2365 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_352 : tensor<4096x4096xf32>) outs(%2364 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_1964 = tensor.collapse_shape %collapsed_1963 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_1965 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %2366 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1964, %2365 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1965 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_1966 = tensor.expand_shape %2366 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %2367 = tensor.empty() : tensor<1x80x4096xf32> + %2368 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2281, %expanded_1966 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%2367 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %2369 = tensor.empty() : tensor<1x80x4096xf32> + %cst_1967 = arith.constant 2.000000e+00 : f32 + %2370 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2368 : tensor<1x80x4096xf32>) outs(%2369 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.powf %in, %cst_1967 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %cst_1968 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32> + %2371 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%2370 : tensor<1x80x4096xf32>) outs(%cst_1968 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %cst_2684 = arith.constant 4.096000e+03 : f32 + %3652 = arith.divf %in, %cst_2684 : f32 + %3653 = arith.addf %3652, %out : f32 + linalg.yield %3653 : f32 + } -> tensor<1x80x1xf32> + %cst_1969 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32> + %2372 = tensor.empty() : tensor<1x80x1xf32> + %2373 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2371, %cst_1969 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%2372 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %2374 = tensor.empty() : tensor<1x80x1xf32> + %2375 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2373 : tensor<1x80x1xf32>) outs(%2374 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.rsqrt %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %2376 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_1970 = tensor.collapse_shape %2375 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32> + %2377 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2368, %collapsed_1970 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%2376 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %expanded_1971 = tensor.expand_shape %extracted_slice_40 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32> + %2378 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_1972 = tensor.collapse_shape %expanded_1971 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32> + %2379 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_1972, %2377 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%2378 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %2380 = tensor.empty() : tensor<4096x11008xf32> + %2381 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_354 : tensor<11008x4096xf32>) outs(%2380 : tensor<4096x11008xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x11008xf32> + %collapsed_1973 = tensor.collapse_shape %2379 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_1974 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32> + %2382 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1973, %2381 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_1974 : tensor<80x11008xf32>) -> tensor<80x11008xf32> + %expanded_1975 = tensor.expand_shape %2382 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32> + %2383 = tensor.empty() : tensor<1x80x11008xf32> + %2384 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_1975 : tensor<1x80x11008xf32>) outs(%2383 : tensor<1x80x11008xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.negf %in : f32 + %3653 = math.exp %3652 : f32 + %cst_2684 = arith.constant 1.000000e+00 : f32 + %3654 = arith.addf %cst_2684, %3653 : f32 + %3655 = arith.divf %in, %3654 : f32 + linalg.yield %3655 : f32 + } -> tensor<1x80x11008xf32> + %2385 = tensor.empty() : tensor<4096x11008xf32> + %2386 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_356 : tensor<11008x4096xf32>) outs(%2385 : tensor<4096x11008xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x11008xf32> + %collapsed_1976 = tensor.collapse_shape %2379 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_1977 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32> + %2387 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1976, %2386 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_1977 : tensor<80x11008xf32>) -> tensor<80x11008xf32> + %expanded_1978 = tensor.expand_shape %2387 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32> + %2388 = tensor.empty() : tensor<1x80x11008xf32> + %2389 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2384, %expanded_1978 : tensor<1x80x11008xf32>, tensor<1x80x11008xf32>) outs(%2388 : tensor<1x80x11008xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x11008xf32> + %2390 = tensor.empty() : tensor<11008x4096xf32> + %2391 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_358 : tensor<4096x11008xf32>) outs(%2390 : tensor<11008x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<11008x4096xf32> + %collapsed_1979 = tensor.collapse_shape %2389 [[0, 1], [2]] : tensor<1x80x11008xf32> into tensor<80x11008xf32> + %cst_1980 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %2392 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1979, %2391 : tensor<80x11008xf32>, tensor<11008x4096xf32>) outs(%cst_1980 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_1981 = tensor.expand_shape %2392 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %2393 = tensor.empty() : tensor<1x80x4096xf32> + %2394 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2368, %expanded_1981 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%2393 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %2395 = tensor.empty() : tensor<1x80x4096xf32> + %cst_1982 = arith.constant 2.000000e+00 : f32 + %2396 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2394 : tensor<1x80x4096xf32>) outs(%2395 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.powf %in, %cst_1982 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %cst_1983 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32> + %2397 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%2396 : tensor<1x80x4096xf32>) outs(%cst_1983 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %cst_2684 = arith.constant 4.096000e+03 : f32 + %3652 = arith.divf %in, %cst_2684 : f32 + %3653 = arith.addf %3652, %out : f32 + linalg.yield %3653 : f32 + } -> tensor<1x80x1xf32> + %cst_1984 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32> + %2398 = tensor.empty() : tensor<1x80x1xf32> + %2399 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2397, %cst_1984 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%2398 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %2400 = tensor.empty() : tensor<1x80x1xf32> + %2401 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2399 : tensor<1x80x1xf32>) outs(%2400 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.rsqrt %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %2402 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_1985 = tensor.collapse_shape %2401 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32> + %2403 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2394, %collapsed_1985 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%2402 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %expanded_1986 = tensor.expand_shape %extracted_slice_41 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32> + %2404 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_1987 = tensor.collapse_shape %expanded_1986 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32> + %2405 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_1987, %2403 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%2404 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %2406 = tensor.empty() : tensor<4096x4096xf32> + %2407 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_360 : tensor<4096x4096xf32>) outs(%2406 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_1988 = tensor.collapse_shape %2405 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_1989 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %2408 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1988, %2407 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1989 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_1990 = tensor.expand_shape %2408 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %2409 = tensor.empty() : tensor<4096x4096xf32> + %2410 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_362 : tensor<4096x4096xf32>) outs(%2409 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_1991 = tensor.collapse_shape %2405 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_1992 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %2411 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1991, %2410 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1992 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_1993 = tensor.expand_shape %2411 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %2412 = tensor.empty() : tensor<4096x4096xf32> + %2413 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_364 : tensor<4096x4096xf32>) outs(%2412 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_1994 = tensor.collapse_shape %2405 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_1995 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %2414 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_1994, %2413 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1995 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_1996 = tensor.expand_shape %2414 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %expanded_1997 = tensor.expand_shape %expanded_1990 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32> + %2415 = tensor.empty() : tensor<1x32x80x128xf32> + %2416 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1997 : tensor<1x80x32x128xf32>) outs(%2415 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x80x128xf32> + %expanded_1998 = tensor.expand_shape %expanded_1993 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32> + %2417 = tensor.empty() : tensor<1x32x80x128xf32> + %2418 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1998 : tensor<1x80x32x128xf32>) outs(%2417 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x80x128xf32> + %expanded_1999 = tensor.expand_shape %expanded_1996 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32> + %2419 = tensor.empty() : tensor<1x32x80x128xf32> + %2420 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1999 : tensor<1x80x32x128xf32>) outs(%2419 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x80x128xf32> + %extracted_slice_2000 = tensor.extract_slice %expanded_600[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32> + %extracted_slice_2001 = tensor.extract_slice %expanded_602[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32> + %2421 = tensor.empty() : tensor<1x80x128xf32> + %2422 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_2000 : tensor<1x1x80x128xf32>) outs(%2421 : tensor<1x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x80x128xf32> + %2423 = tensor.empty() : tensor<80x128xf32> + %2424 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%2422 : tensor<1x80x128xf32>) outs(%2423 : tensor<80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<80x128xf32> + %2425 = tensor.empty() : tensor<1x80x128xf32> + %2426 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_2001 : tensor<1x1x80x128xf32>) outs(%2425 : tensor<1x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x80x128xf32> + %2427 = tensor.empty() : tensor<80x128xf32> + %2428 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%2426 : tensor<1x80x128xf32>) outs(%2427 : tensor<80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<80x128xf32> + %2429 = tensor.empty() : tensor<1x80x128xf32> + %2430 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%2429 : tensor<1x80x128xf32>) { + ^bb0(%in: i64, %out: f32): + %3652 = arith.index_cast %in : i64 to index + %3653 = linalg.index 2 : index + %extracted = tensor.extract %2424[%3652, %3653] : tensor<80x128xf32> + linalg.yield %extracted : f32 + } -> tensor<1x80x128xf32> + %expanded_2002 = tensor.expand_shape %2430 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32> + %2431 = tensor.empty() : tensor<1x80x128xf32> + %2432 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%2431 : tensor<1x80x128xf32>) { + ^bb0(%in: i64, %out: f32): + %3652 = arith.index_cast %in : i64 to index + %3653 = linalg.index 2 : index + %extracted = tensor.extract %2428[%3652, %3653] : tensor<80x128xf32> + linalg.yield %extracted : f32 + } -> tensor<1x80x128xf32> + %expanded_2003 = tensor.expand_shape %2432 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32> + %2433 = tensor.empty() : tensor<1x32x80x128xf32> + %2434 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2416, %2430 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%2433 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %extracted_slice_2004 = tensor.extract_slice %2416[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %extracted_slice_2005 = tensor.extract_slice %2416[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %2435 = tensor.empty() : tensor<1x32x80x64xf32> + %2436 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_2005 : tensor<1x32x80x64xf32>) outs(%2435 : tensor<1x32x80x64xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.negf %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x64xf32> + %2437 = tensor.empty() : tensor<1x32x80x128xf32> + %inserted_slice_2006 = tensor.insert_slice %2436 into %2437[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %inserted_slice_2007 = tensor.insert_slice %extracted_slice_2004 into %inserted_slice_2006[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %2438 = tensor.empty() : tensor<1x32x80x128xf32> + %2439 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_2007, %2432 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%2438 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %2440 = tensor.empty() : tensor<1x32x80x128xf32> + %2441 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2434, %2439 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%2440 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %2442 = tensor.empty() : tensor<1x32x80x128xf32> + %2443 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2418, %2430 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%2442 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %extracted_slice_2008 = tensor.extract_slice %2418[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %extracted_slice_2009 = tensor.extract_slice %2418[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %2444 = tensor.empty() : tensor<1x32x80x64xf32> + %2445 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_2009 : tensor<1x32x80x64xf32>) outs(%2444 : tensor<1x32x80x64xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.negf %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x64xf32> + %2446 = tensor.empty() : tensor<1x32x80x128xf32> + %inserted_slice_2010 = tensor.insert_slice %2445 into %2446[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %inserted_slice_2011 = tensor.insert_slice %extracted_slice_2008 into %inserted_slice_2010[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %2447 = tensor.empty() : tensor<1x32x80x128xf32> + %2448 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_2011, %2432 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%2447 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %2449 = tensor.empty() : tensor<1x32x80x128xf32> + %2450 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2443, %2448 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%2449 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %2451 = tensor.empty() : tensor<1x32x128x80xf32> + %2452 = linalg.generic {indexing_maps = [#map14, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2450 : tensor<1x32x80x128xf32>) outs(%2451 : tensor<1x32x128x80xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x128x80xf32> + %cst_2012 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32> + %collapsed_2013 = tensor.collapse_shape %2441 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32> + %cst_2014 = arith.constant dense<0.000000e+00> : tensor<1x32x128x80xf32> + %collapsed_2015 = tensor.collapse_shape %2452 [[0, 1], [2], [3]] : tensor<1x32x128x80xf32> into tensor<32x128x80xf32> + %cst_2016 = arith.constant 0.000000e+00 : f32 + %2453 = tensor.empty() : tensor<32x80x80xf32> + %2454 = linalg.fill ins(%cst_2016 : f32) outs(%2453 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32> + %2455 = linalg.batch_matmul ins(%collapsed_2013, %collapsed_2015 : tensor<32x80x128xf32>, tensor<32x128x80xf32>) outs(%2454 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32> + %expanded_2017 = tensor.expand_shape %2455 [[0, 1], [2], [3]] : tensor<32x80x80xf32> into tensor<1x32x80x80xf32> + %cst_2018 = arith.constant dense<11.3137083> : tensor<1x32x80x80xf32> + %2456 = tensor.empty() : tensor<1x32x80x80xf32> + %2457 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst_2018 : tensor<1x32x80x80xf32>) outs(%2456 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %out: f32): + %cst_2684 = arith.constant 1.000000e+00 : f32 + %3652 = arith.divf %cst_2684, %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %2458 = tensor.empty() : tensor<1x32x80x80xf32> + %2459 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2017, %2457 : tensor<1x32x80x80xf32>, tensor<1x32x80x80xf32>) outs(%2458 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %2460 = tensor.empty() : tensor<1x32x80x80xf32> + %collapsed_2019 = tensor.collapse_shape %21 [[0, 1], [2], [3]] : tensor<1x1x80x80xf32> into tensor<1x80x80xf32> + %2461 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2459, %collapsed_2019 : tensor<1x32x80x80xf32>, tensor<1x80x80xf32>) outs(%2460 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %2462 = tensor.empty() : tensor<1x32x80x1xf32> + %2463 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%2462 : tensor<1x32x80x1xf32>) { + ^bb0(%out: f32): + %cst_2684 = arith.constant 0xFF800000 : f32 + linalg.yield %cst_2684 : f32 + } -> tensor<1x32x80x1xf32> + %2464 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%2461 : tensor<1x32x80x80xf32>) outs(%2462 : tensor<1x32x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.maximumf %in, %out : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x1xf32> + %2465 = tensor.empty() : tensor<1x32x80x80xf32> + %2466 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2461, %2464 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%2465 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.subf %in, %in_2684 : f32 + %3653 = math.exp %3652 : f32 + linalg.yield %3653 : f32 + } -> tensor<1x32x80x80xf32> + %2467 = tensor.empty() : tensor<1x32x80x1xf32> + %2468 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%2467 : tensor<1x32x80x1xf32>) { + ^bb0(%out: f32): + %cst_2684 = arith.constant 0.000000e+00 : f32 + linalg.yield %cst_2684 : f32 + } -> tensor<1x32x80x1xf32> + %2469 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%2466 : tensor<1x32x80x80xf32>) outs(%2468 : tensor<1x32x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.addf %in, %out : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x1xf32> + %2470 = tensor.empty() : tensor<1x32x80x80xf32> + %2471 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2466, %2469 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%2470 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.divf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %cst_2020 = arith.constant dense<0.000000e+00> : tensor<1x32x80x80xf32> + %collapsed_2021 = tensor.collapse_shape %2471 [[0, 1], [2], [3]] : tensor<1x32x80x80xf32> into tensor<32x80x80xf32> + %cst_2022 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32> + %collapsed_2023 = tensor.collapse_shape %2420 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32> + %cst_2024 = arith.constant 0.000000e+00 : f32 + %2472 = tensor.empty() : tensor<32x80x128xf32> + %2473 = linalg.fill ins(%cst_2024 : f32) outs(%2472 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32> + %2474 = linalg.batch_matmul ins(%collapsed_2021, %collapsed_2023 : tensor<32x80x80xf32>, tensor<32x80x128xf32>) outs(%2473 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32> + %expanded_2025 = tensor.expand_shape %2474 [[0, 1], [2], [3]] : tensor<32x80x128xf32> into tensor<1x32x80x128xf32> + %2475 = tensor.empty() : tensor<1x80x32x128xf32> + %2476 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2025 : tensor<1x32x80x128xf32>) outs(%2475 : tensor<1x80x32x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x80x32x128xf32> + %collapsed_2026 = tensor.collapse_shape %2476 [[0], [1], [2, 3]] : tensor<1x80x32x128xf32> into tensor<1x80x4096xf32> + %2477 = tensor.empty() : tensor<4096x4096xf32> + %2478 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_366 : tensor<4096x4096xf32>) outs(%2477 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_2027 = tensor.collapse_shape %collapsed_2026 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_2028 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %2479 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_2027, %2478 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_2028 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_2029 = tensor.expand_shape %2479 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %2480 = tensor.empty() : tensor<1x80x4096xf32> + %2481 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2394, %expanded_2029 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%2480 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %2482 = tensor.empty() : tensor<1x80x4096xf32> + %cst_2030 = arith.constant 2.000000e+00 : f32 + %2483 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2481 : tensor<1x80x4096xf32>) outs(%2482 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.powf %in, %cst_2030 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %cst_2031 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32> + %2484 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%2483 : tensor<1x80x4096xf32>) outs(%cst_2031 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %cst_2684 = arith.constant 4.096000e+03 : f32 + %3652 = arith.divf %in, %cst_2684 : f32 + %3653 = arith.addf %3652, %out : f32 + linalg.yield %3653 : f32 + } -> tensor<1x80x1xf32> + %cst_2032 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32> + %2485 = tensor.empty() : tensor<1x80x1xf32> + %2486 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2484, %cst_2032 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%2485 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %2487 = tensor.empty() : tensor<1x80x1xf32> + %2488 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2486 : tensor<1x80x1xf32>) outs(%2487 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.rsqrt %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %2489 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_2033 = tensor.collapse_shape %2488 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32> + %2490 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2481, %collapsed_2033 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%2489 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %expanded_2034 = tensor.expand_shape %extracted_slice_42 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32> + %2491 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_2035 = tensor.collapse_shape %expanded_2034 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32> + %2492 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_2035, %2490 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%2491 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %2493 = tensor.empty() : tensor<4096x11008xf32> + %2494 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_368 : tensor<11008x4096xf32>) outs(%2493 : tensor<4096x11008xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x11008xf32> + %collapsed_2036 = tensor.collapse_shape %2492 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_2037 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32> + %2495 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_2036, %2494 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_2037 : tensor<80x11008xf32>) -> tensor<80x11008xf32> + %expanded_2038 = tensor.expand_shape %2495 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32> + %2496 = tensor.empty() : tensor<1x80x11008xf32> + %2497 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_2038 : tensor<1x80x11008xf32>) outs(%2496 : tensor<1x80x11008xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.negf %in : f32 + %3653 = math.exp %3652 : f32 + %cst_2684 = arith.constant 1.000000e+00 : f32 + %3654 = arith.addf %cst_2684, %3653 : f32 + %3655 = arith.divf %in, %3654 : f32 + linalg.yield %3655 : f32 + } -> tensor<1x80x11008xf32> + %2498 = tensor.empty() : tensor<4096x11008xf32> + %2499 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_370 : tensor<11008x4096xf32>) outs(%2498 : tensor<4096x11008xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x11008xf32> + %collapsed_2039 = tensor.collapse_shape %2492 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_2040 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32> + %2500 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_2039, %2499 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_2040 : tensor<80x11008xf32>) -> tensor<80x11008xf32> + %expanded_2041 = tensor.expand_shape %2500 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32> + %2501 = tensor.empty() : tensor<1x80x11008xf32> + %2502 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2497, %expanded_2041 : tensor<1x80x11008xf32>, tensor<1x80x11008xf32>) outs(%2501 : tensor<1x80x11008xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x11008xf32> + %2503 = tensor.empty() : tensor<11008x4096xf32> + %2504 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_372 : tensor<4096x11008xf32>) outs(%2503 : tensor<11008x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<11008x4096xf32> + %collapsed_2042 = tensor.collapse_shape %2502 [[0, 1], [2]] : tensor<1x80x11008xf32> into tensor<80x11008xf32> + %cst_2043 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %2505 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_2042, %2504 : tensor<80x11008xf32>, tensor<11008x4096xf32>) outs(%cst_2043 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_2044 = tensor.expand_shape %2505 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %2506 = tensor.empty() : tensor<1x80x4096xf32> + %2507 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2481, %expanded_2044 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%2506 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %2508 = tensor.empty() : tensor<1x80x4096xf32> + %cst_2045 = arith.constant 2.000000e+00 : f32 + %2509 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2507 : tensor<1x80x4096xf32>) outs(%2508 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.powf %in, %cst_2045 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %cst_2046 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32> + %2510 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%2509 : tensor<1x80x4096xf32>) outs(%cst_2046 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %cst_2684 = arith.constant 4.096000e+03 : f32 + %3652 = arith.divf %in, %cst_2684 : f32 + %3653 = arith.addf %3652, %out : f32 + linalg.yield %3653 : f32 + } -> tensor<1x80x1xf32> + %cst_2047 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32> + %2511 = tensor.empty() : tensor<1x80x1xf32> + %2512 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2510, %cst_2047 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%2511 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %2513 = tensor.empty() : tensor<1x80x1xf32> + %2514 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2512 : tensor<1x80x1xf32>) outs(%2513 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.rsqrt %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %2515 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_2048 = tensor.collapse_shape %2514 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32> + %2516 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2507, %collapsed_2048 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%2515 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %expanded_2049 = tensor.expand_shape %extracted_slice_43 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32> + %2517 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_2050 = tensor.collapse_shape %expanded_2049 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32> + %2518 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_2050, %2516 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%2517 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %2519 = tensor.empty() : tensor<4096x4096xf32> + %2520 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_374 : tensor<4096x4096xf32>) outs(%2519 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_2051 = tensor.collapse_shape %2518 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_2052 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %2521 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_2051, %2520 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_2052 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_2053 = tensor.expand_shape %2521 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %2522 = tensor.empty() : tensor<4096x4096xf32> + %2523 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_376 : tensor<4096x4096xf32>) outs(%2522 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_2054 = tensor.collapse_shape %2518 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_2055 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %2524 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_2054, %2523 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_2055 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_2056 = tensor.expand_shape %2524 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %2525 = tensor.empty() : tensor<4096x4096xf32> + %2526 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_378 : tensor<4096x4096xf32>) outs(%2525 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_2057 = tensor.collapse_shape %2518 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_2058 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %2527 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_2057, %2526 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_2058 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_2059 = tensor.expand_shape %2527 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %expanded_2060 = tensor.expand_shape %expanded_2053 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32> + %2528 = tensor.empty() : tensor<1x32x80x128xf32> + %2529 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2060 : tensor<1x80x32x128xf32>) outs(%2528 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x80x128xf32> + %expanded_2061 = tensor.expand_shape %expanded_2056 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32> + %2530 = tensor.empty() : tensor<1x32x80x128xf32> + %2531 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2061 : tensor<1x80x32x128xf32>) outs(%2530 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x80x128xf32> + %expanded_2062 = tensor.expand_shape %expanded_2059 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32> + %2532 = tensor.empty() : tensor<1x32x80x128xf32> + %2533 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2062 : tensor<1x80x32x128xf32>) outs(%2532 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x80x128xf32> + %extracted_slice_2063 = tensor.extract_slice %expanded_604[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32> + %extracted_slice_2064 = tensor.extract_slice %expanded_606[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32> + %2534 = tensor.empty() : tensor<1x80x128xf32> + %2535 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_2063 : tensor<1x1x80x128xf32>) outs(%2534 : tensor<1x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x80x128xf32> + %2536 = tensor.empty() : tensor<80x128xf32> + %2537 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%2535 : tensor<1x80x128xf32>) outs(%2536 : tensor<80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<80x128xf32> + %2538 = tensor.empty() : tensor<1x80x128xf32> + %2539 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_2064 : tensor<1x1x80x128xf32>) outs(%2538 : tensor<1x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x80x128xf32> + %2540 = tensor.empty() : tensor<80x128xf32> + %2541 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%2539 : tensor<1x80x128xf32>) outs(%2540 : tensor<80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<80x128xf32> + %2542 = tensor.empty() : tensor<1x80x128xf32> + %2543 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%2542 : tensor<1x80x128xf32>) { + ^bb0(%in: i64, %out: f32): + %3652 = arith.index_cast %in : i64 to index + %3653 = linalg.index 2 : index + %extracted = tensor.extract %2537[%3652, %3653] : tensor<80x128xf32> + linalg.yield %extracted : f32 + } -> tensor<1x80x128xf32> + %expanded_2065 = tensor.expand_shape %2543 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32> + %2544 = tensor.empty() : tensor<1x80x128xf32> + %2545 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%2544 : tensor<1x80x128xf32>) { + ^bb0(%in: i64, %out: f32): + %3652 = arith.index_cast %in : i64 to index + %3653 = linalg.index 2 : index + %extracted = tensor.extract %2541[%3652, %3653] : tensor<80x128xf32> + linalg.yield %extracted : f32 + } -> tensor<1x80x128xf32> + %expanded_2066 = tensor.expand_shape %2545 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32> + %2546 = tensor.empty() : tensor<1x32x80x128xf32> + %2547 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2529, %2543 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%2546 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %extracted_slice_2067 = tensor.extract_slice %2529[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %extracted_slice_2068 = tensor.extract_slice %2529[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %2548 = tensor.empty() : tensor<1x32x80x64xf32> + %2549 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_2068 : tensor<1x32x80x64xf32>) outs(%2548 : tensor<1x32x80x64xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.negf %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x64xf32> + %2550 = tensor.empty() : tensor<1x32x80x128xf32> + %inserted_slice_2069 = tensor.insert_slice %2549 into %2550[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %inserted_slice_2070 = tensor.insert_slice %extracted_slice_2067 into %inserted_slice_2069[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %2551 = tensor.empty() : tensor<1x32x80x128xf32> + %2552 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_2070, %2545 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%2551 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %2553 = tensor.empty() : tensor<1x32x80x128xf32> + %2554 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2547, %2552 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%2553 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %2555 = tensor.empty() : tensor<1x32x80x128xf32> + %2556 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2531, %2543 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%2555 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %extracted_slice_2071 = tensor.extract_slice %2531[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %extracted_slice_2072 = tensor.extract_slice %2531[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %2557 = tensor.empty() : tensor<1x32x80x64xf32> + %2558 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_2072 : tensor<1x32x80x64xf32>) outs(%2557 : tensor<1x32x80x64xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.negf %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x64xf32> + %2559 = tensor.empty() : tensor<1x32x80x128xf32> + %inserted_slice_2073 = tensor.insert_slice %2558 into %2559[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %inserted_slice_2074 = tensor.insert_slice %extracted_slice_2071 into %inserted_slice_2073[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %2560 = tensor.empty() : tensor<1x32x80x128xf32> + %2561 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_2074, %2545 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%2560 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %2562 = tensor.empty() : tensor<1x32x80x128xf32> + %2563 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2556, %2561 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%2562 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %2564 = tensor.empty() : tensor<1x32x128x80xf32> + %2565 = linalg.generic {indexing_maps = [#map14, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2563 : tensor<1x32x80x128xf32>) outs(%2564 : tensor<1x32x128x80xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x128x80xf32> + %cst_2075 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32> + %collapsed_2076 = tensor.collapse_shape %2554 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32> + %cst_2077 = arith.constant dense<0.000000e+00> : tensor<1x32x128x80xf32> + %collapsed_2078 = tensor.collapse_shape %2565 [[0, 1], [2], [3]] : tensor<1x32x128x80xf32> into tensor<32x128x80xf32> + %cst_2079 = arith.constant 0.000000e+00 : f32 + %2566 = tensor.empty() : tensor<32x80x80xf32> + %2567 = linalg.fill ins(%cst_2079 : f32) outs(%2566 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32> + %2568 = linalg.batch_matmul ins(%collapsed_2076, %collapsed_2078 : tensor<32x80x128xf32>, tensor<32x128x80xf32>) outs(%2567 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32> + %expanded_2080 = tensor.expand_shape %2568 [[0, 1], [2], [3]] : tensor<32x80x80xf32> into tensor<1x32x80x80xf32> + %cst_2081 = arith.constant dense<11.3137083> : tensor<1x32x80x80xf32> + %2569 = tensor.empty() : tensor<1x32x80x80xf32> + %2570 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst_2081 : tensor<1x32x80x80xf32>) outs(%2569 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %out: f32): + %cst_2684 = arith.constant 1.000000e+00 : f32 + %3652 = arith.divf %cst_2684, %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %2571 = tensor.empty() : tensor<1x32x80x80xf32> + %2572 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2080, %2570 : tensor<1x32x80x80xf32>, tensor<1x32x80x80xf32>) outs(%2571 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %2573 = tensor.empty() : tensor<1x32x80x80xf32> + %collapsed_2082 = tensor.collapse_shape %21 [[0, 1], [2], [3]] : tensor<1x1x80x80xf32> into tensor<1x80x80xf32> + %2574 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2572, %collapsed_2082 : tensor<1x32x80x80xf32>, tensor<1x80x80xf32>) outs(%2573 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %2575 = tensor.empty() : tensor<1x32x80x1xf32> + %2576 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%2575 : tensor<1x32x80x1xf32>) { + ^bb0(%out: f32): + %cst_2684 = arith.constant 0xFF800000 : f32 + linalg.yield %cst_2684 : f32 + } -> tensor<1x32x80x1xf32> + %2577 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%2574 : tensor<1x32x80x80xf32>) outs(%2575 : tensor<1x32x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.maximumf %in, %out : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x1xf32> + %2578 = tensor.empty() : tensor<1x32x80x80xf32> + %2579 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2574, %2577 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%2578 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.subf %in, %in_2684 : f32 + %3653 = math.exp %3652 : f32 + linalg.yield %3653 : f32 + } -> tensor<1x32x80x80xf32> + %2580 = tensor.empty() : tensor<1x32x80x1xf32> + %2581 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%2580 : tensor<1x32x80x1xf32>) { + ^bb0(%out: f32): + %cst_2684 = arith.constant 0.000000e+00 : f32 + linalg.yield %cst_2684 : f32 + } -> tensor<1x32x80x1xf32> + %2582 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%2579 : tensor<1x32x80x80xf32>) outs(%2581 : tensor<1x32x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.addf %in, %out : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x1xf32> + %2583 = tensor.empty() : tensor<1x32x80x80xf32> + %2584 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2579, %2582 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%2583 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.divf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %cst_2083 = arith.constant dense<0.000000e+00> : tensor<1x32x80x80xf32> + %collapsed_2084 = tensor.collapse_shape %2584 [[0, 1], [2], [3]] : tensor<1x32x80x80xf32> into tensor<32x80x80xf32> + %cst_2085 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32> + %collapsed_2086 = tensor.collapse_shape %2533 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32> + %cst_2087 = arith.constant 0.000000e+00 : f32 + %2585 = tensor.empty() : tensor<32x80x128xf32> + %2586 = linalg.fill ins(%cst_2087 : f32) outs(%2585 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32> + %2587 = linalg.batch_matmul ins(%collapsed_2084, %collapsed_2086 : tensor<32x80x80xf32>, tensor<32x80x128xf32>) outs(%2586 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32> + %expanded_2088 = tensor.expand_shape %2587 [[0, 1], [2], [3]] : tensor<32x80x128xf32> into tensor<1x32x80x128xf32> + %2588 = tensor.empty() : tensor<1x80x32x128xf32> + %2589 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2088 : tensor<1x32x80x128xf32>) outs(%2588 : tensor<1x80x32x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x80x32x128xf32> + %collapsed_2089 = tensor.collapse_shape %2589 [[0], [1], [2, 3]] : tensor<1x80x32x128xf32> into tensor<1x80x4096xf32> + %2590 = tensor.empty() : tensor<4096x4096xf32> + %2591 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_380 : tensor<4096x4096xf32>) outs(%2590 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_2090 = tensor.collapse_shape %collapsed_2089 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_2091 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %2592 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_2090, %2591 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_2091 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_2092 = tensor.expand_shape %2592 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %2593 = tensor.empty() : tensor<1x80x4096xf32> + %2594 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2507, %expanded_2092 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%2593 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %2595 = tensor.empty() : tensor<1x80x4096xf32> + %cst_2093 = arith.constant 2.000000e+00 : f32 + %2596 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2594 : tensor<1x80x4096xf32>) outs(%2595 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.powf %in, %cst_2093 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %cst_2094 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32> + %2597 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%2596 : tensor<1x80x4096xf32>) outs(%cst_2094 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %cst_2684 = arith.constant 4.096000e+03 : f32 + %3652 = arith.divf %in, %cst_2684 : f32 + %3653 = arith.addf %3652, %out : f32 + linalg.yield %3653 : f32 + } -> tensor<1x80x1xf32> + %cst_2095 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32> + %2598 = tensor.empty() : tensor<1x80x1xf32> + %2599 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2597, %cst_2095 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%2598 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %2600 = tensor.empty() : tensor<1x80x1xf32> + %2601 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2599 : tensor<1x80x1xf32>) outs(%2600 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.rsqrt %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %2602 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_2096 = tensor.collapse_shape %2601 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32> + %2603 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2594, %collapsed_2096 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%2602 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %expanded_2097 = tensor.expand_shape %extracted_slice_44 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32> + %2604 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_2098 = tensor.collapse_shape %expanded_2097 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32> + %2605 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_2098, %2603 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%2604 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %2606 = tensor.empty() : tensor<4096x11008xf32> + %2607 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_382 : tensor<11008x4096xf32>) outs(%2606 : tensor<4096x11008xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x11008xf32> + %collapsed_2099 = tensor.collapse_shape %2605 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_2100 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32> + %2608 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_2099, %2607 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_2100 : tensor<80x11008xf32>) -> tensor<80x11008xf32> + %expanded_2101 = tensor.expand_shape %2608 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32> + %2609 = tensor.empty() : tensor<1x80x11008xf32> + %2610 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_2101 : tensor<1x80x11008xf32>) outs(%2609 : tensor<1x80x11008xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.negf %in : f32 + %3653 = math.exp %3652 : f32 + %cst_2684 = arith.constant 1.000000e+00 : f32 + %3654 = arith.addf %cst_2684, %3653 : f32 + %3655 = arith.divf %in, %3654 : f32 + linalg.yield %3655 : f32 + } -> tensor<1x80x11008xf32> + %2611 = tensor.empty() : tensor<4096x11008xf32> + %2612 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_384 : tensor<11008x4096xf32>) outs(%2611 : tensor<4096x11008xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x11008xf32> + %collapsed_2102 = tensor.collapse_shape %2605 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_2103 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32> + %2613 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_2102, %2612 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_2103 : tensor<80x11008xf32>) -> tensor<80x11008xf32> + %expanded_2104 = tensor.expand_shape %2613 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32> + %2614 = tensor.empty() : tensor<1x80x11008xf32> + %2615 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2610, %expanded_2104 : tensor<1x80x11008xf32>, tensor<1x80x11008xf32>) outs(%2614 : tensor<1x80x11008xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x11008xf32> + %2616 = tensor.empty() : tensor<11008x4096xf32> + %2617 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_386 : tensor<4096x11008xf32>) outs(%2616 : tensor<11008x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<11008x4096xf32> + %collapsed_2105 = tensor.collapse_shape %2615 [[0, 1], [2]] : tensor<1x80x11008xf32> into tensor<80x11008xf32> + %cst_2106 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %2618 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_2105, %2617 : tensor<80x11008xf32>, tensor<11008x4096xf32>) outs(%cst_2106 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_2107 = tensor.expand_shape %2618 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %2619 = tensor.empty() : tensor<1x80x4096xf32> + %2620 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2594, %expanded_2107 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%2619 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %2621 = tensor.empty() : tensor<1x80x4096xf32> + %cst_2108 = arith.constant 2.000000e+00 : f32 + %2622 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2620 : tensor<1x80x4096xf32>) outs(%2621 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.powf %in, %cst_2108 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %cst_2109 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32> + %2623 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%2622 : tensor<1x80x4096xf32>) outs(%cst_2109 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %cst_2684 = arith.constant 4.096000e+03 : f32 + %3652 = arith.divf %in, %cst_2684 : f32 + %3653 = arith.addf %3652, %out : f32 + linalg.yield %3653 : f32 + } -> tensor<1x80x1xf32> + %cst_2110 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32> + %2624 = tensor.empty() : tensor<1x80x1xf32> + %2625 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2623, %cst_2110 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%2624 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %2626 = tensor.empty() : tensor<1x80x1xf32> + %2627 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2625 : tensor<1x80x1xf32>) outs(%2626 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.rsqrt %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %2628 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_2111 = tensor.collapse_shape %2627 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32> + %2629 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2620, %collapsed_2111 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%2628 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %expanded_2112 = tensor.expand_shape %extracted_slice_45 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32> + %2630 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_2113 = tensor.collapse_shape %expanded_2112 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32> + %2631 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_2113, %2629 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%2630 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %2632 = tensor.empty() : tensor<4096x4096xf32> + %2633 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_388 : tensor<4096x4096xf32>) outs(%2632 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_2114 = tensor.collapse_shape %2631 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_2115 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %2634 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_2114, %2633 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_2115 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_2116 = tensor.expand_shape %2634 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %2635 = tensor.empty() : tensor<4096x4096xf32> + %2636 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_390 : tensor<4096x4096xf32>) outs(%2635 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_2117 = tensor.collapse_shape %2631 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_2118 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %2637 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_2117, %2636 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_2118 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_2119 = tensor.expand_shape %2637 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %2638 = tensor.empty() : tensor<4096x4096xf32> + %2639 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_392 : tensor<4096x4096xf32>) outs(%2638 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_2120 = tensor.collapse_shape %2631 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_2121 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %2640 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_2120, %2639 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_2121 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_2122 = tensor.expand_shape %2640 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %expanded_2123 = tensor.expand_shape %expanded_2116 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32> + %2641 = tensor.empty() : tensor<1x32x80x128xf32> + %2642 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2123 : tensor<1x80x32x128xf32>) outs(%2641 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x80x128xf32> + %expanded_2124 = tensor.expand_shape %expanded_2119 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32> + %2643 = tensor.empty() : tensor<1x32x80x128xf32> + %2644 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2124 : tensor<1x80x32x128xf32>) outs(%2643 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x80x128xf32> + %expanded_2125 = tensor.expand_shape %expanded_2122 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32> + %2645 = tensor.empty() : tensor<1x32x80x128xf32> + %2646 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2125 : tensor<1x80x32x128xf32>) outs(%2645 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x80x128xf32> + %extracted_slice_2126 = tensor.extract_slice %expanded_608[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32> + %extracted_slice_2127 = tensor.extract_slice %expanded_610[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32> + %2647 = tensor.empty() : tensor<1x80x128xf32> + %2648 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_2126 : tensor<1x1x80x128xf32>) outs(%2647 : tensor<1x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x80x128xf32> + %2649 = tensor.empty() : tensor<80x128xf32> + %2650 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%2648 : tensor<1x80x128xf32>) outs(%2649 : tensor<80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<80x128xf32> + %2651 = tensor.empty() : tensor<1x80x128xf32> + %2652 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_2127 : tensor<1x1x80x128xf32>) outs(%2651 : tensor<1x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x80x128xf32> + %2653 = tensor.empty() : tensor<80x128xf32> + %2654 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%2652 : tensor<1x80x128xf32>) outs(%2653 : tensor<80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<80x128xf32> + %2655 = tensor.empty() : tensor<1x80x128xf32> + %2656 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%2655 : tensor<1x80x128xf32>) { + ^bb0(%in: i64, %out: f32): + %3652 = arith.index_cast %in : i64 to index + %3653 = linalg.index 2 : index + %extracted = tensor.extract %2650[%3652, %3653] : tensor<80x128xf32> + linalg.yield %extracted : f32 + } -> tensor<1x80x128xf32> + %expanded_2128 = tensor.expand_shape %2656 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32> + %2657 = tensor.empty() : tensor<1x80x128xf32> + %2658 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%2657 : tensor<1x80x128xf32>) { + ^bb0(%in: i64, %out: f32): + %3652 = arith.index_cast %in : i64 to index + %3653 = linalg.index 2 : index + %extracted = tensor.extract %2654[%3652, %3653] : tensor<80x128xf32> + linalg.yield %extracted : f32 + } -> tensor<1x80x128xf32> + %expanded_2129 = tensor.expand_shape %2658 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32> + %2659 = tensor.empty() : tensor<1x32x80x128xf32> + %2660 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2642, %2656 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%2659 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %extracted_slice_2130 = tensor.extract_slice %2642[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %extracted_slice_2131 = tensor.extract_slice %2642[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %2661 = tensor.empty() : tensor<1x32x80x64xf32> + %2662 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_2131 : tensor<1x32x80x64xf32>) outs(%2661 : tensor<1x32x80x64xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.negf %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x64xf32> + %2663 = tensor.empty() : tensor<1x32x80x128xf32> + %inserted_slice_2132 = tensor.insert_slice %2662 into %2663[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %inserted_slice_2133 = tensor.insert_slice %extracted_slice_2130 into %inserted_slice_2132[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %2664 = tensor.empty() : tensor<1x32x80x128xf32> + %2665 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_2133, %2658 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%2664 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %2666 = tensor.empty() : tensor<1x32x80x128xf32> + %2667 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2660, %2665 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%2666 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %2668 = tensor.empty() : tensor<1x32x80x128xf32> + %2669 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2644, %2656 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%2668 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %extracted_slice_2134 = tensor.extract_slice %2644[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %extracted_slice_2135 = tensor.extract_slice %2644[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %2670 = tensor.empty() : tensor<1x32x80x64xf32> + %2671 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_2135 : tensor<1x32x80x64xf32>) outs(%2670 : tensor<1x32x80x64xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.negf %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x64xf32> + %2672 = tensor.empty() : tensor<1x32x80x128xf32> + %inserted_slice_2136 = tensor.insert_slice %2671 into %2672[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %inserted_slice_2137 = tensor.insert_slice %extracted_slice_2134 into %inserted_slice_2136[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %2673 = tensor.empty() : tensor<1x32x80x128xf32> + %2674 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_2137, %2658 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%2673 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %2675 = tensor.empty() : tensor<1x32x80x128xf32> + %2676 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2669, %2674 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%2675 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %2677 = tensor.empty() : tensor<1x32x128x80xf32> + %2678 = linalg.generic {indexing_maps = [#map14, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2676 : tensor<1x32x80x128xf32>) outs(%2677 : tensor<1x32x128x80xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x128x80xf32> + %cst_2138 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32> + %collapsed_2139 = tensor.collapse_shape %2667 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32> + %cst_2140 = arith.constant dense<0.000000e+00> : tensor<1x32x128x80xf32> + %collapsed_2141 = tensor.collapse_shape %2678 [[0, 1], [2], [3]] : tensor<1x32x128x80xf32> into tensor<32x128x80xf32> + %cst_2142 = arith.constant 0.000000e+00 : f32 + %2679 = tensor.empty() : tensor<32x80x80xf32> + %2680 = linalg.fill ins(%cst_2142 : f32) outs(%2679 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32> + %2681 = linalg.batch_matmul ins(%collapsed_2139, %collapsed_2141 : tensor<32x80x128xf32>, tensor<32x128x80xf32>) outs(%2680 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32> + %expanded_2143 = tensor.expand_shape %2681 [[0, 1], [2], [3]] : tensor<32x80x80xf32> into tensor<1x32x80x80xf32> + %cst_2144 = arith.constant dense<11.3137083> : tensor<1x32x80x80xf32> + %2682 = tensor.empty() : tensor<1x32x80x80xf32> + %2683 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst_2144 : tensor<1x32x80x80xf32>) outs(%2682 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %out: f32): + %cst_2684 = arith.constant 1.000000e+00 : f32 + %3652 = arith.divf %cst_2684, %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %2684 = tensor.empty() : tensor<1x32x80x80xf32> + %2685 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2143, %2683 : tensor<1x32x80x80xf32>, tensor<1x32x80x80xf32>) outs(%2684 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %2686 = tensor.empty() : tensor<1x32x80x80xf32> + %collapsed_2145 = tensor.collapse_shape %21 [[0, 1], [2], [3]] : tensor<1x1x80x80xf32> into tensor<1x80x80xf32> + %2687 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2685, %collapsed_2145 : tensor<1x32x80x80xf32>, tensor<1x80x80xf32>) outs(%2686 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %2688 = tensor.empty() : tensor<1x32x80x1xf32> + %2689 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%2688 : tensor<1x32x80x1xf32>) { + ^bb0(%out: f32): + %cst_2684 = arith.constant 0xFF800000 : f32 + linalg.yield %cst_2684 : f32 + } -> tensor<1x32x80x1xf32> + %2690 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%2687 : tensor<1x32x80x80xf32>) outs(%2688 : tensor<1x32x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.maximumf %in, %out : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x1xf32> + %2691 = tensor.empty() : tensor<1x32x80x80xf32> + %2692 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2687, %2690 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%2691 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.subf %in, %in_2684 : f32 + %3653 = math.exp %3652 : f32 + linalg.yield %3653 : f32 + } -> tensor<1x32x80x80xf32> + %2693 = tensor.empty() : tensor<1x32x80x1xf32> + %2694 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%2693 : tensor<1x32x80x1xf32>) { + ^bb0(%out: f32): + %cst_2684 = arith.constant 0.000000e+00 : f32 + linalg.yield %cst_2684 : f32 + } -> tensor<1x32x80x1xf32> + %2695 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%2692 : tensor<1x32x80x80xf32>) outs(%2694 : tensor<1x32x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.addf %in, %out : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x1xf32> + %2696 = tensor.empty() : tensor<1x32x80x80xf32> + %2697 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2692, %2695 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%2696 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.divf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %cst_2146 = arith.constant dense<0.000000e+00> : tensor<1x32x80x80xf32> + %collapsed_2147 = tensor.collapse_shape %2697 [[0, 1], [2], [3]] : tensor<1x32x80x80xf32> into tensor<32x80x80xf32> + %cst_2148 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32> + %collapsed_2149 = tensor.collapse_shape %2646 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32> + %cst_2150 = arith.constant 0.000000e+00 : f32 + %2698 = tensor.empty() : tensor<32x80x128xf32> + %2699 = linalg.fill ins(%cst_2150 : f32) outs(%2698 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32> + %2700 = linalg.batch_matmul ins(%collapsed_2147, %collapsed_2149 : tensor<32x80x80xf32>, tensor<32x80x128xf32>) outs(%2699 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32> + %expanded_2151 = tensor.expand_shape %2700 [[0, 1], [2], [3]] : tensor<32x80x128xf32> into tensor<1x32x80x128xf32> + %2701 = tensor.empty() : tensor<1x80x32x128xf32> + %2702 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2151 : tensor<1x32x80x128xf32>) outs(%2701 : tensor<1x80x32x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x80x32x128xf32> + %collapsed_2152 = tensor.collapse_shape %2702 [[0], [1], [2, 3]] : tensor<1x80x32x128xf32> into tensor<1x80x4096xf32> + %2703 = tensor.empty() : tensor<4096x4096xf32> + %2704 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_394 : tensor<4096x4096xf32>) outs(%2703 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_2153 = tensor.collapse_shape %collapsed_2152 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_2154 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %2705 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_2153, %2704 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_2154 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_2155 = tensor.expand_shape %2705 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %2706 = tensor.empty() : tensor<1x80x4096xf32> + %2707 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2620, %expanded_2155 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%2706 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %2708 = tensor.empty() : tensor<1x80x4096xf32> + %cst_2156 = arith.constant 2.000000e+00 : f32 + %2709 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2707 : tensor<1x80x4096xf32>) outs(%2708 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.powf %in, %cst_2156 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %cst_2157 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32> + %2710 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%2709 : tensor<1x80x4096xf32>) outs(%cst_2157 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %cst_2684 = arith.constant 4.096000e+03 : f32 + %3652 = arith.divf %in, %cst_2684 : f32 + %3653 = arith.addf %3652, %out : f32 + linalg.yield %3653 : f32 + } -> tensor<1x80x1xf32> + %cst_2158 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32> + %2711 = tensor.empty() : tensor<1x80x1xf32> + %2712 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2710, %cst_2158 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%2711 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %2713 = tensor.empty() : tensor<1x80x1xf32> + %2714 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2712 : tensor<1x80x1xf32>) outs(%2713 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.rsqrt %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %2715 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_2159 = tensor.collapse_shape %2714 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32> + %2716 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2707, %collapsed_2159 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%2715 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %expanded_2160 = tensor.expand_shape %extracted_slice_46 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32> + %2717 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_2161 = tensor.collapse_shape %expanded_2160 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32> + %2718 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_2161, %2716 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%2717 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %2719 = tensor.empty() : tensor<4096x11008xf32> + %2720 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_396 : tensor<11008x4096xf32>) outs(%2719 : tensor<4096x11008xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x11008xf32> + %collapsed_2162 = tensor.collapse_shape %2718 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_2163 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32> + %2721 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_2162, %2720 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_2163 : tensor<80x11008xf32>) -> tensor<80x11008xf32> + %expanded_2164 = tensor.expand_shape %2721 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32> + %2722 = tensor.empty() : tensor<1x80x11008xf32> + %2723 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_2164 : tensor<1x80x11008xf32>) outs(%2722 : tensor<1x80x11008xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.negf %in : f32 + %3653 = math.exp %3652 : f32 + %cst_2684 = arith.constant 1.000000e+00 : f32 + %3654 = arith.addf %cst_2684, %3653 : f32 + %3655 = arith.divf %in, %3654 : f32 + linalg.yield %3655 : f32 + } -> tensor<1x80x11008xf32> + %2724 = tensor.empty() : tensor<4096x11008xf32> + %2725 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_398 : tensor<11008x4096xf32>) outs(%2724 : tensor<4096x11008xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x11008xf32> + %collapsed_2165 = tensor.collapse_shape %2718 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_2166 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32> + %2726 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_2165, %2725 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_2166 : tensor<80x11008xf32>) -> tensor<80x11008xf32> + %expanded_2167 = tensor.expand_shape %2726 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32> + %2727 = tensor.empty() : tensor<1x80x11008xf32> + %2728 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2723, %expanded_2167 : tensor<1x80x11008xf32>, tensor<1x80x11008xf32>) outs(%2727 : tensor<1x80x11008xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x11008xf32> + %2729 = tensor.empty() : tensor<11008x4096xf32> + %2730 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_400 : tensor<4096x11008xf32>) outs(%2729 : tensor<11008x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<11008x4096xf32> + %collapsed_2168 = tensor.collapse_shape %2728 [[0, 1], [2]] : tensor<1x80x11008xf32> into tensor<80x11008xf32> + %cst_2169 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %2731 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_2168, %2730 : tensor<80x11008xf32>, tensor<11008x4096xf32>) outs(%cst_2169 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_2170 = tensor.expand_shape %2731 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %2732 = tensor.empty() : tensor<1x80x4096xf32> + %2733 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2707, %expanded_2170 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%2732 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %2734 = tensor.empty() : tensor<1x80x4096xf32> + %cst_2171 = arith.constant 2.000000e+00 : f32 + %2735 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2733 : tensor<1x80x4096xf32>) outs(%2734 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.powf %in, %cst_2171 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %cst_2172 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32> + %2736 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%2735 : tensor<1x80x4096xf32>) outs(%cst_2172 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %cst_2684 = arith.constant 4.096000e+03 : f32 + %3652 = arith.divf %in, %cst_2684 : f32 + %3653 = arith.addf %3652, %out : f32 + linalg.yield %3653 : f32 + } -> tensor<1x80x1xf32> + %cst_2173 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32> + %2737 = tensor.empty() : tensor<1x80x1xf32> + %2738 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2736, %cst_2173 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%2737 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %2739 = tensor.empty() : tensor<1x80x1xf32> + %2740 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2738 : tensor<1x80x1xf32>) outs(%2739 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.rsqrt %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %2741 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_2174 = tensor.collapse_shape %2740 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32> + %2742 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2733, %collapsed_2174 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%2741 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %expanded_2175 = tensor.expand_shape %extracted_slice_47 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32> + %2743 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_2176 = tensor.collapse_shape %expanded_2175 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32> + %2744 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_2176, %2742 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%2743 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %2745 = tensor.empty() : tensor<4096x4096xf32> + %2746 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_402 : tensor<4096x4096xf32>) outs(%2745 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_2177 = tensor.collapse_shape %2744 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_2178 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %2747 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_2177, %2746 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_2178 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_2179 = tensor.expand_shape %2747 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %2748 = tensor.empty() : tensor<4096x4096xf32> + %2749 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_404 : tensor<4096x4096xf32>) outs(%2748 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_2180 = tensor.collapse_shape %2744 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_2181 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %2750 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_2180, %2749 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_2181 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_2182 = tensor.expand_shape %2750 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %2751 = tensor.empty() : tensor<4096x4096xf32> + %2752 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_406 : tensor<4096x4096xf32>) outs(%2751 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_2183 = tensor.collapse_shape %2744 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_2184 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %2753 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_2183, %2752 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_2184 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_2185 = tensor.expand_shape %2753 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %expanded_2186 = tensor.expand_shape %expanded_2179 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32> + %2754 = tensor.empty() : tensor<1x32x80x128xf32> + %2755 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2186 : tensor<1x80x32x128xf32>) outs(%2754 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x80x128xf32> + %expanded_2187 = tensor.expand_shape %expanded_2182 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32> + %2756 = tensor.empty() : tensor<1x32x80x128xf32> + %2757 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2187 : tensor<1x80x32x128xf32>) outs(%2756 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x80x128xf32> + %expanded_2188 = tensor.expand_shape %expanded_2185 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32> + %2758 = tensor.empty() : tensor<1x32x80x128xf32> + %2759 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2188 : tensor<1x80x32x128xf32>) outs(%2758 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x80x128xf32> + %extracted_slice_2189 = tensor.extract_slice %expanded_612[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32> + %extracted_slice_2190 = tensor.extract_slice %expanded_614[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32> + %2760 = tensor.empty() : tensor<1x80x128xf32> + %2761 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_2189 : tensor<1x1x80x128xf32>) outs(%2760 : tensor<1x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x80x128xf32> + %2762 = tensor.empty() : tensor<80x128xf32> + %2763 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%2761 : tensor<1x80x128xf32>) outs(%2762 : tensor<80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<80x128xf32> + %2764 = tensor.empty() : tensor<1x80x128xf32> + %2765 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_2190 : tensor<1x1x80x128xf32>) outs(%2764 : tensor<1x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x80x128xf32> + %2766 = tensor.empty() : tensor<80x128xf32> + %2767 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%2765 : tensor<1x80x128xf32>) outs(%2766 : tensor<80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<80x128xf32> + %2768 = tensor.empty() : tensor<1x80x128xf32> + %2769 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%2768 : tensor<1x80x128xf32>) { + ^bb0(%in: i64, %out: f32): + %3652 = arith.index_cast %in : i64 to index + %3653 = linalg.index 2 : index + %extracted = tensor.extract %2763[%3652, %3653] : tensor<80x128xf32> + linalg.yield %extracted : f32 + } -> tensor<1x80x128xf32> + %expanded_2191 = tensor.expand_shape %2769 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32> + %2770 = tensor.empty() : tensor<1x80x128xf32> + %2771 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%2770 : tensor<1x80x128xf32>) { + ^bb0(%in: i64, %out: f32): + %3652 = arith.index_cast %in : i64 to index + %3653 = linalg.index 2 : index + %extracted = tensor.extract %2767[%3652, %3653] : tensor<80x128xf32> + linalg.yield %extracted : f32 + } -> tensor<1x80x128xf32> + %expanded_2192 = tensor.expand_shape %2771 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32> + %2772 = tensor.empty() : tensor<1x32x80x128xf32> + %2773 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2755, %2769 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%2772 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %extracted_slice_2193 = tensor.extract_slice %2755[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %extracted_slice_2194 = tensor.extract_slice %2755[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %2774 = tensor.empty() : tensor<1x32x80x64xf32> + %2775 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_2194 : tensor<1x32x80x64xf32>) outs(%2774 : tensor<1x32x80x64xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.negf %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x64xf32> + %2776 = tensor.empty() : tensor<1x32x80x128xf32> + %inserted_slice_2195 = tensor.insert_slice %2775 into %2776[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %inserted_slice_2196 = tensor.insert_slice %extracted_slice_2193 into %inserted_slice_2195[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %2777 = tensor.empty() : tensor<1x32x80x128xf32> + %2778 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_2196, %2771 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%2777 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %2779 = tensor.empty() : tensor<1x32x80x128xf32> + %2780 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2773, %2778 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%2779 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %2781 = tensor.empty() : tensor<1x32x80x128xf32> + %2782 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2757, %2769 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%2781 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %extracted_slice_2197 = tensor.extract_slice %2757[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %extracted_slice_2198 = tensor.extract_slice %2757[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %2783 = tensor.empty() : tensor<1x32x80x64xf32> + %2784 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_2198 : tensor<1x32x80x64xf32>) outs(%2783 : tensor<1x32x80x64xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.negf %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x64xf32> + %2785 = tensor.empty() : tensor<1x32x80x128xf32> + %inserted_slice_2199 = tensor.insert_slice %2784 into %2785[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %inserted_slice_2200 = tensor.insert_slice %extracted_slice_2197 into %inserted_slice_2199[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %2786 = tensor.empty() : tensor<1x32x80x128xf32> + %2787 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_2200, %2771 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%2786 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %2788 = tensor.empty() : tensor<1x32x80x128xf32> + %2789 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2782, %2787 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%2788 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %2790 = tensor.empty() : tensor<1x32x128x80xf32> + %2791 = linalg.generic {indexing_maps = [#map14, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2789 : tensor<1x32x80x128xf32>) outs(%2790 : tensor<1x32x128x80xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x128x80xf32> + %cst_2201 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32> + %collapsed_2202 = tensor.collapse_shape %2780 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32> + %cst_2203 = arith.constant dense<0.000000e+00> : tensor<1x32x128x80xf32> + %collapsed_2204 = tensor.collapse_shape %2791 [[0, 1], [2], [3]] : tensor<1x32x128x80xf32> into tensor<32x128x80xf32> + %cst_2205 = arith.constant 0.000000e+00 : f32 + %2792 = tensor.empty() : tensor<32x80x80xf32> + %2793 = linalg.fill ins(%cst_2205 : f32) outs(%2792 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32> + %2794 = linalg.batch_matmul ins(%collapsed_2202, %collapsed_2204 : tensor<32x80x128xf32>, tensor<32x128x80xf32>) outs(%2793 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32> + %expanded_2206 = tensor.expand_shape %2794 [[0, 1], [2], [3]] : tensor<32x80x80xf32> into tensor<1x32x80x80xf32> + %cst_2207 = arith.constant dense<11.3137083> : tensor<1x32x80x80xf32> + %2795 = tensor.empty() : tensor<1x32x80x80xf32> + %2796 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst_2207 : tensor<1x32x80x80xf32>) outs(%2795 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %out: f32): + %cst_2684 = arith.constant 1.000000e+00 : f32 + %3652 = arith.divf %cst_2684, %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %2797 = tensor.empty() : tensor<1x32x80x80xf32> + %2798 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2206, %2796 : tensor<1x32x80x80xf32>, tensor<1x32x80x80xf32>) outs(%2797 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %2799 = tensor.empty() : tensor<1x32x80x80xf32> + %collapsed_2208 = tensor.collapse_shape %21 [[0, 1], [2], [3]] : tensor<1x1x80x80xf32> into tensor<1x80x80xf32> + %2800 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2798, %collapsed_2208 : tensor<1x32x80x80xf32>, tensor<1x80x80xf32>) outs(%2799 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %2801 = tensor.empty() : tensor<1x32x80x1xf32> + %2802 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%2801 : tensor<1x32x80x1xf32>) { + ^bb0(%out: f32): + %cst_2684 = arith.constant 0xFF800000 : f32 + linalg.yield %cst_2684 : f32 + } -> tensor<1x32x80x1xf32> + %2803 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%2800 : tensor<1x32x80x80xf32>) outs(%2801 : tensor<1x32x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.maximumf %in, %out : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x1xf32> + %2804 = tensor.empty() : tensor<1x32x80x80xf32> + %2805 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2800, %2803 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%2804 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.subf %in, %in_2684 : f32 + %3653 = math.exp %3652 : f32 + linalg.yield %3653 : f32 + } -> tensor<1x32x80x80xf32> + %2806 = tensor.empty() : tensor<1x32x80x1xf32> + %2807 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%2806 : tensor<1x32x80x1xf32>) { + ^bb0(%out: f32): + %cst_2684 = arith.constant 0.000000e+00 : f32 + linalg.yield %cst_2684 : f32 + } -> tensor<1x32x80x1xf32> + %2808 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%2805 : tensor<1x32x80x80xf32>) outs(%2807 : tensor<1x32x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.addf %in, %out : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x1xf32> + %2809 = tensor.empty() : tensor<1x32x80x80xf32> + %2810 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2805, %2808 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%2809 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.divf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %cst_2209 = arith.constant dense<0.000000e+00> : tensor<1x32x80x80xf32> + %collapsed_2210 = tensor.collapse_shape %2810 [[0, 1], [2], [3]] : tensor<1x32x80x80xf32> into tensor<32x80x80xf32> + %cst_2211 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32> + %collapsed_2212 = tensor.collapse_shape %2759 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32> + %cst_2213 = arith.constant 0.000000e+00 : f32 + %2811 = tensor.empty() : tensor<32x80x128xf32> + %2812 = linalg.fill ins(%cst_2213 : f32) outs(%2811 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32> + %2813 = linalg.batch_matmul ins(%collapsed_2210, %collapsed_2212 : tensor<32x80x80xf32>, tensor<32x80x128xf32>) outs(%2812 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32> + %expanded_2214 = tensor.expand_shape %2813 [[0, 1], [2], [3]] : tensor<32x80x128xf32> into tensor<1x32x80x128xf32> + %2814 = tensor.empty() : tensor<1x80x32x128xf32> + %2815 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2214 : tensor<1x32x80x128xf32>) outs(%2814 : tensor<1x80x32x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x80x32x128xf32> + %collapsed_2215 = tensor.collapse_shape %2815 [[0], [1], [2, 3]] : tensor<1x80x32x128xf32> into tensor<1x80x4096xf32> + %2816 = tensor.empty() : tensor<4096x4096xf32> + %2817 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_408 : tensor<4096x4096xf32>) outs(%2816 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_2216 = tensor.collapse_shape %collapsed_2215 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_2217 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %2818 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_2216, %2817 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_2217 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_2218 = tensor.expand_shape %2818 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %2819 = tensor.empty() : tensor<1x80x4096xf32> + %2820 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2733, %expanded_2218 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%2819 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %2821 = tensor.empty() : tensor<1x80x4096xf32> + %cst_2219 = arith.constant 2.000000e+00 : f32 + %2822 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2820 : tensor<1x80x4096xf32>) outs(%2821 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.powf %in, %cst_2219 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %cst_2220 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32> + %2823 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%2822 : tensor<1x80x4096xf32>) outs(%cst_2220 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %cst_2684 = arith.constant 4.096000e+03 : f32 + %3652 = arith.divf %in, %cst_2684 : f32 + %3653 = arith.addf %3652, %out : f32 + linalg.yield %3653 : f32 + } -> tensor<1x80x1xf32> + %cst_2221 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32> + %2824 = tensor.empty() : tensor<1x80x1xf32> + %2825 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2823, %cst_2221 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%2824 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %2826 = tensor.empty() : tensor<1x80x1xf32> + %2827 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2825 : tensor<1x80x1xf32>) outs(%2826 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.rsqrt %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %2828 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_2222 = tensor.collapse_shape %2827 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32> + %2829 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2820, %collapsed_2222 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%2828 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %expanded_2223 = tensor.expand_shape %extracted_slice_48 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32> + %2830 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_2224 = tensor.collapse_shape %expanded_2223 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32> + %2831 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_2224, %2829 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%2830 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %2832 = tensor.empty() : tensor<4096x11008xf32> + %2833 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_410 : tensor<11008x4096xf32>) outs(%2832 : tensor<4096x11008xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x11008xf32> + %collapsed_2225 = tensor.collapse_shape %2831 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_2226 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32> + %2834 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_2225, %2833 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_2226 : tensor<80x11008xf32>) -> tensor<80x11008xf32> + %expanded_2227 = tensor.expand_shape %2834 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32> + %2835 = tensor.empty() : tensor<1x80x11008xf32> + %2836 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_2227 : tensor<1x80x11008xf32>) outs(%2835 : tensor<1x80x11008xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.negf %in : f32 + %3653 = math.exp %3652 : f32 + %cst_2684 = arith.constant 1.000000e+00 : f32 + %3654 = arith.addf %cst_2684, %3653 : f32 + %3655 = arith.divf %in, %3654 : f32 + linalg.yield %3655 : f32 + } -> tensor<1x80x11008xf32> + %2837 = tensor.empty() : tensor<4096x11008xf32> + %2838 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_412 : tensor<11008x4096xf32>) outs(%2837 : tensor<4096x11008xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x11008xf32> + %collapsed_2228 = tensor.collapse_shape %2831 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_2229 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32> + %2839 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_2228, %2838 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_2229 : tensor<80x11008xf32>) -> tensor<80x11008xf32> + %expanded_2230 = tensor.expand_shape %2839 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32> + %2840 = tensor.empty() : tensor<1x80x11008xf32> + %2841 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2836, %expanded_2230 : tensor<1x80x11008xf32>, tensor<1x80x11008xf32>) outs(%2840 : tensor<1x80x11008xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x11008xf32> + %2842 = tensor.empty() : tensor<11008x4096xf32> + %2843 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_414 : tensor<4096x11008xf32>) outs(%2842 : tensor<11008x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<11008x4096xf32> + %collapsed_2231 = tensor.collapse_shape %2841 [[0, 1], [2]] : tensor<1x80x11008xf32> into tensor<80x11008xf32> + %cst_2232 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %2844 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_2231, %2843 : tensor<80x11008xf32>, tensor<11008x4096xf32>) outs(%cst_2232 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_2233 = tensor.expand_shape %2844 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %2845 = tensor.empty() : tensor<1x80x4096xf32> + %2846 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2820, %expanded_2233 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%2845 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %2847 = tensor.empty() : tensor<1x80x4096xf32> + %cst_2234 = arith.constant 2.000000e+00 : f32 + %2848 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2846 : tensor<1x80x4096xf32>) outs(%2847 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.powf %in, %cst_2234 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %cst_2235 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32> + %2849 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%2848 : tensor<1x80x4096xf32>) outs(%cst_2235 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %cst_2684 = arith.constant 4.096000e+03 : f32 + %3652 = arith.divf %in, %cst_2684 : f32 + %3653 = arith.addf %3652, %out : f32 + linalg.yield %3653 : f32 + } -> tensor<1x80x1xf32> + %cst_2236 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32> + %2850 = tensor.empty() : tensor<1x80x1xf32> + %2851 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2849, %cst_2236 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%2850 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %2852 = tensor.empty() : tensor<1x80x1xf32> + %2853 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2851 : tensor<1x80x1xf32>) outs(%2852 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.rsqrt %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %2854 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_2237 = tensor.collapse_shape %2853 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32> + %2855 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2846, %collapsed_2237 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%2854 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %expanded_2238 = tensor.expand_shape %extracted_slice_49 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32> + %2856 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_2239 = tensor.collapse_shape %expanded_2238 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32> + %2857 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_2239, %2855 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%2856 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %2858 = tensor.empty() : tensor<4096x4096xf32> + %2859 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_416 : tensor<4096x4096xf32>) outs(%2858 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_2240 = tensor.collapse_shape %2857 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_2241 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %2860 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_2240, %2859 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_2241 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_2242 = tensor.expand_shape %2860 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %2861 = tensor.empty() : tensor<4096x4096xf32> + %2862 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_418 : tensor<4096x4096xf32>) outs(%2861 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_2243 = tensor.collapse_shape %2857 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_2244 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %2863 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_2243, %2862 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_2244 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_2245 = tensor.expand_shape %2863 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %2864 = tensor.empty() : tensor<4096x4096xf32> + %2865 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_420 : tensor<4096x4096xf32>) outs(%2864 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_2246 = tensor.collapse_shape %2857 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_2247 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %2866 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_2246, %2865 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_2247 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_2248 = tensor.expand_shape %2866 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %expanded_2249 = tensor.expand_shape %expanded_2242 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32> + %2867 = tensor.empty() : tensor<1x32x80x128xf32> + %2868 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2249 : tensor<1x80x32x128xf32>) outs(%2867 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x80x128xf32> + %expanded_2250 = tensor.expand_shape %expanded_2245 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32> + %2869 = tensor.empty() : tensor<1x32x80x128xf32> + %2870 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2250 : tensor<1x80x32x128xf32>) outs(%2869 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x80x128xf32> + %expanded_2251 = tensor.expand_shape %expanded_2248 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32> + %2871 = tensor.empty() : tensor<1x32x80x128xf32> + %2872 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2251 : tensor<1x80x32x128xf32>) outs(%2871 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x80x128xf32> + %extracted_slice_2252 = tensor.extract_slice %expanded_616[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32> + %extracted_slice_2253 = tensor.extract_slice %expanded_618[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32> + %2873 = tensor.empty() : tensor<1x80x128xf32> + %2874 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_2252 : tensor<1x1x80x128xf32>) outs(%2873 : tensor<1x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x80x128xf32> + %2875 = tensor.empty() : tensor<80x128xf32> + %2876 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%2874 : tensor<1x80x128xf32>) outs(%2875 : tensor<80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<80x128xf32> + %2877 = tensor.empty() : tensor<1x80x128xf32> + %2878 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_2253 : tensor<1x1x80x128xf32>) outs(%2877 : tensor<1x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x80x128xf32> + %2879 = tensor.empty() : tensor<80x128xf32> + %2880 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%2878 : tensor<1x80x128xf32>) outs(%2879 : tensor<80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<80x128xf32> + %2881 = tensor.empty() : tensor<1x80x128xf32> + %2882 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%2881 : tensor<1x80x128xf32>) { + ^bb0(%in: i64, %out: f32): + %3652 = arith.index_cast %in : i64 to index + %3653 = linalg.index 2 : index + %extracted = tensor.extract %2876[%3652, %3653] : tensor<80x128xf32> + linalg.yield %extracted : f32 + } -> tensor<1x80x128xf32> + %expanded_2254 = tensor.expand_shape %2882 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32> + %2883 = tensor.empty() : tensor<1x80x128xf32> + %2884 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%2883 : tensor<1x80x128xf32>) { + ^bb0(%in: i64, %out: f32): + %3652 = arith.index_cast %in : i64 to index + %3653 = linalg.index 2 : index + %extracted = tensor.extract %2880[%3652, %3653] : tensor<80x128xf32> + linalg.yield %extracted : f32 + } -> tensor<1x80x128xf32> + %expanded_2255 = tensor.expand_shape %2884 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32> + %2885 = tensor.empty() : tensor<1x32x80x128xf32> + %2886 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2868, %2882 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%2885 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %extracted_slice_2256 = tensor.extract_slice %2868[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %extracted_slice_2257 = tensor.extract_slice %2868[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %2887 = tensor.empty() : tensor<1x32x80x64xf32> + %2888 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_2257 : tensor<1x32x80x64xf32>) outs(%2887 : tensor<1x32x80x64xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.negf %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x64xf32> + %2889 = tensor.empty() : tensor<1x32x80x128xf32> + %inserted_slice_2258 = tensor.insert_slice %2888 into %2889[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %inserted_slice_2259 = tensor.insert_slice %extracted_slice_2256 into %inserted_slice_2258[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %2890 = tensor.empty() : tensor<1x32x80x128xf32> + %2891 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_2259, %2884 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%2890 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %2892 = tensor.empty() : tensor<1x32x80x128xf32> + %2893 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2886, %2891 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%2892 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %2894 = tensor.empty() : tensor<1x32x80x128xf32> + %2895 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2870, %2882 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%2894 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %extracted_slice_2260 = tensor.extract_slice %2870[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %extracted_slice_2261 = tensor.extract_slice %2870[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %2896 = tensor.empty() : tensor<1x32x80x64xf32> + %2897 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_2261 : tensor<1x32x80x64xf32>) outs(%2896 : tensor<1x32x80x64xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.negf %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x64xf32> + %2898 = tensor.empty() : tensor<1x32x80x128xf32> + %inserted_slice_2262 = tensor.insert_slice %2897 into %2898[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %inserted_slice_2263 = tensor.insert_slice %extracted_slice_2260 into %inserted_slice_2262[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %2899 = tensor.empty() : tensor<1x32x80x128xf32> + %2900 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_2263, %2884 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%2899 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %2901 = tensor.empty() : tensor<1x32x80x128xf32> + %2902 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2895, %2900 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%2901 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %2903 = tensor.empty() : tensor<1x32x128x80xf32> + %2904 = linalg.generic {indexing_maps = [#map14, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2902 : tensor<1x32x80x128xf32>) outs(%2903 : tensor<1x32x128x80xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x128x80xf32> + %cst_2264 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32> + %collapsed_2265 = tensor.collapse_shape %2893 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32> + %cst_2266 = arith.constant dense<0.000000e+00> : tensor<1x32x128x80xf32> + %collapsed_2267 = tensor.collapse_shape %2904 [[0, 1], [2], [3]] : tensor<1x32x128x80xf32> into tensor<32x128x80xf32> + %cst_2268 = arith.constant 0.000000e+00 : f32 + %2905 = tensor.empty() : tensor<32x80x80xf32> + %2906 = linalg.fill ins(%cst_2268 : f32) outs(%2905 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32> + %2907 = linalg.batch_matmul ins(%collapsed_2265, %collapsed_2267 : tensor<32x80x128xf32>, tensor<32x128x80xf32>) outs(%2906 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32> + %expanded_2269 = tensor.expand_shape %2907 [[0, 1], [2], [3]] : tensor<32x80x80xf32> into tensor<1x32x80x80xf32> + %cst_2270 = arith.constant dense<11.3137083> : tensor<1x32x80x80xf32> + %2908 = tensor.empty() : tensor<1x32x80x80xf32> + %2909 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst_2270 : tensor<1x32x80x80xf32>) outs(%2908 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %out: f32): + %cst_2684 = arith.constant 1.000000e+00 : f32 + %3652 = arith.divf %cst_2684, %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %2910 = tensor.empty() : tensor<1x32x80x80xf32> + %2911 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2269, %2909 : tensor<1x32x80x80xf32>, tensor<1x32x80x80xf32>) outs(%2910 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %2912 = tensor.empty() : tensor<1x32x80x80xf32> + %collapsed_2271 = tensor.collapse_shape %21 [[0, 1], [2], [3]] : tensor<1x1x80x80xf32> into tensor<1x80x80xf32> + %2913 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2911, %collapsed_2271 : tensor<1x32x80x80xf32>, tensor<1x80x80xf32>) outs(%2912 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %2914 = tensor.empty() : tensor<1x32x80x1xf32> + %2915 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%2914 : tensor<1x32x80x1xf32>) { + ^bb0(%out: f32): + %cst_2684 = arith.constant 0xFF800000 : f32 + linalg.yield %cst_2684 : f32 + } -> tensor<1x32x80x1xf32> + %2916 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%2913 : tensor<1x32x80x80xf32>) outs(%2914 : tensor<1x32x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.maximumf %in, %out : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x1xf32> + %2917 = tensor.empty() : tensor<1x32x80x80xf32> + %2918 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2913, %2916 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%2917 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.subf %in, %in_2684 : f32 + %3653 = math.exp %3652 : f32 + linalg.yield %3653 : f32 + } -> tensor<1x32x80x80xf32> + %2919 = tensor.empty() : tensor<1x32x80x1xf32> + %2920 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%2919 : tensor<1x32x80x1xf32>) { + ^bb0(%out: f32): + %cst_2684 = arith.constant 0.000000e+00 : f32 + linalg.yield %cst_2684 : f32 + } -> tensor<1x32x80x1xf32> + %2921 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%2918 : tensor<1x32x80x80xf32>) outs(%2920 : tensor<1x32x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.addf %in, %out : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x1xf32> + %2922 = tensor.empty() : tensor<1x32x80x80xf32> + %2923 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2918, %2921 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%2922 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.divf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %cst_2272 = arith.constant dense<0.000000e+00> : tensor<1x32x80x80xf32> + %collapsed_2273 = tensor.collapse_shape %2923 [[0, 1], [2], [3]] : tensor<1x32x80x80xf32> into tensor<32x80x80xf32> + %cst_2274 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32> + %collapsed_2275 = tensor.collapse_shape %2872 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32> + %cst_2276 = arith.constant 0.000000e+00 : f32 + %2924 = tensor.empty() : tensor<32x80x128xf32> + %2925 = linalg.fill ins(%cst_2276 : f32) outs(%2924 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32> + %2926 = linalg.batch_matmul ins(%collapsed_2273, %collapsed_2275 : tensor<32x80x80xf32>, tensor<32x80x128xf32>) outs(%2925 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32> + %expanded_2277 = tensor.expand_shape %2926 [[0, 1], [2], [3]] : tensor<32x80x128xf32> into tensor<1x32x80x128xf32> + %2927 = tensor.empty() : tensor<1x80x32x128xf32> + %2928 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2277 : tensor<1x32x80x128xf32>) outs(%2927 : tensor<1x80x32x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x80x32x128xf32> + %collapsed_2278 = tensor.collapse_shape %2928 [[0], [1], [2, 3]] : tensor<1x80x32x128xf32> into tensor<1x80x4096xf32> + %2929 = tensor.empty() : tensor<4096x4096xf32> + %2930 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_422 : tensor<4096x4096xf32>) outs(%2929 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_2279 = tensor.collapse_shape %collapsed_2278 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_2280 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %2931 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_2279, %2930 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_2280 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_2281 = tensor.expand_shape %2931 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %2932 = tensor.empty() : tensor<1x80x4096xf32> + %2933 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2846, %expanded_2281 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%2932 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %2934 = tensor.empty() : tensor<1x80x4096xf32> + %cst_2282 = arith.constant 2.000000e+00 : f32 + %2935 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2933 : tensor<1x80x4096xf32>) outs(%2934 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.powf %in, %cst_2282 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %cst_2283 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32> + %2936 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%2935 : tensor<1x80x4096xf32>) outs(%cst_2283 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %cst_2684 = arith.constant 4.096000e+03 : f32 + %3652 = arith.divf %in, %cst_2684 : f32 + %3653 = arith.addf %3652, %out : f32 + linalg.yield %3653 : f32 + } -> tensor<1x80x1xf32> + %cst_2284 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32> + %2937 = tensor.empty() : tensor<1x80x1xf32> + %2938 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2936, %cst_2284 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%2937 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %2939 = tensor.empty() : tensor<1x80x1xf32> + %2940 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2938 : tensor<1x80x1xf32>) outs(%2939 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.rsqrt %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %2941 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_2285 = tensor.collapse_shape %2940 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32> + %2942 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2933, %collapsed_2285 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%2941 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %expanded_2286 = tensor.expand_shape %extracted_slice_50 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32> + %2943 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_2287 = tensor.collapse_shape %expanded_2286 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32> + %2944 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_2287, %2942 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%2943 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %2945 = tensor.empty() : tensor<4096x11008xf32> + %2946 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_424 : tensor<11008x4096xf32>) outs(%2945 : tensor<4096x11008xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x11008xf32> + %collapsed_2288 = tensor.collapse_shape %2944 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_2289 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32> + %2947 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_2288, %2946 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_2289 : tensor<80x11008xf32>) -> tensor<80x11008xf32> + %expanded_2290 = tensor.expand_shape %2947 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32> + %2948 = tensor.empty() : tensor<1x80x11008xf32> + %2949 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_2290 : tensor<1x80x11008xf32>) outs(%2948 : tensor<1x80x11008xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.negf %in : f32 + %3653 = math.exp %3652 : f32 + %cst_2684 = arith.constant 1.000000e+00 : f32 + %3654 = arith.addf %cst_2684, %3653 : f32 + %3655 = arith.divf %in, %3654 : f32 + linalg.yield %3655 : f32 + } -> tensor<1x80x11008xf32> + %2950 = tensor.empty() : tensor<4096x11008xf32> + %2951 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_426 : tensor<11008x4096xf32>) outs(%2950 : tensor<4096x11008xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x11008xf32> + %collapsed_2291 = tensor.collapse_shape %2944 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_2292 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32> + %2952 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_2291, %2951 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_2292 : tensor<80x11008xf32>) -> tensor<80x11008xf32> + %expanded_2293 = tensor.expand_shape %2952 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32> + %2953 = tensor.empty() : tensor<1x80x11008xf32> + %2954 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2949, %expanded_2293 : tensor<1x80x11008xf32>, tensor<1x80x11008xf32>) outs(%2953 : tensor<1x80x11008xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x11008xf32> + %2955 = tensor.empty() : tensor<11008x4096xf32> + %2956 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_428 : tensor<4096x11008xf32>) outs(%2955 : tensor<11008x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<11008x4096xf32> + %collapsed_2294 = tensor.collapse_shape %2954 [[0, 1], [2]] : tensor<1x80x11008xf32> into tensor<80x11008xf32> + %cst_2295 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %2957 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_2294, %2956 : tensor<80x11008xf32>, tensor<11008x4096xf32>) outs(%cst_2295 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_2296 = tensor.expand_shape %2957 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %2958 = tensor.empty() : tensor<1x80x4096xf32> + %2959 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2933, %expanded_2296 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%2958 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %2960 = tensor.empty() : tensor<1x80x4096xf32> + %cst_2297 = arith.constant 2.000000e+00 : f32 + %2961 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2959 : tensor<1x80x4096xf32>) outs(%2960 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.powf %in, %cst_2297 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %cst_2298 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32> + %2962 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%2961 : tensor<1x80x4096xf32>) outs(%cst_2298 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %cst_2684 = arith.constant 4.096000e+03 : f32 + %3652 = arith.divf %in, %cst_2684 : f32 + %3653 = arith.addf %3652, %out : f32 + linalg.yield %3653 : f32 + } -> tensor<1x80x1xf32> + %cst_2299 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32> + %2963 = tensor.empty() : tensor<1x80x1xf32> + %2964 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2962, %cst_2299 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%2963 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %2965 = tensor.empty() : tensor<1x80x1xf32> + %2966 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2964 : tensor<1x80x1xf32>) outs(%2965 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.rsqrt %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %2967 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_2300 = tensor.collapse_shape %2966 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32> + %2968 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2959, %collapsed_2300 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%2967 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %expanded_2301 = tensor.expand_shape %extracted_slice_51 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32> + %2969 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_2302 = tensor.collapse_shape %expanded_2301 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32> + %2970 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_2302, %2968 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%2969 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %2971 = tensor.empty() : tensor<4096x4096xf32> + %2972 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_430 : tensor<4096x4096xf32>) outs(%2971 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_2303 = tensor.collapse_shape %2970 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_2304 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %2973 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_2303, %2972 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_2304 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_2305 = tensor.expand_shape %2973 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %2974 = tensor.empty() : tensor<4096x4096xf32> + %2975 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_432 : tensor<4096x4096xf32>) outs(%2974 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_2306 = tensor.collapse_shape %2970 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_2307 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %2976 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_2306, %2975 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_2307 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_2308 = tensor.expand_shape %2976 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %2977 = tensor.empty() : tensor<4096x4096xf32> + %2978 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_434 : tensor<4096x4096xf32>) outs(%2977 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_2309 = tensor.collapse_shape %2970 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_2310 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %2979 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_2309, %2978 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_2310 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_2311 = tensor.expand_shape %2979 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %expanded_2312 = tensor.expand_shape %expanded_2305 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32> + %2980 = tensor.empty() : tensor<1x32x80x128xf32> + %2981 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2312 : tensor<1x80x32x128xf32>) outs(%2980 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x80x128xf32> + %expanded_2313 = tensor.expand_shape %expanded_2308 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32> + %2982 = tensor.empty() : tensor<1x32x80x128xf32> + %2983 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2313 : tensor<1x80x32x128xf32>) outs(%2982 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x80x128xf32> + %expanded_2314 = tensor.expand_shape %expanded_2311 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32> + %2984 = tensor.empty() : tensor<1x32x80x128xf32> + %2985 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2314 : tensor<1x80x32x128xf32>) outs(%2984 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x80x128xf32> + %extracted_slice_2315 = tensor.extract_slice %expanded_620[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32> + %extracted_slice_2316 = tensor.extract_slice %expanded_622[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32> + %2986 = tensor.empty() : tensor<1x80x128xf32> + %2987 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_2315 : tensor<1x1x80x128xf32>) outs(%2986 : tensor<1x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x80x128xf32> + %2988 = tensor.empty() : tensor<80x128xf32> + %2989 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%2987 : tensor<1x80x128xf32>) outs(%2988 : tensor<80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<80x128xf32> + %2990 = tensor.empty() : tensor<1x80x128xf32> + %2991 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_2316 : tensor<1x1x80x128xf32>) outs(%2990 : tensor<1x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x80x128xf32> + %2992 = tensor.empty() : tensor<80x128xf32> + %2993 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%2991 : tensor<1x80x128xf32>) outs(%2992 : tensor<80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<80x128xf32> + %2994 = tensor.empty() : tensor<1x80x128xf32> + %2995 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%2994 : tensor<1x80x128xf32>) { + ^bb0(%in: i64, %out: f32): + %3652 = arith.index_cast %in : i64 to index + %3653 = linalg.index 2 : index + %extracted = tensor.extract %2989[%3652, %3653] : tensor<80x128xf32> + linalg.yield %extracted : f32 + } -> tensor<1x80x128xf32> + %expanded_2317 = tensor.expand_shape %2995 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32> + %2996 = tensor.empty() : tensor<1x80x128xf32> + %2997 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%2996 : tensor<1x80x128xf32>) { + ^bb0(%in: i64, %out: f32): + %3652 = arith.index_cast %in : i64 to index + %3653 = linalg.index 2 : index + %extracted = tensor.extract %2993[%3652, %3653] : tensor<80x128xf32> + linalg.yield %extracted : f32 + } -> tensor<1x80x128xf32> + %expanded_2318 = tensor.expand_shape %2997 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32> + %2998 = tensor.empty() : tensor<1x32x80x128xf32> + %2999 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2981, %2995 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%2998 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %extracted_slice_2319 = tensor.extract_slice %2981[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %extracted_slice_2320 = tensor.extract_slice %2981[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %3000 = tensor.empty() : tensor<1x32x80x64xf32> + %3001 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_2320 : tensor<1x32x80x64xf32>) outs(%3000 : tensor<1x32x80x64xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.negf %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x64xf32> + %3002 = tensor.empty() : tensor<1x32x80x128xf32> + %inserted_slice_2321 = tensor.insert_slice %3001 into %3002[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %inserted_slice_2322 = tensor.insert_slice %extracted_slice_2319 into %inserted_slice_2321[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %3003 = tensor.empty() : tensor<1x32x80x128xf32> + %3004 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_2322, %2997 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%3003 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %3005 = tensor.empty() : tensor<1x32x80x128xf32> + %3006 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2999, %3004 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%3005 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %3007 = tensor.empty() : tensor<1x32x80x128xf32> + %3008 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2983, %2995 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%3007 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %extracted_slice_2323 = tensor.extract_slice %2983[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %extracted_slice_2324 = tensor.extract_slice %2983[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %3009 = tensor.empty() : tensor<1x32x80x64xf32> + %3010 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_2324 : tensor<1x32x80x64xf32>) outs(%3009 : tensor<1x32x80x64xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.negf %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x64xf32> + %3011 = tensor.empty() : tensor<1x32x80x128xf32> + %inserted_slice_2325 = tensor.insert_slice %3010 into %3011[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %inserted_slice_2326 = tensor.insert_slice %extracted_slice_2323 into %inserted_slice_2325[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %3012 = tensor.empty() : tensor<1x32x80x128xf32> + %3013 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_2326, %2997 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%3012 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %3014 = tensor.empty() : tensor<1x32x80x128xf32> + %3015 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3008, %3013 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%3014 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %3016 = tensor.empty() : tensor<1x32x128x80xf32> + %3017 = linalg.generic {indexing_maps = [#map14, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3015 : tensor<1x32x80x128xf32>) outs(%3016 : tensor<1x32x128x80xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x128x80xf32> + %cst_2327 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32> + %collapsed_2328 = tensor.collapse_shape %3006 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32> + %cst_2329 = arith.constant dense<0.000000e+00> : tensor<1x32x128x80xf32> + %collapsed_2330 = tensor.collapse_shape %3017 [[0, 1], [2], [3]] : tensor<1x32x128x80xf32> into tensor<32x128x80xf32> + %cst_2331 = arith.constant 0.000000e+00 : f32 + %3018 = tensor.empty() : tensor<32x80x80xf32> + %3019 = linalg.fill ins(%cst_2331 : f32) outs(%3018 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32> + %3020 = linalg.batch_matmul ins(%collapsed_2328, %collapsed_2330 : tensor<32x80x128xf32>, tensor<32x128x80xf32>) outs(%3019 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32> + %expanded_2332 = tensor.expand_shape %3020 [[0, 1], [2], [3]] : tensor<32x80x80xf32> into tensor<1x32x80x80xf32> + %cst_2333 = arith.constant dense<11.3137083> : tensor<1x32x80x80xf32> + %3021 = tensor.empty() : tensor<1x32x80x80xf32> + %3022 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst_2333 : tensor<1x32x80x80xf32>) outs(%3021 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %out: f32): + %cst_2684 = arith.constant 1.000000e+00 : f32 + %3652 = arith.divf %cst_2684, %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %3023 = tensor.empty() : tensor<1x32x80x80xf32> + %3024 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2332, %3022 : tensor<1x32x80x80xf32>, tensor<1x32x80x80xf32>) outs(%3023 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %3025 = tensor.empty() : tensor<1x32x80x80xf32> + %collapsed_2334 = tensor.collapse_shape %21 [[0, 1], [2], [3]] : tensor<1x1x80x80xf32> into tensor<1x80x80xf32> + %3026 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3024, %collapsed_2334 : tensor<1x32x80x80xf32>, tensor<1x80x80xf32>) outs(%3025 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %3027 = tensor.empty() : tensor<1x32x80x1xf32> + %3028 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%3027 : tensor<1x32x80x1xf32>) { + ^bb0(%out: f32): + %cst_2684 = arith.constant 0xFF800000 : f32 + linalg.yield %cst_2684 : f32 + } -> tensor<1x32x80x1xf32> + %3029 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%3026 : tensor<1x32x80x80xf32>) outs(%3027 : tensor<1x32x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.maximumf %in, %out : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x1xf32> + %3030 = tensor.empty() : tensor<1x32x80x80xf32> + %3031 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3026, %3029 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%3030 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.subf %in, %in_2684 : f32 + %3653 = math.exp %3652 : f32 + linalg.yield %3653 : f32 + } -> tensor<1x32x80x80xf32> + %3032 = tensor.empty() : tensor<1x32x80x1xf32> + %3033 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%3032 : tensor<1x32x80x1xf32>) { + ^bb0(%out: f32): + %cst_2684 = arith.constant 0.000000e+00 : f32 + linalg.yield %cst_2684 : f32 + } -> tensor<1x32x80x1xf32> + %3034 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%3031 : tensor<1x32x80x80xf32>) outs(%3033 : tensor<1x32x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.addf %in, %out : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x1xf32> + %3035 = tensor.empty() : tensor<1x32x80x80xf32> + %3036 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3031, %3034 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%3035 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.divf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %cst_2335 = arith.constant dense<0.000000e+00> : tensor<1x32x80x80xf32> + %collapsed_2336 = tensor.collapse_shape %3036 [[0, 1], [2], [3]] : tensor<1x32x80x80xf32> into tensor<32x80x80xf32> + %cst_2337 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32> + %collapsed_2338 = tensor.collapse_shape %2985 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32> + %cst_2339 = arith.constant 0.000000e+00 : f32 + %3037 = tensor.empty() : tensor<32x80x128xf32> + %3038 = linalg.fill ins(%cst_2339 : f32) outs(%3037 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32> + %3039 = linalg.batch_matmul ins(%collapsed_2336, %collapsed_2338 : tensor<32x80x80xf32>, tensor<32x80x128xf32>) outs(%3038 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32> + %expanded_2340 = tensor.expand_shape %3039 [[0, 1], [2], [3]] : tensor<32x80x128xf32> into tensor<1x32x80x128xf32> + %3040 = tensor.empty() : tensor<1x80x32x128xf32> + %3041 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2340 : tensor<1x32x80x128xf32>) outs(%3040 : tensor<1x80x32x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x80x32x128xf32> + %collapsed_2341 = tensor.collapse_shape %3041 [[0], [1], [2, 3]] : tensor<1x80x32x128xf32> into tensor<1x80x4096xf32> + %3042 = tensor.empty() : tensor<4096x4096xf32> + %3043 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_436 : tensor<4096x4096xf32>) outs(%3042 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_2342 = tensor.collapse_shape %collapsed_2341 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_2343 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %3044 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_2342, %3043 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_2343 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_2344 = tensor.expand_shape %3044 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %3045 = tensor.empty() : tensor<1x80x4096xf32> + %3046 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2959, %expanded_2344 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%3045 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %3047 = tensor.empty() : tensor<1x80x4096xf32> + %cst_2345 = arith.constant 2.000000e+00 : f32 + %3048 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3046 : tensor<1x80x4096xf32>) outs(%3047 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.powf %in, %cst_2345 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %cst_2346 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32> + %3049 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%3048 : tensor<1x80x4096xf32>) outs(%cst_2346 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %cst_2684 = arith.constant 4.096000e+03 : f32 + %3652 = arith.divf %in, %cst_2684 : f32 + %3653 = arith.addf %3652, %out : f32 + linalg.yield %3653 : f32 + } -> tensor<1x80x1xf32> + %cst_2347 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32> + %3050 = tensor.empty() : tensor<1x80x1xf32> + %3051 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3049, %cst_2347 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%3050 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %3052 = tensor.empty() : tensor<1x80x1xf32> + %3053 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3051 : tensor<1x80x1xf32>) outs(%3052 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.rsqrt %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %3054 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_2348 = tensor.collapse_shape %3053 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32> + %3055 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3046, %collapsed_2348 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%3054 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %expanded_2349 = tensor.expand_shape %extracted_slice_52 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32> + %3056 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_2350 = tensor.collapse_shape %expanded_2349 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32> + %3057 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_2350, %3055 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%3056 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %3058 = tensor.empty() : tensor<4096x11008xf32> + %3059 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_438 : tensor<11008x4096xf32>) outs(%3058 : tensor<4096x11008xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x11008xf32> + %collapsed_2351 = tensor.collapse_shape %3057 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_2352 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32> + %3060 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_2351, %3059 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_2352 : tensor<80x11008xf32>) -> tensor<80x11008xf32> + %expanded_2353 = tensor.expand_shape %3060 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32> + %3061 = tensor.empty() : tensor<1x80x11008xf32> + %3062 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_2353 : tensor<1x80x11008xf32>) outs(%3061 : tensor<1x80x11008xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.negf %in : f32 + %3653 = math.exp %3652 : f32 + %cst_2684 = arith.constant 1.000000e+00 : f32 + %3654 = arith.addf %cst_2684, %3653 : f32 + %3655 = arith.divf %in, %3654 : f32 + linalg.yield %3655 : f32 + } -> tensor<1x80x11008xf32> + %3063 = tensor.empty() : tensor<4096x11008xf32> + %3064 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_440 : tensor<11008x4096xf32>) outs(%3063 : tensor<4096x11008xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x11008xf32> + %collapsed_2354 = tensor.collapse_shape %3057 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_2355 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32> + %3065 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_2354, %3064 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_2355 : tensor<80x11008xf32>) -> tensor<80x11008xf32> + %expanded_2356 = tensor.expand_shape %3065 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32> + %3066 = tensor.empty() : tensor<1x80x11008xf32> + %3067 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3062, %expanded_2356 : tensor<1x80x11008xf32>, tensor<1x80x11008xf32>) outs(%3066 : tensor<1x80x11008xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x11008xf32> + %3068 = tensor.empty() : tensor<11008x4096xf32> + %3069 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_442 : tensor<4096x11008xf32>) outs(%3068 : tensor<11008x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<11008x4096xf32> + %collapsed_2357 = tensor.collapse_shape %3067 [[0, 1], [2]] : tensor<1x80x11008xf32> into tensor<80x11008xf32> + %cst_2358 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %3070 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_2357, %3069 : tensor<80x11008xf32>, tensor<11008x4096xf32>) outs(%cst_2358 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_2359 = tensor.expand_shape %3070 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %3071 = tensor.empty() : tensor<1x80x4096xf32> + %3072 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3046, %expanded_2359 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%3071 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %3073 = tensor.empty() : tensor<1x80x4096xf32> + %cst_2360 = arith.constant 2.000000e+00 : f32 + %3074 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3072 : tensor<1x80x4096xf32>) outs(%3073 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.powf %in, %cst_2360 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %cst_2361 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32> + %3075 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%3074 : tensor<1x80x4096xf32>) outs(%cst_2361 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %cst_2684 = arith.constant 4.096000e+03 : f32 + %3652 = arith.divf %in, %cst_2684 : f32 + %3653 = arith.addf %3652, %out : f32 + linalg.yield %3653 : f32 + } -> tensor<1x80x1xf32> + %cst_2362 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32> + %3076 = tensor.empty() : tensor<1x80x1xf32> + %3077 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3075, %cst_2362 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%3076 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %3078 = tensor.empty() : tensor<1x80x1xf32> + %3079 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3077 : tensor<1x80x1xf32>) outs(%3078 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.rsqrt %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %3080 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_2363 = tensor.collapse_shape %3079 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32> + %3081 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3072, %collapsed_2363 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%3080 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %expanded_2364 = tensor.expand_shape %extracted_slice_53 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32> + %3082 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_2365 = tensor.collapse_shape %expanded_2364 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32> + %3083 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_2365, %3081 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%3082 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %3084 = tensor.empty() : tensor<4096x4096xf32> + %3085 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_444 : tensor<4096x4096xf32>) outs(%3084 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_2366 = tensor.collapse_shape %3083 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_2367 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %3086 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_2366, %3085 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_2367 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_2368 = tensor.expand_shape %3086 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %3087 = tensor.empty() : tensor<4096x4096xf32> + %3088 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_446 : tensor<4096x4096xf32>) outs(%3087 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_2369 = tensor.collapse_shape %3083 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_2370 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %3089 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_2369, %3088 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_2370 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_2371 = tensor.expand_shape %3089 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %3090 = tensor.empty() : tensor<4096x4096xf32> + %3091 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_448 : tensor<4096x4096xf32>) outs(%3090 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_2372 = tensor.collapse_shape %3083 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_2373 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %3092 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_2372, %3091 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_2373 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_2374 = tensor.expand_shape %3092 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %expanded_2375 = tensor.expand_shape %expanded_2368 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32> + %3093 = tensor.empty() : tensor<1x32x80x128xf32> + %3094 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2375 : tensor<1x80x32x128xf32>) outs(%3093 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x80x128xf32> + %expanded_2376 = tensor.expand_shape %expanded_2371 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32> + %3095 = tensor.empty() : tensor<1x32x80x128xf32> + %3096 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2376 : tensor<1x80x32x128xf32>) outs(%3095 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x80x128xf32> + %expanded_2377 = tensor.expand_shape %expanded_2374 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32> + %3097 = tensor.empty() : tensor<1x32x80x128xf32> + %3098 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2377 : tensor<1x80x32x128xf32>) outs(%3097 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x80x128xf32> + %extracted_slice_2378 = tensor.extract_slice %expanded_624[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32> + %extracted_slice_2379 = tensor.extract_slice %expanded_626[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32> + %3099 = tensor.empty() : tensor<1x80x128xf32> + %3100 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_2378 : tensor<1x1x80x128xf32>) outs(%3099 : tensor<1x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x80x128xf32> + %3101 = tensor.empty() : tensor<80x128xf32> + %3102 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%3100 : tensor<1x80x128xf32>) outs(%3101 : tensor<80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<80x128xf32> + %3103 = tensor.empty() : tensor<1x80x128xf32> + %3104 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_2379 : tensor<1x1x80x128xf32>) outs(%3103 : tensor<1x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x80x128xf32> + %3105 = tensor.empty() : tensor<80x128xf32> + %3106 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%3104 : tensor<1x80x128xf32>) outs(%3105 : tensor<80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<80x128xf32> + %3107 = tensor.empty() : tensor<1x80x128xf32> + %3108 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%3107 : tensor<1x80x128xf32>) { + ^bb0(%in: i64, %out: f32): + %3652 = arith.index_cast %in : i64 to index + %3653 = linalg.index 2 : index + %extracted = tensor.extract %3102[%3652, %3653] : tensor<80x128xf32> + linalg.yield %extracted : f32 + } -> tensor<1x80x128xf32> + %expanded_2380 = tensor.expand_shape %3108 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32> + %3109 = tensor.empty() : tensor<1x80x128xf32> + %3110 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%3109 : tensor<1x80x128xf32>) { + ^bb0(%in: i64, %out: f32): + %3652 = arith.index_cast %in : i64 to index + %3653 = linalg.index 2 : index + %extracted = tensor.extract %3106[%3652, %3653] : tensor<80x128xf32> + linalg.yield %extracted : f32 + } -> tensor<1x80x128xf32> + %expanded_2381 = tensor.expand_shape %3110 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32> + %3111 = tensor.empty() : tensor<1x32x80x128xf32> + %3112 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3094, %3108 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%3111 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %extracted_slice_2382 = tensor.extract_slice %3094[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %extracted_slice_2383 = tensor.extract_slice %3094[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %3113 = tensor.empty() : tensor<1x32x80x64xf32> + %3114 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_2383 : tensor<1x32x80x64xf32>) outs(%3113 : tensor<1x32x80x64xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.negf %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x64xf32> + %3115 = tensor.empty() : tensor<1x32x80x128xf32> + %inserted_slice_2384 = tensor.insert_slice %3114 into %3115[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %inserted_slice_2385 = tensor.insert_slice %extracted_slice_2382 into %inserted_slice_2384[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %3116 = tensor.empty() : tensor<1x32x80x128xf32> + %3117 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_2385, %3110 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%3116 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %3118 = tensor.empty() : tensor<1x32x80x128xf32> + %3119 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3112, %3117 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%3118 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %3120 = tensor.empty() : tensor<1x32x80x128xf32> + %3121 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3096, %3108 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%3120 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %extracted_slice_2386 = tensor.extract_slice %3096[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %extracted_slice_2387 = tensor.extract_slice %3096[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %3122 = tensor.empty() : tensor<1x32x80x64xf32> + %3123 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_2387 : tensor<1x32x80x64xf32>) outs(%3122 : tensor<1x32x80x64xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.negf %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x64xf32> + %3124 = tensor.empty() : tensor<1x32x80x128xf32> + %inserted_slice_2388 = tensor.insert_slice %3123 into %3124[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %inserted_slice_2389 = tensor.insert_slice %extracted_slice_2386 into %inserted_slice_2388[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %3125 = tensor.empty() : tensor<1x32x80x128xf32> + %3126 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_2389, %3110 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%3125 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %3127 = tensor.empty() : tensor<1x32x80x128xf32> + %3128 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3121, %3126 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%3127 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %3129 = tensor.empty() : tensor<1x32x128x80xf32> + %3130 = linalg.generic {indexing_maps = [#map14, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3128 : tensor<1x32x80x128xf32>) outs(%3129 : tensor<1x32x128x80xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x128x80xf32> + %cst_2390 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32> + %collapsed_2391 = tensor.collapse_shape %3119 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32> + %cst_2392 = arith.constant dense<0.000000e+00> : tensor<1x32x128x80xf32> + %collapsed_2393 = tensor.collapse_shape %3130 [[0, 1], [2], [3]] : tensor<1x32x128x80xf32> into tensor<32x128x80xf32> + %cst_2394 = arith.constant 0.000000e+00 : f32 + %3131 = tensor.empty() : tensor<32x80x80xf32> + %3132 = linalg.fill ins(%cst_2394 : f32) outs(%3131 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32> + %3133 = linalg.batch_matmul ins(%collapsed_2391, %collapsed_2393 : tensor<32x80x128xf32>, tensor<32x128x80xf32>) outs(%3132 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32> + %expanded_2395 = tensor.expand_shape %3133 [[0, 1], [2], [3]] : tensor<32x80x80xf32> into tensor<1x32x80x80xf32> + %cst_2396 = arith.constant dense<11.3137083> : tensor<1x32x80x80xf32> + %3134 = tensor.empty() : tensor<1x32x80x80xf32> + %3135 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst_2396 : tensor<1x32x80x80xf32>) outs(%3134 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %out: f32): + %cst_2684 = arith.constant 1.000000e+00 : f32 + %3652 = arith.divf %cst_2684, %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %3136 = tensor.empty() : tensor<1x32x80x80xf32> + %3137 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2395, %3135 : tensor<1x32x80x80xf32>, tensor<1x32x80x80xf32>) outs(%3136 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %3138 = tensor.empty() : tensor<1x32x80x80xf32> + %collapsed_2397 = tensor.collapse_shape %21 [[0, 1], [2], [3]] : tensor<1x1x80x80xf32> into tensor<1x80x80xf32> + %3139 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3137, %collapsed_2397 : tensor<1x32x80x80xf32>, tensor<1x80x80xf32>) outs(%3138 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %3140 = tensor.empty() : tensor<1x32x80x1xf32> + %3141 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%3140 : tensor<1x32x80x1xf32>) { + ^bb0(%out: f32): + %cst_2684 = arith.constant 0xFF800000 : f32 + linalg.yield %cst_2684 : f32 + } -> tensor<1x32x80x1xf32> + %3142 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%3139 : tensor<1x32x80x80xf32>) outs(%3140 : tensor<1x32x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.maximumf %in, %out : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x1xf32> + %3143 = tensor.empty() : tensor<1x32x80x80xf32> + %3144 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3139, %3142 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%3143 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.subf %in, %in_2684 : f32 + %3653 = math.exp %3652 : f32 + linalg.yield %3653 : f32 + } -> tensor<1x32x80x80xf32> + %3145 = tensor.empty() : tensor<1x32x80x1xf32> + %3146 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%3145 : tensor<1x32x80x1xf32>) { + ^bb0(%out: f32): + %cst_2684 = arith.constant 0.000000e+00 : f32 + linalg.yield %cst_2684 : f32 + } -> tensor<1x32x80x1xf32> + %3147 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%3144 : tensor<1x32x80x80xf32>) outs(%3146 : tensor<1x32x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.addf %in, %out : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x1xf32> + %3148 = tensor.empty() : tensor<1x32x80x80xf32> + %3149 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3144, %3147 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%3148 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.divf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %cst_2398 = arith.constant dense<0.000000e+00> : tensor<1x32x80x80xf32> + %collapsed_2399 = tensor.collapse_shape %3149 [[0, 1], [2], [3]] : tensor<1x32x80x80xf32> into tensor<32x80x80xf32> + %cst_2400 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32> + %collapsed_2401 = tensor.collapse_shape %3098 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32> + %cst_2402 = arith.constant 0.000000e+00 : f32 + %3150 = tensor.empty() : tensor<32x80x128xf32> + %3151 = linalg.fill ins(%cst_2402 : f32) outs(%3150 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32> + %3152 = linalg.batch_matmul ins(%collapsed_2399, %collapsed_2401 : tensor<32x80x80xf32>, tensor<32x80x128xf32>) outs(%3151 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32> + %expanded_2403 = tensor.expand_shape %3152 [[0, 1], [2], [3]] : tensor<32x80x128xf32> into tensor<1x32x80x128xf32> + %3153 = tensor.empty() : tensor<1x80x32x128xf32> + %3154 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2403 : tensor<1x32x80x128xf32>) outs(%3153 : tensor<1x80x32x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x80x32x128xf32> + %collapsed_2404 = tensor.collapse_shape %3154 [[0], [1], [2, 3]] : tensor<1x80x32x128xf32> into tensor<1x80x4096xf32> + %3155 = tensor.empty() : tensor<4096x4096xf32> + %3156 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_450 : tensor<4096x4096xf32>) outs(%3155 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_2405 = tensor.collapse_shape %collapsed_2404 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_2406 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %3157 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_2405, %3156 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_2406 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_2407 = tensor.expand_shape %3157 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %3158 = tensor.empty() : tensor<1x80x4096xf32> + %3159 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3072, %expanded_2407 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%3158 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %3160 = tensor.empty() : tensor<1x80x4096xf32> + %cst_2408 = arith.constant 2.000000e+00 : f32 + %3161 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3159 : tensor<1x80x4096xf32>) outs(%3160 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.powf %in, %cst_2408 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %cst_2409 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32> + %3162 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%3161 : tensor<1x80x4096xf32>) outs(%cst_2409 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %cst_2684 = arith.constant 4.096000e+03 : f32 + %3652 = arith.divf %in, %cst_2684 : f32 + %3653 = arith.addf %3652, %out : f32 + linalg.yield %3653 : f32 + } -> tensor<1x80x1xf32> + %cst_2410 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32> + %3163 = tensor.empty() : tensor<1x80x1xf32> + %3164 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3162, %cst_2410 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%3163 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %3165 = tensor.empty() : tensor<1x80x1xf32> + %3166 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3164 : tensor<1x80x1xf32>) outs(%3165 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.rsqrt %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %3167 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_2411 = tensor.collapse_shape %3166 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32> + %3168 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3159, %collapsed_2411 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%3167 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %expanded_2412 = tensor.expand_shape %extracted_slice_54 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32> + %3169 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_2413 = tensor.collapse_shape %expanded_2412 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32> + %3170 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_2413, %3168 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%3169 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %3171 = tensor.empty() : tensor<4096x11008xf32> + %3172 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_452 : tensor<11008x4096xf32>) outs(%3171 : tensor<4096x11008xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x11008xf32> + %collapsed_2414 = tensor.collapse_shape %3170 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_2415 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32> + %3173 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_2414, %3172 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_2415 : tensor<80x11008xf32>) -> tensor<80x11008xf32> + %expanded_2416 = tensor.expand_shape %3173 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32> + %3174 = tensor.empty() : tensor<1x80x11008xf32> + %3175 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_2416 : tensor<1x80x11008xf32>) outs(%3174 : tensor<1x80x11008xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.negf %in : f32 + %3653 = math.exp %3652 : f32 + %cst_2684 = arith.constant 1.000000e+00 : f32 + %3654 = arith.addf %cst_2684, %3653 : f32 + %3655 = arith.divf %in, %3654 : f32 + linalg.yield %3655 : f32 + } -> tensor<1x80x11008xf32> + %3176 = tensor.empty() : tensor<4096x11008xf32> + %3177 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_454 : tensor<11008x4096xf32>) outs(%3176 : tensor<4096x11008xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x11008xf32> + %collapsed_2417 = tensor.collapse_shape %3170 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_2418 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32> + %3178 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_2417, %3177 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_2418 : tensor<80x11008xf32>) -> tensor<80x11008xf32> + %expanded_2419 = tensor.expand_shape %3178 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32> + %3179 = tensor.empty() : tensor<1x80x11008xf32> + %3180 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3175, %expanded_2419 : tensor<1x80x11008xf32>, tensor<1x80x11008xf32>) outs(%3179 : tensor<1x80x11008xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x11008xf32> + %3181 = tensor.empty() : tensor<11008x4096xf32> + %3182 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_456 : tensor<4096x11008xf32>) outs(%3181 : tensor<11008x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<11008x4096xf32> + %collapsed_2420 = tensor.collapse_shape %3180 [[0, 1], [2]] : tensor<1x80x11008xf32> into tensor<80x11008xf32> + %cst_2421 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %3183 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_2420, %3182 : tensor<80x11008xf32>, tensor<11008x4096xf32>) outs(%cst_2421 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_2422 = tensor.expand_shape %3183 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %3184 = tensor.empty() : tensor<1x80x4096xf32> + %3185 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3159, %expanded_2422 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%3184 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %3186 = tensor.empty() : tensor<1x80x4096xf32> + %cst_2423 = arith.constant 2.000000e+00 : f32 + %3187 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3185 : tensor<1x80x4096xf32>) outs(%3186 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.powf %in, %cst_2423 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %cst_2424 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32> + %3188 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%3187 : tensor<1x80x4096xf32>) outs(%cst_2424 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %cst_2684 = arith.constant 4.096000e+03 : f32 + %3652 = arith.divf %in, %cst_2684 : f32 + %3653 = arith.addf %3652, %out : f32 + linalg.yield %3653 : f32 + } -> tensor<1x80x1xf32> + %cst_2425 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32> + %3189 = tensor.empty() : tensor<1x80x1xf32> + %3190 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3188, %cst_2425 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%3189 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %3191 = tensor.empty() : tensor<1x80x1xf32> + %3192 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3190 : tensor<1x80x1xf32>) outs(%3191 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.rsqrt %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %3193 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_2426 = tensor.collapse_shape %3192 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32> + %3194 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3185, %collapsed_2426 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%3193 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %expanded_2427 = tensor.expand_shape %extracted_slice_55 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32> + %3195 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_2428 = tensor.collapse_shape %expanded_2427 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32> + %3196 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_2428, %3194 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%3195 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %3197 = tensor.empty() : tensor<4096x4096xf32> + %3198 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_458 : tensor<4096x4096xf32>) outs(%3197 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_2429 = tensor.collapse_shape %3196 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_2430 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %3199 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_2429, %3198 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_2430 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_2431 = tensor.expand_shape %3199 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %3200 = tensor.empty() : tensor<4096x4096xf32> + %3201 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_460 : tensor<4096x4096xf32>) outs(%3200 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_2432 = tensor.collapse_shape %3196 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_2433 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %3202 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_2432, %3201 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_2433 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_2434 = tensor.expand_shape %3202 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %3203 = tensor.empty() : tensor<4096x4096xf32> + %3204 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_462 : tensor<4096x4096xf32>) outs(%3203 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_2435 = tensor.collapse_shape %3196 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_2436 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %3205 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_2435, %3204 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_2436 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_2437 = tensor.expand_shape %3205 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %expanded_2438 = tensor.expand_shape %expanded_2431 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32> + %3206 = tensor.empty() : tensor<1x32x80x128xf32> + %3207 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2438 : tensor<1x80x32x128xf32>) outs(%3206 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x80x128xf32> + %expanded_2439 = tensor.expand_shape %expanded_2434 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32> + %3208 = tensor.empty() : tensor<1x32x80x128xf32> + %3209 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2439 : tensor<1x80x32x128xf32>) outs(%3208 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x80x128xf32> + %expanded_2440 = tensor.expand_shape %expanded_2437 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32> + %3210 = tensor.empty() : tensor<1x32x80x128xf32> + %3211 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2440 : tensor<1x80x32x128xf32>) outs(%3210 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x80x128xf32> + %extracted_slice_2441 = tensor.extract_slice %expanded_628[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32> + %extracted_slice_2442 = tensor.extract_slice %expanded_630[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32> + %3212 = tensor.empty() : tensor<1x80x128xf32> + %3213 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_2441 : tensor<1x1x80x128xf32>) outs(%3212 : tensor<1x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x80x128xf32> + %3214 = tensor.empty() : tensor<80x128xf32> + %3215 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%3213 : tensor<1x80x128xf32>) outs(%3214 : tensor<80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<80x128xf32> + %3216 = tensor.empty() : tensor<1x80x128xf32> + %3217 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_2442 : tensor<1x1x80x128xf32>) outs(%3216 : tensor<1x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x80x128xf32> + %3218 = tensor.empty() : tensor<80x128xf32> + %3219 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%3217 : tensor<1x80x128xf32>) outs(%3218 : tensor<80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<80x128xf32> + %3220 = tensor.empty() : tensor<1x80x128xf32> + %3221 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%3220 : tensor<1x80x128xf32>) { + ^bb0(%in: i64, %out: f32): + %3652 = arith.index_cast %in : i64 to index + %3653 = linalg.index 2 : index + %extracted = tensor.extract %3215[%3652, %3653] : tensor<80x128xf32> + linalg.yield %extracted : f32 + } -> tensor<1x80x128xf32> + %expanded_2443 = tensor.expand_shape %3221 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32> + %3222 = tensor.empty() : tensor<1x80x128xf32> + %3223 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%3222 : tensor<1x80x128xf32>) { + ^bb0(%in: i64, %out: f32): + %3652 = arith.index_cast %in : i64 to index + %3653 = linalg.index 2 : index + %extracted = tensor.extract %3219[%3652, %3653] : tensor<80x128xf32> + linalg.yield %extracted : f32 + } -> tensor<1x80x128xf32> + %expanded_2444 = tensor.expand_shape %3223 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32> + %3224 = tensor.empty() : tensor<1x32x80x128xf32> + %3225 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3207, %3221 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%3224 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %extracted_slice_2445 = tensor.extract_slice %3207[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %extracted_slice_2446 = tensor.extract_slice %3207[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %3226 = tensor.empty() : tensor<1x32x80x64xf32> + %3227 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_2446 : tensor<1x32x80x64xf32>) outs(%3226 : tensor<1x32x80x64xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.negf %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x64xf32> + %3228 = tensor.empty() : tensor<1x32x80x128xf32> + %inserted_slice_2447 = tensor.insert_slice %3227 into %3228[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %inserted_slice_2448 = tensor.insert_slice %extracted_slice_2445 into %inserted_slice_2447[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %3229 = tensor.empty() : tensor<1x32x80x128xf32> + %3230 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_2448, %3223 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%3229 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %3231 = tensor.empty() : tensor<1x32x80x128xf32> + %3232 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3225, %3230 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%3231 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %3233 = tensor.empty() : tensor<1x32x80x128xf32> + %3234 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3209, %3221 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%3233 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %extracted_slice_2449 = tensor.extract_slice %3209[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %extracted_slice_2450 = tensor.extract_slice %3209[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %3235 = tensor.empty() : tensor<1x32x80x64xf32> + %3236 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_2450 : tensor<1x32x80x64xf32>) outs(%3235 : tensor<1x32x80x64xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.negf %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x64xf32> + %3237 = tensor.empty() : tensor<1x32x80x128xf32> + %inserted_slice_2451 = tensor.insert_slice %3236 into %3237[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %inserted_slice_2452 = tensor.insert_slice %extracted_slice_2449 into %inserted_slice_2451[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %3238 = tensor.empty() : tensor<1x32x80x128xf32> + %3239 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_2452, %3223 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%3238 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %3240 = tensor.empty() : tensor<1x32x80x128xf32> + %3241 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3234, %3239 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%3240 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %3242 = tensor.empty() : tensor<1x32x128x80xf32> + %3243 = linalg.generic {indexing_maps = [#map14, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3241 : tensor<1x32x80x128xf32>) outs(%3242 : tensor<1x32x128x80xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x128x80xf32> + %cst_2453 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32> + %collapsed_2454 = tensor.collapse_shape %3232 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32> + %cst_2455 = arith.constant dense<0.000000e+00> : tensor<1x32x128x80xf32> + %collapsed_2456 = tensor.collapse_shape %3243 [[0, 1], [2], [3]] : tensor<1x32x128x80xf32> into tensor<32x128x80xf32> + %cst_2457 = arith.constant 0.000000e+00 : f32 + %3244 = tensor.empty() : tensor<32x80x80xf32> + %3245 = linalg.fill ins(%cst_2457 : f32) outs(%3244 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32> + %3246 = linalg.batch_matmul ins(%collapsed_2454, %collapsed_2456 : tensor<32x80x128xf32>, tensor<32x128x80xf32>) outs(%3245 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32> + %expanded_2458 = tensor.expand_shape %3246 [[0, 1], [2], [3]] : tensor<32x80x80xf32> into tensor<1x32x80x80xf32> + %cst_2459 = arith.constant dense<11.3137083> : tensor<1x32x80x80xf32> + %3247 = tensor.empty() : tensor<1x32x80x80xf32> + %3248 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst_2459 : tensor<1x32x80x80xf32>) outs(%3247 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %out: f32): + %cst_2684 = arith.constant 1.000000e+00 : f32 + %3652 = arith.divf %cst_2684, %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %3249 = tensor.empty() : tensor<1x32x80x80xf32> + %3250 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2458, %3248 : tensor<1x32x80x80xf32>, tensor<1x32x80x80xf32>) outs(%3249 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %3251 = tensor.empty() : tensor<1x32x80x80xf32> + %collapsed_2460 = tensor.collapse_shape %21 [[0, 1], [2], [3]] : tensor<1x1x80x80xf32> into tensor<1x80x80xf32> + %3252 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3250, %collapsed_2460 : tensor<1x32x80x80xf32>, tensor<1x80x80xf32>) outs(%3251 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %3253 = tensor.empty() : tensor<1x32x80x1xf32> + %3254 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%3253 : tensor<1x32x80x1xf32>) { + ^bb0(%out: f32): + %cst_2684 = arith.constant 0xFF800000 : f32 + linalg.yield %cst_2684 : f32 + } -> tensor<1x32x80x1xf32> + %3255 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%3252 : tensor<1x32x80x80xf32>) outs(%3253 : tensor<1x32x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.maximumf %in, %out : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x1xf32> + %3256 = tensor.empty() : tensor<1x32x80x80xf32> + %3257 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3252, %3255 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%3256 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.subf %in, %in_2684 : f32 + %3653 = math.exp %3652 : f32 + linalg.yield %3653 : f32 + } -> tensor<1x32x80x80xf32> + %3258 = tensor.empty() : tensor<1x32x80x1xf32> + %3259 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%3258 : tensor<1x32x80x1xf32>) { + ^bb0(%out: f32): + %cst_2684 = arith.constant 0.000000e+00 : f32 + linalg.yield %cst_2684 : f32 + } -> tensor<1x32x80x1xf32> + %3260 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%3257 : tensor<1x32x80x80xf32>) outs(%3259 : tensor<1x32x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.addf %in, %out : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x1xf32> + %3261 = tensor.empty() : tensor<1x32x80x80xf32> + %3262 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3257, %3260 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%3261 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.divf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %cst_2461 = arith.constant dense<0.000000e+00> : tensor<1x32x80x80xf32> + %collapsed_2462 = tensor.collapse_shape %3262 [[0, 1], [2], [3]] : tensor<1x32x80x80xf32> into tensor<32x80x80xf32> + %cst_2463 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32> + %collapsed_2464 = tensor.collapse_shape %3211 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32> + %cst_2465 = arith.constant 0.000000e+00 : f32 + %3263 = tensor.empty() : tensor<32x80x128xf32> + %3264 = linalg.fill ins(%cst_2465 : f32) outs(%3263 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32> + %3265 = linalg.batch_matmul ins(%collapsed_2462, %collapsed_2464 : tensor<32x80x80xf32>, tensor<32x80x128xf32>) outs(%3264 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32> + %expanded_2466 = tensor.expand_shape %3265 [[0, 1], [2], [3]] : tensor<32x80x128xf32> into tensor<1x32x80x128xf32> + %3266 = tensor.empty() : tensor<1x80x32x128xf32> + %3267 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2466 : tensor<1x32x80x128xf32>) outs(%3266 : tensor<1x80x32x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x80x32x128xf32> + %collapsed_2467 = tensor.collapse_shape %3267 [[0], [1], [2, 3]] : tensor<1x80x32x128xf32> into tensor<1x80x4096xf32> + %3268 = tensor.empty() : tensor<4096x4096xf32> + %3269 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_464 : tensor<4096x4096xf32>) outs(%3268 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_2468 = tensor.collapse_shape %collapsed_2467 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_2469 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %3270 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_2468, %3269 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_2469 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_2470 = tensor.expand_shape %3270 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %3271 = tensor.empty() : tensor<1x80x4096xf32> + %3272 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3185, %expanded_2470 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%3271 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %3273 = tensor.empty() : tensor<1x80x4096xf32> + %cst_2471 = arith.constant 2.000000e+00 : f32 + %3274 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3272 : tensor<1x80x4096xf32>) outs(%3273 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.powf %in, %cst_2471 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %cst_2472 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32> + %3275 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%3274 : tensor<1x80x4096xf32>) outs(%cst_2472 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %cst_2684 = arith.constant 4.096000e+03 : f32 + %3652 = arith.divf %in, %cst_2684 : f32 + %3653 = arith.addf %3652, %out : f32 + linalg.yield %3653 : f32 + } -> tensor<1x80x1xf32> + %cst_2473 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32> + %3276 = tensor.empty() : tensor<1x80x1xf32> + %3277 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3275, %cst_2473 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%3276 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %3278 = tensor.empty() : tensor<1x80x1xf32> + %3279 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3277 : tensor<1x80x1xf32>) outs(%3278 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.rsqrt %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %3280 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_2474 = tensor.collapse_shape %3279 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32> + %3281 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3272, %collapsed_2474 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%3280 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %expanded_2475 = tensor.expand_shape %extracted_slice_56 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32> + %3282 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_2476 = tensor.collapse_shape %expanded_2475 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32> + %3283 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_2476, %3281 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%3282 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %3284 = tensor.empty() : tensor<4096x11008xf32> + %3285 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_466 : tensor<11008x4096xf32>) outs(%3284 : tensor<4096x11008xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x11008xf32> + %collapsed_2477 = tensor.collapse_shape %3283 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_2478 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32> + %3286 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_2477, %3285 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_2478 : tensor<80x11008xf32>) -> tensor<80x11008xf32> + %expanded_2479 = tensor.expand_shape %3286 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32> + %3287 = tensor.empty() : tensor<1x80x11008xf32> + %3288 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_2479 : tensor<1x80x11008xf32>) outs(%3287 : tensor<1x80x11008xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.negf %in : f32 + %3653 = math.exp %3652 : f32 + %cst_2684 = arith.constant 1.000000e+00 : f32 + %3654 = arith.addf %cst_2684, %3653 : f32 + %3655 = arith.divf %in, %3654 : f32 + linalg.yield %3655 : f32 + } -> tensor<1x80x11008xf32> + %3289 = tensor.empty() : tensor<4096x11008xf32> + %3290 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_468 : tensor<11008x4096xf32>) outs(%3289 : tensor<4096x11008xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x11008xf32> + %collapsed_2480 = tensor.collapse_shape %3283 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_2481 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32> + %3291 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_2480, %3290 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_2481 : tensor<80x11008xf32>) -> tensor<80x11008xf32> + %expanded_2482 = tensor.expand_shape %3291 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32> + %3292 = tensor.empty() : tensor<1x80x11008xf32> + %3293 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3288, %expanded_2482 : tensor<1x80x11008xf32>, tensor<1x80x11008xf32>) outs(%3292 : tensor<1x80x11008xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x11008xf32> + %3294 = tensor.empty() : tensor<11008x4096xf32> + %3295 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_470 : tensor<4096x11008xf32>) outs(%3294 : tensor<11008x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<11008x4096xf32> + %collapsed_2483 = tensor.collapse_shape %3293 [[0, 1], [2]] : tensor<1x80x11008xf32> into tensor<80x11008xf32> + %cst_2484 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %3296 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_2483, %3295 : tensor<80x11008xf32>, tensor<11008x4096xf32>) outs(%cst_2484 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_2485 = tensor.expand_shape %3296 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %3297 = tensor.empty() : tensor<1x80x4096xf32> + %3298 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3272, %expanded_2485 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%3297 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %3299 = tensor.empty() : tensor<1x80x4096xf32> + %cst_2486 = arith.constant 2.000000e+00 : f32 + %3300 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3298 : tensor<1x80x4096xf32>) outs(%3299 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.powf %in, %cst_2486 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %cst_2487 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32> + %3301 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%3300 : tensor<1x80x4096xf32>) outs(%cst_2487 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %cst_2684 = arith.constant 4.096000e+03 : f32 + %3652 = arith.divf %in, %cst_2684 : f32 + %3653 = arith.addf %3652, %out : f32 + linalg.yield %3653 : f32 + } -> tensor<1x80x1xf32> + %cst_2488 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32> + %3302 = tensor.empty() : tensor<1x80x1xf32> + %3303 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3301, %cst_2488 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%3302 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %3304 = tensor.empty() : tensor<1x80x1xf32> + %3305 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3303 : tensor<1x80x1xf32>) outs(%3304 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.rsqrt %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %3306 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_2489 = tensor.collapse_shape %3305 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32> + %3307 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3298, %collapsed_2489 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%3306 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %expanded_2490 = tensor.expand_shape %extracted_slice_57 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32> + %3308 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_2491 = tensor.collapse_shape %expanded_2490 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32> + %3309 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_2491, %3307 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%3308 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %3310 = tensor.empty() : tensor<4096x4096xf32> + %3311 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_472 : tensor<4096x4096xf32>) outs(%3310 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_2492 = tensor.collapse_shape %3309 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_2493 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %3312 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_2492, %3311 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_2493 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_2494 = tensor.expand_shape %3312 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %3313 = tensor.empty() : tensor<4096x4096xf32> + %3314 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_474 : tensor<4096x4096xf32>) outs(%3313 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_2495 = tensor.collapse_shape %3309 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_2496 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %3315 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_2495, %3314 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_2496 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_2497 = tensor.expand_shape %3315 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %3316 = tensor.empty() : tensor<4096x4096xf32> + %3317 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_476 : tensor<4096x4096xf32>) outs(%3316 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_2498 = tensor.collapse_shape %3309 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_2499 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %3318 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_2498, %3317 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_2499 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_2500 = tensor.expand_shape %3318 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %expanded_2501 = tensor.expand_shape %expanded_2494 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32> + %3319 = tensor.empty() : tensor<1x32x80x128xf32> + %3320 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2501 : tensor<1x80x32x128xf32>) outs(%3319 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x80x128xf32> + %expanded_2502 = tensor.expand_shape %expanded_2497 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32> + %3321 = tensor.empty() : tensor<1x32x80x128xf32> + %3322 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2502 : tensor<1x80x32x128xf32>) outs(%3321 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x80x128xf32> + %expanded_2503 = tensor.expand_shape %expanded_2500 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32> + %3323 = tensor.empty() : tensor<1x32x80x128xf32> + %3324 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2503 : tensor<1x80x32x128xf32>) outs(%3323 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x80x128xf32> + %extracted_slice_2504 = tensor.extract_slice %expanded_632[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32> + %extracted_slice_2505 = tensor.extract_slice %expanded_634[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32> + %3325 = tensor.empty() : tensor<1x80x128xf32> + %3326 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_2504 : tensor<1x1x80x128xf32>) outs(%3325 : tensor<1x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x80x128xf32> + %3327 = tensor.empty() : tensor<80x128xf32> + %3328 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%3326 : tensor<1x80x128xf32>) outs(%3327 : tensor<80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<80x128xf32> + %3329 = tensor.empty() : tensor<1x80x128xf32> + %3330 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_2505 : tensor<1x1x80x128xf32>) outs(%3329 : tensor<1x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x80x128xf32> + %3331 = tensor.empty() : tensor<80x128xf32> + %3332 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%3330 : tensor<1x80x128xf32>) outs(%3331 : tensor<80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<80x128xf32> + %3333 = tensor.empty() : tensor<1x80x128xf32> + %3334 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%3333 : tensor<1x80x128xf32>) { + ^bb0(%in: i64, %out: f32): + %3652 = arith.index_cast %in : i64 to index + %3653 = linalg.index 2 : index + %extracted = tensor.extract %3328[%3652, %3653] : tensor<80x128xf32> + linalg.yield %extracted : f32 + } -> tensor<1x80x128xf32> + %expanded_2506 = tensor.expand_shape %3334 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32> + %3335 = tensor.empty() : tensor<1x80x128xf32> + %3336 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%3335 : tensor<1x80x128xf32>) { + ^bb0(%in: i64, %out: f32): + %3652 = arith.index_cast %in : i64 to index + %3653 = linalg.index 2 : index + %extracted = tensor.extract %3332[%3652, %3653] : tensor<80x128xf32> + linalg.yield %extracted : f32 + } -> tensor<1x80x128xf32> + %expanded_2507 = tensor.expand_shape %3336 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32> + %3337 = tensor.empty() : tensor<1x32x80x128xf32> + %3338 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3320, %3334 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%3337 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %extracted_slice_2508 = tensor.extract_slice %3320[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %extracted_slice_2509 = tensor.extract_slice %3320[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %3339 = tensor.empty() : tensor<1x32x80x64xf32> + %3340 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_2509 : tensor<1x32x80x64xf32>) outs(%3339 : tensor<1x32x80x64xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.negf %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x64xf32> + %3341 = tensor.empty() : tensor<1x32x80x128xf32> + %inserted_slice_2510 = tensor.insert_slice %3340 into %3341[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %inserted_slice_2511 = tensor.insert_slice %extracted_slice_2508 into %inserted_slice_2510[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %3342 = tensor.empty() : tensor<1x32x80x128xf32> + %3343 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_2511, %3336 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%3342 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %3344 = tensor.empty() : tensor<1x32x80x128xf32> + %3345 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3338, %3343 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%3344 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %3346 = tensor.empty() : tensor<1x32x80x128xf32> + %3347 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3322, %3334 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%3346 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %extracted_slice_2512 = tensor.extract_slice %3322[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %extracted_slice_2513 = tensor.extract_slice %3322[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %3348 = tensor.empty() : tensor<1x32x80x64xf32> + %3349 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_2513 : tensor<1x32x80x64xf32>) outs(%3348 : tensor<1x32x80x64xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.negf %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x64xf32> + %3350 = tensor.empty() : tensor<1x32x80x128xf32> + %inserted_slice_2514 = tensor.insert_slice %3349 into %3350[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %inserted_slice_2515 = tensor.insert_slice %extracted_slice_2512 into %inserted_slice_2514[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %3351 = tensor.empty() : tensor<1x32x80x128xf32> + %3352 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_2515, %3336 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%3351 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %3353 = tensor.empty() : tensor<1x32x80x128xf32> + %3354 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3347, %3352 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%3353 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %3355 = tensor.empty() : tensor<1x32x128x80xf32> + %3356 = linalg.generic {indexing_maps = [#map14, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3354 : tensor<1x32x80x128xf32>) outs(%3355 : tensor<1x32x128x80xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x128x80xf32> + %cst_2516 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32> + %collapsed_2517 = tensor.collapse_shape %3345 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32> + %cst_2518 = arith.constant dense<0.000000e+00> : tensor<1x32x128x80xf32> + %collapsed_2519 = tensor.collapse_shape %3356 [[0, 1], [2], [3]] : tensor<1x32x128x80xf32> into tensor<32x128x80xf32> + %cst_2520 = arith.constant 0.000000e+00 : f32 + %3357 = tensor.empty() : tensor<32x80x80xf32> + %3358 = linalg.fill ins(%cst_2520 : f32) outs(%3357 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32> + %3359 = linalg.batch_matmul ins(%collapsed_2517, %collapsed_2519 : tensor<32x80x128xf32>, tensor<32x128x80xf32>) outs(%3358 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32> + %expanded_2521 = tensor.expand_shape %3359 [[0, 1], [2], [3]] : tensor<32x80x80xf32> into tensor<1x32x80x80xf32> + %cst_2522 = arith.constant dense<11.3137083> : tensor<1x32x80x80xf32> + %3360 = tensor.empty() : tensor<1x32x80x80xf32> + %3361 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst_2522 : tensor<1x32x80x80xf32>) outs(%3360 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %out: f32): + %cst_2684 = arith.constant 1.000000e+00 : f32 + %3652 = arith.divf %cst_2684, %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %3362 = tensor.empty() : tensor<1x32x80x80xf32> + %3363 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2521, %3361 : tensor<1x32x80x80xf32>, tensor<1x32x80x80xf32>) outs(%3362 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %3364 = tensor.empty() : tensor<1x32x80x80xf32> + %collapsed_2523 = tensor.collapse_shape %21 [[0, 1], [2], [3]] : tensor<1x1x80x80xf32> into tensor<1x80x80xf32> + %3365 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3363, %collapsed_2523 : tensor<1x32x80x80xf32>, tensor<1x80x80xf32>) outs(%3364 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %3366 = tensor.empty() : tensor<1x32x80x1xf32> + %3367 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%3366 : tensor<1x32x80x1xf32>) { + ^bb0(%out: f32): + %cst_2684 = arith.constant 0xFF800000 : f32 + linalg.yield %cst_2684 : f32 + } -> tensor<1x32x80x1xf32> + %3368 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%3365 : tensor<1x32x80x80xf32>) outs(%3366 : tensor<1x32x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.maximumf %in, %out : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x1xf32> + %3369 = tensor.empty() : tensor<1x32x80x80xf32> + %3370 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3365, %3368 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%3369 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.subf %in, %in_2684 : f32 + %3653 = math.exp %3652 : f32 + linalg.yield %3653 : f32 + } -> tensor<1x32x80x80xf32> + %3371 = tensor.empty() : tensor<1x32x80x1xf32> + %3372 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%3371 : tensor<1x32x80x1xf32>) { + ^bb0(%out: f32): + %cst_2684 = arith.constant 0.000000e+00 : f32 + linalg.yield %cst_2684 : f32 + } -> tensor<1x32x80x1xf32> + %3373 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%3370 : tensor<1x32x80x80xf32>) outs(%3372 : tensor<1x32x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.addf %in, %out : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x1xf32> + %3374 = tensor.empty() : tensor<1x32x80x80xf32> + %3375 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3370, %3373 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%3374 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.divf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %cst_2524 = arith.constant dense<0.000000e+00> : tensor<1x32x80x80xf32> + %collapsed_2525 = tensor.collapse_shape %3375 [[0, 1], [2], [3]] : tensor<1x32x80x80xf32> into tensor<32x80x80xf32> + %cst_2526 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32> + %collapsed_2527 = tensor.collapse_shape %3324 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32> + %cst_2528 = arith.constant 0.000000e+00 : f32 + %3376 = tensor.empty() : tensor<32x80x128xf32> + %3377 = linalg.fill ins(%cst_2528 : f32) outs(%3376 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32> + %3378 = linalg.batch_matmul ins(%collapsed_2525, %collapsed_2527 : tensor<32x80x80xf32>, tensor<32x80x128xf32>) outs(%3377 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32> + %expanded_2529 = tensor.expand_shape %3378 [[0, 1], [2], [3]] : tensor<32x80x128xf32> into tensor<1x32x80x128xf32> + %3379 = tensor.empty() : tensor<1x80x32x128xf32> + %3380 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2529 : tensor<1x32x80x128xf32>) outs(%3379 : tensor<1x80x32x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x80x32x128xf32> + %collapsed_2530 = tensor.collapse_shape %3380 [[0], [1], [2, 3]] : tensor<1x80x32x128xf32> into tensor<1x80x4096xf32> + %3381 = tensor.empty() : tensor<4096x4096xf32> + %3382 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_478 : tensor<4096x4096xf32>) outs(%3381 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_2531 = tensor.collapse_shape %collapsed_2530 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_2532 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %3383 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_2531, %3382 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_2532 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_2533 = tensor.expand_shape %3383 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %3384 = tensor.empty() : tensor<1x80x4096xf32> + %3385 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3298, %expanded_2533 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%3384 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %3386 = tensor.empty() : tensor<1x80x4096xf32> + %cst_2534 = arith.constant 2.000000e+00 : f32 + %3387 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3385 : tensor<1x80x4096xf32>) outs(%3386 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.powf %in, %cst_2534 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %cst_2535 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32> + %3388 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%3387 : tensor<1x80x4096xf32>) outs(%cst_2535 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %cst_2684 = arith.constant 4.096000e+03 : f32 + %3652 = arith.divf %in, %cst_2684 : f32 + %3653 = arith.addf %3652, %out : f32 + linalg.yield %3653 : f32 + } -> tensor<1x80x1xf32> + %cst_2536 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32> + %3389 = tensor.empty() : tensor<1x80x1xf32> + %3390 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3388, %cst_2536 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%3389 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %3391 = tensor.empty() : tensor<1x80x1xf32> + %3392 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3390 : tensor<1x80x1xf32>) outs(%3391 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.rsqrt %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %3393 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_2537 = tensor.collapse_shape %3392 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32> + %3394 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3385, %collapsed_2537 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%3393 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %expanded_2538 = tensor.expand_shape %extracted_slice_58 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32> + %3395 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_2539 = tensor.collapse_shape %expanded_2538 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32> + %3396 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_2539, %3394 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%3395 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %3397 = tensor.empty() : tensor<4096x11008xf32> + %3398 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_480 : tensor<11008x4096xf32>) outs(%3397 : tensor<4096x11008xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x11008xf32> + %collapsed_2540 = tensor.collapse_shape %3396 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_2541 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32> + %3399 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_2540, %3398 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_2541 : tensor<80x11008xf32>) -> tensor<80x11008xf32> + %expanded_2542 = tensor.expand_shape %3399 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32> + %3400 = tensor.empty() : tensor<1x80x11008xf32> + %3401 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_2542 : tensor<1x80x11008xf32>) outs(%3400 : tensor<1x80x11008xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.negf %in : f32 + %3653 = math.exp %3652 : f32 + %cst_2684 = arith.constant 1.000000e+00 : f32 + %3654 = arith.addf %cst_2684, %3653 : f32 + %3655 = arith.divf %in, %3654 : f32 + linalg.yield %3655 : f32 + } -> tensor<1x80x11008xf32> + %3402 = tensor.empty() : tensor<4096x11008xf32> + %3403 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_482 : tensor<11008x4096xf32>) outs(%3402 : tensor<4096x11008xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x11008xf32> + %collapsed_2543 = tensor.collapse_shape %3396 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_2544 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32> + %3404 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_2543, %3403 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_2544 : tensor<80x11008xf32>) -> tensor<80x11008xf32> + %expanded_2545 = tensor.expand_shape %3404 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32> + %3405 = tensor.empty() : tensor<1x80x11008xf32> + %3406 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3401, %expanded_2545 : tensor<1x80x11008xf32>, tensor<1x80x11008xf32>) outs(%3405 : tensor<1x80x11008xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x11008xf32> + %3407 = tensor.empty() : tensor<11008x4096xf32> + %3408 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_484 : tensor<4096x11008xf32>) outs(%3407 : tensor<11008x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<11008x4096xf32> + %collapsed_2546 = tensor.collapse_shape %3406 [[0, 1], [2]] : tensor<1x80x11008xf32> into tensor<80x11008xf32> + %cst_2547 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %3409 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_2546, %3408 : tensor<80x11008xf32>, tensor<11008x4096xf32>) outs(%cst_2547 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_2548 = tensor.expand_shape %3409 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %3410 = tensor.empty() : tensor<1x80x4096xf32> + %3411 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3385, %expanded_2548 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%3410 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %3412 = tensor.empty() : tensor<1x80x4096xf32> + %cst_2549 = arith.constant 2.000000e+00 : f32 + %3413 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3411 : tensor<1x80x4096xf32>) outs(%3412 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.powf %in, %cst_2549 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %cst_2550 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32> + %3414 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%3413 : tensor<1x80x4096xf32>) outs(%cst_2550 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %cst_2684 = arith.constant 4.096000e+03 : f32 + %3652 = arith.divf %in, %cst_2684 : f32 + %3653 = arith.addf %3652, %out : f32 + linalg.yield %3653 : f32 + } -> tensor<1x80x1xf32> + %cst_2551 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32> + %3415 = tensor.empty() : tensor<1x80x1xf32> + %3416 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3414, %cst_2551 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%3415 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %3417 = tensor.empty() : tensor<1x80x1xf32> + %3418 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3416 : tensor<1x80x1xf32>) outs(%3417 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.rsqrt %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %3419 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_2552 = tensor.collapse_shape %3418 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32> + %3420 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3411, %collapsed_2552 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%3419 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %expanded_2553 = tensor.expand_shape %extracted_slice_59 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32> + %3421 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_2554 = tensor.collapse_shape %expanded_2553 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32> + %3422 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_2554, %3420 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%3421 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %3423 = tensor.empty() : tensor<4096x4096xf32> + %3424 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_486 : tensor<4096x4096xf32>) outs(%3423 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_2555 = tensor.collapse_shape %3422 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_2556 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %3425 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_2555, %3424 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_2556 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_2557 = tensor.expand_shape %3425 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %3426 = tensor.empty() : tensor<4096x4096xf32> + %3427 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_488 : tensor<4096x4096xf32>) outs(%3426 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_2558 = tensor.collapse_shape %3422 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_2559 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %3428 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_2558, %3427 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_2559 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_2560 = tensor.expand_shape %3428 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %3429 = tensor.empty() : tensor<4096x4096xf32> + %3430 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_490 : tensor<4096x4096xf32>) outs(%3429 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_2561 = tensor.collapse_shape %3422 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_2562 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %3431 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_2561, %3430 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_2562 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_2563 = tensor.expand_shape %3431 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %expanded_2564 = tensor.expand_shape %expanded_2557 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32> + %3432 = tensor.empty() : tensor<1x32x80x128xf32> + %3433 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2564 : tensor<1x80x32x128xf32>) outs(%3432 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x80x128xf32> + %expanded_2565 = tensor.expand_shape %expanded_2560 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32> + %3434 = tensor.empty() : tensor<1x32x80x128xf32> + %3435 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2565 : tensor<1x80x32x128xf32>) outs(%3434 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x80x128xf32> + %expanded_2566 = tensor.expand_shape %expanded_2563 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32> + %3436 = tensor.empty() : tensor<1x32x80x128xf32> + %3437 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2566 : tensor<1x80x32x128xf32>) outs(%3436 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x80x128xf32> + %extracted_slice_2567 = tensor.extract_slice %expanded_636[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32> + %extracted_slice_2568 = tensor.extract_slice %expanded_638[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32> + %3438 = tensor.empty() : tensor<1x80x128xf32> + %3439 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_2567 : tensor<1x1x80x128xf32>) outs(%3438 : tensor<1x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x80x128xf32> + %3440 = tensor.empty() : tensor<80x128xf32> + %3441 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%3439 : tensor<1x80x128xf32>) outs(%3440 : tensor<80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<80x128xf32> + %3442 = tensor.empty() : tensor<1x80x128xf32> + %3443 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_2568 : tensor<1x1x80x128xf32>) outs(%3442 : tensor<1x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x80x128xf32> + %3444 = tensor.empty() : tensor<80x128xf32> + %3445 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%3443 : tensor<1x80x128xf32>) outs(%3444 : tensor<80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<80x128xf32> + %3446 = tensor.empty() : tensor<1x80x128xf32> + %3447 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%3446 : tensor<1x80x128xf32>) { + ^bb0(%in: i64, %out: f32): + %3652 = arith.index_cast %in : i64 to index + %3653 = linalg.index 2 : index + %extracted = tensor.extract %3441[%3652, %3653] : tensor<80x128xf32> + linalg.yield %extracted : f32 + } -> tensor<1x80x128xf32> + %expanded_2569 = tensor.expand_shape %3447 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32> + %3448 = tensor.empty() : tensor<1x80x128xf32> + %3449 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%3448 : tensor<1x80x128xf32>) { + ^bb0(%in: i64, %out: f32): + %3652 = arith.index_cast %in : i64 to index + %3653 = linalg.index 2 : index + %extracted = tensor.extract %3445[%3652, %3653] : tensor<80x128xf32> + linalg.yield %extracted : f32 + } -> tensor<1x80x128xf32> + %expanded_2570 = tensor.expand_shape %3449 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32> + %3450 = tensor.empty() : tensor<1x32x80x128xf32> + %3451 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3433, %3447 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%3450 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %extracted_slice_2571 = tensor.extract_slice %3433[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %extracted_slice_2572 = tensor.extract_slice %3433[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %3452 = tensor.empty() : tensor<1x32x80x64xf32> + %3453 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_2572 : tensor<1x32x80x64xf32>) outs(%3452 : tensor<1x32x80x64xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.negf %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x64xf32> + %3454 = tensor.empty() : tensor<1x32x80x128xf32> + %inserted_slice_2573 = tensor.insert_slice %3453 into %3454[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %inserted_slice_2574 = tensor.insert_slice %extracted_slice_2571 into %inserted_slice_2573[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %3455 = tensor.empty() : tensor<1x32x80x128xf32> + %3456 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_2574, %3449 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%3455 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %3457 = tensor.empty() : tensor<1x32x80x128xf32> + %3458 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3451, %3456 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%3457 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %3459 = tensor.empty() : tensor<1x32x80x128xf32> + %3460 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3435, %3447 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%3459 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %extracted_slice_2575 = tensor.extract_slice %3435[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %extracted_slice_2576 = tensor.extract_slice %3435[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %3461 = tensor.empty() : tensor<1x32x80x64xf32> + %3462 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_2576 : tensor<1x32x80x64xf32>) outs(%3461 : tensor<1x32x80x64xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.negf %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x64xf32> + %3463 = tensor.empty() : tensor<1x32x80x128xf32> + %inserted_slice_2577 = tensor.insert_slice %3462 into %3463[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %inserted_slice_2578 = tensor.insert_slice %extracted_slice_2575 into %inserted_slice_2577[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %3464 = tensor.empty() : tensor<1x32x80x128xf32> + %3465 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_2578, %3449 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%3464 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %3466 = tensor.empty() : tensor<1x32x80x128xf32> + %3467 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3460, %3465 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%3466 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %3468 = tensor.empty() : tensor<1x32x128x80xf32> + %3469 = linalg.generic {indexing_maps = [#map14, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3467 : tensor<1x32x80x128xf32>) outs(%3468 : tensor<1x32x128x80xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x128x80xf32> + %cst_2579 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32> + %collapsed_2580 = tensor.collapse_shape %3458 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32> + %cst_2581 = arith.constant dense<0.000000e+00> : tensor<1x32x128x80xf32> + %collapsed_2582 = tensor.collapse_shape %3469 [[0, 1], [2], [3]] : tensor<1x32x128x80xf32> into tensor<32x128x80xf32> + %cst_2583 = arith.constant 0.000000e+00 : f32 + %3470 = tensor.empty() : tensor<32x80x80xf32> + %3471 = linalg.fill ins(%cst_2583 : f32) outs(%3470 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32> + %3472 = linalg.batch_matmul ins(%collapsed_2580, %collapsed_2582 : tensor<32x80x128xf32>, tensor<32x128x80xf32>) outs(%3471 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32> + %expanded_2584 = tensor.expand_shape %3472 [[0, 1], [2], [3]] : tensor<32x80x80xf32> into tensor<1x32x80x80xf32> + %cst_2585 = arith.constant dense<11.3137083> : tensor<1x32x80x80xf32> + %3473 = tensor.empty() : tensor<1x32x80x80xf32> + %3474 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst_2585 : tensor<1x32x80x80xf32>) outs(%3473 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %out: f32): + %cst_2684 = arith.constant 1.000000e+00 : f32 + %3652 = arith.divf %cst_2684, %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %3475 = tensor.empty() : tensor<1x32x80x80xf32> + %3476 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2584, %3474 : tensor<1x32x80x80xf32>, tensor<1x32x80x80xf32>) outs(%3475 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %3477 = tensor.empty() : tensor<1x32x80x80xf32> + %collapsed_2586 = tensor.collapse_shape %21 [[0, 1], [2], [3]] : tensor<1x1x80x80xf32> into tensor<1x80x80xf32> + %3478 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3476, %collapsed_2586 : tensor<1x32x80x80xf32>, tensor<1x80x80xf32>) outs(%3477 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %3479 = tensor.empty() : tensor<1x32x80x1xf32> + %3480 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%3479 : tensor<1x32x80x1xf32>) { + ^bb0(%out: f32): + %cst_2684 = arith.constant 0xFF800000 : f32 + linalg.yield %cst_2684 : f32 + } -> tensor<1x32x80x1xf32> + %3481 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%3478 : tensor<1x32x80x80xf32>) outs(%3479 : tensor<1x32x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.maximumf %in, %out : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x1xf32> + %3482 = tensor.empty() : tensor<1x32x80x80xf32> + %3483 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3478, %3481 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%3482 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.subf %in, %in_2684 : f32 + %3653 = math.exp %3652 : f32 + linalg.yield %3653 : f32 + } -> tensor<1x32x80x80xf32> + %3484 = tensor.empty() : tensor<1x32x80x1xf32> + %3485 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%3484 : tensor<1x32x80x1xf32>) { + ^bb0(%out: f32): + %cst_2684 = arith.constant 0.000000e+00 : f32 + linalg.yield %cst_2684 : f32 + } -> tensor<1x32x80x1xf32> + %3486 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%3483 : tensor<1x32x80x80xf32>) outs(%3485 : tensor<1x32x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.addf %in, %out : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x1xf32> + %3487 = tensor.empty() : tensor<1x32x80x80xf32> + %3488 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3483, %3486 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%3487 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.divf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %cst_2587 = arith.constant dense<0.000000e+00> : tensor<1x32x80x80xf32> + %collapsed_2588 = tensor.collapse_shape %3488 [[0, 1], [2], [3]] : tensor<1x32x80x80xf32> into tensor<32x80x80xf32> + %cst_2589 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32> + %collapsed_2590 = tensor.collapse_shape %3437 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32> + %cst_2591 = arith.constant 0.000000e+00 : f32 + %3489 = tensor.empty() : tensor<32x80x128xf32> + %3490 = linalg.fill ins(%cst_2591 : f32) outs(%3489 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32> + %3491 = linalg.batch_matmul ins(%collapsed_2588, %collapsed_2590 : tensor<32x80x80xf32>, tensor<32x80x128xf32>) outs(%3490 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32> + %expanded_2592 = tensor.expand_shape %3491 [[0, 1], [2], [3]] : tensor<32x80x128xf32> into tensor<1x32x80x128xf32> + %3492 = tensor.empty() : tensor<1x80x32x128xf32> + %3493 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2592 : tensor<1x32x80x128xf32>) outs(%3492 : tensor<1x80x32x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x80x32x128xf32> + %collapsed_2593 = tensor.collapse_shape %3493 [[0], [1], [2, 3]] : tensor<1x80x32x128xf32> into tensor<1x80x4096xf32> + %3494 = tensor.empty() : tensor<4096x4096xf32> + %3495 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_492 : tensor<4096x4096xf32>) outs(%3494 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_2594 = tensor.collapse_shape %collapsed_2593 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_2595 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %3496 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_2594, %3495 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_2595 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_2596 = tensor.expand_shape %3496 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %3497 = tensor.empty() : tensor<1x80x4096xf32> + %3498 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3411, %expanded_2596 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%3497 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %3499 = tensor.empty() : tensor<1x80x4096xf32> + %cst_2597 = arith.constant 2.000000e+00 : f32 + %3500 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3498 : tensor<1x80x4096xf32>) outs(%3499 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.powf %in, %cst_2597 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %cst_2598 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32> + %3501 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%3500 : tensor<1x80x4096xf32>) outs(%cst_2598 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %cst_2684 = arith.constant 4.096000e+03 : f32 + %3652 = arith.divf %in, %cst_2684 : f32 + %3653 = arith.addf %3652, %out : f32 + linalg.yield %3653 : f32 + } -> tensor<1x80x1xf32> + %cst_2599 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32> + %3502 = tensor.empty() : tensor<1x80x1xf32> + %3503 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3501, %cst_2599 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%3502 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %3504 = tensor.empty() : tensor<1x80x1xf32> + %3505 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3503 : tensor<1x80x1xf32>) outs(%3504 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.rsqrt %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %3506 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_2600 = tensor.collapse_shape %3505 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32> + %3507 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3498, %collapsed_2600 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%3506 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %expanded_2601 = tensor.expand_shape %extracted_slice_60 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32> + %3508 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_2602 = tensor.collapse_shape %expanded_2601 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32> + %3509 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_2602, %3507 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%3508 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %3510 = tensor.empty() : tensor<4096x11008xf32> + %3511 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_494 : tensor<11008x4096xf32>) outs(%3510 : tensor<4096x11008xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x11008xf32> + %collapsed_2603 = tensor.collapse_shape %3509 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_2604 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32> + %3512 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_2603, %3511 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_2604 : tensor<80x11008xf32>) -> tensor<80x11008xf32> + %expanded_2605 = tensor.expand_shape %3512 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32> + %3513 = tensor.empty() : tensor<1x80x11008xf32> + %3514 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_2605 : tensor<1x80x11008xf32>) outs(%3513 : tensor<1x80x11008xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.negf %in : f32 + %3653 = math.exp %3652 : f32 + %cst_2684 = arith.constant 1.000000e+00 : f32 + %3654 = arith.addf %cst_2684, %3653 : f32 + %3655 = arith.divf %in, %3654 : f32 + linalg.yield %3655 : f32 + } -> tensor<1x80x11008xf32> + %3515 = tensor.empty() : tensor<4096x11008xf32> + %3516 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_496 : tensor<11008x4096xf32>) outs(%3515 : tensor<4096x11008xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x11008xf32> + %collapsed_2606 = tensor.collapse_shape %3509 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_2607 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32> + %3517 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_2606, %3516 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_2607 : tensor<80x11008xf32>) -> tensor<80x11008xf32> + %expanded_2608 = tensor.expand_shape %3517 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32> + %3518 = tensor.empty() : tensor<1x80x11008xf32> + %3519 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3514, %expanded_2608 : tensor<1x80x11008xf32>, tensor<1x80x11008xf32>) outs(%3518 : tensor<1x80x11008xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x11008xf32> + %3520 = tensor.empty() : tensor<11008x4096xf32> + %3521 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_498 : tensor<4096x11008xf32>) outs(%3520 : tensor<11008x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<11008x4096xf32> + %collapsed_2609 = tensor.collapse_shape %3519 [[0, 1], [2]] : tensor<1x80x11008xf32> into tensor<80x11008xf32> + %cst_2610 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %3522 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_2609, %3521 : tensor<80x11008xf32>, tensor<11008x4096xf32>) outs(%cst_2610 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_2611 = tensor.expand_shape %3522 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %3523 = tensor.empty() : tensor<1x80x4096xf32> + %3524 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3498, %expanded_2611 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%3523 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %3525 = tensor.empty() : tensor<1x80x4096xf32> + %cst_2612 = arith.constant 2.000000e+00 : f32 + %3526 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3524 : tensor<1x80x4096xf32>) outs(%3525 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.powf %in, %cst_2612 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %cst_2613 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32> + %3527 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%3526 : tensor<1x80x4096xf32>) outs(%cst_2613 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %cst_2684 = arith.constant 4.096000e+03 : f32 + %3652 = arith.divf %in, %cst_2684 : f32 + %3653 = arith.addf %3652, %out : f32 + linalg.yield %3653 : f32 + } -> tensor<1x80x1xf32> + %cst_2614 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32> + %3528 = tensor.empty() : tensor<1x80x1xf32> + %3529 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3527, %cst_2614 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%3528 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %3530 = tensor.empty() : tensor<1x80x1xf32> + %3531 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3529 : tensor<1x80x1xf32>) outs(%3530 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.rsqrt %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %3532 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_2615 = tensor.collapse_shape %3531 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32> + %3533 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3524, %collapsed_2615 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%3532 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %expanded_2616 = tensor.expand_shape %extracted_slice_61 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32> + %3534 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_2617 = tensor.collapse_shape %expanded_2616 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32> + %3535 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_2617, %3533 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%3534 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %3536 = tensor.empty() : tensor<4096x4096xf32> + %3537 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_500 : tensor<4096x4096xf32>) outs(%3536 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_2618 = tensor.collapse_shape %3535 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_2619 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %3538 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_2618, %3537 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_2619 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_2620 = tensor.expand_shape %3538 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %3539 = tensor.empty() : tensor<4096x4096xf32> + %3540 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_502 : tensor<4096x4096xf32>) outs(%3539 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_2621 = tensor.collapse_shape %3535 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_2622 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %3541 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_2621, %3540 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_2622 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_2623 = tensor.expand_shape %3541 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %3542 = tensor.empty() : tensor<4096x4096xf32> + %3543 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_504 : tensor<4096x4096xf32>) outs(%3542 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_2624 = tensor.collapse_shape %3535 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_2625 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %3544 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_2624, %3543 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_2625 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_2626 = tensor.expand_shape %3544 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %expanded_2627 = tensor.expand_shape %expanded_2620 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32> + %3545 = tensor.empty() : tensor<1x32x80x128xf32> + %3546 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2627 : tensor<1x80x32x128xf32>) outs(%3545 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x80x128xf32> + %expanded_2628 = tensor.expand_shape %expanded_2623 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32> + %3547 = tensor.empty() : tensor<1x32x80x128xf32> + %3548 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2628 : tensor<1x80x32x128xf32>) outs(%3547 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x80x128xf32> + %expanded_2629 = tensor.expand_shape %expanded_2626 [[0], [1], [2, 3]] : tensor<1x80x4096xf32> into tensor<1x80x32x128xf32> + %3549 = tensor.empty() : tensor<1x32x80x128xf32> + %3550 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2629 : tensor<1x80x32x128xf32>) outs(%3549 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x80x128xf32> + %extracted_slice_2630 = tensor.extract_slice %expanded_640[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32> + %extracted_slice_2631 = tensor.extract_slice %expanded_642[0, 0, 0, 0] [1, 1, 80, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x80x128xf32> + %3551 = tensor.empty() : tensor<1x80x128xf32> + %3552 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_2630 : tensor<1x1x80x128xf32>) outs(%3551 : tensor<1x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x80x128xf32> + %3553 = tensor.empty() : tensor<80x128xf32> + %3554 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%3552 : tensor<1x80x128xf32>) outs(%3553 : tensor<80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<80x128xf32> + %3555 = tensor.empty() : tensor<1x80x128xf32> + %3556 = linalg.generic {indexing_maps = [#map11, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_2631 : tensor<1x1x80x128xf32>) outs(%3555 : tensor<1x80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x80x128xf32> + %3557 = tensor.empty() : tensor<80x128xf32> + %3558 = linalg.generic {indexing_maps = [#map12, #map], iterator_types = ["parallel", "parallel"]} ins(%3556 : tensor<1x80x128xf32>) outs(%3557 : tensor<80x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<80x128xf32> + %3559 = tensor.empty() : tensor<1x80x128xf32> + %3560 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%3559 : tensor<1x80x128xf32>) { + ^bb0(%in: i64, %out: f32): + %3652 = arith.index_cast %in : i64 to index + %3653 = linalg.index 2 : index + %extracted = tensor.extract %3554[%3652, %3653] : tensor<80x128xf32> + linalg.yield %extracted : f32 + } -> tensor<1x80x128xf32> + %expanded_2632 = tensor.expand_shape %3560 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32> + %3561 = tensor.empty() : tensor<1x80x128xf32> + %3562 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_643 : tensor<1x80xi64>) outs(%3561 : tensor<1x80x128xf32>) { + ^bb0(%in: i64, %out: f32): + %3652 = arith.index_cast %in : i64 to index + %3653 = linalg.index 2 : index + %extracted = tensor.extract %3558[%3652, %3653] : tensor<80x128xf32> + linalg.yield %extracted : f32 + } -> tensor<1x80x128xf32> + %expanded_2633 = tensor.expand_shape %3562 [[0, 1], [2], [3]] : tensor<1x80x128xf32> into tensor<1x1x80x128xf32> + %3563 = tensor.empty() : tensor<1x32x80x128xf32> + %3564 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3546, %3560 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%3563 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %extracted_slice_2634 = tensor.extract_slice %3546[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %extracted_slice_2635 = tensor.extract_slice %3546[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %3565 = tensor.empty() : tensor<1x32x80x64xf32> + %3566 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_2635 : tensor<1x32x80x64xf32>) outs(%3565 : tensor<1x32x80x64xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.negf %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x64xf32> + %3567 = tensor.empty() : tensor<1x32x80x128xf32> + %inserted_slice_2636 = tensor.insert_slice %3566 into %3567[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %inserted_slice_2637 = tensor.insert_slice %extracted_slice_2634 into %inserted_slice_2636[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %3568 = tensor.empty() : tensor<1x32x80x128xf32> + %3569 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_2637, %3562 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%3568 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %3570 = tensor.empty() : tensor<1x32x80x128xf32> + %3571 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3564, %3569 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%3570 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %3572 = tensor.empty() : tensor<1x32x80x128xf32> + %3573 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3548, %3560 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%3572 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %extracted_slice_2638 = tensor.extract_slice %3548[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %extracted_slice_2639 = tensor.extract_slice %3548[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x128xf32> to tensor<1x32x80x64xf32> + %3574 = tensor.empty() : tensor<1x32x80x64xf32> + %3575 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_2639 : tensor<1x32x80x64xf32>) outs(%3574 : tensor<1x32x80x64xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.negf %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x64xf32> + %3576 = tensor.empty() : tensor<1x32x80x128xf32> + %inserted_slice_2640 = tensor.insert_slice %3575 into %3576[0, 0, 0, 0] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %inserted_slice_2641 = tensor.insert_slice %extracted_slice_2638 into %inserted_slice_2640[0, 0, 0, 64] [1, 32, 80, 64] [1, 1, 1, 1] : tensor<1x32x80x64xf32> into tensor<1x32x80x128xf32> + %3577 = tensor.empty() : tensor<1x32x80x128xf32> + %3578 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_2641, %3562 : tensor<1x32x80x128xf32>, tensor<1x80x128xf32>) outs(%3577 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %3579 = tensor.empty() : tensor<1x32x80x128xf32> + %3580 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3573, %3578 : tensor<1x32x80x128xf32>, tensor<1x32x80x128xf32>) outs(%3579 : tensor<1x32x80x128xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x128xf32> + %3581 = tensor.empty() : tensor<1x32x128x80xf32> + %3582 = linalg.generic {indexing_maps = [#map14, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3580 : tensor<1x32x80x128xf32>) outs(%3581 : tensor<1x32x128x80xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x32x128x80xf32> + %cst_2642 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32> + %collapsed_2643 = tensor.collapse_shape %3571 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32> + %cst_2644 = arith.constant dense<0.000000e+00> : tensor<1x32x128x80xf32> + %collapsed_2645 = tensor.collapse_shape %3582 [[0, 1], [2], [3]] : tensor<1x32x128x80xf32> into tensor<32x128x80xf32> + %cst_2646 = arith.constant 0.000000e+00 : f32 + %3583 = tensor.empty() : tensor<32x80x80xf32> + %3584 = linalg.fill ins(%cst_2646 : f32) outs(%3583 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32> + %3585 = linalg.batch_matmul ins(%collapsed_2643, %collapsed_2645 : tensor<32x80x128xf32>, tensor<32x128x80xf32>) outs(%3584 : tensor<32x80x80xf32>) -> tensor<32x80x80xf32> + %expanded_2647 = tensor.expand_shape %3585 [[0, 1], [2], [3]] : tensor<32x80x80xf32> into tensor<1x32x80x80xf32> + %cst_2648 = arith.constant dense<11.3137083> : tensor<1x32x80x80xf32> + %3586 = tensor.empty() : tensor<1x32x80x80xf32> + %3587 = linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst_2648 : tensor<1x32x80x80xf32>) outs(%3586 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %out: f32): + %cst_2684 = arith.constant 1.000000e+00 : f32 + %3652 = arith.divf %cst_2684, %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %3588 = tensor.empty() : tensor<1x32x80x80xf32> + %3589 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2647, %3587 : tensor<1x32x80x80xf32>, tensor<1x32x80x80xf32>) outs(%3588 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %3590 = tensor.empty() : tensor<1x32x80x80xf32> + %collapsed_2649 = tensor.collapse_shape %21 [[0, 1], [2], [3]] : tensor<1x1x80x80xf32> into tensor<1x80x80xf32> + %3591 = linalg.generic {indexing_maps = [#map7, #map13, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3589, %collapsed_2649 : tensor<1x32x80x80xf32>, tensor<1x80x80xf32>) outs(%3590 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %3592 = tensor.empty() : tensor<1x32x80x1xf32> + %3593 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%3592 : tensor<1x32x80x1xf32>) { + ^bb0(%out: f32): + %cst_2684 = arith.constant 0xFF800000 : f32 + linalg.yield %cst_2684 : f32 + } -> tensor<1x32x80x1xf32> + %3594 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%3591 : tensor<1x32x80x80xf32>) outs(%3592 : tensor<1x32x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.maximumf %in, %out : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x1xf32> + %3595 = tensor.empty() : tensor<1x32x80x80xf32> + %3596 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3591, %3594 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%3595 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.subf %in, %in_2684 : f32 + %3653 = math.exp %3652 : f32 + linalg.yield %3653 : f32 + } -> tensor<1x32x80x80xf32> + %3597 = tensor.empty() : tensor<1x32x80x1xf32> + %3598 = linalg.generic {indexing_maps = [#map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%3597 : tensor<1x32x80x1xf32>) { + ^bb0(%out: f32): + %cst_2684 = arith.constant 0.000000e+00 : f32 + linalg.yield %cst_2684 : f32 + } -> tensor<1x32x80x1xf32> + %3599 = linalg.generic {indexing_maps = [#map7, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%3596 : tensor<1x32x80x80xf32>) outs(%3598 : tensor<1x32x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.addf %in, %out : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x1xf32> + %3600 = tensor.empty() : tensor<1x32x80x80xf32> + %3601 = linalg.generic {indexing_maps = [#map7, #map15, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3596, %3599 : tensor<1x32x80x80xf32>, tensor<1x32x80x1xf32>) outs(%3600 : tensor<1x32x80x80xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.divf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x32x80x80xf32> + %cst_2650 = arith.constant dense<0.000000e+00> : tensor<1x32x80x80xf32> + %collapsed_2651 = tensor.collapse_shape %3601 [[0, 1], [2], [3]] : tensor<1x32x80x80xf32> into tensor<32x80x80xf32> + %cst_2652 = arith.constant dense<0.000000e+00> : tensor<1x32x80x128xf32> + %collapsed_2653 = tensor.collapse_shape %3550 [[0, 1], [2], [3]] : tensor<1x32x80x128xf32> into tensor<32x80x128xf32> + %cst_2654 = arith.constant 0.000000e+00 : f32 + %3602 = tensor.empty() : tensor<32x80x128xf32> + %3603 = linalg.fill ins(%cst_2654 : f32) outs(%3602 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32> + %3604 = linalg.batch_matmul ins(%collapsed_2651, %collapsed_2653 : tensor<32x80x80xf32>, tensor<32x80x128xf32>) outs(%3603 : tensor<32x80x128xf32>) -> tensor<32x80x128xf32> + %expanded_2655 = tensor.expand_shape %3604 [[0, 1], [2], [3]] : tensor<32x80x128xf32> into tensor<1x32x80x128xf32> + %3605 = tensor.empty() : tensor<1x80x32x128xf32> + %3606 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2655 : tensor<1x32x80x128xf32>) outs(%3605 : tensor<1x80x32x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x80x32x128xf32> + %collapsed_2656 = tensor.collapse_shape %3606 [[0], [1], [2, 3]] : tensor<1x80x32x128xf32> into tensor<1x80x4096xf32> + %3607 = tensor.empty() : tensor<4096x4096xf32> + %3608 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_506 : tensor<4096x4096xf32>) outs(%3607 : tensor<4096x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x4096xf32> + %collapsed_2657 = tensor.collapse_shape %collapsed_2656 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_2658 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %3609 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_2657, %3608 : tensor<80x4096xf32>, tensor<4096x4096xf32>) outs(%cst_2658 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_2659 = tensor.expand_shape %3609 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %3610 = tensor.empty() : tensor<1x80x4096xf32> + %3611 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3524, %expanded_2659 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%3610 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %3612 = tensor.empty() : tensor<1x80x4096xf32> + %cst_2660 = arith.constant 2.000000e+00 : f32 + %3613 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3611 : tensor<1x80x4096xf32>) outs(%3612 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.powf %in, %cst_2660 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %cst_2661 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32> + %3614 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%3613 : tensor<1x80x4096xf32>) outs(%cst_2661 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %cst_2684 = arith.constant 4.096000e+03 : f32 + %3652 = arith.divf %in, %cst_2684 : f32 + %3653 = arith.addf %3652, %out : f32 + linalg.yield %3653 : f32 + } -> tensor<1x80x1xf32> + %cst_2662 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32> + %3615 = tensor.empty() : tensor<1x80x1xf32> + %3616 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3614, %cst_2662 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%3615 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %3617 = tensor.empty() : tensor<1x80x1xf32> + %3618 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3616 : tensor<1x80x1xf32>) outs(%3617 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.rsqrt %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %3619 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_2663 = tensor.collapse_shape %3618 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32> + %3620 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3611, %collapsed_2663 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%3619 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %expanded_2664 = tensor.expand_shape %extracted_slice_62 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32> + %3621 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_2665 = tensor.collapse_shape %expanded_2664 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32> + %3622 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_2665, %3620 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%3621 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %3623 = tensor.empty() : tensor<4096x11008xf32> + %3624 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_508 : tensor<11008x4096xf32>) outs(%3623 : tensor<4096x11008xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x11008xf32> + %collapsed_2666 = tensor.collapse_shape %3622 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_2667 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32> + %3625 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_2666, %3624 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_2667 : tensor<80x11008xf32>) -> tensor<80x11008xf32> + %expanded_2668 = tensor.expand_shape %3625 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32> + %3626 = tensor.empty() : tensor<1x80x11008xf32> + %3627 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_2668 : tensor<1x80x11008xf32>) outs(%3626 : tensor<1x80x11008xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = arith.negf %in : f32 + %3653 = math.exp %3652 : f32 + %cst_2684 = arith.constant 1.000000e+00 : f32 + %3654 = arith.addf %cst_2684, %3653 : f32 + %3655 = arith.divf %in, %3654 : f32 + linalg.yield %3655 : f32 + } -> tensor<1x80x11008xf32> + %3628 = tensor.empty() : tensor<4096x11008xf32> + %3629 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_510 : tensor<11008x4096xf32>) outs(%3628 : tensor<4096x11008xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x11008xf32> + %collapsed_2669 = tensor.collapse_shape %3622 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_2670 = arith.constant dense<0.000000e+00> : tensor<80x11008xf32> + %3630 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_2669, %3629 : tensor<80x4096xf32>, tensor<4096x11008xf32>) outs(%cst_2670 : tensor<80x11008xf32>) -> tensor<80x11008xf32> + %expanded_2671 = tensor.expand_shape %3630 [[0, 1], [2]] : tensor<80x11008xf32> into tensor<1x80x11008xf32> + %3631 = tensor.empty() : tensor<1x80x11008xf32> + %3632 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3627, %expanded_2671 : tensor<1x80x11008xf32>, tensor<1x80x11008xf32>) outs(%3631 : tensor<1x80x11008xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x11008xf32> + %3633 = tensor.empty() : tensor<11008x4096xf32> + %3634 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_512 : tensor<4096x11008xf32>) outs(%3633 : tensor<11008x4096xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<11008x4096xf32> + %collapsed_2672 = tensor.collapse_shape %3632 [[0, 1], [2]] : tensor<1x80x11008xf32> into tensor<80x11008xf32> + %cst_2673 = arith.constant dense<0.000000e+00> : tensor<80x4096xf32> + %3635 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_2672, %3634 : tensor<80x11008xf32>, tensor<11008x4096xf32>) outs(%cst_2673 : tensor<80x4096xf32>) -> tensor<80x4096xf32> + %expanded_2674 = tensor.expand_shape %3635 [[0, 1], [2]] : tensor<80x4096xf32> into tensor<1x80x4096xf32> + %3636 = tensor.empty() : tensor<1x80x4096xf32> + %3637 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3611, %expanded_2674 : tensor<1x80x4096xf32>, tensor<1x80x4096xf32>) outs(%3636 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %3638 = tensor.empty() : tensor<1x80x4096xf32> + %cst_2675 = arith.constant 2.000000e+00 : f32 + %3639 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3637 : tensor<1x80x4096xf32>) outs(%3638 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.powf %in, %cst_2675 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %cst_2676 = arith.constant dense<0.000000e+00> : tensor<1x80x1xf32> + %3640 = linalg.generic {indexing_maps = [#map8, #map6], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%3639 : tensor<1x80x4096xf32>) outs(%cst_2676 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %cst_2684 = arith.constant 4.096000e+03 : f32 + %3652 = arith.divf %in, %cst_2684 : f32 + %3653 = arith.addf %3652, %out : f32 + linalg.yield %3653 : f32 + } -> tensor<1x80x1xf32> + %cst_2677 = arith.constant dense<9.99999974E-6> : tensor<1x80x1xf32> + %3641 = tensor.empty() : tensor<1x80x1xf32> + %3642 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3640, %cst_2677 : tensor<1x80x1xf32>, tensor<1x80x1xf32>) outs(%3641 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.addf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %3643 = tensor.empty() : tensor<1x80x1xf32> + %3644 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3642 : tensor<1x80x1xf32>) outs(%3643 : tensor<1x80x1xf32>) { + ^bb0(%in: f32, %out: f32): + %3652 = math.rsqrt %in : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x1xf32> + %3645 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_2678 = tensor.collapse_shape %3644 [[0], [1, 2]] : tensor<1x80x1xf32> into tensor<1x80xf32> + %3646 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3637, %collapsed_2678 : tensor<1x80x4096xf32>, tensor<1x80xf32>) outs(%3645 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %expanded_2679 = tensor.expand_shape %extracted_slice_63 [[0, 1, 2]] : tensor<4096xf32> into tensor<1x1x4096xf32> + %3647 = tensor.empty() : tensor<1x80x4096xf32> + %collapsed_2680 = tensor.collapse_shape %expanded_2679 [[0, 1], [2]] : tensor<1x1x4096xf32> into tensor<1x4096xf32> + %3648 = linalg.generic {indexing_maps = [#map5, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_2680, %3646 : tensor<1x4096xf32>, tensor<1x80x4096xf32>) outs(%3647 : tensor<1x80x4096xf32>) { + ^bb0(%in: f32, %in_2684: f32, %out: f32): + %3652 = arith.mulf %in, %in_2684 : f32 + linalg.yield %3652 : f32 + } -> tensor<1x80x4096xf32> + %3649 = tensor.empty() : tensor<4096x32000xf32> + %3650 = linalg.generic {indexing_maps = [#map, #map9], iterator_types = ["parallel", "parallel"]} ins(%expanded_514 : tensor<32000x4096xf32>) outs(%3649 : tensor<4096x32000xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<4096x32000xf32> + %collapsed_2681 = tensor.collapse_shape %3648 [[0, 1], [2]] : tensor<1x80x4096xf32> into tensor<80x4096xf32> + %cst_2682 = arith.constant dense<0.000000e+00> : tensor<80x32000xf32> + %3651 = linalg.matmul {cast = #linalg.type_fn} ins(%collapsed_2681, %3650 : tensor<80x4096xf32>, tensor<4096x32000xf32>) outs(%cst_2682 : tensor<80x32000xf32>) -> tensor<80x32000xf32> + %expanded_2683 = tensor.expand_shape %3651 [[0, 1], [2]] : tensor<80x32000xf32> into tensor<1x80x32000xf32> + return %expanded_2683 : tensor<1x80x32000xf32> + } +} + diff --git a/examples/BuddyLlama/llama-main.cpp b/examples/BuddyLlama/llama-main.cpp index 78b5cec02..55530a01c 100644 --- a/examples/BuddyLlama/llama-main.cpp +++ b/examples/BuddyLlama/llama-main.cpp @@ -18,12 +18,9 @@ #include #include #include -#include #include #include #include -#include -#include using namespace buddy; diff --git a/examples/BuddyLlama/llama.sh b/examples/BuddyLlama/llama.sh new file mode 100755 index 000000000..65eab547f --- /dev/null +++ b/examples/BuddyLlama/llama.sh @@ -0,0 +1,11 @@ +buddy-opt llama-linalg-default.mlir -arith-expand -eliminate-empty-tensors -empty-tensor-to-alloc-tensor -linalg-bufferize -convert-linalg-to-affine-loops -affine-loop-fusion -affine-parallelize -lower-affine -canonicalize -func-bufferize -arith-bufferize -tensor-bufferize -buffer-deallocation -finalizing-bufferize -canonicalize -o llama-bufferized.mlir +# mlir-opt llama-linalg-default.mlir -arith-expand -eliminate-empty-tensors -empty-tensor-to-alloc-tensor -one-shot-bufferize="bufferize-function-boundaries copy-before-write" -expand-realloc -resolve-shaped-type-result-dims -canonicalize -buffer-deallocation-simplification -bufferization-lower-deallocations -cse -canonicalize -buffer-deallocation-pipeline -convert-linalg-to-affine-loops -affine-loop-fusion -affine-parallelize -lower-affine -canonicalize -o llama-bufferized.mlir +buddy-opt llama-bufferized.mlir -gpu-map-parallel-loops -convert-parallel-loops-to-gpu -canonicalize -gpu-kernel-outlining -o llama-outlined.mlir +buddy-opt llama-outlined.mlir -gpu-host-register -o llama-host-registered.mlir +buddy-opt llama-host-registered.mlir -convert-scf-to-cf -memref-expand -finalize-memref-to-llvm -convert-arith-to-llvm -convert-gpu-to-nvvm='has-redux=1' -o llama-nvvm.mlir +mlir-opt llama-nvvm.mlir -llvm-request-c-wrappers -o llama-wrapper.mlir +mlir-opt llama-wrapper.mlir --test-lower-to-nvvm="cubin-chip=sm_80 cubin-features=+ptx71 cubin-format=fatbin" -o llama-cubin.mlir +mlir-translate llama-cubin.mlir --mlir-to-llvmir -o llama.ll +/home/liam/IPRC/llvm-project/build/bin/llc llama.ll -filetype=obj -relocation-model=pic -O3 -o llama.o +clang llama.o llama-main.cpp.o /home/liam/IPRC/llvm-project/build/lib/libmlir_cuda_runtime.so /home/liam/IPRC/llvm-project/build/lib/libmlir_c_runner_utils.so -lstdc++ -o llama.out +./llama.out \ No newline at end of file diff --git a/examples/BuddyLlama/test.mlir b/examples/BuddyLlama/test.mlir new file mode 100644 index 000000000..12aba71c3 --- /dev/null +++ b/examples/BuddyLlama/test.mlir @@ -0,0 +1,23 @@ +func.func @main() { + %0 = arith.constant 0 : i8 + %1 = arith.constant 1 : i8 + %2 = arith.constant 2 : i8 + %mem0 = memref.alloc() : memref<8x8xi8> + %mem1 = memref.alloc() : memref<8x8xi8> + %mem2 = memref.alloc() : memref<8x8xi8> + linalg.fill + ins(%2 : i8) + outs(%mem0 : memref<8x8xi8>) + linalg.fill + ins(%1 : i8) + outs(%mem1 : memref<8x8xi8>) + // CHECK: gemmini.tile_matmul %alloc %alloc_{{[0-9]+}} %alloc_{{[0-9]+}} %alloc_{{[0-9]+}} + // CHECK-SAME: memref<8x8xi8> memref<8x8xi8> memref<8x8xi8> memref<8x8xi32> + linalg.matmul + ins(%mem0, %mem1 : memref<8x8xi8>, memref<8x8xi8>) + outs(%mem2 : memref<8x8xi8>) + memref.dealloc %mem0 : memref<8x8xi8> + memref.dealloc %mem1 : memref<8x8xi8> + memref.dealloc %mem2 : memref<8x8xi8> + return +} \ No newline at end of file diff --git a/examples/BuddyLlama/test.sh b/examples/BuddyLlama/test.sh new file mode 100755 index 000000000..b31a5b3b9 --- /dev/null +++ b/examples/BuddyLlama/test.sh @@ -0,0 +1,7 @@ +mlir-opt test.mlir -arith-expand -eliminate-empty-tensors -empty-tensor-to-alloc-tensor -linalg-bufferize -convert-linalg-to-affine-loops -affine-loop-fusion -affine-parallelize -lower-affine -canonicalize -func-bufferize -arith-bufferize -tensor-bufferize -buffer-deallocation -finalizing-bufferize -canonicalize -o bufferized.mlir +mlir-opt bufferized.mlir -gpu-map-parallel-loops -convert-parallel-loops-to-gpu -canonicalize -gpu-kernel-outlining -o outlined.mlir +buddy-opt outlined.mlir -gpu-host-register -o host-registered.mlir +mlir-opt host-registered.mlir -convert-scf-to-cf -memref-expand -finalize-memref-to-llvm -convert-arith-to-llvm -convert-gpu-to-nvvm='has-redux=1' -o nvvm.mlir +mlir-opt nvvm.mlir -llvm-request-c-wrappers -o wrapper.mlir +mlir-opt wrapper.mlir --test-lower-to-nvvm="cubin-chip=sm_80 cubin-features=+ptx71 cubin-format=fatbin" -o cubin.mlir +mlir-cpu-runner cubin.mlir -entry-point-result=void -shared-libs=/home/liam/IPRC/llvm-project/build/lib/libmlir_runner_utils.so -shared-libs=/home/liam/IPRC/llvm-project/build/lib/libmlir_cuda_runtime.so \ No newline at end of file diff --git a/examples/BuddyPython/bert.py b/examples/BuddyPython/bert.py index 7f4f00435..e57dc991b 100644 --- a/examples/BuddyPython/bert.py +++ b/examples/BuddyPython/bert.py @@ -15,6 +15,10 @@ text = "Replace me by any text you'd like." encoded_text = tokenizer(text, return_tensors="pt") with torch.no_grad(): - module, params = dynamo_compiler.importer(model, **encoded_text) - print(module) - print(params) + graphs = dynamo_compiler.importer(model, **encoded_text) + +graph = graphs[0] +params = dynamo_compiler.imported_params[graph] +graph.lower_to_top_level_ir(do_params_pack=True) +print(graph._imported_module) +print(params) diff --git a/examples/BuddyPython/module_gen.py b/examples/BuddyPython/module_gen.py index 10a1e2ee1..e2c722ceb 100644 --- a/examples/BuddyPython/module_gen.py +++ b/examples/BuddyPython/module_gen.py @@ -43,23 +43,12 @@ def foo(x, y): aot_autograd_decomposition=inductor_decomp, ) -# The first way to generate an MLIR Module: -# Pass the function and input data to the dynamo compiler's importer, -# and accepts the generated module and weight parameters. -module, params = dynamo_compiler.importer(foo, *(float32_in1, float32_in2)) - -print(module) -print(params) - -# The second way to generate an MLIR Module: -# Execute the target function using a define-by-run style, -# and get the module and weight parameters from the dynamo compiler's attribute. -foo_mlir = dynamo.optimize(dynamo_compiler)(foo) - -foo_mlir(float32_in1, float32_in2) -print(dynamo_compiler.imported_module) -print(dynamo_compiler.imported_params) - -foo_mlir(int32_in1, int32_in2) -print(dynamo_compiler.imported_module) -print(dynamo_compiler.imported_params) +# Pass the function and input data to the dynamo compiler's importer, the +# importer will first build a graph. Then, lower the graph to top-level IR. +# (tosa, linalg, etc.). Finally, accepts the generated module and weight parameters. +graphs = dynamo_compiler.importer(foo, *(float32_in1, float32_in2)) +graph = graphs[0] +graph.lower_to_top_level_ir(do_params_pack=True) + +print(graph._imported_module) +print(dynamo_compiler.imported_params[graph]) diff --git a/examples/BuddyResNet18/import-resnet18.py b/examples/BuddyResNet18/import-resnet18.py new file mode 100644 index 000000000..c58f4a604 --- /dev/null +++ b/examples/BuddyResNet18/import-resnet18.py @@ -0,0 +1,45 @@ +# ===- import-resnet18.py ------------------------------------------------------ +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# ===--------------------------------------------------------------------------- +# +# This is the test of resnet18 model. +# +# ===--------------------------------------------------------------------------- + +import torch +import torchvision +from torch._inductor.decomposition import decompositions as inductor_decomp + +from buddy.compiler.frontend import DynamoCompiler +from buddy.compiler.ops import tosa + + +model = torchvision.models.resnet18() +model = model.eval() + +# Initialize Dynamo Compiler with specific configurations as an importer. +dynamo_compiler = DynamoCompiler( + primary_registry=tosa.ops_registry, + aot_autograd_decomposition=inductor_decomp, +) + +data = torch.randn([1, 3, 224, 224]) +# Import the model into MLIR module and parameters. +with torch.no_grad(): + graphs = dynamo_compiler.importer(model, data) + +assert len(graphs) == 1 +graphs[0].lower_to_top_level_ir(do_params_pack=True) +print(graphs[0]._imported_module) diff --git a/examples/DAPDialect/CMakeLists.txt b/examples/DAPDialect/CMakeLists.txt index 21c766035..b147d5604 100644 --- a/examples/DAPDialect/CMakeLists.txt +++ b/examples/DAPDialect/CMakeLists.txt @@ -16,9 +16,9 @@ message(STATUS "Spliting size: ${SPLITING_SIZE}") # Buddy DAP Dialect FIR operation #------------------------------------------------------------------------------- -add_executable(firLowpass firLowpass.cpp) -add_dependencies(firLowpass buddy-opt) -target_link_libraries(firLowpass +add_executable(buddy-fir FIRLowpass.cpp) +add_dependencies(buddy-fir buddy-opt) +target_link_libraries(buddy-fir BuddyLibDAP ) @@ -26,9 +26,9 @@ target_link_libraries(firLowpass # Buddy DAP Dialect Biquad Operation #------------------------------------------------------------------------------- -add_executable(biquad biquad.cpp) -add_dependencies(biquad buddy-opt) -target_link_libraries(biquad +add_executable(buddy-biquad biquad.cpp) +add_dependencies(buddy-biquad buddy-opt) +target_link_libraries(buddy-biquad BuddyLibDAP ) @@ -36,8 +36,14 @@ target_link_libraries(biquad # Buddy DAP Dialect IIR Operation #------------------------------------------------------------------------------- -add_executable(iirLowpass iirLowpass.cpp) -add_dependencies(iirLowpass buddy-opt) -target_link_libraries(iirLowpass +add_executable(buddy-iir-scalar IIRLowpass.cpp) +add_dependencies(buddy-iir-scalar buddy-opt) +target_link_libraries(buddy-iir-scalar BuddyLibDAP ) + +add_executable(buddy-iir-vectorization IIRVectorization.cpp) +add_dependencies(buddy-iir-vectorization buddy-opt) +target_link_libraries(buddy-iir-vectorization + BuddyLibDAPVectorization +) diff --git a/examples/DAPDialect/firLowpass.cpp b/examples/DAPDialect/FIRLowpass.cpp similarity index 90% rename from examples/DAPDialect/firLowpass.cpp rename to examples/DAPDialect/FIRLowpass.cpp index 6a4052e8d..cfce56091 100644 --- a/examples/DAPDialect/firLowpass.cpp +++ b/examples/DAPDialect/FIRLowpass.cpp @@ -1,4 +1,4 @@ -//===- FirLowpass.cpp - Example of DAP fir filter ----------------------===// +//===- FIRLowpass.cpp - Example of DAP FIR Filter -------------------------===// // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -38,7 +38,7 @@ int main(int argc, char *argv[]) { if (argc == 3) { saveFileName = argv[2]; } - cout << "Usage: FirLowpass [loadPath] [savePath]" << endl; + cout << "Usage: FIRLowpass [loadPath] [savePath]" << endl; cout << "Current specified path: \n"; cout << "Load: " << fileName << endl; cout << "Save: " << saveFileName << endl; @@ -53,6 +53,6 @@ int main(int argc, char *argv[]) { output.getAudioFile().setAudioBuffer(nullptr); dap::fir(&aud.getMemRef(), &kernel, &output.getMemRef()); cout << "Saving file:" << endl; - cout << (output.save(saveFileName) ? "OK" : "NOT OK") << endl; + cout << (output.save(saveFileName) ? "OK" : "ERROR") << endl; return 0; } diff --git a/examples/DAPDialect/iirLowpass.cpp b/examples/DAPDialect/IIRLowpass.cpp similarity index 82% rename from examples/DAPDialect/iirLowpass.cpp rename to examples/DAPDialect/IIRLowpass.cpp index f3d152802..1b69ec08b 100644 --- a/examples/DAPDialect/iirLowpass.cpp +++ b/examples/DAPDialect/IIRLowpass.cpp @@ -1,4 +1,4 @@ -//===- iirLowpass.cpp - Example of DAP iir filter -------------------------===// +//===- IIRLowpass.cpp - Example of DAP IIR Filter -------------------------===// // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -30,21 +30,23 @@ using namespace std; int main(int argc, char *argv[]) { string fileName = "../../tests/Interface/core/NASA_Mars.wav"; - string saveFileName = "IIR_NASA_Mars.wav"; + string saveFileName = "IIR_LOWPASS_NASA_Mars.wav"; if (argc >= 2) { fileName = argv[1]; } if (argc == 3) { saveFileName = argv[2]; } - cout << "Usage: FirLowpass [loadPath] [savePath]" << endl; + cout << "Usage: IIRLowpass [loadPath] [savePath]" << endl; cout << "Current specified path: \n"; cout << "Load: " << fileName << endl; cout << "Save: " << saveFileName << endl; + // Order of butterworth filter int order = 8; + // Each SOS matrix has 6 paramters. intptr_t kernelSize[2] = {int(order / 2), 6}; MemRef kernel(kernelSize); - + // cutoff frequency = 1000, fs = 48000. dap::iirLowpass(kernel, dap::butterworth(order), 1000, 48000); @@ -54,10 +56,10 @@ int main(int argc, char *argv[]) { output.fetchMetadata(aud.getAudioFile()); output.getAudioFile().setAudioBuffer(nullptr); - dap::iir(&aud.getMemRef(), &kernel, &output.getMemRef()); + dap::IIR(&aud.getMemRef(), &kernel, &output.getMemRef()); cout << "Saving file:" << endl; - cout << (output.save(saveFileName) ? "OK" : "NOT OK") << endl; + cout << (output.save(saveFileName) ? "OK" : "ERROR") << endl; return 0; } diff --git a/examples/DAPDialect/IIRVectorization.cpp b/examples/DAPDialect/IIRVectorization.cpp new file mode 100644 index 000000000..c7d0c1955 --- /dev/null +++ b/examples/DAPDialect/IIRVectorization.cpp @@ -0,0 +1,66 @@ +//===- IIRVectorization.cpp - Example of DAP IIR Vectorization ------------===// +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +// +// This file implements an end to end example for iir filter in buddy-mlir. It +// generates coefficients for a filter and apply it on a piece of mono audio, +// then saves the audio. +// This file will be linked with the object file which use dap vectorization +// pass to generate the executable file. +// +//===----------------------------------------------------------------------===// + +#include +#include + +using namespace dap; +using namespace std; + +int main(int argc, char *argv[]) { + string fileName = "../../tests/Interface/core/NASA_Mars.wav"; + string saveFileName = "IIR_VECTORIZATION_PASS_NASA_Mars.wav"; + if (argc >= 2) { + fileName = argv[1]; + } + if (argc == 3) { + saveFileName = argv[2]; + } + cout << "Usage: IIRVectorizationPass [loadPath] [savePath]" << endl; + cout << "Current specified path: \n"; + cout << "Load: " << fileName << endl; + cout << "Save: " << saveFileName << endl; + // Order for butterworth filter. + int order = 8; + // Each SOS matrix has 6 paramters. + intptr_t kernelSize[2] = {int(order / 2), 6}; + MemRef kernel(kernelSize); + // cutoff frequency = 1000, fs = 48000. + dap::iirLowpass(kernel, dap::butterworth(order), 1000, + 48000); + + auto aud = dap::Audio(fileName); + aud.getAudioFile().printSummary(); + dap::Audio output; + output.fetchMetadata(aud.getAudioFile()); + output.getAudioFile().setAudioBuffer(nullptr); + + dap::IIR(&aud.getMemRef(), &kernel, &output.getMemRef(), + /*isVectorization=*/true); + + cout << "Saving file:" << endl; + cout << (output.save(saveFileName) ? "OK" : "ERROR") << endl; + + return 0; +} diff --git a/examples/DAPDialect/biquad.cpp b/examples/DAPDialect/biquad.cpp index 03709989e..14a78084a 100644 --- a/examples/DAPDialect/biquad.cpp +++ b/examples/DAPDialect/biquad.cpp @@ -1,4 +1,4 @@ -//===- biquad.cpp - Example of DAP iir filter -----------------------------===// +//===- biquad.cpp - Example of DAP Biquad Filter --------------------------===// // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -53,6 +53,6 @@ int main(int argc, char *argv[]) { dap::biquad(&aud.getMemRef(), &kernel, &output.getMemRef()); cout << "Saving file:" << endl; - cout << (output.save(saveFileName) ? "OK" : "NOT OK") << endl; + cout << (output.save(saveFileName) ? "OK" : "ERROR") << endl; return 0; } diff --git a/examples/MLIRTOSA/makefile b/examples/MLIRTOSA/makefile index 3400ea99c..9c1ff1b7f 100644 --- a/examples/MLIRTOSA/makefile +++ b/examples/MLIRTOSA/makefile @@ -16,6 +16,13 @@ MLIR_C_RUNNER_UTILS := ../../llvm/build/lib/libmlir_c_runner_utils.dylib MTRIPLE := x86_64-apple-darwin endif +tosa-resize-lower-to-linalg: + @${MLIR_OPT} ./tosa-resize.mlir \ + -pass-pipeline="builtin.module( \ + func.func(tosa-to-linalg) \ + )" \ + -o ./log.mlir + tosa-resize-lower: @${MLIR_OPT} ./tosa-resize.mlir \ -pass-pipeline="builtin.module( \ @@ -26,7 +33,6 @@ tosa-resize-lower: func-bufferize, \ func.func(buffer-deallocation, convert-linalg-to-loops), \ convert-scf-to-cf, \ - convert-linalg-to-llvm, \ finalize-memref-to-llvm, \ convert-math-to-llvm, \ convert-func-to-llvm, \ @@ -44,7 +50,6 @@ tosa-resize-translate: func-bufferize, \ func.func(buffer-deallocation, convert-linalg-to-loops), \ convert-scf-to-cf, \ - convert-linalg-to-llvm, \ finalize-memref-to-llvm, \ convert-math-to-llvm, \ convert-func-to-llvm, \ @@ -62,7 +67,6 @@ tosa-resize-run: func-bufferize, \ func.func(buffer-deallocation, convert-linalg-to-loops), \ convert-scf-to-cf, \ - convert-linalg-to-llvm, \ finalize-memref-to-llvm, \ convert-math-to-llvm, \ convert-func-to-llvm, \ @@ -81,7 +85,6 @@ tosa-sigmoid-lower: func-bufferize, \ func.func(buffer-deallocation, convert-linalg-to-loops), \ convert-scf-to-cf, \ - convert-linalg-to-llvm, \ finalize-memref-to-llvm, \ convert-math-to-llvm, \ convert-func-to-llvm, \ @@ -99,7 +102,6 @@ tosa-sigmoid-translate: func-bufferize, \ func.func(buffer-deallocation, convert-linalg-to-loops), \ convert-scf-to-cf, \ - convert-linalg-to-llvm, \ finalize-memref-to-llvm, \ convert-math-to-llvm, \ convert-func-to-llvm, \ @@ -117,7 +119,6 @@ tosa-sigmoid-run: func-bufferize, \ func.func(buffer-deallocation, convert-linalg-to-loops), \ convert-scf-to-cf, \ - convert-linalg-to-llvm, \ finalize-memref-to-llvm, \ convert-math-to-llvm, \ convert-func-to-llvm, \ @@ -136,7 +137,6 @@ tosa-log-lower: func-bufferize, \ func.func(buffer-deallocation, convert-linalg-to-loops), \ convert-scf-to-cf, \ - convert-linalg-to-llvm, \ finalize-memref-to-llvm, \ convert-math-to-llvm, \ convert-func-to-llvm, \ @@ -154,7 +154,6 @@ tosa-log-translate: func-bufferize, \ func.func(buffer-deallocation, convert-linalg-to-loops), \ convert-scf-to-cf, \ - convert-linalg-to-llvm, \ finalize-memref-to-llvm, \ convert-math-to-llvm, \ convert-func-to-llvm, \ @@ -172,7 +171,6 @@ tosa-log-run: func-bufferize, \ func.func(buffer-deallocation, convert-linalg-to-loops), \ convert-scf-to-cf, \ - convert-linalg-to-llvm, \ finalize-memref-to-llvm, \ convert-math-to-llvm, \ convert-func-to-llvm, \ @@ -191,7 +189,6 @@ tosa-add-lower: func-bufferize, \ func.func(buffer-deallocation, convert-linalg-to-loops), \ convert-scf-to-cf, \ - convert-linalg-to-llvm, \ finalize-memref-to-llvm, \ convert-math-to-llvm, \ convert-func-to-llvm, \ @@ -209,7 +206,6 @@ tosa-add-translate: func-bufferize, \ func.func(buffer-deallocation, convert-linalg-to-loops), \ convert-scf-to-cf, \ - convert-linalg-to-llvm, \ finalize-memref-to-llvm, \ convert-math-to-llvm, \ convert-func-to-llvm, \ @@ -227,7 +223,6 @@ tosa-add-run: func-bufferize, \ func.func(buffer-deallocation, convert-linalg-to-loops), \ convert-scf-to-cf, \ - convert-linalg-to-llvm, \ finalize-memref-to-llvm, \ convert-math-to-llvm, \ convert-func-to-llvm, \ @@ -246,7 +241,6 @@ tosa-concat-lower: func-bufferize, \ func.func(buffer-deallocation, convert-linalg-to-loops), \ convert-scf-to-cf, \ - convert-linalg-to-llvm, \ expand-strided-metadata, \ finalize-memref-to-llvm, \ convert-math-to-llvm, \ @@ -265,7 +259,6 @@ tosa-concat-translate: func-bufferize, \ func.func(buffer-deallocation, convert-linalg-to-loops), \ convert-scf-to-cf, \ - convert-linalg-to-llvm, \ expand-strided-metadata, \ finalize-memref-to-llvm, \ convert-math-to-llvm, \ @@ -284,7 +277,6 @@ tosa-concat-run: func-bufferize, \ func.func(buffer-deallocation, convert-linalg-to-loops), \ convert-scf-to-cf, \ - convert-linalg-to-llvm, \ expand-strided-metadata, \ finalize-memref-to-llvm, \ convert-math-to-llvm, \ diff --git a/examples/MLIRTOSA/tosa-concat.mlir b/examples/MLIRTOSA/tosa-concat.mlir index f1a1b1d4a..5c67b435a 100644 --- a/examples/MLIRTOSA/tosa-concat.mlir +++ b/examples/MLIRTOSA/tosa-concat.mlir @@ -3,7 +3,7 @@ func.func @main() { %0 = arith.constant dense<[[11.,12.],[30.,40.]]> : tensor<2x2xf32> %1 = arith.constant dense<[[12.,13.],[23.,45.],[11.,89.]]> : tensor<3x2xf32> - %output = "tosa.concat"(%0,%1) {axis=0} : (tensor<2x2xf32>,tensor<3x2xf32>) -> tensor<5x2xf32> + %output = "tosa.concat"(%0,%1) {axis=0 : i32} : (tensor<2x2xf32>,tensor<3x2xf32>) -> tensor<5x2xf32> %tensor_unranked = tensor.cast %output : tensor<5x2xf32> to tensor<*xf32> call @printMemrefF32(%tensor_unranked) : (tensor<*xf32>) -> () diff --git a/examples/MLIRTensor/makefile b/examples/MLIRTensor/makefile index 2790fbe32..0a8d1fa20 100644 --- a/examples/MLIRTensor/makefile +++ b/examples/MLIRTensor/makefile @@ -20,14 +20,14 @@ tensor-print-lower: @${MLIR_OPT} ./tensor-print.mlir \ -arith-bufferize -tensor-bufferize -linalg-bufferize \ -func-bufferize -buffer-deallocation -convert-linalg-to-loops \ - -convert-linalg-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm \ + -finalize-memref-to-llvm -convert-func-to-llvm \ -reconcile-unrealized-casts -o ./log.mlir tensor-print-translate: @${MLIR_OPT} ./tensor-print.mlir \ -arith-bufferize -tensor-bufferize -linalg-bufferize \ -func-bufferize -buffer-deallocation -convert-linalg-to-loops \ - -convert-linalg-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm \ + -finalize-memref-to-llvm -convert-func-to-llvm \ -reconcile-unrealized-casts | \ ${MLIR_TRANSLATE} --mlir-to-llvmir -o log.ll @@ -35,7 +35,7 @@ tensor-print-run: @${MLIR_OPT} ./tensor-print.mlir \ -arith-bufferize -tensor-bufferize -linalg-bufferize \ -func-bufferize -buffer-deallocation -convert-linalg-to-loops \ - -convert-linalg-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm \ + -finalize-memref-to-llvm -convert-func-to-llvm \ -reconcile-unrealized-casts | \ ${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \ -shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS} @@ -43,7 +43,7 @@ tensor-print-run: tensor-collapse-shape-lower: @${MLIR_OPT} ./tensor-collapse-shape.mlir \ -arith-bufferize -tensor-bufferize -func-bufferize \ - -finalizing-bufferize -buffer-deallocation -convert-linalg-to-llvm \ + -finalizing-bufferize -buffer-deallocation \ -expand-strided-metadata -lower-affine \ -finalize-memref-to-llvm -convert-func-to-llvm \ -reconcile-unrealized-casts -o ./log.mlir @@ -51,7 +51,7 @@ tensor-collapse-shape-lower: tensor-collapse-shape-translate: @${MLIR_OPT} ./tensor-collapse-shape.mlir \ -arith-bufferize -tensor-bufferize -func-bufferize \ - -finalizing-bufferize -buffer-deallocation -convert-linalg-to-llvm \ + -finalizing-bufferize -buffer-deallocation \ -expand-strided-metadata -lower-affine \ -finalize-memref-to-llvm -convert-func-to-llvm \ -reconcile-unrealized-casts | \ @@ -60,7 +60,7 @@ tensor-collapse-shape-translate: tensor-collapse-shape-run: @${MLIR_OPT} ./tensor-collapse-shape.mlir \ -arith-bufferize -tensor-bufferize -func-bufferize \ - -finalizing-bufferize -buffer-deallocation -convert-linalg-to-llvm \ + -finalizing-bufferize -buffer-deallocation \ -expand-strided-metadata -lower-affine \ -finalize-memref-to-llvm -convert-func-to-llvm \ -reconcile-unrealized-casts | \ @@ -71,21 +71,22 @@ tensor-extract-lower: @${MLIR_OPT} ./tensor-extract.mlir \ -arith-bufferize -tensor-bufferize -linalg-bufferize -convert-vector-to-llvm \ -func-bufferize -buffer-deallocation -convert-linalg-to-loops \ - -convert-linalg-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm \ + -finalize-memref-to-llvm -convert-func-to-llvm \ -reconcile-unrealized-casts -o ./log.mlir tensor-extract-translate: @${MLIR_OPT} ./tensor-extract.mlir \ -arith-bufferize -tensor-bufferize -linalg-bufferize -convert-vector-to-llvm \ -func-bufferize -buffer-deallocation -convert-linalg-to-loops \ - -convert-linalg-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm \ + -finalize-memref-to-llvm -convert-func-to-llvm \ -reconcile-unrealized-casts | \ ${MLIR_TRANSLATE} --mlir-to-llvmir -o log.ll + tensor-extract-run: @${MLIR_OPT} ./tensor-extract.mlir \ -arith-bufferize -tensor-bufferize -linalg-bufferize -convert-vector-to-llvm \ -func-bufferize -buffer-deallocation -convert-linalg-to-loops \ - -convert-linalg-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm \ + -finalize-memref-to-llvm -convert-func-to-llvm \ -reconcile-unrealized-casts | \ ${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \ -shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS} @@ -94,14 +95,14 @@ tensor-extract-slice-lower: @${MLIR_OPT} ./tensor-extract-slice.mlir \ -arith-bufferize -tensor-bufferize -linalg-bufferize -convert-vector-to-llvm \ -func-bufferize -buffer-deallocation -convert-linalg-to-loops -expand-strided-metadata \ - -convert-linalg-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm \ + -finalize-memref-to-llvm -convert-func-to-llvm \ -reconcile-unrealized-casts -o ./log.mlir tensor-extract-slice-translate: @${MLIR_OPT} ./tensor-extract-slice.mlir \ -arith-bufferize -tensor-bufferize -linalg-bufferize -convert-vector-to-llvm \ -func-bufferize -buffer-deallocation -convert-linalg-to-loops -expand-strided-metadata \ - -convert-linalg-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm \ + -finalize-memref-to-llvm -convert-func-to-llvm \ -reconcile-unrealized-casts | \ ${MLIR_TRANSLATE} --mlir-to-llvmir -o log.ll @@ -109,7 +110,7 @@ tensor-extract-slice-run: @${MLIR_OPT} ./tensor-extract-slice.mlir \ -arith-bufferize -tensor-bufferize -linalg-bufferize -convert-vector-to-llvm \ -func-bufferize -buffer-deallocation -convert-linalg-to-loops -expand-strided-metadata \ - -convert-linalg-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm \ + -finalize-memref-to-llvm -convert-func-to-llvm \ -reconcile-unrealized-casts | \ ${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \ -shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS} @@ -118,14 +119,14 @@ tensor-from-elements-lower: @${MLIR_OPT} ./tensor-from-elements.mlir \ -arith-bufferize -tensor-bufferize -linalg-bufferize -convert-vector-to-llvm \ -func-bufferize -buffer-deallocation -convert-linalg-to-loops \ - -convert-linalg-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm \ + -finalize-memref-to-llvm -convert-func-to-llvm \ -reconcile-unrealized-casts -o ./log.mlir tensor-from-elements-translate: @${MLIR_OPT} ./tensor-from-elements.mlir \ -arith-bufferize -tensor-bufferize -linalg-bufferize -convert-vector-to-llvm \ -func-bufferize -buffer-deallocation -convert-linalg-to-loops \ - -convert-linalg-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm \ + -finalize-memref-to-llvm -convert-func-to-llvm \ -reconcile-unrealized-casts | \ ${MLIR_TRANSLATE} --mlir-to-llvmir -o log.ll @@ -133,7 +134,7 @@ tensor-from-elements-run: @${MLIR_OPT} ./tensor-from-elements.mlir \ -arith-bufferize -tensor-bufferize -linalg-bufferize -convert-vector-to-llvm \ -func-bufferize -buffer-deallocation -convert-linalg-to-loops \ - -convert-linalg-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm \ + -finalize-memref-to-llvm -convert-func-to-llvm \ -reconcile-unrealized-casts | \ ${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \ -shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS} @@ -142,14 +143,14 @@ tensor-insert-lower: @${MLIR_OPT} ./tensor-insert.mlir \ -arith-bufferize -tensor-bufferize -linalg-bufferize -convert-vector-to-llvm \ -func-bufferize -buffer-deallocation -convert-linalg-to-loops \ - -convert-linalg-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm \ + -finalize-memref-to-llvm -convert-func-to-llvm \ -reconcile-unrealized-casts -o ./log.mlir tensor-insert-translate: @${MLIR_OPT} ./tensor-insert.mlir \ -arith-bufferize -tensor-bufferize -linalg-bufferize -convert-vector-to-llvm \ -func-bufferize -buffer-deallocation -convert-linalg-to-loops \ - -convert-linalg-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm \ + -finalize-memref-to-llvm -convert-func-to-llvm \ -reconcile-unrealized-casts | \ ${MLIR_TRANSLATE} --mlir-to-llvmir -o log.ll @@ -157,7 +158,7 @@ tensor-insert-run: @${MLIR_OPT} ./tensor-insert.mlir \ -arith-bufferize -tensor-bufferize -linalg-bufferize -convert-vector-to-llvm \ -func-bufferize -buffer-deallocation -convert-linalg-to-loops \ - -convert-linalg-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm \ + -finalize-memref-to-llvm -convert-func-to-llvm \ -reconcile-unrealized-casts | \ ${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \ -shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS} @@ -166,14 +167,14 @@ tensor-insert-slice-lower: @${MLIR_OPT} ./tensor-insert-slice.mlir \ -arith-bufferize -tensor-bufferize -linalg-bufferize -convert-vector-to-llvm \ -func-bufferize -buffer-deallocation -convert-linalg-to-loops -expand-strided-metadata \ - -convert-linalg-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm \ + -finalize-memref-to-llvm -convert-func-to-llvm \ -reconcile-unrealized-casts -o ./log.mlir tensor-insert-slice-translate: @${MLIR_OPT} ./tensor-insert-slice.mlir \ -arith-bufferize -tensor-bufferize -linalg-bufferize -convert-vector-to-llvm \ -func-bufferize -buffer-deallocation -convert-linalg-to-loops -expand-strided-metadata \ - -convert-linalg-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm \ + -finalize-memref-to-llvm -convert-func-to-llvm \ -reconcile-unrealized-casts | \ ${MLIR_TRANSLATE} --mlir-to-llvmir -o log.ll @@ -181,7 +182,7 @@ tensor-insert-slice-run: @${MLIR_OPT} ./tensor-insert-slice.mlir \ -arith-bufferize -tensor-bufferize -linalg-bufferize -convert-vector-to-llvm \ -func-bufferize -buffer-deallocation -convert-linalg-to-loops -expand-strided-metadata \ - -convert-linalg-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm \ + -finalize-memref-to-llvm -convert-func-to-llvm \ -reconcile-unrealized-casts | \ ${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \ -shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS} diff --git a/examples/MLIRTensor/tensor-insert-slice.mlir b/examples/MLIRTensor/tensor-insert-slice.mlir index 9baac6ccb..cac15ca80 100644 --- a/examples/MLIRTensor/tensor-insert-slice.mlir +++ b/examples/MLIRTensor/tensor-insert-slice.mlir @@ -1,6 +1,6 @@ // RUN: buddy-opt %s \ // RUN: -arith-bufferize -tensor-bufferize -linalg-bufferize \ -// RUN: -convert-linalg-to-loops -convert-scf-to-cf -func-bufferize \ +// RUN: -convert-scf-to-cf -func-bufferize \ // RUN: -buffer-deallocation -convert-linalg-to-loops \ // RUN: -expand-strided-metadata \ // RUN: -finalize-memref-to-llvm -convert-func-to-llvm \ diff --git a/examples/MLIRTensor/tensor-print.mlir b/examples/MLIRTensor/tensor-print.mlir index e33e0184b..ec0b14126 100644 --- a/examples/MLIRTensor/tensor-print.mlir +++ b/examples/MLIRTensor/tensor-print.mlir @@ -1,7 +1,7 @@ // RUN: buddy-opt %s \ // RUN: -arith-bufferize -tensor-bufferize -linalg-bufferize \ // RUN: -func-bufferize -buffer-deallocation -convert-linalg-to-loops \ -// RUN: -convert-linalg-to-loops -convert-scf-to-cf -finalize-memref-to-llvm -convert-func-to-llvm \ +// RUN: -convert-scf-to-cf -finalize-memref-to-llvm -convert-func-to-llvm \ // RUN: -reconcile-unrealized-casts \ // RUN: | mlir-cpu-runner -e main -entry-point-result=void \ // RUN: -shared-libs=%mlir_runner_utils_dir/libmlir_runner_utils%shlibext \ diff --git a/examples/README.md b/examples/README.md index 2f940537d..fc551ccdd 100644 --- a/examples/README.md +++ b/examples/README.md @@ -92,7 +92,7 @@ $ buddy-opt -lower-dip="DIP-strip-mining=${BUDDY_DIP_OPT_STRIP_MINING}" ``` $ cd buddy-mlir/build/bin -$ ./buddy-opt ../../examples/DIPDialect/dip.mlir --lower-dip="DIP-strip-mining=${BUDDY_DIP_OPT_STRIP_MINING}" +$ ./buddy-opt ../../frontend/Interfaces/DIP.mlir --lower-dip="DIP-strip-mining=${BUDDY_DIP_OPT_STRIP_MINING}" ``` - Edge detection example: @@ -237,7 +237,7 @@ Example: ``` $ cd buddy-mlir/build/bin -$ ./buddy-opt ../../examples/BudDialect/TestConstant.mlir --lower-bud +$ ./buddy-opt ../../examples/BudDialect/bud-print.mlir --lower-bud ``` ## DSL Examples diff --git a/examples/VectorExpDialect/.gitignore b/examples/VectorExpDialect/.gitignore index d32dc0c50..790429d34 100644 --- a/examples/VectorExpDialect/.gitignore +++ b/examples/VectorExpDialect/.gitignore @@ -1,2 +1,3 @@ log* core +a.out diff --git a/examples/VectorExpDialect/makefile b/examples/VectorExpDialect/makefile index 5be430a67..bfecdba83 100644 --- a/examples/VectorExpDialect/makefile +++ b/examples/VectorExpDialect/makefile @@ -124,6 +124,41 @@ vector-exp-predication-matmul-run: -dlopen=${CROSS_MLIR_C_RUNNER_UTILS} \ -dlopen=${CROSS_MLIR_RUNNER_UTILS} +vector-exp-predication-matmul-aot: + @${BUDDY_OPT} ./vector-exp-predication-matmul.mlir \ + -lower-affine \ + -convert-scf-to-cf \ + -convert-math-to-llvm \ + -lower-vector-exp \ + -lower-rvv \ + -convert-vector-to-llvm \ + -finalize-memref-to-llvm \ + -convert-func-to-llvm \ + -reconcile-unrealized-casts |\ + ${BUDDY_TRANSLATE} --buddy-to-llvmir | \ + ${LLC} -mtriple riscv64 -mattr=+v,+m -riscv-v-vector-bits-min=128 --filetype=obj -o log.o + @${RISCV_GNU_TOOLCHAIN}/bin/riscv64-unknown-linux-gnu-gcc log.o \ + -L${CROSS_MLIR_LIB} -lmlir_runner_utils -lmlir_c_runner_utils \ + -o a.out + @LD_LIBRARY_PATH=${CROSS_MLIR_LIB} ${QEMU} -L ${RISCV_GNU_TOOLCHAIN_SYSROOT} -cpu rv64,x-v=true,vlen=128 a.out + +# vector-exp-predication-matmul-elf: +# @${BUDDY_OPT} ./vector-exp-predication-matmul.mlir \ +# -lower-affine \ +# -convert-scf-to-cf \ +# -convert-math-to-llvm \ +# -lower-vector-exp \ +# -lower-rvv \ +# -convert-vector-to-llvm \ +# -finalize-memref-to-llvm \ +# -convert-func-to-llvm \ +# -reconcile-unrealized-casts |\ +# ${BUDDY_TRANSLATE} -buddy-to-llvmir | \ +# ${LLC} -mtriple riscv64 -mattr=+v,+m -riscv-v-vector-bits-min=128 -filetype=obj -o log.o +# @${RISCV_GNU_TOOLCHAIN}/bin/riscv64-unknown-linux-gnu-gcc log.o \ +# -static \ +# -o log.elf + vector-exp-add-mask-run: @${BUDDY_OPT} ./vector-exp-add-mask.mlir \ -lower-affine \ @@ -187,3 +222,7 @@ vector-exp-add-predication-asm: ${LLC} ${OPT_FLAG} -mtriple riscv64 -target-abi lp64d \ -mattr=+m,+d,+v -riscv-v-vector-bits-min=128 \ --filetype=asm -o log.s + +vector-exp-dynamic-vector-dump: + @${BUDDY_OPT} ./vector-exp-dynamic-vector.mlir \ + -o log.mlir diff --git a/examples/VectorExpDialect/vector-exp-dynamic-vector.mlir b/examples/VectorExpDialect/vector-exp-dynamic-vector.mlir new file mode 100644 index 000000000..d792bacb0 --- /dev/null +++ b/examples/VectorExpDialect/vector-exp-dynamic-vector.mlir @@ -0,0 +1,51 @@ +#map = affine_map<(d0)[s0, s1] -> (s0, -d0 + s1)> + +func.func private @printMemrefI32(memref<*xi32>) + +func.func @alloc_mem_i32(%init: i32) -> memref { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c20 = arith.constant 20 : index + %mem = memref.alloc(%c20) : memref + scf.for %idx0 = %c0 to %c20 step %c1 { + memref.store %init, %mem[%idx0] : memref + } + return %mem : memref +} + +func.func @vector_add(%input1: memref, %input2: memref, %output: memref) { + %c0 = arith.constant 0 : index + // Get the dimension of the workload. + %dim_size = memref.dim %input1, %c0 : memref + // Perform dynamic vector addition. + // Returns four times the physical vl for element type i32. + %vl = vector_exp.get_vl i32, 4 : index + + scf.for %idx = %c0 to %dim_size step %vl { // Tiling + %it_vl = affine.min #map(%idx)[%vl, %dim_size] + vector_exp.set_vl %it_vl : index { + %vec_input1 = vector.load %input1[%idx] : memref, vector<[1]xi32> // vector + %vec_input2 = vector.load %input2[%idx] : memref, vector<[1]xi32> // vector + %vec_output = arith.addi %vec_input1, %vec_input2 : vector<[1]xi32> // vector + vector.store %vec_output, %output[%idx] : memref, vector<[1]xi32> // vector + vector.yield + } + } + return +} + +func.func @main() -> i32 { + %c0_i32 = arith.constant 0 : i32 + %c1_i32 = arith.constant 1 : i32 + + %input_mem = call @alloc_mem_i32(%c1_i32) : (i32) -> memref + %result_mem = call @alloc_mem_i32(%c0_i32) : (i32) -> memref + + call @vector_add(%input_mem, %input_mem, %result_mem) : (memref, memref, memref) -> () + + %print_result_mem = memref.cast %result_mem : memref to memref<*xi32> + call @printMemrefI32(%print_result_mem) : (memref<*xi32>) -> () + + %ret = arith.constant 0 : i32 + return %ret : i32 +} diff --git a/examples/lit.cfg.py b/examples/lit.cfg.py index 5bb8746c3..724d9cdaa 100644 --- a/examples/lit.cfg.py +++ b/examples/lit.cfg.py @@ -36,7 +36,9 @@ # subdirectories contain auxiliary inputs for various tests in their parent # directories. config.excludes = [ + 'BuddyBert', 'BuddyLlama', + 'BuddyBert', 'ConvOpt', 'DAPDialect', 'DIPDialect', diff --git a/frontend/Interfaces/buddy/Core/Container.h b/frontend/Interfaces/buddy/Core/Container.h index db8b66c17..242199811 100644 --- a/frontend/Interfaces/buddy/Core/Container.h +++ b/frontend/Interfaces/buddy/Core/Container.h @@ -25,10 +25,16 @@ #include #include #include +#include #include +#include #include #include #include +#include +#include +#include +#include #include // MemRef descriptor. @@ -54,6 +60,8 @@ template class MemRef { MemRef &operator=(const MemRef &other); // Move constructor. MemRef(MemRef &&other) noexcept; + // Constructor from file. + MemRef(const std::string& filename, intptr_t sizes[N], intptr_t offset = 0, bool isMmap = false); // Move assignment operator. MemRef &operator=(MemRef &&other) noexcept; // Desctrutor. @@ -96,6 +104,10 @@ template class MemRef { intptr_t sizes[N]; // Strides. intptr_t strides[N]; + // Number of elements. + size_t size; + // File descriptor for mmap + int fd = -1; }; // MemRef Shape Constructor. @@ -277,12 +289,54 @@ MemRef &MemRef::operator=(MemRef &&other) noexcept { return *this; } +template +MemRef::MemRef(const std::string &filename, intptr_t sizes[N], intptr_t offset, + bool isMmap) { + this->offset = offset; + for (size_t i = 0; i < N; i++) { + this->sizes[i] = sizes[i]; + } + setStrides(); + size = product(sizes); + if (isMmap) { + fd = open(filename.c_str(), O_RDONLY); + if (fd == -1) { + assert (0 && "Failed to open file!"); + } + struct stat sb; + if (fstat(fd, &sb) == -1) { + assert (0 && "Failed to get file size!"); + } + if (sb.st_size != size * sizeof(T)) { + assert (0 && "File size does not match!"); + } + allocated = (T *)mmap(NULL, sb.st_size, PROT_READ, MAP_PRIVATE, fd, 0); + if (allocated == MAP_FAILED) { + assert (0 && "Failed to mmap!"); + } + aligned = allocated; + + } else { + allocated = new T[size]; + aligned = allocated; + std::ifstream in(filename, std::ios::in | std::ios::binary); + if (!in.is_open()) { + throw std::runtime_error("Failed to open file!"); + } + in.read((char *)(aligned), sizeof(T) * (size)); + in.close(); + } + } + // MemRef Destructor. // Note that the `allocated` and `aligned` point to the same address, so it is // enough to release the space of the `allocated` pointer in the destructor. template MemRef::~MemRef() { if (allocated) - free(allocated); + delete[] allocated; + if (fd != -1) { + close(fd); + } } // Get the data pointer. diff --git a/frontend/Interfaces/buddy/DAP/DSP/IIR.h b/frontend/Interfaces/buddy/DAP/DSP/IIR.h index c2c3bb1eb..bb3035a9f 100644 --- a/frontend/Interfaces/buddy/DAP/DSP/IIR.h +++ b/frontend/Interfaces/buddy/DAP/DSP/IIR.h @@ -30,13 +30,13 @@ namespace detail { // Declare the Fir C interface. extern "C" { // TODO: support both float and double. -void _mlir_ciface_mlir_iir(MemRef *inputBuddyConv1D, - MemRef *kernelBuddyConv1D, - MemRef *outputBuddyConv1D); - void _mlir_ciface_buddy_iir(MemRef *inputBuddyConv1D, MemRef *kernelBuddyConv1D, MemRef *outputBuddyConv1D); + +void _mlir_ciface_buddy_iir_vectorization(MemRef *inputBuddyConv1D, + MemRef *kernelBuddyConv1D, + MemRef *outputBuddyConv1D); } } // namespace detail @@ -62,15 +62,19 @@ void iirLowpass(MemRef &input, const zpk &filter, T frequency, T fs) { } } -template -void iir(MemRef *input, MemRef *filter, - MemRef *output) { +// Filter parameters are represented by Second Order Section (SOS) filter, which +// accept a MemRef with 2 dimension only (with the second dimension set to 6). +template +void IIR(MemRef *input, MemRef *filter, + MemRef *output, bool isVectorization=false) { if (N != 1) assert(0 && "Only mono audio is supported for now."); - if (M != 2) - assert(0 && "Second Order Section (SOS) filter is only supported for now."); - detail::_mlir_ciface_buddy_iir(input, filter, output); + if (!isVectorization) + detail::_mlir_ciface_buddy_iir(input, filter, output); + else + detail::_mlir_ciface_buddy_iir_vectorization(input, filter, output); } + } // namespace dap #endif // FRONTEND_INTERFACES_BUDDY_DAP_DSP_IIR diff --git a/frontend/Interfaces/buddy/DIP/ImageContainer.h b/frontend/Interfaces/buddy/DIP/ImageContainer.h index d90e25337..a25fd4476 100644 --- a/frontend/Interfaces/buddy/DIP/ImageContainer.h +++ b/frontend/Interfaces/buddy/DIP/ImageContainer.h @@ -127,6 +127,7 @@ template Img &Img::operator=(Img &&m) { template Img &Img::operator=(const Img &m) { MemRef::operator=(m); + return *this; } /** diff --git a/frontend/Interfaces/buddy/DIP/imgcodecs/bitstrm.h b/frontend/Interfaces/buddy/DIP/imgcodecs/bitstrm.h index e0199499d..5fb018ec2 100644 --- a/frontend/Interfaces/buddy/DIP/imgcodecs/bitstrm.h +++ b/frontend/Interfaces/buddy/DIP/imgcodecs/bitstrm.h @@ -111,14 +111,14 @@ template class RMByteStream : public RLByteStream { class WBaseStream { public: // methods - WBaseStream(); - virtual ~WBaseStream(); + inline WBaseStream(); + inline virtual ~WBaseStream(); - virtual bool open(const String &filename); - virtual bool open(std::vector &buf); - virtual void close(); - bool isOpened(); - int getPos(); + inline virtual bool open(const String &filename); + inline virtual bool open(std::vector &buf); + inline virtual void close(); + inline bool isOpened(); + inline int getPos(); protected: uchar *m_start; @@ -130,9 +130,9 @@ class WBaseStream { bool m_is_opened; std::vector *m_buf; - virtual void writeBlock(); - virtual void release(); - virtual void allocate(); + inline virtual void writeBlock(); + inline virtual void release(); + inline virtual void allocate(); }; // class WLByteStream - uchar-oriented stream. @@ -140,12 +140,12 @@ class WBaseStream { // first class WLByteStream : public WBaseStream { public: - virtual ~WLByteStream(); + inline virtual ~WLByteStream(); - void putByte(int val); - void putBytes(const void *buffer, int count); - void putWord(int val); - void putDWord(int val); + inline void putByte(int val); + inline void putBytes(const void *buffer, int count); + inline void putWord(int val); + inline void putDWord(int val); }; // class WLByteStream - uchar-oriented stream. @@ -153,9 +153,9 @@ class WLByteStream : public WBaseStream { // last class WMByteStream : public WLByteStream { public: - virtual ~WMByteStream(); - void putWord(int val); - void putDWord(int val); + inline virtual ~WMByteStream(); + inline void putWord(int val); + inline void putDWord(int val); }; inline unsigned BSWAP(unsigned v) { @@ -165,7 +165,7 @@ inline unsigned BSWAP(unsigned v) { const int BS_DEF_BLOCK_SIZE = 1 << 15; -bool bsIsBigEndian(void) { +inline bool bsIsBigEndian(void) { return (((const int *)"\0\x1\x2\x3\x4\x5\x6\x7")[0] & 255) != 0; } diff --git a/frontend/Interfaces/buddy/DIP/imgcodecs/grfmt_png.h b/frontend/Interfaces/buddy/DIP/imgcodecs/grfmt_png.h index b2ad33a38..2866a4916 100644 --- a/frontend/Interfaces/buddy/DIP/imgcodecs/grfmt_png.h +++ b/frontend/Interfaces/buddy/DIP/imgcodecs/grfmt_png.h @@ -105,7 +105,7 @@ class PngEncoder : public BaseImageEncoder { static void writeDataToBuf(void *png_ptr, uchar *src, size_t size); }; -bool isBigEndian() { +inline bool isBigEndian() { int num = 1; char *ptr = (char *)# return (*ptr == 0); diff --git a/frontend/Interfaces/buddy/DIP/imgcodecs/loadsave.h b/frontend/Interfaces/buddy/DIP/imgcodecs/loadsave.h index 1b39d3577..a4854a7f4 100644 --- a/frontend/Interfaces/buddy/DIP/imgcodecs/loadsave.h +++ b/frontend/Interfaces/buddy/DIP/imgcodecs/loadsave.h @@ -308,4 +308,4 @@ static bool imwrite(const String &filename, Img &img_vec) { return true; } } // namespace dip -#endif \ No newline at end of file +#endif diff --git a/frontend/Interfaces/buddy/DIP/imgcodecs/utils.h b/frontend/Interfaces/buddy/DIP/imgcodecs/utils.h index f848fdc97..52dcbd764 100644 --- a/frontend/Interfaces/buddy/DIP/imgcodecs/utils.h +++ b/frontend/Interfaces/buddy/DIP/imgcodecs/utils.h @@ -52,7 +52,7 @@ #include "buddy/DIP/imgcodecs/replenishment.h" namespace dip { -int validateToInt(size_t sz) { +inline int validateToInt(size_t sz) { int valueInt = (int)sz; assert((size_t)valueInt == sz); return valueInt; @@ -77,26 +77,31 @@ struct PaletteEntry { #define descale(x, n) (((x) + (1 << ((n)-1))) >> (n)) #define saturate(x) (uchar)(((x) & ~255) == 0 ? (x) : ~((x) >> 31)) -void icvCvt_BGR2Gray_8u_C3C1R(const uchar *bgr, int bgr_step, uchar *gray, - int gray_step, _Size size, int swap_rb = 0); - -void FillGrayPalette(PaletteEntry *palette, int bpp, bool negative = false); -bool IsColorPalette(PaletteEntry *palette, int bpp); -void CvtPaletteToGray(const PaletteEntry *palette, uchar *grayPalette, - int entries); -uchar *FillUniColor(uchar *data, uchar *&line_end, int step, int width3, int &y, - int height, int count3, PaletteEntry clr); -uchar *FillUniGray(uchar *data, uchar *&line_end, int step, int width3, int &y, - int height, int count3, uchar clr); -uchar *FillColorRow8(uchar *data, uchar *indices, int len, - PaletteEntry *palette); -uchar *FillGrayRow8(uchar *data, uchar *indices, int len, uchar *palette); -uchar *FillColorRow4(uchar *data, uchar *indices, int len, - PaletteEntry *palette); -uchar *FillGrayRow4(uchar *data, uchar *indices, int len, uchar *palette); -uchar *FillColorRow1(uchar *data, uchar *indices, int len, - PaletteEntry *palette); -uchar *FillGrayRow1(uchar *data, uchar *indices, int len, uchar *palette); +inline void icvCvt_BGR2Gray_8u_C3C1R(const uchar *bgr, int bgr_step, + uchar *gray, int gray_step, _Size size, + int swap_rb = 0); + +inline void FillGrayPalette(PaletteEntry *palette, int bpp, + bool negative = false); +inline bool IsColorPalette(PaletteEntry *palette, int bpp); +inline void CvtPaletteToGray(const PaletteEntry *palette, uchar *grayPalette, + int entries); +inline uchar *FillUniColor(uchar *data, uchar *&line_end, int step, int width3, + int &y, int height, int count3, PaletteEntry clr); +inline uchar *FillUniGray(uchar *data, uchar *&line_end, int step, int width3, + int &y, int height, int count3, uchar clr); +inline uchar *FillColorRow8(uchar *data, uchar *indices, int len, + PaletteEntry *palette); +inline uchar *FillGrayRow8(uchar *data, uchar *indices, int len, + uchar *palette); +inline uchar *FillColorRow4(uchar *data, uchar *indices, int len, + PaletteEntry *palette); +inline uchar *FillGrayRow4(uchar *data, uchar *indices, int len, + uchar *palette); +inline uchar *FillColorRow1(uchar *data, uchar *indices, int len, + PaletteEntry *palette); +inline uchar *FillGrayRow1(uchar *data, uchar *indices, int len, + uchar *palette); #define SCALE 14 #define cR (int)(0.299 * (1 << SCALE) + 0.5) diff --git a/frontend/Interfaces/buddy/LLM/TextContainer.h b/frontend/Interfaces/buddy/LLM/TextContainer.h index b5e307abd..28432b3c1 100644 --- a/frontend/Interfaces/buddy/LLM/TextContainer.h +++ b/frontend/Interfaces/buddy/LLM/TextContainer.h @@ -325,7 +325,7 @@ template std::string Text::revertLlama() { const int CLS_ID = 1; const int SEP_ID = 2; - for (size_t i = 0; i < this->getSize(); i++) { + for (size_t i = 0; i < this->tokenCnt; i++) { int id = this->aligned[i]; if (id == PAD_ID || id == CLS_ID) continue; diff --git a/frontend/Interfaces/lib/CMakeLists.txt b/frontend/Interfaces/lib/CMakeLists.txt index e70a24034..9f6f61b29 100644 --- a/frontend/Interfaces/lib/CMakeLists.txt +++ b/frontend/Interfaces/lib/CMakeLists.txt @@ -65,3 +65,36 @@ SET_TARGET_PROPERTIES(BuddyLibDAP PROPERTIES LINKER_LANGUAGE CXX ARCHIVE_OUTPUT_DIRECTORY ${LIBRARY_OUTPUT_DIRECTORY} ) + + add_custom_command(OUTPUT DAPVectorization.o + COMMAND cat ${CMAKE_CURRENT_SOURCE_DIR}/DAP.mlir | + sed 's/buddy_fir/buddy_fir_vectorization/' | + sed 's/buddy_iir/buddy_iir_vectorization/' | + sed 's/buddy_biquad/buddy_biquad_vectorization/' | + ${CMAKE_BINARY_DIR}/bin/buddy-opt + -vectorize-dap + -convert-linalg-to-affine-loops + -arith-expand + -lower-affine + -convert-scf-to-cf + -convert-math-to-llvm + -convert-vector-to-llvm + -finalize-memref-to-llvm + -llvm-request-c-wrappers + -convert-func-to-llvm + -reconcile-unrealized-casts | + ${LLVM_MLIR_BINARY_DIR}/mlir-translate -mlir-to-llvmir | + ${LLVM_MLIR_BINARY_DIR}/llc + -mtriple=${BUDDY_TARGET_TRIPLE} + -mattr=${BUDDY_OPT_ATTR} + -filetype=obj + -o ${CMAKE_CURRENT_BINARY_DIR}/DAPVectorization.o + DEPENDS buddy-opt + ) + +add_library(BuddyLibDAPVectorization STATIC DAPVectorization.o) + +SET_TARGET_PROPERTIES(BuddyLibDAPVectorization PROPERTIES + LINKER_LANGUAGE CXX + ARCHIVE_OUTPUT_DIRECTORY ${LIBRARY_OUTPUT_DIRECTORY} + ) diff --git a/frontend/Python/frontend.py b/frontend/Python/frontend.py index 24002fd64..e89597800 100644 --- a/frontend/Python/frontend.py +++ b/frontend/Python/frontend.py @@ -16,32 +16,44 @@ # # This is the entry of the Buddy Compiler frontend. # +# TODO[Low]: When integrating more frameworks, `frontend.py` acts as a unified +# entry and driver, separating out compilers/importers for various platforms +# (e.g. DynamoCompiler). +# # ===--------------------------------------------------------------------------- -import operator from typing import Any, List, Optional -import functools +import operator +import os +import ctypes +import platform -import mlir.dialects.func as func import mlir.ir as ir +import mlir.dialects.func as func +from mlir.passmanager import * +from mlir.execution_engine import * +from mlir import runtime as rt import torch import torch._dynamo as dynamo from torch._functorch.aot_autograd import aot_module_simplified import torch.utils._pytree as pytree -from .ops.math import ops_registry as math_ops_registry -from .ops.tosa import ops_registry as tosa_ops_registry from .ops.linalg import ops_registry as linalg_ops_registry +from .ops.tosa import ops_registry as tosa_ops_registry +from .ops.math import ops_registry as math_ops_registry +from .graph import Graph, TensorDType, TensorMeta +from .graph.operation import * +from .graph.transform import maxpool2d_simplify class DynamoCompiler: """ Dynamo Compiler is one of the frontends of Buddy Compiler. - Dynamo Compiler acts as a custom compiler for the Torch Dynamo framework, - which converts an FX Graph into an equivalent MLIR module. + Dynamo Compiler acts as a custom compiler for the TorchDynamo framework, + which converts an FX Graph into an equivalent Buddy Graph and MLIR module. Attributes: - imported_module: The imported MLIR module after compilation. + imported_graphs: The imported graphs. imported_params: The imported parameters from the model. """ @@ -50,80 +62,279 @@ def __init__( func_name: str = "forward", primary_registry: Optional[dict] = None, aot_autograd_decomposition: Optional[dict] = None, - do_param_pack: bool = True, ) -> None: """ Initializes the Dynamo Compiler. Args: - func_name (str, optional): The function name to be used. + func_name: The function name to be used. primary_registry (dict, optional): The primary operations registry. aot_autograd_decomposition (Optional[dict], optional): - The ahead-of-time autograd decomposition dictionary. + The ahead-of-time autograd decomposition dictionary. + Attributes: + _func_name: The function name to be used. + _aot_autograd_decomposition (Optional[dict], optional): + The ahead-of-time autograd decomposition dictionary. + _imported_graphs: The buddy graphs from dynamo importer. + _ops_registry (dict, optional): The buddy operations' lower func + registry. + _imported_params: The model params extract from torch. + _ops_map: The torch aten ops map with buddy ops. + """ if primary_registry is None: primary_registry = {} self._func_name = func_name self._aot_autograd_decomposition = aot_autograd_decomposition - self._imported_module = None - self._imported_params = None - self._do_param_pack = do_param_pack + self._imported_graphs = [] self._ops_registry = {} + self._imported_params = {} self._ops_registry.update(math_ops_registry) self._ops_registry.update(linalg_ops_registry) self._ops_registry.update(tosa_ops_registry) self._ops_registry.update(primary_registry) + self._ops_map = { + "output": OutputOp, + "placeholder": PlaceholderOp, + "arange.start": ArangeOp, + "arange.default": ArangeOp, + "unsqueeze.default": UnsqueezeOp, + "view.default": ViewOp, + "ones.default": OnesOp, + "full.default": FullOp, + "lt.Tensor": LessThanOp, + "embedding.default": EmbeddingOp, + "masked_fill.Scalar": MaskedFillOp, + "slice.Tensor": SliceOp, + "expand.default": ExpandOp, + "_to_copy.default": ToCopyOp, + "rsub.Scalar": RsubOp, + "pow.Tensor_Scalar": PowOp, + "mean.dim": MeanOp, + "rsqrt.default": RsqrtOp, + "mul.Tensor": MulOp, + "t.default": TOp, + "mm.default": MatmulOp, + "transpose.int": TransposeOp, + "index.Tensor": IndexOp, + "neg.default": NegOp, + "cat.default": CatOp, + "squeeze.dim": SqueezeOp, + "bmm.default": BatchMatmulOp, + "div.Tensor": DivOp, + "_softmax.default": SoftmaxOp, + "clone.default": CloneOp, + "silu.default": SiluOp, + "add.Tensor": AddOp, + "addmm.default": AddMMOp, + "permute.default": PermuteOp, + "convert_element_type.default": ConvertElementTypeOp, + "sum.dim_IntList": SumDimOp, + "tanh.default": TanhOp, + "sub.Tensor": SubOp, + "var_mean.correction": VarMeanOp, + "amax.default": AmaxOp, + "select.int": SelectOp, + "exp.default": ExpOp, + "erf.default": ErfOp, + "getitem": GetItemOp, + "convolution.default": Conv2dOp, + "max_pool2d_with_indices.default": MaxPool2dWithIndicesOp, + "relu.default": ReluOp, + "iota.default": IotaOp, + "sigmoid.default": SigmoidOp, + "scalar_tensor.default": ScalarTensorOp, + "where.self": WhereOp, + "sqrt.default": SqrtOp, + "reciprocal.default": ReciprocalOp, + } @property - def imported_module(self): - """Returns the imported MLIR module after compilation.""" - return self._imported_module + def imported_graphs(self): + """Returns the imported buddy graphs after compilation.""" + return self._imported_graphs @property def imported_params(self): - """Returns the imported parameters from the model.""" + """Returns the imported model params after compilation.""" return self._imported_params + def _torch_dtype_translate(self, dtype): + match dtype: + case "torch.int64": + return TensorDType.Int64 + case "torch.int32": + return TensorDType.Int32 + case "torch.float16": + return TensorDType.Float16 + case "torch.float32": + return TensorDType.Float32 + case "torch.float64": + return TensorDType.Float64 + case "torch.bool": + return TensorDType.Bool + case _: + raise NotImplementedError(f"Unsupported dtype: {dtype}") + + def _create_node( + self, + gm_node_name: str, + node_name: str, + node_input: Tuple, + node_users: List[str], + node_output_shape: list = [], + node_output_dtype: TensorDType = None, + node_kwargs: Optional[Dict] = None, + ): + """ + Create buddy op node from torch aten op. + + Args: + gm_node_name: The op node class map to buddy op by _ops_map. + node_name: The op node name to be used. + node_input: The args input to op node. + node_output_shape: The list of the op node's output shape. + node_output_dtype: The TensorDType enum type of the op node's output + data type. + node_kwargs: The restful attributes for op node. + """ + op_class = self._ops_map[gm_node_name] + buddy_node = op_class() + buddy_node._name = node_name + if gm_node_name == "output": + for input_arg in node_input[0]: + buddy_node.add_argument(str(input_arg)) + return buddy_node + for input_arg in node_input: + if isinstance(input_arg, torch.fx.Node): + buddy_node.add_argument(str(input_arg)) + buddy_node.add_parent(str(input_arg)) + elif isinstance(input_arg, torch.dtype): + buddy_node.add_argument(self._torch_dtype_translate(str(input_arg))) + else: + buddy_node.add_argument(input_arg) + for user in node_users: + buddy_node.add_children(user) + if node_kwargs is None: + node_kwargs = {} + buddy_node._keyword_arguments.update(node_kwargs) + buddy_node._tensor_meta["shape"] = node_output_shape + buddy_node._tensor_meta["dtype"] = node_output_dtype + return buddy_node + def _compile_fx( self, gm: torch.fx.GraphModule, inputs: List[torch.Tensor] ) -> Any: """ - Compiles the provided FX Graph to MLIR module. + Compiles the provided FX Graph to Buddy Graph. Args: gm (torch.fx.GraphModule): The GraphModule to be compiled. inputs (List[torch.Tensor]): The input tensors. Returns: - Any: The result of the ahead-of-time compiled module. + dynamo_run: The function of the ahead-of-time compiled module, + return for torchdynamo's call. """ - def _compiler(_gm: torch.fx.GraphModule, _inputs: List[torch.Tensor]): - """Compile a FX graph in Aten/Prims IR to MLIR.""" - func_params = _inputs[: len(self.imported_params)] - func_inputs = _inputs[len(self.imported_params) :] - - # Initializes the MLIR context. - ctx = ir.Context() - with ir.Location.unknown(ctx): - fx_importer = FXGraphImporter( - _gm, - func_params, - func_inputs, - self._do_param_pack, - self._func_name, - self._ops_registry, - ) - self._imported_module = fx_importer.import_graph() - # TODO: Lower to LLVM dialect and use JIT engine to execute. - return _gm.forward - params = { **dict(gm.named_parameters(remove_duplicate=False)), **dict(gm.named_buffers(remove_duplicate=False)), } params_flat, _ = pytree.tree_flatten(params) - self._imported_params = params_flat + + def _compiler(_gm: torch.fx.GraphModule, _inputs: List[torch.Tensor]): + """Compile a FX graph in Aten/Prims IR to MLIR.""" + nonlocal params_flat + func_inputs = [] + for inp in _inputs[len(params_flat) :]: + inp_shape = inp.shape + inp_dtype = self._torch_dtype_translate(str(inp.dtype)) + func_inputs.append(TensorMeta(inp_shape, inp_dtype)) + fake_params = [] + for param in params_flat: + param_dtype = self._torch_dtype_translate(str(param.dtype)) + fake_params.append(TensorMeta(param.shape, param_dtype)) + graph = Graph( + func_inputs, + fake_params, + self._ops_registry, + self._func_name, + ) + for gm_node in _gm.graph.nodes: + node_users = [] + for user in gm_node.users.keys(): + node_users.append(str(user)) + if gm_node.op == "placeholder": + node_dtype = self._torch_dtype_translate( + str(gm_node.meta["tensor_meta"].dtype) + ) + buddy_node = self._create_node( + gm_node.op, + gm_node.name, + gm_node.args, + node_users, + gm_node.meta["tensor_meta"].shape, + node_dtype, + ) + + elif gm_node.op == "output": + buddy_node = self._create_node( + gm_node.op, + gm_node.name, + gm_node.args, + node_users + ) + + elif gm_node.target is operator.getitem: + node_dtype = self._torch_dtype_translate( + str(gm_node.meta["tensor_meta"].dtype) + ) + buddy_node = self._create_node( + str(gm_node.target.__name__), + gm_node.name, + gm_node.args, + node_users, + gm_node.meta["tensor_meta"].shape, + node_dtype, + ) + + else: + tensor_meta = gm_node.meta.get("tensor_meta") + val = gm_node.meta.get("val") + num_returns = len(gm_node.target._schema.returns) + if num_returns == 1: + node_dtype = self._torch_dtype_translate( + str(tensor_meta.dtype) + ) + node_shape = tensor_meta.shape + elif num_returns > 1: + node_dtype = tuple( + [ + self._torch_dtype_translate(str(val_item.dtype)) + for val_item in val + ] + ) + node_shape = tuple([val_item.shape for val_item in val]) + else: + raise RuntimeError("Zero returns is not supported.") + + buddy_node = self._create_node( + str(gm_node.target.__name__), + gm_node.name, + gm_node.args, + node_users, + node_shape, + node_dtype, + node_kwargs=gm_node.kwargs, + ) + + graph.add_node(buddy_node) + transform_list = [maxpool2d_simplify] + graph.perform(transform_list) + self._imported_graphs.append(graph) + self._imported_params[graph] = params_flat + return self.dynamo_run() return aot_module_simplified( gm, @@ -143,11 +354,12 @@ def __call__( inputs (List[torch.Tensor]): The input tensors. Returns: - Any: The result of the ahead-of-time compiled module. + dynamo_run: The function of the ahead-of-time compiled module, + return for torchdynamo's call. """ return self._compile_fx(gm, inputs) - def importer(self, model, *args, **kwargs): + def importer(self, model, *args, **kwargs) -> List[Graph]: """ Imports the provided model as MLIR module and flat parameters. @@ -157,212 +369,145 @@ def importer(self, model, *args, **kwargs): kwargs: Keyword arguments for the model. Returns: - module: The imported MLIR module. - params: The imported flat parameters. + imported_graphs: The imported buddy graphs. """ model_opt = dynamo.optimize(self._compile_fx)(model) model_opt(*args, **kwargs) - module = self._imported_module - params = self._imported_params - return module, params - - -class FXGraphImporter: - """ - Imports an FX graph and generates an MLIR module in high-level dialects. - - Attributes: - _symbol_table (dict): A dictionary to keep track of the symbols. - _gm (torch.fx.GraphModule): The FX graph module to be imported. - _func_name (str): Name of the generated MLIR function. - _inputs (List[torch.Tensor]): Input tensor(s) of the FX graph. - _num_input_visited (int): Number of input nodes that have been visited. - _module (mlir.ir.Module): The generated MLIR module. - _ops_registry (dict): Registry for the candidate operations. - """ - - def __init__( - self, - gm: torch.fx.GraphModule, - params: List[torch.Tensor], - inputs: List[torch.Tensor], - do_param_pack: bool = True, - func_name: str = "forward", - ops_registry: Optional[dict] = None, - ): - """ - Initializes the FX Graph importer. - - Args: - gm (torch.fx.GraphModule): The FX graph that will be imported. - inputs (List[torch.Tensor]): Input tensor(s) of the FX graph. - func_name (str): Name of the generated MLIR function. - ops_registry (dict): Registry for the candidate operations. - """ - if ops_registry is None: - ops_registry = {} - self._symbol_table = {} - self._gm = gm - self._func_name = func_name - self._params = params - self._inputs = inputs - self._do_param_pack = do_param_pack - self._param_packs = [] - self._num_input_visited = 0 - self._module = ir.Module.create() - self._ops_registry = ops_registry - self._current_param_pack_offset = None - - def _torch_dtype_to_mlir_dtype(self, dtype: torch.dtype) -> ir.Type: - """ - Converts a torch dtype to the corresponding MLIR dtype. - - Args: - dtype (torch.dtype): The torch data type. + return self._imported_graphs - Returns: - mlir.ir.Type: The corresponding MLIR data type. - - Raises: - NotImplementedError: If the given dtype is not supported. - """ - match dtype: - case torch.int32: - return ir.IntegerType.get_signless(32) - case torch.int64: - return ir.IntegerType.get_signless(64) - case torch.float32: - return ir.F32Type.get() - case torch.bool: - return ir.IntegerType.get_signless(1) - case _: - raise NotImplementedError(f"Unsupported dtype {dtype}") - - def _pack_params(self) -> None: - dtypes = list(set([param.dtype for param in self._params])) - dtypes.sort(key=str) - self._current_param_pack_offset = {dtype: 0 for dtype in dtypes} - for dtype in dtypes: - params_of_dtype = [ - param for param in self._params if param.dtype == dtype - ] - param_total_size = 0 - for param in params_of_dtype: - param_total_size += functools.reduce( - lambda x, y: x * y, list(param.shape) - ) - mlir_dtype = self._torch_dtype_to_mlir_dtype(dtype) - self._param_packs.append( - ir.RankedTensorType.get([param_total_size], mlir_dtype) - ) - - def import_graph(self) -> ir.Module: + def dynamo_run(self): """ - Imports FX graph and generates an MLIR module in high-level dialects. + A callable method that wraps around the `exec_buddy_graph` method. Returns: - mlir.ir.Module: An MLIR module in high-level dialects. - """ - with ir.InsertionPoint(self._module.body): - arguments = [] - if self._do_param_pack: - self._pack_params() - arguments.extend(self._param_packs) - inputs = self._inputs - else: - inputs = self._params + self._inputs - for arg in inputs: - shape_list = list(arg.shape) - torch_dtype = arg.dtype - mlir_dtype = self._torch_dtype_to_mlir_dtype(torch_dtype) - tensor_arg = ir.RankedTensorType.get(shape_list, mlir_dtype) - arguments.append(tensor_arg) - - @func.FuncOp.from_py_func(*arguments, name=self._func_name) - def generated_func(*args): - args_list = list(args) - for node in self._gm.graph.nodes: - if not ( - node.op in ["output", "placeholder", "call_function"] - or node.target is operator.getitem - ): - continue - if node.op == "output": - output_node_args = node.args[0] - returns = [ - self._symbol_table.get((str(output_arg), 0)) - for output_arg in output_node_args - ] - self._symbol_table[("output", 0)] = returns - elif node.op == "placeholder": - self._import_placeholder(node, args_list) - elif node.target is operator.getitem: - self._symbol_table[ - (str(node.name), 0) - ] = self._symbol_table[ - (str(node.args[0]), node.args[1]) - ] - else: - self._import_op(node) - - return self._symbol_table.get(("output", 0)) - - return self._module - - def _import_placeholder( - self, node: torch.fx.Node, args_list: List[ir.BlockArgument] - ): + exec_buddy_graph: The function of the ahead-of-time compiled module, + return for torchdynamo's call. """ - Imports a placeholder node from the FX graph. - Args: - node (torch.fx.Node): The FX node representing the placeholder. - args_list (List[mlir.ir.BlockArgument]): List of input tensors. - """ - if self._num_input_visited < len(self._params): - dtype = node.meta["tensor_meta"].dtype - pack_of_dtype = None - for pack in args_list: - if ir.RankedTensorType( - pack.type - ).element_type == self._torch_dtype_to_mlir_dtype(dtype): - pack_of_dtype = pack - break - placeholder_name = self._ops_registry["param.extract"]( - node, self._current_param_pack_offset[dtype], pack_of_dtype - ).result - self._current_param_pack_offset[dtype] += functools.reduce( - lambda x, y: x * y, list(node.meta["tensor_meta"].shape) - ) - else: - if len(self._params) > 0: - placeholder_name = args_list[ - self._num_input_visited - - len(self._params) - + len(self._param_packs) - ] + def get_lib_extension(): + if platform.system() == "Linux": + return ".so" + elif platform.system() == "Darwin": + return ".dylib" else: - placeholder_name = args_list[self._num_input_visited] - - self._symbol_table[(str(node.name), 0)] = placeholder_name - self._num_input_visited += 1 - - def _import_op(self, node: torch.fx.Node): - """ - Imports an operation node from the FX graph. - - Args: - node (torch.fx.Node): The FX node representing the operation. - - """ - op_name = node.target.__name__ - op_ret: ir.Operation | ir.Value | tuple | ir.OpResult = ( - self._ops_registry[op_name](node, self._symbol_table) + raise RuntimeError("Unsupported platform") + + # Dynamo's graph break may import more than one graph. + graph = self._imported_graphs[-1] + graph.compile() + # Collect dependency libraries. + lib_extension = get_lib_extension() + lib_names = ["libmlir_runner_utils", "libmlir_c_runner_utils", "libomp"] + path_prefix = os.path.dirname(os.path.abspath(__file__)) + lib_base_path = os.path.join(path_prefix, "../../../../llvm/build/lib/") + lib_base_path = os.path.abspath(lib_base_path) + shared_libs = [ + os.path.join(lib_base_path, lib_name + lib_extension) + for lib_name in lib_names + ] + # Define execution engine. + ee = ExecutionEngine( + graph._imported_module, opt_level=3, shared_libs=shared_libs ) - if isinstance(op_ret, tuple): - for i, operation in enumerate(op_ret): - self._symbol_table[(str(node.name), i)] = operation.result - elif isinstance(op_ret, ir.OpResult): - self._symbol_table[(str(node.name), 0)] = op_ret - else: - self._symbol_table[(str(node.name), 0)] = op_ret.result + + def cast_c_ptr(outdata_ptr, memref_ptr): + """ + Casts a C pointer (`outdata_ptr`) to the type of another C pointer + (`memref_ptr`). + + Args: + outdata_ptr: ctypes.POINTER + The C pointer whose type needs to be cast. + memref_ptr: ctypes.POINTER + The reference C pointer whose type will be used for casting. + + Returns: + ctypes.POINTER + A new C pointer with the type of `memref_ptr`, representing the + same memory location as `outdata_ptr`. + + Example: + outdata = ctypes.pointer(ctypes.c_int()) + memref = ctypes.pointer(ctypes.c_float()) + casted_ptr = cast_c_ptr(outdata, memref) + # Now `casted_ptr` points to the same memory location as `outdata`, + but with the type of `memref`. + """ + outdata_addr = ctypes.addressof(outdata_ptr.contents) + out_ptr = ctypes.cast(outdata_addr, type(memref_ptr)) + return out_ptr + + def move_c_ptr(outdata_ptr, memref_ptr): + """ + Moves a C pointer (`outdata_ptr`) to the next element in memory, + based on the size of the referenced type in another C pointer + (`memref_ptr`). + + Args: + outdata_ptr: ctypes.POINTER + The C pointer whose position needs to be moved. + memref_ptr: ctypes.POINTER + The reference C pointer whose type determines the size of each + element for the move. + + Returns: + ctypes.POINTER + A new C pointer pointing to the next element in memory, based on + the size of the type referenced by `memref_ptr`. + """ + elem_size = ctypes.sizeof(memref_ptr.contents) + outdata_addr = ctypes.addressof(outdata_ptr.contents) + out_ptr = ctypes.cast(outdata_addr + elem_size, type(memref_ptr)) + return out_ptr + + def exec_buddy_graph(*args): + """ + Execute a graph using TorchDynamo with the provided input tensors. + + Args: + *args: List[torch.Tensor] + Input tensors to be passed to the graph's function. + + Returns: + List[torch.Tensor] + The result of executing the graph, represented as a list of + output tensors. + """ + # A list of ctypes pointers representing memory references for input + # tensors. + input_memref = [ + ctypes.pointer( + ctypes.pointer( + rt.get_ranked_memref_descriptor(tensor.numpy()) + ) + ) + for tensor in args + ] + # A list of ctypes pointers representing memory references for + # output tensors. + output_memref = [ + ctypes.pointer(ctypes.pointer(graph._output_descriptor())) + ] + args_memref = output_memref + input_memref + # Invoke the graph's function using the provided execution engine + # and memory references + ee.invoke(graph._func_name, *args_memref) + + output_tensor = [] + outdata_ptr = args_memref[0][0] + # Iterate through each output memory reference in the graph + for output_ptr in graph._output_memref: + # Cast the output data pointer to the type of the current output + # memory reference + data_ptr = cast_c_ptr(outdata_ptr, output_ptr[0]) + # Convert the C data pointer to a NumPy array and append it to + # the output_tensor list + output_tensor.append(rt.ranked_memref_to_numpy(data_ptr)) + # Move to the next element in memory based on the size of the + # current output type + outdata_ptr = move_c_ptr(outdata_ptr, output_ptr[0]) + # Convert each NumPy array to a PyTorch tensor and return the list + # of tensors + return [torch.from_numpy(tensor) for tensor in output_tensor] + + return exec_buddy_graph diff --git a/frontend/Python/graph/__init__.py b/frontend/Python/graph/__init__.py new file mode 100644 index 000000000..bd927a3c0 --- /dev/null +++ b/frontend/Python/graph/__init__.py @@ -0,0 +1,23 @@ +# ===- __init__.py ------------------------------------------------------------- +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# ===--------------------------------------------------------------------------- +# +# Init the packages in graph directory. +# +# ===--------------------------------------------------------------------------- + +from .graph import Graph +from .operation import * +from .type import TensorDType, TensorMeta diff --git a/frontend/Python/graph/graph.py b/frontend/Python/graph/graph.py new file mode 100644 index 000000000..be2ce438c --- /dev/null +++ b/frontend/Python/graph/graph.py @@ -0,0 +1,487 @@ +# ===- graph.py ---------------------------------------------------------------- +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# ===--------------------------------------------------------------------------- +# +# This is the graph level of the Buddy Compiler frontend. +# +# ===--------------------------------------------------------------------------- + +from typing import Any, List, Optional +from types import FunctionType +import ctypes +import functools + +import numpy as np +import mlir.ir as ir +import mlir.dialects.func as func +from mlir.passmanager import * +from mlir.execution_engine import * +from mlir import runtime as rt + +from .operation import * +from .type import * + + +def make_output_memref_descriptor(ranks, dtypes): + """ + Make an output memref descriptor for the given memref ranks and dtypes. + + Parameters: + - ranks: List[int] + A list of integers representing the ranks of each memref. + - dtypes: List[str] + A list of strings representing the data types of each memref. + + Returns: + ctypes.Structure + An output memref descriptor struct. + + Example: + ranks = [2, 3, 1] + dtypes = [np.float32, np.int64, np.bool] + descriptor = make_output_memref_descriptor(ranks, dtypes) + # Use the descriptor in your code + """ + memref_descriptor = [] + for i, rank, dtype in zip(range(len(ranks)), ranks, dtypes): + memref_descriptor.append( + (str(i), rt.make_nd_memref_descriptor(rank, dtype)) + ) + + class OutputDescriptor(ctypes.Structure): + """Builds an output struct descriptor for the multi memref.""" + + _fields_ = memref_descriptor + + return OutputDescriptor + + +class Graph: + """ + Graph is a graph-level expression for the Buddy Compiler frontends. + It acts as a model compute graph, which converts a Graph into an equivalent + MLIR module. + + Attributes: + - _body: List[Op] + The sequence of operation nodes in the graph. + - _inputs: List[TensorMeta] + The model inputs represented as TensorMeta objects. + - _fake_params: List[TensorMeta] + The fake parameters represented as TensorMeta objects. + - device: str + The hardware for graph runtime. + - _imported_module: Union[None, ImportedModuleType] + The imported MLIR module after compilation, if set. + - _ops_registry: dict + The ops lower strategy for the graph. + - _func_name: str + The function name for the MLIR module. + - _ctx: ir.Context + The context of the MLIR module. + - _output_memref: Union[None, ctypes.POINTER] + The memref pointer in the MLIR function output, if set. + - _output_descriptor: Union[None, OutputDescriptorType] + The output descriptor for the MLIR function, if set. + - ee_: Union[None, ExecutionEngineType] + The execution engine for the graph, if set. + """ + + def __init__( + self, + inputs: List[TensorMeta], + fake_params: List[TensorMeta], + ops_registry: dict, + func_name: str, + ) -> None: + """ + Initializes the Graph. + + Args: + inputs: List[TensorMeta] + The model inputs represented as TensorMeta objects. + fake_params: List[TensorMeta] + The fake parameters represented as TensorMeta objects. + ops_registry: dict + The ops lower strategy for the graph. + func_name: str + The function name for the MLIR module. + """ + self._body = [] + self._inputs = inputs + self.node_table: Dict[str, Op] = {} + self._fake_params = fake_params + self.device = "cpu" + self._imported_module = None + self._ops_registry = ops_registry + self._func_name = func_name + self._ctx = ir.Context() + self._output_memref = None + self._output_descriptor = None + self.execution_engine = None + + def add_node(self, node: Op): + """ + Adds an operation node to the graph's body. + + Parameters: + - node: Op + The operation node to be added to the graph. + + Returns: + None + + Example: + graph_instance = Graph(inputs, fake_params, ops_registry, func_name) + op_node = Op() + graph_instance.add_node(op_node) + # The op_node is now part of the graph's body + """ + self._body.append(node) + self.node_table[node.name] = node + + def perform(self, func_list: List[FunctionType]): + for transform_func in func_list: + transform_func(self) + + def lower_to_top_level_ir(self, do_params_pack=False): + """ + Lowers the graph to top-level MLIR dialects. + + Parameters: + - do_params_pack: bool, optional (default=False) + Flag indicating whether to perform parameters packing to one memref. + + Returns: + None + + Example: + graph_instance = Graph(inputs, fake_params, ops_registry, func_name) + graph_instance.lower_to_top_level_ir(do_params_pack=True) + # The graph is now lowered to top-level MLIR dialects + """ + with ir.Location.unknown(self._ctx): + fx_importer = GraphImporter( + self._body, + self._fake_params, + self._inputs, + do_params_pack, + self._func_name, + self._ops_registry, + ) + self._imported_module = fx_importer.import_graph() + outputs = fx_importer.get_output_nodes() + self._output_memref = [] + output_ranks = [] + output_dtypes = [] + for out_node in outputs: + out_type = ir.RankedTensorType(out_node.type) + shape = list(out_type.shape) + dtype = out_type.element_type + match str(dtype): + case "i1": + np_type = np.dtype(np.bool_) + case "i32": + np_type = np.dtype(np.int32) + case "i64": + np_type = np.dtype(np.int64) + case "f32": + np_type = np.dtype(np.float32) + case _: + raise NotImplementedError(f"Unsupported dtype {dtype}") + self._output_memref.append( + ctypes.pointer( + ctypes.pointer( + rt.make_nd_memref_descriptor( + len(shape), rt.as_ctype(np_type) + )() + ) + ) + ) + output_ranks.append(len(shape)) + output_dtypes.append(rt.as_ctype(np_type)) + self._output_descriptor = make_output_memref_descriptor( + output_ranks, output_dtypes + ) + + def lower_to_llvm_ir(self): + """ + Lower graph to llvm ir. + """ + if self._imported_module is None: + self.lower_to_top_level_ir() + + with ir.Location.unknown(self._ctx): + pm = PassManager("builtin.module") + pm.add("func.func(tosa-to-linalg-named)") + pm.add("func.func(tosa-to-linalg)") + pm.add("func.func(tosa-to-tensor)") + pm.add("func.func(tosa-to-arith)") + pm.run(self._imported_module.operation) + pm.add("arith-expand") + pm.add("eliminate-empty-tensors") + pm.add("empty-tensor-to-alloc-tensor") + pm.add("convert-elementwise-to-linalg") + pm.add('one-shot-bufferize') + pm.add("func.func(convert-linalg-to-affine-loops)") + pm.add("affine-loop-fusion") + pm.add("func.func(affine-parallelize)") + pm.add("lower-affine") + pm.add("convert-scf-to-openmp") + pm.add("func-bufferize") + pm.add("arith-bufferize") + pm.add("func.func(tensor-bufferize)") + pm.add("func.func(buffer-deallocation)") + pm.add("func.func(finalizing-bufferize)") + pm.add("expand-strided-metadata") + pm.add("convert-vector-to-llvm") + pm.add("memref-expand") + pm.add("arith-expand") + pm.add("convert-arith-to-llvm") + pm.add("finalize-memref-to-llvm") + pm.add("convert-scf-to-cf") + pm.add("func.func(llvm-request-c-wrappers)") + pm.add("convert-openmp-to-llvm") + pm.add("convert-math-to-llvm") + pm.add("convert-math-to-libm") + pm.add("convert-func-to-llvm") + pm.add("reconcile-unrealized-casts") + pm.run(self._imported_module.operation) + + def compile(self): + """ + Compile graph from Buddy Graph to LLVM IR. + """ + self.lower_to_top_level_ir() + self.lower_to_llvm_ir() + + +class GraphImporter: + """ + Imports an buddy graph and generates an MLIR module in high-level dialects. + + Attributes: + _symbol_table (dict): A dictionary to keep track of the symbols. + _body (List[Op]): The FX graph module to be imported. + _func_name (str): Name of the generated MLIR function. + _inputs (List[TensorMeta]): Input tensor(s) of the FX graph. + _num_input_visited (int): Number of input nodes that have been visited. + _module (mlir.ir.Module): The generated MLIR module. + _ops_registry (dict): Registry for the candidate operations. + """ + + def __init__( + self, + body: List[Op], + params: List[TensorMeta], + inputs: List[TensorMeta], + do_param_pack: bool, + func_name: str, + ops_registry: dict, + ): + """ + Initializes the buddy Graph importer. + + Args: + gm (Graph): The buddy graph that will be imported. + inputs (List[TensorMeta]): Input tensor(s) of the buddy graph. + func_name (str): Name of the generated MLIR function. + ops_registry (dict): Registry for the candidate operations. + """ + if ops_registry is None: + ops_registry = {} + self._symbol_table = {} + self._body = body + self._func_name = func_name + self._params = params + self._inputs = inputs + self._do_param_pack = do_param_pack + self._param_packs = [] + self._num_input_visited = 0 + self._module = ir.Module.create() + self._ops_registry = ops_registry + self._current_param_pack_offset = None + + def _str_to_mlir_dtype(self, dtype: str) -> ir.Type: + """ + Converts a str to the corresponding MLIR dtype. + + Args: + dtype (str): The tensor type. + + Returns: + mlir.ir.Type: The corresponding MLIR data type. + + Raises: + NotImplementedError: If the given dtype is not supported. + """ + match dtype: + case TensorDType.Int32: + return ir.IntegerType.get_signless(32) + case TensorDType.Int64: + return ir.IntegerType.get_signless(64) + case TensorDType.Float32: + return ir.F32Type.get() + case TensorDType.Bool: + return ir.IntegerType.get_signless(1) + case _: + raise NotImplementedError(f"Unsupported dtype {dtype}") + + def _pack_params(self) -> None: + """ + Packs parameters of the graph to one memref. + + Returns: + None + + Example: + graph_instance = Graph(inputs, fake_params, ops_registry, func_name) + graph_instance._pack_params() + # The parameters of the graph are now packed to one memref. + """ + dtypes = list(set([param.dtype for param in self._params])) + dtypes.sort(key=str) + self._current_param_pack_offset = {dtype: 0 for dtype in dtypes} + for dtype in dtypes: + params_of_dtype = [ + param for param in self._params if param.dtype == dtype + ] + param_total_size = 0 + for param in params_of_dtype: + param_total_size += functools.reduce( + lambda x, y: x * y, list(param.shape), 1 + ) + mlir_dtype = self._str_to_mlir_dtype(dtype) + self._param_packs.append( + ir.RankedTensorType.get([param_total_size], mlir_dtype) + ) + + def import_graph(self) -> ir.Module: + """ + Imports buddy graph and generates an MLIR module in high-level dialects. + + Returns: + mlir.ir.Module: An MLIR module in high-level dialects. + """ + with ir.InsertionPoint(self._module.body): + arguments = [] + if self._do_param_pack: + self._pack_params() + arguments.extend(self._param_packs) + inputs = self._inputs + else: + inputs = self._params + self._inputs + for arg in inputs: + shape_list = list(arg.shape) + dtype = arg.dtype + mlir_dtype = self._str_to_mlir_dtype(dtype) + tensor_arg = ir.RankedTensorType.get(shape_list, mlir_dtype) + arguments.append(tensor_arg) + + @func.FuncOp.from_py_func(*arguments, name=self._func_name) + def generated_func(*args): + args_list = list(args) + for node in self._body: + if isinstance(node, OutputOp): + output_node_args = node.args + returns = [ + self._symbol_table.get((str(output_arg), 0)) + for output_arg in output_node_args + ] + self._symbol_table[("output", 0)] = returns + elif isinstance(node, PlaceholderOp): + self._import_placeholder(node, args_list) + elif isinstance(node, GetItemOp): + self._symbol_table[ + (str(node.name), 0) + ] = self._symbol_table[ + (str(node.args[0]), node.args[1]) + ] + else: + self._import_op(node) + + return self._symbol_table.get(("output", 0)) + + return self._module + + def _import_placeholder( + self, node: PlaceholderOp, args_list: List[ir.BlockArgument] + ): + """ + Imports a placeholder node from the Buddy graph. + + Parameters: + - node (PlaceholderOp): The PlaceholderOp node representing the + placeholder. + - args_list (List[mlir.ir.BlockArgument]): List of input memrefs. + + Returns: + None + """ + if self._num_input_visited < len(self._params) and self._do_param_pack: + dtype = node.tensor_meta["dtype"] + pack_of_dtype = None + for pack in args_list: + if ir.RankedTensorType( + pack.type + ).element_type == self._str_to_mlir_dtype(dtype): + pack_of_dtype = pack + break + placeholder_name = self._ops_registry["param.extract"]( + node, self._current_param_pack_offset[dtype], pack_of_dtype + ).result + self._current_param_pack_offset[dtype] += functools.reduce( + lambda x, y: x * y, list(node.tensor_meta["shape"]), 1 + ) + elif self._do_param_pack: + if len(self._params) > 0: + placeholder_name = args_list[ + self._num_input_visited + - len(self._params) + + len(self._param_packs) + ] + else: + placeholder_name = args_list[self._num_input_visited] + else: + placeholder_name = args_list[self._num_input_visited] + + self._symbol_table[(str(node.name), 0)] = placeholder_name + self._num_input_visited += 1 + + def _import_op(self, node: Op): + """ + Imports an operation node from the buddy graph. + + Args: + node (Op): The buddy node representing the operation. + + """ + op_name = node.__class__.__name__ + op_ret: ir.Operation | ir.Value | tuple | ir.OpResult = ( + self._ops_registry[op_name](node, self._symbol_table) + ) + if isinstance(op_ret, tuple): + for i, operation in enumerate(op_ret): + self._symbol_table[(str(node.name), i)] = operation.result + elif isinstance(op_ret, ir.OpResult): + self._symbol_table[(str(node.name), 0)] = op_ret + else: + self._symbol_table[(str(node.name), 0)] = op_ret.result + + def get_output_nodes(self): + """ + Get output nodes from the lowered mlir func. + """ + return self._symbol_table.get(("output", 0)) diff --git a/frontend/Python/graph/operation.py b/frontend/Python/graph/operation.py new file mode 100644 index 000000000..550f3f321 --- /dev/null +++ b/frontend/Python/graph/operation.py @@ -0,0 +1,456 @@ +# ===- operation.py ------------------------------------------------------------ +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# ===--------------------------------------------------------------------------- +# +# This is the operation structure of Buddy Compiler graph representation. +# +# ===--------------------------------------------------------------------------- + +from enum import Enum +from typing import Dict, Optional, List, Tuple + +from .type import TensorDType, TensorMeta + + +class OpType(Enum): + """ + Enum class for declaring operation types. + + Members: + - BroadcastType: int + Represents a broadcast operation. + - ElementwiseType: int + Represents an elementwise operation. + - ReshapeType: int + Represents a reshape operation. + - ReduceType: int + Represents a reduction operation. + - ConcatType: int + Represents a concatenation operation. + - PlaceholderType: int + Represents a placeholder operation. + - GetItemType: int + Represents an operation to retrieve an item. + + Note: The underlying values are integers for these operation types. + """ + + BroadcastType = 0 + ElementwiseType = 1 + ReshapeType = 2 + ReduceType = 3 + ConcatType = 4 + PlaceholderType = 5 + GetItemType = 6 + + +class Op: + """ + Base class for all operations in a computational graph. + + Attributes: + - _name: str + The unique name of the operation node. + - _arguments: list + The input arguments of the operation node. + - _keyword_arguments: dict + The keyword arguments of the operation node. + - _tensor_meta: dict + The metadata of the output tensor, including shape and data type. + - _op_type: OpType + The type of the operation node, as defined in the OpType enum. + """ + + def __init__(self) -> None: + """ + Initialize a new instance of the Op class. + """ + self._name = None + self._arguments = [] + self._keyword_arguments = {} + self._tensor_meta: List[TensorMeta] = {} + self._op_type: OpType = None + self._children: List[str] = [] + self._parents: List[str] = [] + + def add_argument(self, arg): + """ + Add an input argument to the operation node. + + Parameters: + - arg: Any + The input argument to be added. + """ + self._arguments.append(arg) + + def add_parent(self, parent: str): + """ + Add an parent node's name to the operation node. + + Parameters: + - parent: str + The parent node's name to be added. + """ + self._parents.append(parent) + + def add_children(self, child): + """ + Add an user node's name to the operation node. + + Parameters: + - user: str + The user node's name to be added. + """ + self._children.append(child) + + @property + def args(self): + return self._arguments + + @property + def kwargs(self): + return self._keyword_arguments + + @property + def name(self): + return self._name + + @name.setter + def name(self, new_name): + self._name = new_name + + @property + def tensor_meta(self): + return self._tensor_meta + + +class PlaceholderOp(Op): + def __init__(self) -> None: + super().__init__() + self._op_type = OpType.PlaceholderType + + +class MatmulOp(Op): + def __init__(self) -> None: + super().__init__() + self._op_type = OpType.ReduceType + + +class GetItemOp(Op): + def __init__(self) -> None: + super().__init__() + self._op_type = OpType.GetItemType + + +class OutputOp(Op): + def __init__(self) -> None: + super().__init__() + self._op_type = OpType.GetItemType + + +class ArangeOp(Op): + def __init__(self) -> None: + super().__init__() + self._op_type = OpType.PlaceholderType + + +class UnsqueezeOp(Op): + def __init__(self) -> None: + super().__init__() + self._op_type = OpType.ReshapeType + + +class ViewOp(Op): + def __init__(self) -> None: + super().__init__() + self._op_type = OpType.ReshapeType + + +class EmbeddingOp(Op): + def __init__(self) -> None: + super().__init__() + self._op_type = OpType.ReshapeType + + +class OnesOp(Op): + def __init__(self) -> None: + super().__init__() + self._op_type = OpType.PlaceholderType + + +class FullOp(Op): + def __init__(self) -> None: + super().__init__() + self._op_type = OpType.PlaceholderType + + +class LessThanOp(Op): + def __init__(self) -> None: + super().__init__() + self._op_type = OpType.BroadcastType + + +class MaskedFillOp(Op): + def __init__(self) -> None: + super().__init__() + self._op_type = OpType.ElementwiseType + + +class SliceOp(Op): + def __init__(self) -> None: + super().__init__() + self._op_type = OpType.ReshapeType + + +class ToCopyOp(Op): + def __init__(self) -> None: + super().__init__() + self._op_type = OpType.ElementwiseType + + +class RsubOp(Op): + def __init__(self) -> None: + super().__init__() + self._op_type = OpType.BroadcastType + + +class PowOp(Op): + def __init__(self) -> None: + super().__init__() + self._op_type = OpType.BroadcastType + + +class MeanOp(Op): + def __init__(self) -> None: + super().__init__() + self._op_type = OpType.ReduceType + + +class RsqrtOp(Op): + def __init__(self) -> None: + super().__init__() + self._op_type = OpType.ElementwiseType + + +class MulOp(Op): + def __init__(self) -> None: + super().__init__() + self._op_type = OpType.BroadcastType + + +class TransposeOp(Op): + def __init__(self) -> None: + super().__init__() + self._op_type = OpType.ReshapeType + + +class IndexOp(Op): + def __init__(self) -> None: + super().__init__() + self._op_type = OpType.ReshapeType + + +class NegOp(Op): + def __init__(self) -> None: + super().__init__() + self._op_type = OpType.ElementwiseType + + +class CatOp(Op): + def __init__(self) -> None: + super().__init__() + self._op_type = OpType.ConcatType + + +class SqueezeOp(Op): + def __init__(self) -> None: + super().__init__() + self._op_type = OpType.ReshapeType + + +class BatchMatmulOp(Op): + def __init__(self) -> None: + super().__init__() + self._op_type = OpType.ReduceType + + +class DivOp(Op): + def __init__(self) -> None: + super().__init__() + self._op_type = OpType.BroadcastType + + +class SoftmaxOp(Op): + def __init__(self) -> None: + super().__init__() + self._op_type = OpType.ReduceType + + +class CloneOp(Op): + def __init__(self) -> None: + super().__init__() + self._op_type = OpType.ReduceType + + +class SiluOp(Op): + def __init__(self) -> None: + super().__init__() + self._op_type = OpType.ElementwiseType + + +class AddOp(Op): + def __init__(self) -> None: + super().__init__() + self._op_type = OpType.BroadcastType + + +class AddMMOp(Op): + def __init__(self) -> None: + super().__init__() + self._op_type = OpType.ReduceType + + +class AmaxOp(Op): + def __init__(self) -> None: + super().__init__() + self._op_type = OpType.ReduceType + + +class SubOp(Op): + def __init__(self) -> None: + super().__init__() + self._op_type = OpType.BroadcastType + + +class ConvertElementTypeOp(Op): + def __init__(self) -> None: + super().__init__() + self._op_type = OpType.ElementwiseType + + +class ExpOp(Op): + def __init__(self) -> None: + super().__init__() + self._op_type = OpType.ElementwiseType + + +class ExpandOp(Op): + def __init__(self) -> None: + super().__init__() + self._op_type = OpType.ReshapeType + + +class PermuteOp(Op): + def __init__(self) -> None: + super().__init__() + self._op_type = OpType.ReshapeType + + +class ReshapeOp(Op): + def __init__(self) -> None: + super().__init__() + self._op_type = OpType.ReshapeType + + +class SelectOp(Op): + def __init__(self) -> None: + super().__init__() + self._op_type = OpType.ReshapeType + + +class SumDimOp(Op): + def __init__(self) -> None: + super().__init__() + self._op_type = OpType.ReduceType + + +class TanhOp(Op): + def __init__(self) -> None: + super().__init__() + self._op_type = OpType.ElementwiseType + + +class VarMeanOp(Op): + def __init__(self) -> None: + super().__init__() + self._op_type = OpType.ReduceType + + +class TOp(Op): + def __init__(self) -> None: + super().__init__() + self._op_type = OpType.ReshapeType + + +class ErfOp(Op): + def __init__(self) -> None: + super().__init__() + self._op_type = OpType.ElementwiseType + +class Conv2dOp(Op): + def __init__(self) -> None: + super().__init__() + self._op_type = OpType.ReduceType + self._layout = "NCHW_FCHW" + +class ReluOp(Op): + def __init__(self) -> None: + super().__init__() + self._op_type = OpType.ElementwiseType + +class SigmoidOp(Op): + def __init__(self) -> None: + super().__init__() + self._op_type = OpType.ElementwiseType + +class IotaOp(Op): + def __init__(self) -> None: + super().__init__() + self._op_type = OpType.PlaceholderType + +class ScalarTensorOp(Op): + def __init__(self) -> None: + super().__init__() + self._op_type = OpType.PlaceholderType + +class WhereOp(Op): + def __init__(self) -> None: + super().__init__() + self._op_type = OpType.ElementwiseType + +class MaxPool2dWithIndicesOp(Op): + def __init__(self) -> None: + super().__init__() + self._op_type = OpType.ReduceType + self._layout = "NCHW" + + +class MaxPool2dOp(Op): + def __init__(self) -> None: + super().__init__() + self._op_type = OpType.ReduceType + self._layout = "NCHW" + + +class ReciprocalOp(Op): + def __init__(self) -> None: + super().__init__() + self._op_type = OpType.ElementwiseType + + +class SqrtOp(Op): + def __init__(self) -> None: + super().__init__() + self._op_type = OpType.ElementwiseType diff --git a/frontend/Python/graph/transform/__init__.py b/frontend/Python/graph/transform/__init__.py new file mode 100644 index 000000000..c4b7ac3d1 --- /dev/null +++ b/frontend/Python/graph/transform/__init__.py @@ -0,0 +1 @@ +from .useless_op_eliminate import maxpool2d_simplify \ No newline at end of file diff --git a/frontend/Python/graph/transform/useless_op_eliminate.py b/frontend/Python/graph/transform/useless_op_eliminate.py new file mode 100644 index 000000000..1b3f59296 --- /dev/null +++ b/frontend/Python/graph/transform/useless_op_eliminate.py @@ -0,0 +1,66 @@ +# ===- maxpool2d_simplify.py --------------------------------------------------- +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# ===--------------------------------------------------------------------------- +# +# simplify the maxpool2d with getitem. +# +# ===--------------------------------------------------------------------------- + +from .. import Graph +from ..operation import * + + +def maxpool2d_simplify(graph: Graph): + """ + Fuse the maxpool op and getitem op to simpllify graph. + + Args: + graph (torch.fx.GraphModule): The Graph to be simplified. + """ + for i, node in enumerate(graph._body): + if isinstance(node, MaxPool2dWithIndicesOp): + getitem_num = 0 + for user in node._children: + if isinstance(graph.node_table[user], GetItemOp): + getitem_num += 1 + getitem_node = graph.node_table[user] + if ( + getitem_num == 1 + and len(node._children) == 1 + and getitem_node.args[1] == 0 + ): + new_node = MaxPool2dOp() + new_node.name = getitem_node.name + for arg in node.args: + new_node.add_argument(arg) + for parent in node._parents: + new_node.add_parent(parent) + for child in getitem_node._children: + new_node.add_children(child) + new_node.tensor_meta["shape"] = getitem_node.tensor_meta[ + "shape" + ] + new_node.tensor_meta["dtype"] = getitem_node.tensor_meta[ + "dtype" + ] + new_node._layout = node._layout + del graph.node_table[node.name] + del graph.node_table[getitem_node.name] + graph.node_table[new_node.name] = new_node + del graph._body[i] + for j, op in enumerate(graph._body): + if op == getitem_node: + graph._body[j] = new_node + break diff --git a/frontend/Python/graph/type.py b/frontend/Python/graph/type.py new file mode 100644 index 000000000..5e1db3ed8 --- /dev/null +++ b/frontend/Python/graph/type.py @@ -0,0 +1,79 @@ +# ===- type.py ----------------------------------------------------------------- +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# ===--------------------------------------------------------------------------- +# +# This is the tensor type of the Buddy Compiler frontend. +# +# ===--------------------------------------------------------------------------- + +from enum import Enum + + +class TensorDType(Enum): + """ + Enum class for declaring tensor data types. + + Members: + - Int32: str + Represents the 32-bit integer data type. + - Int64: str + Represents the 64-bit integer data type. + - Float32: str + Represents the 32-bit floating-point data type. + - Bool: str + Represents the boolean data type. + """ + + Int32 = "int32" + Int64 = "int64" + Float16 = "float16" + Float32 = "float32" + Float64 = "float64" + Bool = "bool" + + +class TensorMeta: + """ + Store tensor metadata, including shape and data type, while overlooking raw + data. + + Attributes: + - shape: tuple + Represents the shape of the tensor. + - dtype: str + Represents the data type of the tensor. + + Methods: + - __init__(shape: tuple, dtype: str) -> None: + Initializes a new instance of the TensorMeta class with the specified + shape and data type. + + Example: + meta = TensorMeta(shape=(3, 4), dtype='float32') + # Access metadata attributes: meta.shape, meta.dtype + """ + + def __init__(self, shape, dtype) -> None: + """ + Initialize a new instance of the TensorMeta class. + + Parameters: + - shape: tuple + Represents the shape of the tensor. + - dtype: str + Represents the data type of the tensor. + """ + self.shape = shape + self.dtype = dtype diff --git a/frontend/Python/ops/linalg.py b/frontend/Python/ops/linalg.py index 6a6e161c9..0a22478e1 100644 --- a/frontend/Python/ops/linalg.py +++ b/frontend/Python/ops/linalg.py @@ -14,29 +14,70 @@ # # ===--------------------------------------------------------------------------- # -# The registry of mappings from Torch node to MLIR linalg dialect operations. +# The registry of mappings from Buddy Graph to MLIR linalg dialect operations. # # ===--------------------------------------------------------------------------- from typing import Dict, Tuple, List -import torch - import mlir.ir as ir from mlir.dialects import tosa, linalg, arith, tensor, math import copy import numpy import functools +from ..graph import * +from ..graph.graph import TensorDType +from .utils import * + + +def add_op(node: AddOp, symbol_table: Dict[Tuple[str, int], ir.Operation]): + """ + Import tensor add operation. + From buddy AddOp to MLIR arith `constant` operation. + + Note: this function init an output tensor according input range. + + Args: + node: Containing information from the input graph node. + symbol_table: A dictionary mapping symbols to their corresponding + operations. + + Returns: + op: The operation representing the result tensor of two input nodes' add + result. + """ + input1 = symbol_table.get((str(node.args[0]), 0)) + dtype = node.tensor_meta["dtype"] + mlir_dtype = mlir_element_type_get(dtype) + shape = list(node.tensor_meta["shape"]) + if isinstance(node.args[1], str): + input2 = symbol_table.get((str(node.args[1]), 0)) + else: + data = [node.args[1]] + input2_shape = numpy.array(data).shape + tensor_type = ir.RankedTensorType.get(input2_shape, mlir_dtype) + element = mlir_element_attr_get(dtype, node.args[1]) + attr = ir.DenseElementsAttr.get_splat(tensor_type, element) + input2 = arith.ConstantOp(tensor_type, attr).result + if input1 is None or input2 is None: + return + add_result_tensor_type = ir.RankedTensorType.get(shape, mlir_dtype) + op = tosa.AddOp( + add_result_tensor_type, + input1, + input2, + ) + return op.result + def arange_op( - node: torch.fx.Node, + node: ArangeOp, symbol_table: Dict[Tuple[str, int], ir.Operation], ): """ Import tensor arange operation. - From PyTorch `aten.arange.default` and `aten.arange.start` operator to MLIR - arith `constant` operation. + From buddy ArangeOp to MLIR arith `constant` operation. Note: this function init an output tensor according input range. @@ -49,51 +90,34 @@ def arange_op( op: The operation representing the result tensor of ranging the start and end from input node. """ - if node.target.__name__ == "arange.start": + if len(node.args) == 2: start = int(node.args[0]) end = int(node.args[1]) - stride = int(node.meta["tensor_meta"].stride[0]) - dtype = str(node.meta["tensor_meta"].dtype) - shape = list(node.meta["tensor_meta"].shape) - dtype = ir.IntegerType.get_signless(64) - tensor_type = ir.RankedTensorType.get(shape, dtype) - attr = ir.DenseElementsAttr.get( - numpy.array([i for i in range(start, end, stride)]), - signless=True, - type=tensor_type, - ) - op = arith.ConstantOp(tensor_type, attr) - - elif node.target.__name__ == "arange.default": + else: start = 0 end = int(node.args[0]) - stride = int(node.meta["tensor_meta"].stride[0]) - dtype = str(node.meta["tensor_meta"].dtype) - shape = list(node.meta["tensor_meta"].shape) - dtype = ir.IntegerType.get_signless(64) - tensor_type = ir.RankedTensorType.get(shape, dtype) - attr = ir.DenseElementsAttr.get( - numpy.array([i for i in range(start, end, stride)]), - signless=True, - type=tensor_type, - ) - op = arith.ConstantOp(tensor_type, attr) + stride = 1 + dtype = node.tensor_meta["dtype"] + shape = list(node.tensor_meta["shape"]) + dtype = mlir_element_type_get(dtype) + tensor_type = ir.RankedTensorType.get(shape, dtype) + attr = ir.DenseElementsAttr.get( + numpy.array([i for i in range(start, end, stride)]), + signless=True, + type=tensor_type, + ) + op = arith.ConstantOp(tensor_type, attr) return op def unsqueeze_op( - node: torch.fx.Node, + node: UnsqueezeOp, symbol_table: Dict[Tuple[str, int], ir.Operation], ): """ Import the unsqueeze operation. - From PyTorch `aten.unsqueeze.default` operator to MLIR TOSA `reshape` - operation. - - Note: "unsqueeze" means inserting a new dimension of size 1 at the specified - position. For more information, please refer to - https://pytorch.org/docs/stable/generated/torch.unsqueeze.html + From buddy UnsqueezeOp to MLIR TOSA `reshape` operation. Args: node: Containing information from the input graph node. @@ -118,12 +142,12 @@ def unsqueeze_op( def view_op( - node: torch.fx.Node, + node: ViewOp, symbol_table: Dict[Tuple[str, int], ir.Operation], ): """ Import the tensor view operation. - From PyTorch `aten.view.default` operator to MLIR TOSA `reshape` operation. + From buddy ViewOp to MLIR TOSA `reshape` operation. Note: If the new shape contains one and only one `-1`, the size of the new shape will be inferred automatically. @@ -160,13 +184,12 @@ def view_op( def embedding_op( - node: torch.fx.Node, + node: EmbeddingOp, symbol_table: Dict[Tuple[str, int], ir.Operation], ): """ Import the embedding operation. - From PyTorch `aten.embedding.default` operator to MLIR linalg `generic` - operation. + From buddy EmbeddingOp to MLIR linalg `generic` operation. Note: In this op, input node1's value is as index to get input node2's row slice. @@ -180,52 +203,51 @@ def embedding_op( """ input1 = symbol_table.get((str(node.args[0]), 0)) input2 = symbol_table.get((str(node.args[1]), 0)) - output_shape = list(node.meta["tensor_meta"].shape) - dtype = str(node.meta["tensor_meta"].dtype) - if dtype == "torch.float32": - tensor_type = ir.RankedTensorType.get(output_shape, ir.F32Type.get()) - output = tensor.EmptyOp(output_shape, ir.F32Type.get()) - generic_map = ir.AffineMap.get_permutation([0, 1, 2]) - op = linalg.GenericOp( - [tensor_type], - [input2], - [output], - ir.ArrayAttr.get( - [ - ir.AffineMapAttr.get(generic_map.get_submap([0, 1])), - ir.AffineMapAttr.get(generic_map.get_submap([0, 1, 2])), - ] - ), - ir.ArrayAttr.get( - [ir.Attribute.parse("#linalg.iterator_type")] * 3 - ), - ) - block = ir.Block.create_at_start( - op.region, + output_shape = list(node.tensor_meta["shape"]) + dtype = node.tensor_meta["dtype"] + dtype = mlir_element_type_get(dtype) + tensor_type = ir.RankedTensorType.get(output_shape, dtype) + output = tensor.EmptyOp(output_shape, dtype) + generic_map = ir.AffineMap.get_permutation([0, 1, 2]) + op = linalg.GenericOp( + [tensor_type], + [input2], + [output], + ir.ArrayAttr.get( [ - ir.RankedTensorType(input2.type).element_type, - ir.RankedTensorType(output.result.type).element_type, - ], - ) - index1 = arith.IndexCastOp(ir.IndexType.get(), block.arguments[0]) - index2 = linalg.IndexOp(ir._i64Attr(2, None)) - value = tensor.ExtractOp(input1, [index1.result, index2.result]) - block.append(index1) - block.append(index2) - block.append(value) - block.append(linalg.YieldOp([value.result])) + ir.AffineMapAttr.get(generic_map.get_submap([0, 1])), + ir.AffineMapAttr.get(generic_map.get_submap([0, 1, 2])), + ] + ), + ir.ArrayAttr.get( + [ir.Attribute.parse("#linalg.iterator_type")] * 3 + ), + ) + block = ir.Block.create_at_start( + op.region, + [ + ir.RankedTensorType(input2.type).element_type, + ir.RankedTensorType(output.result.type).element_type, + ], + ) + index1 = arith.IndexCastOp(ir.IndexType.get(), block.arguments[0]) + index2 = linalg.IndexOp(ir._i64Attr(2, None)) + value = tensor.ExtractOp(input1, [index1.result, index2.result]) + block.append(index1) + block.append(index2) + block.append(value) + block.append(linalg.YieldOp([value.result])) return op def ones_op( - node: torch.fx.Node, + node: OnesOp, symbol_table: Dict[Tuple[str, int], ir.Operation], ): """ Import the tensor ones operation. - From PyTorch `aten.ones.default` operator to MLIR arith `constant` - operation. + From buddy OnesOp to MLIR arith `constant` operation. Note: This op, input node1's value is as index to get input node2's row slice. @@ -238,30 +260,21 @@ def ones_op( op: The operation return the arith.constant op. """ output_shape = list(node.args[0]) - dtype = str(node.meta["tensor_meta"].dtype) - if dtype == "torch.bool": - element = ir.BoolAttr.get(1) - tensor_type = ir.RankedTensorType.get(output_shape, element.type) - attr = ir.DenseElementsAttr.get_splat(tensor_type, element) - elif dtype == "torch.int64": - dtype = ir.IntegerType.get_signless(64) - tensor_type = ir.RankedTensorType.get(output_shape, dtype) - attr = ir.DenseElementsAttr.get( - numpy.ones(output_shape), signless=True, type=tensor_type - ) + dtype = node.tensor_meta["dtype"] + element = mlir_element_attr_get(dtype, 1) + tensor_type = ir.RankedTensorType.get(output_shape, element.type) + attr = ir.DenseElementsAttr.get_splat(tensor_type, element) op = arith.ConstantOp(tensor_type, attr) return op - def full_op( - node: torch.fx.Node, + node: FullOp, symbol_table: Dict[Tuple[str, int], ir.Operation], ): """ Import the tensor full operation. - From PyTorch `aten.full.default` operator to MLIR arith `constant` - operation. + From buddy FullOp to MLIR arith `constant` operation. Note: This op, input node1's value is the shape of output tensor, input node2's value is the value of all elements in output tensor. @@ -275,39 +288,22 @@ def full_op( """ output_shape = list(node.args[0]) value = node.args[1] - dtype = str(node.meta["tensor_meta"].dtype) - if dtype == "torch.bool": - element = ir.BoolAttr.get(bool(value)) - tensor_type = ir.RankedTensorType.get(output_shape, element.type) - attr = ir.DenseElementsAttr.get_splat(tensor_type, element) - elif dtype == "torch.int64": - dtype = ir.IntegerType.get_signless(64) - tensor_type = ir.RankedTensorType.get(output_shape, dtype) - attr = ir.DenseElementsAttr.get( - numpy.full(output_shape, value, dtype=numpy.int64), - signless=True, - type=tensor_type, - ) - elif dtype == "torch.float32": - dtype = ir.F32Type.get() - tensor_type = ir.RankedTensorType.get(output_shape, dtype) - attr = ir.DenseElementsAttr.get( - numpy.full(output_shape, value, dtype=numpy.float32), - signless=True, - type=tensor_type, - ) + dtype = node.tensor_meta["dtype"] + element = mlir_element_attr_get(dtype, value) + tensor_type = ir.RankedTensorType.get(output_shape, element.type) + attr = ir.DenseElementsAttr.get_splat(tensor_type, element) op = arith.ConstantOp(tensor_type, attr) return op def lt_op( - node: torch.fx.Node, + node: LessThanOp, symbol_table: Dict[Tuple[str, int], ir.Operation], ): """ Import the tensor less than operation. - From PyTorch `aten.lt.Tensor` operator to MLIR arith `constant` operation. + From buddy LessThanOp to MLIR arith `constant` operation. Note: This op, campare two input nodes, and output bool tensor to represent compare result. @@ -321,93 +317,86 @@ def lt_op( """ input1 = symbol_table.get((str(node.args[0]), 0)) input2 = symbol_table.get((str(node.args[1]), 0)) - output_shape = list(node.meta["tensor_meta"].shape) - dtype = str(node.meta["tensor_meta"].dtype) + output_shape = list(node.tensor_meta["shape"]) + dtype = node.tensor_meta["dtype"] value = ir.IntegerAttr.get(ir.IntegerType.get_signless(64), 2) shp1 = list(ir.RankedTensorType(ir.Value(input1).type).shape) shp2 = list(ir.RankedTensorType(ir.Value(input2).type).shape) - if dtype == "torch.bool": - tensor_type = ir.RankedTensorType.get( - output_shape, ir.IntegerType.get_signless(1) - ) - output = tensor.EmptyOp(output_shape, ir.IntegerType.get_signless(1)) - if len(shp1) < len(shp2): - if int(shp1[-1]) > 1 and shp2[-1] == 1: - generic_map = ir.AffineMap.get_permutation( - [i for i in range(len(shp2) + 1)] - ) - op = linalg.GenericOp( - [tensor_type], - [input1, input2], - [output], - ir.ArrayAttr.get( - [ - ir.AffineMapAttr.get( - generic_map.get_submap( - [ - i - for i in range( - len(shp2) - len(shp1), len(shp2) - ) - ] - ) - ), - ir.AffineMapAttr.get( - generic_map.get_submap( - [i for i in range(0, len(shp2) - 1)] - + [len(shp2)] - ) - ), - ir.AffineMapAttr.get( - generic_map.get_submap( - [i for i in range(0, len(shp2))] - ) - ), - ] - ), - ir.ArrayAttr.get( - [ir.Attribute.parse("#linalg.iterator_type")] - * len(shp2) - + [ - ir.Attribute.parse( - "#linalg.iterator_type" + dtype = mlir_element_type_get(dtype) + tensor_type = ir.RankedTensorType.get(output_shape, dtype) + output = tensor.EmptyOp(output_shape, dtype) + if len(shp1) < len(shp2): + if int(shp1[-1]) > 1 and shp2[-1] == 1: + generic_map = ir.AffineMap.get_permutation( + [i for i in range(len(shp2) + 1)] + ) + op = linalg.GenericOp( + [tensor_type], + [input1, input2], + [output], + ir.ArrayAttr.get( + [ + ir.AffineMapAttr.get( + generic_map.get_submap( + [ + i + for i in range( + len(shp2) - len(shp1), len(shp2) + ) + ] ) - ] - ), + ), + ir.AffineMapAttr.get( + generic_map.get_submap( + [i for i in range(0, len(shp2) - 1)] + + [len(shp2)] + ) + ), + ir.AffineMapAttr.get( + generic_map.get_submap( + [i for i in range(0, len(shp2))] + ) + ), + ] + ), + ir.ArrayAttr.get( + [ir.Attribute.parse("#linalg.iterator_type")] + * len(shp2) + + [ir.Attribute.parse("#linalg.iterator_type")] + ), + ) + block = ir.Block.create_at_start( + op.region, + [ + ir.RankedTensorType(input2.type).element_type, + ir.RankedTensorType(input2.type).element_type, + dtype, + ], + ) + if ( + str(ir.RankedTensorType(input2.type).element_type).find("i") + != -1 + ): + cmpop = arith.CmpIOp( + value, block.arguments[0], block.arguments[1] ) - block = ir.Block.create_at_start( - op.region, - [ - ir.RankedTensorType(input2.type).element_type, - ir.RankedTensorType(input2.type).element_type, - ir.IntegerType.get_signless(1), - ], + else: + cmpop = arith.CmpFOp( + value, block.arguments[0], block.arguments[1] ) - if ( - str(ir.RankedTensorType(input2.type).element_type).find("i") - != -1 - ): - cmpop = arith.CmpIOp( - value, block.arguments[0], block.arguments[1] - ) - else: - cmpop = arith.CmpFOp( - value, block.arguments[0], block.arguments[1] - ) - block.append(cmpop) - block.append(linalg.YieldOp([cmpop.result])) + block.append(cmpop) + block.append(linalg.YieldOp([cmpop.result])) return op def masked_fill_op( - node: torch.fx.Node, + node: MaskedFillOp, symbol_table: Dict[Tuple[str, int], ir.Operation], ): """ Import the tensor masked fill operation. - From PyTorch `aten.masked_fill.Scalar` operator to MLIR linalg `generic` - operation. + From buddy MaskedFillOp to MLIR linalg `generic` operation. Note: This op, input node2 is a bool tensor. Select input node1's value or input node3's value by true or false in input node2's value. @@ -423,71 +412,67 @@ def masked_fill_op( input2 = symbol_table.get((str(node.args[1]), 0)) if input1 is None or input2 is None: return - if str(node.args[0].meta["tensor_meta"].dtype) == "torch.float32": - value = float(node.args[2]) - attr = ir.FloatAttr.get(ir.F32Type.get(), value) - value = arith.ConstantOp(ir.F32Type.get(), attr) - output_shape = list(node.meta["tensor_meta"].shape) - dtype = str(node.meta["tensor_meta"].dtype) - if dtype == "torch.float32": - tensor_type = ir.RankedTensorType.get(output_shape, ir.F32Type.get()) - output = tensor.EmptyOp(output_shape, ir.F32Type.get()) - generic_map = ir.AffineMap.get_permutation( - [i for i in range(len(output_shape))] - ) - op = linalg.GenericOp( - [tensor_type], - [input1, input2], - [output], - ir.ArrayAttr.get( - [ - ir.AffineMapAttr.get( - generic_map.get_submap( - [i for i in range(len(output_shape))] - ) - ), - ir.AffineMapAttr.get( - generic_map.get_submap( - [i for i in range(len(output_shape))] - ) - ), - ir.AffineMapAttr.get( - generic_map.get_submap( - [i for i in range(len(output_shape))] - ) - ), - ] - ), - ir.ArrayAttr.get( - [ir.Attribute.parse("#linalg.iterator_type")] - * len(output_shape) - ), - ) - block = ir.Block.create_at_start( - op.region, + dtype = node.tensor_meta["dtype"] + value = node.args[2] + attr = mlir_element_attr_get(dtype, value) + dtype = mlir_element_type_get(dtype) + value = arith.ConstantOp(dtype, attr) + output_shape = list(node.tensor_meta["shape"]) + tensor_type = ir.RankedTensorType.get(output_shape, dtype) + output = tensor.EmptyOp(output_shape, dtype) + generic_map = ir.AffineMap.get_permutation( + [i for i in range(len(output_shape))] + ) + op = linalg.GenericOp( + [tensor_type], + [input1, input2], + [output], + ir.ArrayAttr.get( [ - ir.RankedTensorType(input1.type).element_type, - ir.RankedTensorType(input2.type).element_type, - ir.RankedTensorType(output.result.type).element_type, - ], - ) - select_op = arith.SelectOp( - block.arguments[1], value, block.arguments[0] - ) - block.append(select_op) - block.append(linalg.YieldOp([select_op.result])) + ir.AffineMapAttr.get( + generic_map.get_submap( + [i for i in range(len(output_shape))] + ) + ), + ir.AffineMapAttr.get( + generic_map.get_submap( + [i for i in range(len(output_shape))] + ) + ), + ir.AffineMapAttr.get( + generic_map.get_submap( + [i for i in range(len(output_shape))] + ) + ), + ] + ), + ir.ArrayAttr.get( + [ir.Attribute.parse("#linalg.iterator_type")] + * len(output_shape) + ), + ) + block = ir.Block.create_at_start( + op.region, + [ + ir.RankedTensorType(input1.type).element_type, + ir.RankedTensorType(input2.type).element_type, + ir.RankedTensorType(output.result.type).element_type, + ], + ) + select_op = arith.SelectOp(block.arguments[1], value, block.arguments[0]) + block.append(select_op) + block.append(linalg.YieldOp([select_op.result])) return op def slice_op( - node: torch.fx.Node, + node: SliceOp, symbol_table: Dict[Tuple[str, int], ir.Operation], ): """ Import the tensor slice operation. - From PyTorch `aten.slice.Tensor` operator to MLIR tensor `extract_slice` - operation. + From buddy SliceOp to MLIR tensor `extract_slice` operation. Note: This op, get the slice of input node1. Args: @@ -514,18 +499,14 @@ def slice_op( offset = [0 for x in input_shape] offset[dim] = start offset_attr = ir._denseI64ArrayAttr(offset, None) - output_shape = list(node.meta["tensor_meta"].shape) + output_shape = list(node.tensor_meta["shape"]) size_attr = ir._denseI64ArrayAttr(output_shape, None) stride = [1 for x in output_shape] stride[dim] = step stride_attr = ir._denseI64ArrayAttr(stride, None) - dtype = str(node.meta["tensor_meta"].dtype) - if dtype == "torch.float32": - tensor_type = ir.RankedTensorType.get(output_shape, ir.F32Type.get()) - if dtype == "torch.bool": - tensor_type = ir.RankedTensorType.get( - output_shape, ir.IntegerType.get_signless(1) - ) + dtype = node.tensor_meta["dtype"] + dtype = mlir_element_type_get(dtype) + tensor_type = ir.RankedTensorType.get(output_shape, dtype) op = tensor.ExtractSliceOp( tensor_type, input1, [], [], [], offset_attr, size_attr, stride_attr @@ -535,13 +516,12 @@ def slice_op( def expand_op( - node: torch.fx.Node, + node: ExpandOp, symbol_table: Dict[Tuple[str, int], ir.Operation], ): """ Import the tensor expand operation. - From PyTorch `aten.expand.default` operator to MLIR tensor `extract_slice` - operation. + From buddy ExpandOp to MLIR tensor `extract_slice` operation. Note: This op, based on expand shape, create a new tensor and extract slice from origin tensor. @@ -559,26 +539,15 @@ def expand_op( if input1 is None: return input_shape = ir.RankedTensorType(input1.type).shape - output_shape = list(node.meta["tensor_meta"].shape) - dtype = str(node.meta["tensor_meta"].dtype) - if dtype == "torch.bool": - empty_tensor = tensor.EmptyOp( - output_shape, ir.IntegerType.get_signless(1) - ) - elif dtype == "torch.float32": - empty_tensor = tensor.EmptyOp(output_shape, ir.F32Type.get()) + output_shape = list(node.tensor_meta["shape"]) + dtype = node.tensor_meta["dtype"] + dtype = mlir_element_type_get(dtype) + empty_tensor = tensor.EmptyOp(output_shape, dtype) if list(input_shape) == list(node.args[1]): offset_attr = ir._denseI64ArrayAttr([0 for x in input_shape], None) size_attr = ir._denseI64ArrayAttr(output_shape, None) stride_attr = ir._denseI64ArrayAttr([1 for x in input_shape], None) - if dtype == "torch.bool": - tensor_type = ir.RankedTensorType.get( - output_shape, ir.IntegerType.get_signless(1) - ) - elif dtype == "torch.float32": - tensor_type = ir.RankedTensorType.get( - output_shape, ir.F32Type.get() - ) + tensor_type = ir.RankedTensorType.get(output_shape, dtype) extract_tensor = tensor.ExtractSliceOp( tensor_type, input1, [], [], [], offset_attr, size_attr, stride_attr ) @@ -602,16 +571,10 @@ def expand_op( [1] * (i + 1) + [x for x in output_shape[i + 1 :]], None ) stride_attr = ir._denseI64ArrayAttr([1] * len(offset), None) - if dtype == "torch.bool": - tensor_type = ir.RankedTensorType.get( - [1] * (i + 1) + [x for x in output_shape[i + 1 :]], - ir.IntegerType.get_signless(1), - ) - elif dtype == "torch.float32": - tensor_type = ir.RankedTensorType.get( - [1] * (i + 1) + [x for x in output_shape[i + 1 :]], - ir.F32Type.get(), - ) + tensor_type = ir.RankedTensorType.get( + [1] * (i + 1) + [x for x in output_shape[i + 1 :]], + dtype, + ) extract_tensor = tensor.ExtractSliceOp( tensor_type, input1, @@ -639,12 +602,12 @@ def expand_op( def to_copy_op( - node: torch.fx.Node, + node: ToCopyOp, symbol_table: Dict[Tuple[str, int], ir.Operation], ): """ Import the tensor copy operation. - From PyTorch `aten._to_copy.default` operator to MLIR linalg `generic` + From buddy ToCopyOp to MLIR linalg `generic` operation. Note: This op, will convert input node's value type, such as float32 to @@ -660,10 +623,10 @@ def to_copy_op( input1 = symbol_table.get((str(node.args[0]), 0)) if input1 is None: return - output_shape = list(node.meta["tensor_meta"].shape) - dtype = str(node.meta["tensor_meta"].dtype) + output_shape = list(node.tensor_meta["shape"]) + dtype = node.tensor_meta["dtype"] - if dtype == "torch.bool": + if dtype == TensorDType.Bool: if str(ir.RankedTensorType(input1.type).element_type) == "f32": tensor_type = ir.RankedTensorType.get( output_shape, ir.IntegerType.get_signless(1) @@ -713,7 +676,7 @@ def to_copy_op( block.append(fptosi_op) block.append(trunc_op) block.append(linalg.YieldOp([trunc_op.result])) - elif dtype == "torch.float32": + elif dtype == TensorDType.Float32: if str(ir.RankedTensorType(input1.type).element_type) == "i1": tensor_type = ir.RankedTensorType.get( output_shape, ir.F32Type.get() @@ -764,12 +727,12 @@ def to_copy_op( def rsub_op( - node: torch.fx.Node, + node: RsubOp, symbol_table: Dict[Tuple[str, int], ir.Operation], ): """ Import the tensor rsub operation. - From PyTorch `aten.rsub.Scalar` operator to MLIR linalg `generic` operation. + From buddy RsubOp to MLIR linalg `generic` operation. Note: This op, compute input node1 rsub input node2 Args: @@ -782,20 +745,94 @@ def rsub_op( """ input1 = symbol_table.get((str(node.args[0]), 0)) value = node.args[1] - output_shape = list(node.meta["tensor_meta"].shape) - dtype = str(node.meta["tensor_meta"].dtype) - if not isinstance(value, torch.fx.Node): - if dtype == "torch.float32": - value = arith.ConstantOp( - ir.F32Type.get(), ir.FloatAttr.get(ir.F32Type.get(), value) - ) + output_shape = list(node.tensor_meta["shape"]) + dtype = node.tensor_meta["dtype"] + mlir_dtype = mlir_element_type_get(dtype) + if not isinstance(value, str): + value = arith.ConstantOp( + mlir_dtype, mlir_element_attr_get(dtype, value) + ) + generic_map = ir.AffineMap.get_permutation( + [i for i in range(len(output_shape))] + ) + tensor_type = ir.RankedTensorType.get(output_shape, mlir_dtype) + output = tensor.EmptyOp(output_shape, mlir_dtype) + op = linalg.GenericOp( + [tensor_type], + [input1], + [output], + ir.ArrayAttr.get( + [ + ir.AffineMapAttr.get( + generic_map.get_submap( + [i for i in range(len(output_shape))] + ) + ), + ir.AffineMapAttr.get( + generic_map.get_submap( + [i for i in range(len(output_shape))] + ) + ), + ] + ), + ir.ArrayAttr.get( + [ir.Attribute.parse("#linalg.iterator_type")] + * len(output_shape) + ), + ) + block = ir.Block.create_at_start( + op.region, + [ + ir.RankedTensorType(input1.type).element_type, + ir.RankedTensorType(output.result.type).element_type, + ], + ) + if str(ir.RankedTensorType(input1.type).element_type).find("i") != -1: + sub_op = arith.SubIOp(value.result, block.arguments[0]) + else: + sub_op = arith.SubFOp(value.result, block.arguments[0]) + block.append(sub_op) + block.append(linalg.YieldOp([sub_op.result])) + + return op + + +def pow_op( + node: PowOp, + symbol_table: Dict[Tuple[str, int], ir.Operation], +): + """ + Import the tensor copy operation. + From buddy PowOp to MLIR linalg `generic` + operation. + + Note: This op, compute input node's power result. + Args: + node: Containing information from the input graph node. + symbol_table: A dictionary mapping symbols to their corresponding + operations. + + Returns: + op: The operation return the linalg.generic op. + """ + input1 = symbol_table.get((str(node.args[0]), 0)) + if input1 is None: + return + value = node.args[1] + output_shape = list(node.tensor_meta["shape"]) + dtype = node.tensor_meta["dtype"] + dtype = mlir_element_type_get(dtype) + if not isinstance(value, str): + if abs(int(value) - float(value)) < 1e-6: generic_map = ir.AffineMap.get_permutation( [i for i in range(len(output_shape))] ) - tensor_type = ir.RankedTensorType.get( - output_shape, ir.F32Type.get() + tensor_type = ir.RankedTensorType.get(output_shape, dtype) + output = tensor.EmptyOp(output_shape, dtype) + value = arith.ConstantOp( + ir.IntegerType.get_signless(32), + ir.IntegerAttr.get(ir.IntegerType.get_signless(32), value), ) - output = tensor.EmptyOp(output_shape, ir.F32Type.get()) op = linalg.GenericOp( [tensor_type], [input1], @@ -826,23 +863,28 @@ def rsub_op( ir.RankedTensorType(output.result.type).element_type, ], ) - subf_op = arith.SubFOp(value.result, block.arguments[0]) - block.append(subf_op) - block.append(linalg.YieldOp([subf_op.result])) + if ( + str(ir.RankedTensorType(input1.type).element_type).find("i") + != -1 + ): + powi_op = math.IPowIOp(block.arguments[0], value.result) + else: + powi_op = math.FPowIOp(block.arguments[0], value.result) + block.append(powi_op) + block.append(linalg.YieldOp([powi_op.result])) return op -def pow_op( - node: torch.fx.Node, +def mean_op( + node: MeanOp, symbol_table: Dict[Tuple[str, int], ir.Operation], ): """ Import the tensor copy operation. - From PyTorch `aten.pow.Tensor_Scalar` operator to MLIR linalg `generic` - operation. + From buddy MeanOp to MLIR linalg `generic` operation. - Note: This op, compute input node's power result. + Note: This op, compute input node's mean result in a specified dim. Args: node: Containing information from the input graph node. symbol_table: A dictionary mapping symbols to their corresponding @@ -854,160 +896,91 @@ def pow_op( input1 = symbol_table.get((str(node.args[0]), 0)) if input1 is None: return - value = node.args[1] - output_shape = list(node.meta["tensor_meta"].shape) - dtype = str(node.meta["tensor_meta"].dtype) - if not isinstance(value, torch.fx.Node): - if dtype == "torch.float32": + dims = list(node.args[1]) + keep_dim = bool(node.args[2]) + output_shape = list(node.tensor_meta["shape"]) + dtype = node.tensor_meta["dtype"] + mlir_dtype = mlir_element_type_get(dtype) + tensor_type = ir.RankedTensorType.get(output_shape, mlir_dtype) + element = mlir_element_attr_get(dtype, 0.0) + attr = ir.DenseElementsAttr.get_splat(tensor_type, element) + output = arith.ConstantOp(tensor_type, attr) + assert len(dims) == 1 + for dim in dims: + if dim < 0: + dim = len(list(ir.RankedTensorType(input1.type).shape)) + dim + if keep_dim: generic_map = ir.AffineMap.get_permutation( - [i for i in range(len(output_shape))] + [i for i in range(len(output_shape) + 1)] ) - tensor_type = ir.RankedTensorType.get( - output_shape, ir.F32Type.get() + tensor_type = ir.RankedTensorType.get(output_shape, mlir_dtype) + output_map = [i for i in range(len(output_shape))] + output_map[dim] = len(output_shape) + loop_type = [ + ir.Attribute.parse("#linalg.iterator_type") + ] * (len(output_shape) + 1) + loop_type[dim] = ir.Attribute.parse( + "#linalg.iterator_type" ) - output = tensor.EmptyOp(output_shape, ir.F32Type.get()) - if abs(int(value) - float(value)) < 1e-6: - value = arith.ConstantOp( - ir.IntegerType.get_signless(32), - ir.IntegerAttr.get(ir.IntegerType.get_signless(32), value), - ) - op = linalg.GenericOp( - [tensor_type], - [input1], - [output], - ir.ArrayAttr.get( - [ - ir.AffineMapAttr.get( - generic_map.get_submap( - [i for i in range(len(output_shape))] - ) - ), - ir.AffineMapAttr.get( - generic_map.get_submap( - [i for i in range(len(output_shape))] - ) - ), - ] - ), - ir.ArrayAttr.get( - [ir.Attribute.parse("#linalg.iterator_type")] - * len(output_shape) - ), - ) - block = ir.Block.create_at_start( - op.region, - [ - ir.RankedTensorType(input1.type).element_type, - ir.RankedTensorType(output.result.type).element_type, - ], - ) - fpowi_op = math.FPowIOp(block.arguments[0], value.result) - block.append(fpowi_op) - block.append(linalg.YieldOp([fpowi_op.result])) - - return op - - -def mean_op( - node: torch.fx.Node, - symbol_table: Dict[Tuple[str, int], ir.Operation], -): - """ - Import the tensor copy operation. - From PyTorch `aten.mean.dim` operator to MLIR linalg `generic` operation. - - Note: This op, compute input node's mean result in a specified dim. - Args: - node: Containing information from the input graph node. - symbol_table: A dictionary mapping symbols to their corresponding - operations. - - Returns: - op: The operation return the linalg.generic op. - """ - input1 = symbol_table.get((str(node.args[0]), 0)) - if input1 is None: - return - dims = list(node.args[1]) - keep_dim = bool(node.args[2]) - output_shape = list(node.meta["tensor_meta"].shape) - dtype = str(node.meta["tensor_meta"].dtype) - if dtype == "torch.float32": - tensor_type = ir.RankedTensorType.get(output_shape, ir.F32Type.get()) - element = ir.FloatAttr.get(ir.F32Type.get(), 0.0) - attr = ir.DenseElementsAttr.get_splat(tensor_type, element) - output = arith.ConstantOp(tensor_type, attr) - - assert len(dims) == 1 - - for dim in dims: - if dim == -1: - dim = len(list(ir.RankedTensorType(input1.type).shape)) - 1 - if keep_dim: - generic_map = ir.AffineMap.get_permutation( - [i for i in range(len(output_shape) + 1)] - ) - tensor_type = ir.RankedTensorType.get( - output_shape, ir.F32Type.get() - ) - output_map = [i for i in range(len(output_shape))] - output_map[dim] = len(output_shape) - loop_type = [ - ir.Attribute.parse("#linalg.iterator_type") - ] * (len(output_shape) + 1) - loop_type[dim] = ir.Attribute.parse( - "#linalg.iterator_type" - ) - op = linalg.GenericOp( - [tensor_type], - [input1], - [output], - ir.ArrayAttr.get( - [ - ir.AffineMapAttr.get( - generic_map.get_submap( - [i for i in range(len(output_shape))] - ) - ), - ir.AffineMapAttr.get( - generic_map.get_submap(output_map) - ), - ] - ), - ir.ArrayAttr.get(loop_type), - ) - block = ir.Block.create_at_start( - op.region, + op = linalg.GenericOp( + [tensor_type], + [input1], + [output], + ir.ArrayAttr.get( [ - ir.RankedTensorType(input1.type).element_type, - ir.RankedTensorType(output.result.type).element_type, - ], + ir.AffineMapAttr.get( + generic_map.get_submap( + [i for i in range(len(output_shape))] + ) + ), + ir.AffineMapAttr.get( + generic_map.get_submap(output_map) + ), + ] + ), + ir.ArrayAttr.get(loop_type), + ) + block = ir.Block.create_at_start( + op.region, + [ + ir.RankedTensorType(input1.type).element_type, + ir.RankedTensorType(output.result.type).element_type, + ], + ) + value = arith.ConstantOp( + mlir_dtype, + mlir_element_attr_get( + dtype, list(ir.RankedTensorType(input1.type).shape)[dim] + ), + ) + if ( + str(ir.RankedTensorType(input1.type).element_type).find("i") + != -1 + ): + block_div_op = arith.DivSIOp(block.arguments[0], value.result) + block_add_op = arith.AddIOp( + block_div_op.result, block.arguments[1] ) - value = arith.ConstantOp( - ir.F32Type.get(), - ir.FloatAttr.get( - ir.F32Type.get(), - list(ir.RankedTensorType(input1.type).shape)[dim], - ), + else: + block_div_op = arith.DivFOp(block.arguments[0], value.result) + block_add_op = arith.AddFOp( + block_div_op.result, block.arguments[1] ) - divf_op = arith.DivFOp(block.arguments[0], value.result) - addf_op = arith.AddFOp(divf_op.result, block.arguments[1]) - block.append(value) - block.append(divf_op) - block.append(addf_op) - block.append(linalg.YieldOp([addf_op.result])) + block.append(value) + block.append(block_div_op) + block.append(block_add_op) + block.append(linalg.YieldOp([block_add_op.result])) return op def rsqrt_op( - node: torch.fx.Node, + node: RsqrtOp, symbol_table: Dict[Tuple[str, int], ir.Operation], ): """ Import the tensor rsqrt operation. - From PyTorch `aten.rsqrt.default` operator to MLIR linalg `generic` - operation. + From buddy RsqrtOp to MLIR linalg `generic` operation. Note: This op, compute input node's rsqrt result. Args: @@ -1023,59 +996,58 @@ def rsqrt_op( if input1 is None: return - output_shape = list(node.meta["tensor_meta"].shape) - dtype = str(node.meta["tensor_meta"].dtype) - - if dtype == "torch.float32": - tensor_type = ir.RankedTensorType.get(output_shape, ir.F32Type.get()) - output = tensor.EmptyOp(output_shape, ir.F32Type.get()) - generic_map = ir.AffineMap.get_permutation( - [i for i in range(len(output_shape))] - ) - op = linalg.GenericOp( - [tensor_type], - [input1], - [output], - ir.ArrayAttr.get( - [ - ir.AffineMapAttr.get( - generic_map.get_submap( - [i for i in range(len(output_shape))] - ) - ), - ir.AffineMapAttr.get( - generic_map.get_submap( - [i for i in range(len(output_shape))] - ) - ), - ] - ), - ir.ArrayAttr.get( - [ir.Attribute.parse("#linalg.iterator_type")] - * len(output_shape) - ), - ) - block = ir.Block.create_at_start( - op.region, + output_shape = list(node.tensor_meta["shape"]) + dtype = node.tensor_meta["dtype"] + mlir_dtype = mlir_element_type_get(dtype) + tensor_type = ir.RankedTensorType.get(output_shape, mlir_dtype) + output = tensor.EmptyOp(output_shape, mlir_dtype) + generic_map = ir.AffineMap.get_permutation( + [i for i in range(len(output_shape))] + ) + op = linalg.GenericOp( + [tensor_type], + [input1], + [output], + ir.ArrayAttr.get( [ - ir.RankedTensorType(input1.type).element_type, - ir.RankedTensorType(output.result.type).element_type, - ], - ) - math_rsqrt_op = math.RsqrtOp(block.arguments[0]) - block.append(math_rsqrt_op) - block.append(linalg.YieldOp([math_rsqrt_op.result])) + ir.AffineMapAttr.get( + generic_map.get_submap( + [i for i in range(len(output_shape))] + ) + ), + ir.AffineMapAttr.get( + generic_map.get_submap( + [i for i in range(len(output_shape))] + ) + ), + ] + ), + ir.ArrayAttr.get( + [ir.Attribute.parse("#linalg.iterator_type")] + * len(output_shape) + ), + ) + block = ir.Block.create_at_start( + op.region, + [ + ir.RankedTensorType(input1.type).element_type, + ir.RankedTensorType(output.result.type).element_type, + ], + ) + math_rsqrt_op = math.RsqrtOp(block.arguments[0]) + block.append(math_rsqrt_op) + block.append(linalg.YieldOp([math_rsqrt_op.result])) return op def mul_op( - node: torch.fx.Node, + node: MulOp, symbol_table: Dict[Tuple[str, int], ir.Operation], ): """ Import the tensor mul operation. - From PyTorch `aten.mul.Tensor` operator to MLIR linalg `generic` operation. + From buddy MulOp to MLIR linalg `generic` operation. Note: This op, compute input node's mul result. Args: @@ -1087,257 +1059,38 @@ def mul_op( op: The operation return the linalg.generic op. """ assert len(node.args) == 2 - if isinstance(node.args[0], torch.fx.Node): - input1 = symbol_table.get((str(node.args[0]), 0)) - else: - input1 = node.args[0] - - if isinstance(node.args[1], torch.fx.Node): + input1 = symbol_table.get((str(node.args[0]), 0)) + dtype = node.tensor_meta["dtype"] + mlir_dtype = mlir_element_type_get(dtype) + shape = list(node.tensor_meta["shape"]) + if isinstance(node.args[1], str): input2 = symbol_table.get((str(node.args[1]), 0)) else: - input2 = node.args[1] - + data = [node.args[1]] + input2_shape = numpy.array(data).shape + tensor_type = ir.RankedTensorType.get(input2_shape, mlir_dtype) + element = mlir_element_attr_get(dtype, node.args[1]) + attr = ir.DenseElementsAttr.get_splat(tensor_type, element) + input2 = arith.ConstantOp(tensor_type, attr).result if input1 is None or input2 is None: return - - output_shape = list(node.meta["tensor_meta"].shape) - dtype = str(node.meta["tensor_meta"].dtype) - - if isinstance(node.args[0], torch.fx.Node): - if dtype == "torch.float32": - if not isinstance(node.args[1], torch.fx.Node): - input2 = arith.ConstantOp( - ir.F32Type.get(), ir.FloatAttr.get(ir.F32Type.get(), input2) - ) - tensor_type = ir.RankedTensorType.get( - output_shape, ir.F32Type.get() - ) - output = tensor.EmptyOp(output_shape, ir.F32Type.get()) - generic_map = ir.AffineMap.get_permutation( - [i for i in range(len(output_shape))] - ) - op = linalg.GenericOp( - [tensor_type], - [input1], - [output], - ir.ArrayAttr.get( - [ - ir.AffineMapAttr.get( - generic_map.get_submap( - [i for i in range(len(output_shape))] - ) - ), - ir.AffineMapAttr.get( - generic_map.get_submap( - [i for i in range(len(output_shape))] - ) - ), - ] - ), - ir.ArrayAttr.get( - [ir.Attribute.parse("#linalg.iterator_type")] - * len(output_shape) - ), - ) - block = ir.Block.create_at_start( - op.region, - [ - ir.RankedTensorType(input1.type).element_type, - ir.RankedTensorType(output.result.type).element_type, - ], - ) - mulf_op = arith.MulFOp(block.arguments[0], input2.result) - block.append(mulf_op) - block.append(linalg.YieldOp([mulf_op.result])) - else: - tensor_type = ir.RankedTensorType.get( - output_shape, ir.F32Type.get() - ) - output = tensor.EmptyOp(output_shape, ir.F32Type.get()) - input1_shape = list(ir.RankedTensorType(input1.type).shape) - if input1_shape != output_shape: - dims = [] - for i in range(len(input1_shape) - 1, -1, -1): - if ( - input1_shape[i] - != output_shape[ - len(output_shape) - (len(input1_shape) - i) - ] - ): - dims.append(i) - output1 = tensor.EmptyOp(output_shape, ir.F32Type.get()) - generic_map = ir.AffineMap.get_permutation( - [i for i in range(len(output_shape) + len(dims))] - ) - input1_map = [ - i - for i in range( - len(output_shape) - len(input1_shape), - len(output_shape), - ) - ] - for index, i in enumerate(dims): - input1_map[i] = len(output_shape) + index - input1_map = generic_map.get_submap(input1_map) - input1_op = linalg.GenericOp( - [tensor_type], - [input1], - [output1], - ir.ArrayAttr.get( - [ - ir.AffineMapAttr.get(input1_map), - ir.AffineMapAttr.get( - generic_map.get_submap( - [i for i in range(len(output_shape))] - ) - ), - ] - ), - ir.ArrayAttr.get( - [ - ir.Attribute.parse( - "#linalg.iterator_type" - ) - ] - * len(output_shape) - + [ - ir.Attribute.parse( - "#linalg.iterator_type" - ) - ] - * len(dims) - ), - ) - block = ir.Block.create_at_start( - input1_op.region, - [ - ir.RankedTensorType(input1.type).element_type, - ir.RankedTensorType( - output.result.type - ).element_type, - ], - ) - block.append(linalg.YieldOp([block.arguments[0]])) - input1 = input1_op.result - - input2_shape = list(ir.RankedTensorType(input2.type).shape) - if input2_shape != output_shape: - dims = [] - for i in range(len(input2_shape) - 1, -1, -1): - if ( - input2_shape[i] - != output_shape[ - len(output_shape) - (len(input2_shape) - i) - ] - ): - dims.append(i) - output2 = tensor.EmptyOp(output_shape, ir.F32Type.get()) - generic_map = ir.AffineMap.get_permutation( - [i for i in range(len(output_shape) + len(dims))] - ) - input2_map = [ - i - for i in range( - len(output_shape) - len(input2_shape), - len(output_shape), - ) - ] - for index, i in enumerate(dims): - input2_map[i] = len(output_shape) + index - input2_map = generic_map.get_submap(input2_map) - input2_op = linalg.GenericOp( - [tensor_type], - [input2], - [output2], - ir.ArrayAttr.get( - [ - ir.AffineMapAttr.get(input2_map), - ir.AffineMapAttr.get( - generic_map.get_submap( - [i for i in range(len(output_shape))] - ) - ), - ] - ), - ir.ArrayAttr.get( - [ - ir.Attribute.parse( - "#linalg.iterator_type" - ) - ] - * len(output_shape) - + [ - ir.Attribute.parse( - "#linalg.iterator_type" - ) - ] - * len(dims) - ), - ) - block = ir.Block.create_at_start( - input2_op.region, - [ - ir.RankedTensorType(input2.type).element_type, - ir.RankedTensorType( - output.result.type - ).element_type, - ], - ) - block.append(linalg.YieldOp([block.arguments[0]])) - input2 = input2_op.result - generic_map = ir.AffineMap.get_permutation( - [i for i in range(len(output_shape))] - ) - op = linalg.GenericOp( - [tensor_type], - [input1, input2], - [output], - ir.ArrayAttr.get( - [ - ir.AffineMapAttr.get( - generic_map.get_submap( - [i for i in range(len(output_shape))] - ) - ), - ir.AffineMapAttr.get( - generic_map.get_submap( - [i for i in range(len(output_shape))] - ) - ), - ir.AffineMapAttr.get( - generic_map.get_submap( - [i for i in range(len(output_shape))] - ) - ), - ] - ), - ir.ArrayAttr.get( - [ir.Attribute.parse("#linalg.iterator_type")] - * len(output_shape) - ), - ) - block = ir.Block.create_at_start( - op.region, - [ - ir.RankedTensorType(input1.type).element_type, - ir.RankedTensorType(input2.type).element_type, - ir.RankedTensorType(output.result.type).element_type, - ], - ) - mulf_op = arith.MulFOp(block.arguments[0], block.arguments[1]) - block.append(mulf_op) - block.append(linalg.YieldOp([mulf_op.result])) - - return op + mul_result_tensor_type = ir.RankedTensorType.get(shape, mlir_dtype) + op = tosa.MulOp( + mul_result_tensor_type, + input1, + input2, + ir.IntegerAttr.get(ir.IntegerType.get_signless(8), 0), + ) + return op.result def t_op( - node: torch.fx.Node, + node: TOp, symbol_table: Dict[Tuple[str, int], ir.Operation], ): """ Import the tensor tanspose operation. - From PyTorch `aten.t.default` operator to MLIR linalg `generic` operation. + From buddy TransposeOp to MLIR linalg `generic` operation. Note: This op, compute input node's transpose result. Args: @@ -1353,50 +1106,23 @@ def t_op( if input1 is None: return - input_shape = list(ir.RankedTensorType(input1.type).shape) - output_shape = list(node.meta["tensor_meta"].shape) - dtype = str(node.meta["tensor_meta"].dtype) - if len(input_shape) == 2: - if dtype == "torch.float32": - tensor_type = ir.RankedTensorType.get( - output_shape, ir.F32Type.get() - ) - output = tensor.EmptyOp(output_shape, ir.F32Type.get()) - generic_map = ir.AffineMap.get_permutation([0, 1]) - op = linalg.GenericOp( - [tensor_type], - [input1], - [output], - ir.ArrayAttr.get( - [ - ir.AffineMapAttr.get(generic_map.get_submap([0, 1])), - ir.AffineMapAttr.get(generic_map.get_submap([1, 0])), - ] - ), - ir.ArrayAttr.get( - [ir.Attribute.parse("#linalg.iterator_type")] - * len(output_shape) - ), - ) - block = ir.Block.create_at_start( - op.region, - [ - ir.RankedTensorType(input1.type).element_type, - ir.RankedTensorType(output.result.type).element_type, - ], - ) - block.append(linalg.YieldOp([block.arguments[0]])) + output_shape = list(node.tensor_meta["shape"]) + dtype = node.tensor_meta["dtype"] + mlir_dtype = mlir_element_type_get(dtype) + perm = ir._denseI64ArrayAttr([1, 0], None) + output = tensor.EmptyOp(output_shape, mlir_dtype) + op = linalg.transpose(input=input1, outs=[output], permutation=perm) - return op + return op.result[0] def matmul_op( - node: torch.fx.Node, + node: MatmulOp, symbol_table: Dict[Tuple[str, int], ir.Operation], ): """ Import the tensor matmul operation. - From PyTorch `aten.mm.default` operator to MLIR linalg `matmul` operation. + From Buddy MatmulOp to MLIR linalg `matmul` operation. Note: This op, compute input node's matrix multiplication result. Args: @@ -1413,25 +1139,24 @@ def matmul_op( if input1 is None or input2 is None: return - output_shape = list(node.meta["tensor_meta"].shape) - dtype = str(node.meta["tensor_meta"].dtype) - if dtype == "torch.float32": - tensor_type = ir.RankedTensorType.get(output_shape, ir.F32Type.get()) - f32 = ir.F32Type.get() - element = ir.FloatAttr.get(f32, 0.0) - attr = ir.DenseElementsAttr.get_splat(tensor_type, element) - matmul_result_buffer = arith.ConstantOp(tensor_type, attr).result - op = linalg.matmul(input1, input2, outs=[matmul_result_buffer]) + output_shape = list(node.tensor_meta["shape"]) + dtype = node.tensor_meta["dtype"] + mlir_dtype = mlir_element_type_get(dtype) + tensor_type = ir.RankedTensorType.get(output_shape, mlir_dtype) + element = mlir_element_attr_get(dtype, 0.0) + attr = ir.DenseElementsAttr.get_splat(tensor_type, element) + matmul_result_buffer = arith.ConstantOp(tensor_type, attr).result + op = linalg.matmul(input1, input2, outs=[matmul_result_buffer]) return op def transpose_op( - node: torch.fx.Node, + node: TransposeOp, symbol_table: Dict[Tuple[str, int], ir.Operation], ): """ Import the tensor transpose operation. - From PyTorch `aten.transpose.int` operator to MLIR linalg `generic` + From buddy TransposeSpecificDimOp to MLIR linalg `generic` operation. Note: This op, compute input node's transpose result. @@ -1449,51 +1174,25 @@ def transpose_op( return dim1 = int(node.args[1]) dim2 = int(node.args[2]) - output_shape = list(node.meta["tensor_meta"].shape) - dtype = str(node.meta["tensor_meta"].dtype) - if dtype == "torch.float32": - tensor_type = ir.RankedTensorType.get(output_shape, ir.F32Type.get()) - output = tensor.EmptyOp(output_shape, ir.F32Type.get()) - generic_map = ir.AffineMap.get_permutation( - [i for i in range(len(output_shape))] - ) - input1_map = [i for i in range(len(output_shape))] - input1_map[dim1], input1_map[dim2] = input1_map[dim2], input1_map[dim1] - output_map = [i for i in range(len(output_shape))] - op = linalg.GenericOp( - [tensor_type], - [input1], - [output], - ir.ArrayAttr.get( - [ - ir.AffineMapAttr.get(generic_map.get_submap(input1_map)), - ir.AffineMapAttr.get(generic_map.get_submap(output_map)), - ] - ), - ir.ArrayAttr.get( - [ir.Attribute.parse("#linalg.iterator_type")] - * len(output_shape) - ), - ) - block = ir.Block.create_at_start( - op.region, - [ - ir.RankedTensorType(input1.type).element_type, - ir.RankedTensorType(output.result.type).element_type, - ], - ) - block.append(linalg.YieldOp([block.arguments[0]])) + output_shape = list(node.tensor_meta["shape"]) + dtype = node.tensor_meta["dtype"] + mlir_dtype = mlir_element_type_get(dtype) + output_perm = [i for i in range(len(output_shape))] + output_perm[dim2], output_perm[dim1] = output_perm[dim1], output_perm[dim2] + perm = ir._denseI64ArrayAttr(output_perm, None) + output = tensor.EmptyOp(output_shape, mlir_dtype) + op = linalg.transpose(input=input1, outs=[output], permutation=perm) - return op + return op.result[0] def index_op( - node: torch.fx.Node, + node: IndexOp, symbol_table: Dict[Tuple[str, int], ir.Operation], ): """ Import the tensor index operation. - From PyTorch `aten.index.Tensor` operator to MLIR linalg `generic` + From buddy IndexOp to MLIR linalg `generic` operation. Note: This op, get input node slice result by input index. @@ -1511,70 +1210,66 @@ def index_op( return input1_shape = ir.RankedTensorType(input1.type).shape input2 = node.args[1] - output_shape = list(node.meta["tensor_meta"].shape) - dtype = str(node.meta["tensor_meta"].dtype) + output_shape = list(node.tensor_meta["shape"]) + dtype = node.tensor_meta["dtype"] + mlir_dtype = mlir_element_type_get(dtype) if len(input2) < len(input1_shape): - if dtype == "torch.float32": - tensor_type = ir.RankedTensorType.get( - output_shape, ir.F32Type.get() - ) - output = tensor.EmptyOp(output_shape, ir.F32Type.get()) - loops = ir.RankedTensorType( - symbol_table.get((str(input2[0]), 0)).type - ).shape - generic_map = ir.AffineMap.get_permutation( - [i for i in range(len(output_shape))] + tensor_type = ir.RankedTensorType.get(output_shape, mlir_dtype) + output = tensor.EmptyOp(output_shape, mlir_dtype) + loops = ir.RankedTensorType( + symbol_table.get((str(input2[0]), 0)).type + ).shape + generic_map = ir.AffineMap.get_permutation( + [i for i in range(len(output_shape))] + ) + input_map = [ + ir.AffineMapAttr.get( + generic_map.get_submap([j for j in range(len(loops))]) ) - input_map = [ - ir.AffineMapAttr.get( - generic_map.get_submap([j for j in range(len(loops))]) - ) - for i in range(len(input2)) - ] + [ - ir.AffineMapAttr.get( - generic_map.get_submap( - [j for j in range(len(output_shape))] - ) - ) - ] - operands = [symbol_table.get((str(i), 0)) for i in input2] - op = linalg.GenericOp( - [tensor_type], - operands, - [output], - ir.ArrayAttr.get(input_map), - ir.ArrayAttr.get( - [ir.Attribute.parse("#linalg.iterator_type")] - * len(output_shape) - ), + for i in range(len(input2)) + ] + [ + ir.AffineMapAttr.get( + generic_map.get_submap([j for j in range(len(output_shape))]) ) - arguments = [ - ir.RankedTensorType(i.type).element_type for i in operands - ] + [ir.RankedTensorType(output.result.type).element_type] - block = ir.Block.create_at_start(op.region, arguments) - index = [] - for i in block.arguments[:-1]: - indexcast_op = arith.IndexCastOp(ir.IndexType.get(), i) - block.append(indexcast_op) - index.append(indexcast_op.result) - for i in range(len(loops), len(output_shape) - len(input2) + 1): - index_op = linalg.IndexOp(ir._i64Attr(i, None)) - block.append(index_op) - index.append(index_op.result) - value = tensor.ExtractOp(input1, index) - block.append(value) - block.append(linalg.YieldOp([value.result])) + ] + operands = [symbol_table.get((str(i), 0)) for i in input2] + op = linalg.GenericOp( + [tensor_type], + operands, + [output], + ir.ArrayAttr.get(input_map), + ir.ArrayAttr.get( + [ir.Attribute.parse("#linalg.iterator_type")] + * len(output_shape) + ), + ) + arguments = [ + ir.RankedTensorType(i.type).element_type for i in operands + ] + [ir.RankedTensorType(output.result.type).element_type] + block = ir.Block.create_at_start(op.region, arguments) + index = [] + for i in block.arguments[:-1]: + indexcast_op = arith.IndexCastOp(ir.IndexType.get(), i) + block.append(indexcast_op) + index.append(indexcast_op.result) + for i in range(len(loops), len(output_shape) - len(input2) + 1): + index_op = linalg.IndexOp(ir._i64Attr(i, None)) + block.append(index_op) + index.append(index_op.result) + value = tensor.ExtractOp(input1, index) + block.append(value) + block.append(linalg.YieldOp([value.result])) return op def neg_op( - node: torch.fx.Node, + node: NegOp, symbol_table: Dict[Tuple[str, int], ir.Operation], ): """ Import the tensor neg operation. - From PyTorch `aten.neg.default` operator to MLIR linalg `matmul` operation. + From buddy NegOp to MLIR linalg `negf` operation. Note: This op, compute input node's neg result. Args: @@ -1589,59 +1284,22 @@ def neg_op( input1 = symbol_table.get((str(node.args[0]), 0)) if input1 is None: return - - output_shape = list(node.meta["tensor_meta"].shape) - dtype = str(node.meta["tensor_meta"].dtype) - if dtype == "torch.float32": - tensor_type = ir.RankedTensorType.get(output_shape, ir.F32Type.get()) - output = tensor.EmptyOp(output_shape, ir.F32Type.get()) - generic_map = ir.AffineMap.get_permutation( - [i for i in range(len(output_shape))] - ) - op = linalg.GenericOp( - [tensor_type], - [input1], - [output], - ir.ArrayAttr.get( - [ - ir.AffineMapAttr.get( - generic_map.get_submap( - [i for i in range(len(output_shape))] - ) - ), - ir.AffineMapAttr.get( - generic_map.get_submap( - [i for i in range(len(output_shape))] - ) - ), - ] - ), - ir.ArrayAttr.get( - [ir.Attribute.parse("#linalg.iterator_type")] - * len(output_shape) - ), - ) - block = ir.Block.create_at_start( - op.region, - [ - ir.RankedTensorType(input1.type).element_type, - ir.RankedTensorType(output.result.type).element_type, - ], - ) - negf_op = arith.NegFOp(block.arguments[0]) - block.append(negf_op) - block.append(linalg.YieldOp([negf_op.result])) + output_shape = list(node.tensor_meta["shape"]) + dtype = node.tensor_meta["dtype"] + mlir_dtype = mlir_element_type_get(dtype) + output = tensor.EmptyOp(output_shape, mlir_dtype) + op = linalg.negf(input1, outs=output) return op def cat_op( - node: torch.fx.Node, + node: CatOp, symbol_table: Dict[Tuple[str, int], ir.Operation], ): """ Import the tensor concate operation. - From PyTorch `aten.cat.default` operator to MLIR tensor `insert_slice` + From buddy CatOp to MLIR tensor `insert_slice` operation. Note: This op, concate two input tensor. @@ -1660,52 +1318,52 @@ def cat_op( if input1 is None or input2 is None: return - output_shape = list(node.meta["tensor_meta"].shape) + output_shape = list(node.tensor_meta["shape"]) if dim < 0: dim = len(output_shape) + dim - dtype = str(node.meta["tensor_meta"].dtype) - if dtype == "torch.float32": - output = tensor.EmptyOp(output_shape, ir.F32Type.get()) - offset = [0 for x in output_shape] - offset_attr = ir._denseI64ArrayAttr(offset, None) - input1_shape = ir.RankedTensorType(input1.type).shape - size_attr = ir._denseI64ArrayAttr(input1_shape, None) - stride_attr = ir._denseI64ArrayAttr([1] * len(offset), None) - insert_input1 = tensor.InsertSliceOp( - input1, - output.result, - [], - [], - [], - offset_attr, - size_attr, - stride_attr, - ) - offset[dim] += input1_shape[dim] - offset_attr = ir._denseI64ArrayAttr(offset, None) - input2_shape = ir.RankedTensorType(input2.type).shape - size_attr = ir._denseI64ArrayAttr(input2_shape, None) - insert_input2 = tensor.InsertSliceOp( - input2, - insert_input1.result, - [], - [], - [], - offset_attr, - size_attr, - stride_attr, - ) + dtype = node.tensor_meta["dtype"] + mlir_dtype = mlir_element_type_get(dtype) + output = tensor.EmptyOp(output_shape, mlir_dtype) + offset = [0 for x in output_shape] + offset_attr = ir._denseI64ArrayAttr(offset, None) + input1_shape = ir.RankedTensorType(input1.type).shape + size_attr = ir._denseI64ArrayAttr(input1_shape, None) + stride_attr = ir._denseI64ArrayAttr([1] * len(offset), None) + insert_input1 = tensor.InsertSliceOp( + input1, + output.result, + [], + [], + [], + offset_attr, + size_attr, + stride_attr, + ) + offset[dim] += input1_shape[dim] + offset_attr = ir._denseI64ArrayAttr(offset, None) + input2_shape = ir.RankedTensorType(input2.type).shape + size_attr = ir._denseI64ArrayAttr(input2_shape, None) + insert_input2 = tensor.InsertSliceOp( + input2, + insert_input1.result, + [], + [], + [], + offset_attr, + size_attr, + stride_attr, + ) return insert_input2 def squeeze_op( - node: torch.fx.Node, + node: SqueezeOp, symbol_table: Dict[Tuple[str, int], ir.Operation], ): """ Import the tensor squeeze operation. - From PyTorch `aten.squeeze.dim` operator to MLIR linalg `generic` operation. + From buddy SqueezeOp to MLIR linalg `generic` operation. Note: This op, reduce the input tensor's shape dims by specified dim. Args: @@ -1722,78 +1380,78 @@ def squeeze_op( if input1 is None: return - output_shape = list(node.meta["tensor_meta"].shape) + output_shape = list(node.tensor_meta["shape"]) input1_shape = ir.RankedTensorType(input1.type).shape if dim < 0: dim = len(input1_shape) + dim - dtype = str(node.meta["tensor_meta"].dtype) - if dtype == "torch.float32": - tensor_type = ir.RankedTensorType.get(output_shape, ir.F32Type.get()) - output = tensor.EmptyOp(output_shape, ir.F32Type.get()) - if input1_shape[dim] != 1: - offset = [0 for x in output_shape] - offset_attr = ir._denseI64ArrayAttr(offset, None) - size_attr = ir._denseI64ArrayAttr(input1_shape, None) - stride_attr = ir._denseI64ArrayAttr([1] * len(offset), None) - op = tensor.InsertSliceOp( - input1, - output.result, - [], - [], - [], - offset_attr, - size_attr, - stride_attr, - ) - else: - output_map = ir.AffineMap.get( - len(output_shape), - 0, - [ir.AffineExpr.get_dim(i) for i in range(len(output_shape))], - ) - input1_map = [] - loop_index = 0 - for i in range(len(input1_shape)): - if len(input1_map) == dim: - input1_map.append(ir.AffineExpr.get_constant(0)) - else: - input1_map.append(ir.AffineExpr.get_dim(loop_index)) - loop_index += 1 - input1_map = ir.AffineMap.get(len(output_shape), 0, input1_map) - op = linalg.GenericOp( - [tensor_type], - [input1], - [output], - ir.ArrayAttr.get( - [ - ir.AffineMapAttr.get(input1_map), - ir.AffineMapAttr.get(output_map), - ] - ), - ir.ArrayAttr.get( - [ir.Attribute.parse("#linalg.iterator_type")] - * len(output_shape) - ), - ) - block = ir.Block.create_at_start( - op.region, + dtype = node.tensor_meta["dtype"] + mlir_dtype = mlir_element_type_get(dtype) + tensor_type = ir.RankedTensorType.get(output_shape, mlir_dtype) + output = tensor.EmptyOp(output_shape, mlir_dtype) + if input1_shape[dim] != 1: + offset = [0 for x in output_shape] + offset_attr = ir._denseI64ArrayAttr(offset, None) + size_attr = ir._denseI64ArrayAttr(input1_shape, None) + stride_attr = ir._denseI64ArrayAttr([1] * len(offset), None) + op = tensor.InsertSliceOp( + input1, + output.result, + [], + [], + [], + offset_attr, + size_attr, + stride_attr, + ) + else: + output_map = ir.AffineMap.get( + len(output_shape), + 0, + [ir.AffineExpr.get_dim(i) for i in range(len(output_shape))], + ) + input1_map = [] + loop_index = 0 + for i in range(len(input1_shape)): + if len(input1_map) == dim: + input1_map.append(ir.AffineExpr.get_constant(0)) + else: + input1_map.append(ir.AffineExpr.get_dim(loop_index)) + loop_index += 1 + input1_map = ir.AffineMap.get(len(output_shape), 0, input1_map) + op = linalg.GenericOp( + [tensor_type], + [input1], + [output], + ir.ArrayAttr.get( [ - ir.RankedTensorType(input1.type).element_type, - ir.RankedTensorType(output.result.type).element_type, - ], - ) - block.append(linalg.YieldOp([block.arguments[0]])) + ir.AffineMapAttr.get(input1_map), + ir.AffineMapAttr.get(output_map), + ] + ), + ir.ArrayAttr.get( + [ir.Attribute.parse("#linalg.iterator_type")] + * len(output_shape) + ), + ) + block = ir.Block.create_at_start( + op.region, + [ + ir.RankedTensorType(input1.type).element_type, + ir.RankedTensorType(output.result.type).element_type, + ], + ) + block.append(linalg.YieldOp([block.arguments[0]])) return op def batch_matmul_op( - node: torch.fx.Node, + node: BatchMatmulOp, symbol_table: Dict[Tuple[str, int], ir.Operation], ): """ Import the tensor batch matmul operation. - From PyTorch `aten.bmm.default` operator to MLIR linalg `batch_matmul` + From buddy BatchMatmulOp to MLIR linalg `batch_matmul` operation. Note: This op, compute input node's batch matrix multiplication result. @@ -1811,45 +1469,25 @@ def batch_matmul_op( if input1 is None or input2 is None: return - output_shape = list(node.meta["tensor_meta"].shape) - dtype = str(node.meta["tensor_meta"].dtype) - if dtype == "torch.float32": - tensor_type = ir.RankedTensorType.get(output_shape, ir.F32Type.get()) - output = tensor.EmptyOp(output_shape, ir.F32Type.get()) - # use linalg.generic implementation - generic_map = ir.AffineMap.get_permutation([0, 1, 2]) - zero_fill = linalg.GenericOp( - [tensor_type], - [], - [output], - ir.ArrayAttr.get( - [ir.AffineMapAttr.get(generic_map.get_submap([0, 1, 2]))] - ), - ir.ArrayAttr.get( - [ir.Attribute.parse("#linalg.iterator_type")] * 3 - ), - ) - block = ir.Block.create_at_start( - zero_fill.region, - [ir.RankedTensorType(output.result.type).element_type], - ) - zero_op = arith.ConstantOp( - ir.F32Type.get(), ir.FloatAttr.get(ir.F32Type.get(), 0) - ) - block.append(zero_op) - block.append(linalg.YieldOp([zero_op.result])) - op = linalg.batch_matmul(input1, input2, outs=[zero_fill.result]) + output_shape = list(node.tensor_meta["shape"]) + dtype = node.tensor_meta["dtype"] + mlir_dtype = mlir_element_type_get(dtype) + tensor_type = ir.RankedTensorType.get(output_shape, mlir_dtype) + element = mlir_element_attr_get(dtype, 0) + attr = ir.DenseElementsAttr.get_splat(tensor_type, element) + zero_fill = arith.ConstantOp(tensor_type, attr).result + op = linalg.batch_matmul(input1, input2, outs=[zero_fill]) return op def div_op( - node: torch.fx.Node, + node: DivOp, symbol_table: Dict[Tuple[str, int], ir.Operation], ): """ Import the tensor divsion operation. - From PyTorch `aten.div.Tensor` operator to MLIR linalg `generic` operation. + From buddy DivOp to MLIR linalg `generic` operation. Note: This op, compute input node's division result. Args: @@ -1861,258 +1499,38 @@ def div_op( op: The operation return the linalg.generic op. """ assert len(node.args) == 2 - if isinstance(node.args[0], torch.fx.Node): - input1 = symbol_table.get((str(node.args[0]), 0)) - else: - input1 = node.args[0] - - if isinstance(node.args[1], torch.fx.Node): + input1 = symbol_table.get((str(node.args[0]), 0)) + dtype = node.tensor_meta["dtype"] + mlir_dtype = mlir_element_type_get(dtype) + shape = list(node.tensor_meta["shape"]) + if isinstance(node.args[1], str): input2 = symbol_table.get((str(node.args[1]), 0)) else: - input2 = node.args[1] - + data = [node.args[1]] + input2_shape = numpy.array(data).shape + tensor_type = ir.RankedTensorType.get(input2_shape, mlir_dtype) + element = mlir_element_attr_get(dtype, node.args[1]) + attr = ir.DenseElementsAttr.get_splat(tensor_type, element) + input2 = arith.ConstantOp(tensor_type, attr).result if input1 is None or input2 is None: return - - output_shape = list(node.meta["tensor_meta"].shape) - dtype = str(node.meta["tensor_meta"].dtype) - - if isinstance(node.args[0], torch.fx.Node): - if dtype == "torch.float32": - if not isinstance(node.args[1], torch.fx.Node): - input2 = arith.ConstantOp( - ir.F32Type.get(), ir.FloatAttr.get(ir.F32Type.get(), input2) - ) - tensor_type = ir.RankedTensorType.get( - output_shape, ir.F32Type.get() - ) - output = tensor.EmptyOp(output_shape, ir.F32Type.get()) - generic_map = ir.AffineMap.get_permutation( - [i for i in range(len(output_shape))] - ) - op = linalg.GenericOp( - [tensor_type], - [input1], - [output], - ir.ArrayAttr.get( - [ - ir.AffineMapAttr.get( - generic_map.get_submap( - [i for i in range(len(output_shape))] - ) - ), - ir.AffineMapAttr.get( - generic_map.get_submap( - [i for i in range(len(output_shape))] - ) - ), - ] - ), - ir.ArrayAttr.get( - [ir.Attribute.parse("#linalg.iterator_type")] - * len(output_shape) - ), - ) - block = ir.Block.create_at_start( - op.region, - [ - ir.RankedTensorType(input1.type).element_type, - ir.RankedTensorType(output.result.type).element_type, - ], - ) - divf_op = arith.DivFOp(block.arguments[0], input2.result) - block.append(divf_op) - block.append(linalg.YieldOp([divf_op.result])) - else: - tensor_type = ir.RankedTensorType.get( - output_shape, ir.F32Type.get() - ) - output = tensor.EmptyOp(output_shape, ir.F32Type.get()) - input1_shape = list(ir.RankedTensorType(input1.type).shape) - if input1_shape != output_shape: - dims = [] - for i in range(len(input1_shape) - 1, -1, -1): - if ( - input1_shape[i] - != output_shape[ - len(output_shape) - (len(input1_shape) - i) - ] - ): - dims.append(i) - output1 = tensor.EmptyOp(output_shape, ir.F32Type.get()) - generic_map = ir.AffineMap.get_permutation( - [i for i in range(len(output_shape) + len(dims))] - ) - input1_map = [ - i - for i in range( - len(output_shape) - len(input1_shape), - len(output_shape), - ) - ] - for index, i in enumerate(dims): - input1_map[i] = len(output_shape) + index - input1_map = generic_map.get_submap(input1_map) - input1_op = linalg.GenericOp( - [tensor_type], - [input1], - [output1], - ir.ArrayAttr.get( - [ - ir.AffineMapAttr.get(input1_map), - ir.AffineMapAttr.get( - generic_map.get_submap( - [i for i in range(len(output_shape))] - ) - ), - ] - ), - ir.ArrayAttr.get( - [ - ir.Attribute.parse( - "#linalg.iterator_type" - ) - ] - * len(output_shape) - + [ - ir.Attribute.parse( - "#linalg.iterator_type" - ) - ] - * len(dims) - ), - ) - block = ir.Block.create_at_start( - input1_op.region, - [ - ir.RankedTensorType(input1.type).element_type, - ir.RankedTensorType( - output.result.type - ).element_type, - ], - ) - block.append(linalg.YieldOp([block.arguments[0]])) - input1 = input1_op.result - - input2_shape = list(ir.RankedTensorType(input2.type).shape) - if input2_shape != output_shape: - dims = [] - for i in range(len(input2_shape) - 1, -1, -1): - if ( - input2_shape[i] - != output_shape[ - len(output_shape) - (len(input2_shape) - i) - ] - ): - dims.append(i) - output2 = tensor.EmptyOp(output_shape, ir.F32Type.get()) - generic_map = ir.AffineMap.get_permutation( - [i for i in range(len(output_shape) + len(dims))] - ) - input2_map = [ - i - for i in range( - len(output_shape) - len(input2_shape), - len(output_shape), - ) - ] - for index, i in enumerate(dims): - input2_map[i] = len(output_shape) + index - input2_map = generic_map.get_submap(input2_map) - input2_op = linalg.GenericOp( - [tensor_type], - [input2], - [output2], - ir.ArrayAttr.get( - [ - ir.AffineMapAttr.get(input2_map), - ir.AffineMapAttr.get( - generic_map.get_submap( - [i for i in range(len(output_shape))] - ) - ), - ] - ), - ir.ArrayAttr.get( - [ - ir.Attribute.parse( - "#linalg.iterator_type" - ) - ] - * len(output_shape) - + [ - ir.Attribute.parse( - "#linalg.iterator_type" - ) - ] - * len(dims) - ), - ) - block = ir.Block.create_at_start( - input2_op.region, - [ - ir.RankedTensorType(input2.type).element_type, - ir.RankedTensorType( - output.result.type - ).element_type, - ], - ) - block.append(linalg.YieldOp([block.arguments[0]])) - input2 = input2_op.result - generic_map = ir.AffineMap.get_permutation( - [i for i in range(len(output_shape))] - ) - op = linalg.GenericOp( - [tensor_type], - [input1, input2], - [output], - ir.ArrayAttr.get( - [ - ir.AffineMapAttr.get( - generic_map.get_submap( - [i for i in range(len(output_shape))] - ) - ), - ir.AffineMapAttr.get( - generic_map.get_submap( - [i for i in range(len(output_shape))] - ) - ), - ir.AffineMapAttr.get( - generic_map.get_submap( - [i for i in range(len(output_shape))] - ) - ), - ] - ), - ir.ArrayAttr.get( - [ir.Attribute.parse("#linalg.iterator_type")] - * len(output_shape) - ), - ) - block = ir.Block.create_at_start( - op.region, - [ - ir.RankedTensorType(input1.type).element_type, - ir.RankedTensorType(input2.type).element_type, - ir.RankedTensorType(output.result.type).element_type, - ], - ) - divf_op = arith.DivFOp(block.arguments[0], block.arguments[1]) - block.append(divf_op) - block.append(linalg.YieldOp([divf_op.result])) - - return op + div_result_tensor_type = ir.RankedTensorType.get(shape, mlir_dtype) + op = tosa.MulOp( + div_result_tensor_type, + input1, + tosa.ReciprocalOp(input2.type, input2).result, + ir.IntegerAttr.get(ir.IntegerType.get_signless(8), 0), + ) + return op.result def softmax_op( - node: torch.fx.Node, + node: SoftmaxOp, symbol_table: Dict[Tuple[str, int], ir.Operation], ): """ Import the tensor softmax operation. - From PyTorch `aten._softmax.default` operator to MLIR linalg `generic` - operation. + From buddy SoftmaxOp to MLIR linalg `generic` operation. Note: This op, compute input node's softmax result. Args: @@ -2129,266 +1547,109 @@ def softmax_op( dim = int(node.args[1]) if input1 is None: return - output_shape = list(node.meta["tensor_meta"].shape) - dtype = str(node.meta["tensor_meta"].dtype) + output_shape = list(node.tensor_meta["shape"]) + dtype = node.tensor_meta["dtype"] if dim < 0: dim += len(output_shape) - if dtype == "torch.float32": - max_tensor_shape = copy.deepcopy(output_shape) - max_tensor_shape[dim] = 1 - max_tensor_type = ir.RankedTensorType.get( - max_tensor_shape, ir.F32Type.get() - ) - max_tensor = tensor.EmptyOp(max_tensor_shape, ir.F32Type.get()) - max_tensor_map = [ - ir.AffineExpr.get_dim(i) for i in range(len(max_tensor_shape)) - ] - max_tensor_map = ir.AffineMap.get( - len(max_tensor_shape), 0, max_tensor_map - ) - neg_inf_fill = linalg.GenericOp( - [max_tensor_type], - [], - [max_tensor], - ir.ArrayAttr.get([ir.AffineMapAttr.get(max_tensor_map)]), - ir.ArrayAttr.get( - [ir.Attribute.parse("#linalg.iterator_type")] - * len(max_tensor_shape) - ), - ) - block = ir.Block.create_at_start( - neg_inf_fill.region, - [ir.RankedTensorType(max_tensor.result.type).element_type], - ) - neg_inf_op = arith.ConstantOp( - ir.F32Type.get(), ir.FloatAttr.get(ir.F32Type.get(), float("-inf")) - ) - block.append(neg_inf_op) - block.append(linalg.YieldOp([neg_inf_op.result])) - - input1_map = [ - ir.AffineExpr.get_dim(i) for i in range(len(output_shape)) - ] - input1_map = ir.AffineMap.get(len(output_shape), 0, input1_map) - max_tensor_map = [ - ir.AffineExpr.get_dim(i) for i in range(len(output_shape)) - ] - max_tensor_map[dim] = ir.AffineExpr.get_constant(0) - max_tensor_map = ir.AffineMap.get(len(output_shape), 0, max_tensor_map) - loop_type = [ - ir.Attribute.parse("#linalg.iterator_type") - ] * len(output_shape) - loop_type[dim] = ir.Attribute.parse("#linalg.iterator_type") - max_tensor_op = linalg.GenericOp( - [max_tensor_type], - [input1], - [neg_inf_fill], - ir.ArrayAttr.get( - [ - ir.AffineMapAttr.get(input1_map), - ir.AffineMapAttr.get(max_tensor_map), - ] - ), - ir.ArrayAttr.get(loop_type), - ) - block = ir.Block.create_at_start( - max_tensor_op.region, - [ - ir.RankedTensorType(input1.type).element_type, - ir.RankedTensorType(neg_inf_fill.result.type).element_type, - ], - ) - max_op = arith.MaximumFOp(block.arguments[0], block.arguments[1]) - block.append(max_op) - block.append(linalg.YieldOp([max_op.result])) - - exp_tensor = tensor.EmptyOp(output_shape, ir.F32Type.get()) - exp_tensor_type = ir.RankedTensorType.get( - output_shape, ir.F32Type.get() - ) - input1_map = [ - ir.AffineExpr.get_dim(i) for i in range(len(output_shape)) - ] - input1_map = ir.AffineMap.get(len(output_shape), 0, input1_map) - max_tensor_map = [ - ir.AffineExpr.get_dim(i) for i in range(len(output_shape)) - ] - max_tensor_map[dim] = ir.AffineExpr.get_constant(0) - max_tensor_map = ir.AffineMap.get(len(output_shape), 0, max_tensor_map) - exp_tensor_map = [ - ir.AffineExpr.get_dim(i) for i in range(len(output_shape)) - ] - exp_tensor_map = ir.AffineMap.get(len(output_shape), 0, exp_tensor_map) - exp_tensor_op = linalg.GenericOp( - [exp_tensor_type], - [input1, max_tensor_op.result], - [exp_tensor], - ir.ArrayAttr.get( - [ - ir.AffineMapAttr.get(input1_map), - ir.AffineMapAttr.get(max_tensor_map), - ir.AffineMapAttr.get(exp_tensor_map), - ] - ), - ir.ArrayAttr.get( - [ir.Attribute.parse("#linalg.iterator_type")] - * len(output_shape) - ), - ) - block = ir.Block.create_at_start( - exp_tensor_op.region, - [ - ir.RankedTensorType(input1.type).element_type, - ir.RankedTensorType(max_tensor_op.result.type).element_type, - ir.RankedTensorType(exp_tensor.result.type).element_type, - ], - ) - sub_op = arith.SubFOp(block.arguments[0], block.arguments[1]) - exp_op = math.ExpOp(sub_op.result) - block.append(sub_op) - block.append(exp_op) - block.append(linalg.YieldOp([exp_op.result])) - - reduce_sum_tensor_shape = copy.deepcopy(output_shape) - reduce_sum_tensor_shape[dim] = 1 - reduce_sum_tensor = tensor.EmptyOp( - reduce_sum_tensor_shape, ir.F32Type.get() - ) - reduce_sum_tensor_type = ir.RankedTensorType.get( - reduce_sum_tensor_shape, ir.F32Type.get() - ) - reduce_sum_tensor_map = [ - ir.AffineExpr.get_dim(i) for i in range(len(output_shape)) - ] - reduce_sum_tensor_map = ir.AffineMap.get( - len(output_shape), 0, reduce_sum_tensor_map - ) - zero_fill_op = linalg.GenericOp( - [reduce_sum_tensor_type], - [], - [reduce_sum_tensor.result], - ir.ArrayAttr.get([ir.AffineMapAttr.get(reduce_sum_tensor_map)]), - ir.ArrayAttr.get( - [ir.Attribute.parse("#linalg.iterator_type")] - * len(output_shape) - ), - ) - block = ir.Block.create_at_start( - zero_fill_op.region, - [ir.RankedTensorType(reduce_sum_tensor.result.type).element_type], - ) - zero_op = arith.ConstantOp( - ir.F32Type.get(), ir.FloatAttr.get(ir.F32Type.get(), 0) - ) - block.append(zero_op) - block.append(linalg.YieldOp([zero_op.result])) - - reduce_sum_tensor_shape = copy.deepcopy(output_shape) - reduce_sum_tensor_shape[dim] = 1 - reduce_sum_tensor_type = ir.RankedTensorType.get( - reduce_sum_tensor_shape, ir.F32Type.get() - ) - exp_tensor_map = [ - ir.AffineExpr.get_dim(i) for i in range(len(output_shape)) - ] - exp_tensor_map = ir.AffineMap.get(len(output_shape), 0, exp_tensor_map) - reduce_sum_tensor_map = [ - ir.AffineExpr.get_dim(i) for i in range(len(output_shape)) - ] - reduce_sum_tensor_map[dim] = ir.AffineExpr.get_constant(0) - reduce_sum_tensor_map = ir.AffineMap.get( - len(output_shape), 0, reduce_sum_tensor_map - ) - loop_type = [ - ir.Attribute.parse("#linalg.iterator_type") - ] * len(output_shape) - loop_type[dim] = ir.Attribute.parse("#linalg.iterator_type") - reduce_sum_tensor_op = linalg.GenericOp( - [reduce_sum_tensor_type], - [exp_tensor_op.result], - [zero_fill_op.result], - ir.ArrayAttr.get( - [ - ir.AffineMapAttr.get(exp_tensor_map), - ir.AffineMapAttr.get(reduce_sum_tensor_map), - ] - ), - ir.ArrayAttr.get(loop_type), - ) - block = ir.Block.create_at_start( - reduce_sum_tensor_op.region, + mlir_dtype = mlir_element_type_get(dtype) + # tensor_type = ir.RankedTensorType.get(output_shape, mlir_dtype) + # output = tensor.EmptyOp(output_shape, mlir_dtype) + # op = linalg.softmax( + # [tensor_type], + # input1, + # output, + # ir.IntegerAttr.get(ir.IntegerType.get_signless(64), dim), + # ) + # print(op, flush=True) + sum_tensor_shape = copy.deepcopy(output_shape) + sum_tensor_shape[dim] = 1 + sum_tensor_type = ir.RankedTensorType.get(sum_tensor_shape, mlir_dtype) + element = mlir_element_attr_get(dtype, 0) + attr = ir.DenseElementsAttr.get_splat(sum_tensor_type, element) + sum_tensor = arith.ConstantOp(sum_tensor_type, attr).result + input1_map = [ir.AffineExpr.get_dim(i) for i in range(len(output_shape))] + input1_map = ir.AffineMap.get(len(output_shape), 0, input1_map) + sum_tensor_map = [ + ir.AffineExpr.get_dim(i) for i in range(len(output_shape)) + ] + sum_tensor_map[dim] = ir.AffineExpr.get_constant(0) + sum_tensor_map = ir.AffineMap.get(len(output_shape), 0, sum_tensor_map) + loop_type = [ir.Attribute.parse("#linalg.iterator_type")] * len( + output_shape + ) + loop_type[dim] = ir.Attribute.parse("#linalg.iterator_type") + sum_tensor_op = linalg.GenericOp( + [sum_tensor_type], + [input1], + [sum_tensor], + ir.ArrayAttr.get( [ - ir.RankedTensorType(exp_tensor_op.result.type).element_type, - ir.RankedTensorType(zero_fill_op.result.type).element_type, - ], - ) - add_op = arith.AddFOp(block.arguments[0], block.arguments[1]) - block.append(add_op) - block.append(linalg.YieldOp([add_op.result])) - - reduce_sum_tensor_shape = copy.deepcopy(output_shape) - reduce_sum_tensor_shape[dim] = 1 - result_tensor_type = ir.RankedTensorType.get( - output_shape, ir.F32Type.get() - ) - result_tensor = tensor.EmptyOp(output_shape, ir.F32Type.get()) - exp_tensor_map = [ - ir.AffineExpr.get_dim(i) for i in range(len(output_shape)) - ] - exp_tensor_map = ir.AffineMap.get(len(output_shape), 0, exp_tensor_map) - reduce_sum_tensor_map = [ - ir.AffineExpr.get_dim(i) for i in range(len(output_shape)) - ] - reduce_sum_tensor_map[dim] = ir.AffineExpr.get_constant(0) - reduce_sum_tensor_map = ir.AffineMap.get( - len(output_shape), 0, reduce_sum_tensor_map - ) - result_tensor_map = [ - ir.AffineExpr.get_dim(i) for i in range(len(output_shape)) - ] - result_tensor_map = ir.AffineMap.get( - len(output_shape), 0, result_tensor_map - ) - op = linalg.GenericOp( - [result_tensor_type], - [exp_tensor_op.result, reduce_sum_tensor_op.result], - [result_tensor.result], - ir.ArrayAttr.get( - [ - ir.AffineMapAttr.get(exp_tensor_map), - ir.AffineMapAttr.get(reduce_sum_tensor_map), - ir.AffineMapAttr.get(result_tensor_map), - ] - ), - ir.ArrayAttr.get( - [ir.Attribute.parse("#linalg.iterator_type")] - * len(output_shape) - ), - ) - block = ir.Block.create_at_start( - op.region, + ir.AffineMapAttr.get(input1_map), + ir.AffineMapAttr.get(sum_tensor_map), + ] + ), + ir.ArrayAttr.get(loop_type), + ) + block = ir.Block.create_at_start( + sum_tensor_op.region, + [ + mlir_dtype, + mlir_dtype, + ], + ) + exp_op = math.ExpOp(block.arguments[0]) + add_op = arith.AddFOp(exp_op.result, block.arguments[1]) + block.append(exp_op) + block.append(add_op) + block.append(linalg.YieldOp([add_op.result])) + result_tensor_type = ir.RankedTensorType.get(output_shape, mlir_dtype) + result_tensor = tensor.EmptyOp(output_shape, mlir_dtype) + result_tensor_map = [ + ir.AffineExpr.get_dim(i) for i in range(len(output_shape)) + ] + result_tensor_map = ir.AffineMap.get( + len(output_shape), 0, result_tensor_map + ) + op = linalg.GenericOp( + [result_tensor_type], + [input1, sum_tensor_op.result], + [result_tensor.result], + ir.ArrayAttr.get( [ - ir.RankedTensorType(exp_tensor_op.result.type).element_type, - ir.RankedTensorType( - reduce_sum_tensor_op.result.type - ).element_type, - ir.RankedTensorType(result_tensor.result.type).element_type, - ], - ) - div_op = arith.DivFOp(block.arguments[0], block.arguments[1]) - block.append(div_op) - block.append(linalg.YieldOp([div_op.result])) + ir.AffineMapAttr.get(input1_map), + ir.AffineMapAttr.get(sum_tensor_map), + ir.AffineMapAttr.get(result_tensor_map), + ] + ), + ir.ArrayAttr.get( + [ir.Attribute.parse("#linalg.iterator_type")] + * len(output_shape) + ), + ) + block = ir.Block.create_at_start( + op.region, + [ + mlir_dtype, + mlir_dtype, + mlir_dtype, + ], + ) + exp_op = math.ExpOp(block.arguments[0]) + div_op = arith.DivFOp(exp_op.result, block.arguments[1]) + block.append(exp_op) + block.append(div_op) + block.append(linalg.YieldOp([div_op.result])) return op def clone_op( - node: torch.fx.Node, + node: CloneOp, symbol_table: Dict[Tuple[str, int], ir.Operation], ): """ Import the tensor clone operation. - From PyTorch `aten.clone.default` operator to MLIR tensor `extract_slice` + From buddy CloneOp to MLIR tensor `extract_slice` operation. Note: This op, clone input tensor to a new tensor. @@ -2405,31 +1666,29 @@ def clone_op( if input1 is None: return - output_shape = list(node.meta["tensor_meta"].shape) - dtype = str(node.meta["tensor_meta"].dtype) - if dtype == "torch.float32": - offset = [0 for x in output_shape] - offset_attr = ir._denseI64ArrayAttr(offset, None) - size_attr = ir._denseI64ArrayAttr(output_shape, None) - stride = [1 for x in output_shape] - stride_attr = ir._denseI64ArrayAttr(stride, None) - tensor_type = ir.RankedTensorType.get(output_shape, ir.F32Type.get()) - - op = tensor.ExtractSliceOp( - tensor_type, input1, [], [], [], offset_attr, size_attr, stride_attr - ) + output_shape = list(node.tensor_meta["shape"]) + dtype = node.tensor_meta["dtype"] + mlir_dtype = mlir_element_type_get(dtype) + offset = [0 for x in output_shape] + offset_attr = ir._denseI64ArrayAttr(offset, None) + size_attr = ir._denseI64ArrayAttr(output_shape, None) + stride = [1 for x in output_shape] + stride_attr = ir._denseI64ArrayAttr(stride, None) + tensor_type = ir.RankedTensorType.get(output_shape, mlir_dtype) + op = tensor.ExtractSliceOp( + tensor_type, input1, [], [], [], offset_attr, size_attr, stride_attr + ) return op def silu_op( - node: torch.fx.Node, + node: SiluOp, symbol_table: Dict[Tuple[str, int], ir.Operation], ): """ Import the tensor silu activation operation. - From PyTorch `aten.silu.default` operator to MLIR linalg `generic` - operation. + From Buddy SiluOp to MLIR linalg `generic` operation. Note: This op, compute input node's silu activation result. Args: @@ -2445,63 +1704,61 @@ def silu_op( if input1 is None: return - output_shape = list(node.meta["tensor_meta"].shape) - dtype = str(node.meta["tensor_meta"].dtype) - if dtype == "torch.float32": - tensor_type = ir.RankedTensorType.get(output_shape, ir.F32Type.get()) - output = tensor.EmptyOp(output_shape, ir.F32Type.get()) - generic_map = ir.AffineMap.get_permutation( - [i for i in range(len(output_shape))] - ) - op = linalg.GenericOp( - [tensor_type], - [input1], - [output], - ir.ArrayAttr.get( - [ - ir.AffineMapAttr.get( - generic_map.get_submap( - [i for i in range(len(output_shape))] - ) - ), - ir.AffineMapAttr.get( - generic_map.get_submap( - [i for i in range(len(output_shape))] - ) - ), - ] - ), - ir.ArrayAttr.get( - [ir.Attribute.parse("#linalg.iterator_type")] - * len(output_shape) - ), - ) - block = ir.Block.create_at_start( - op.region, + output_shape = list(node.tensor_meta["shape"]) + dtype = node.tensor_meta["dtype"] + mlir_dtype = mlir_element_type_get(dtype) + tensor_type = ir.RankedTensorType.get(output_shape, mlir_dtype) + output = tensor.EmptyOp(output_shape, mlir_dtype) + generic_map = ir.AffineMap.get_permutation( + [i for i in range(len(output_shape))] + ) + op = linalg.GenericOp( + [tensor_type], + [input1], + [output], + ir.ArrayAttr.get( [ - ir.RankedTensorType(input1.type).element_type, - ir.RankedTensorType(output.result.type).element_type, - ], - ) - neg_op = arith.NegFOp(block.arguments[0]) - exp_op = math.ExpOp(neg_op.result) - one_op = arith.ConstantOp( - ir.F32Type.get(), ir.FloatAttr.get(ir.F32Type.get(), 1) - ) - add_op = arith.AddFOp(one_op.result, exp_op.result) - div_op = arith.DivFOp(block.arguments[0], add_op.result) - block.append(neg_op) - block.append(exp_op) - block.append(one_op) - block.append(add_op) - block.append(div_op) - block.append(linalg.YieldOp([div_op.result])) + ir.AffineMapAttr.get( + generic_map.get_submap( + [i for i in range(len(output_shape))] + ) + ), + ir.AffineMapAttr.get( + generic_map.get_submap( + [i for i in range(len(output_shape))] + ) + ), + ] + ), + ir.ArrayAttr.get( + [ir.Attribute.parse("#linalg.iterator_type")] + * len(output_shape) + ), + ) + block = ir.Block.create_at_start( + op.region, + [ + ir.RankedTensorType(input1.type).element_type, + ir.RankedTensorType(output.result.type).element_type, + ], + ) + neg_op = arith.NegFOp(block.arguments[0]) + exp_op = math.ExpOp(neg_op.result) + one_op = arith.ConstantOp(mlir_dtype, mlir_element_attr_get(dtype, 1)) + add_op = arith.AddFOp(one_op.result, exp_op.result) + div_op = arith.DivFOp(block.arguments[0], add_op.result) + block.append(neg_op) + block.append(exp_op) + block.append(one_op) + block.append(add_op) + block.append(div_op) + block.append(linalg.YieldOp([div_op.result])) return op def param_extract( - node: torch.fx.Node, + node: PlaceholderOp, offset, params_mlir_node, ): @@ -2519,12 +1776,12 @@ def param_extract( op: The operation return the tensor.expand_shape op. """ dtype_mapping = { - torch.float32: ir.F32Type.get(), - torch.int64: ir.IntegerType.get_signless(64), + TensorDType.Float32: ir.F32Type.get(), + TensorDType.Int64: ir.IntegerType.get_signless(64), } - tensor_element_type = dtype_mapping[node.meta["tensor_meta"].dtype] - output_shape = list(node.meta["tensor_meta"].shape) - extract_size = functools.reduce(lambda x, y: x * y, output_shape) + tensor_element_type = dtype_mapping[node.tensor_meta["dtype"]] + output_shape = list(node.tensor_meta["shape"]) + extract_size = functools.reduce(lambda x, y: x * y, output_shape, 1) offset_attr = ir._denseI64ArrayAttr([offset], None) size_attr = ir._denseI64ArrayAttr([extract_size], None) stride = [1] @@ -2540,7 +1797,7 @@ def param_extract( size_attr, stride_attr, ) - if len(output_shape) == 1: + if len(output_shape) == 1 or len(output_shape) == 0: return extract_slice_op tensor_type = ir.RankedTensorType.get(output_shape, tensor_element_type) axis = ir.ArrayAttr.get( @@ -2553,36 +1810,123 @@ def param_extract( axis = ir.ArrayAttr.get([axis], None) return tensor.ExpandShapeOp(tensor_type, extract_slice_op.result, axis) +def where_op( + node: WhereOp, + symbol_table: Dict[Tuple[str, int], ir.Operation], +): + """ + Import the tensor where operation. + From Buddy WhereOp to MLIR linalg `generic` operation. + + Note: This op, compute input node's silu activation result. + Args: + node: Containing information from the input graph node. + symbol_table: A dictionary mapping symbols to their corresponding + operations. + + Returns: + op: The operation return the linalg.generic op. + """ + assert len(node.args) == 3 + input1 = symbol_table.get((str(node.args[0]), 0)) + input2 = symbol_table.get((str(node.args[1]), 0)) + input3 = symbol_table.get((str(node.args[2]), 0)) + if input1 is None or input2 is None or input3 is None: + return + + output_shape = list(node.tensor_meta["shape"]) + dtype = node.tensor_meta["dtype"] + mlir_dtype = mlir_element_type_get(dtype) + tensor_type = ir.RankedTensorType.get(output_shape, mlir_dtype) + output = tensor.EmptyOp(output_shape, mlir_dtype) + generic_map = ir.AffineMap.get_permutation( + [i for i in range(len(output_shape))] + ) + op = linalg.GenericOp( + [tensor_type], + [input1, input3], + [output], + ir.ArrayAttr.get( + [ + ir.AffineMapAttr.get( + generic_map.get_submap( + [i for i in range(len(output_shape))] + ) + ), + ir.AffineMapAttr.get( + generic_map.get_submap( + [i for i in range(len(output_shape))] + ) + ), + ir.AffineMapAttr.get( + generic_map.get_submap( + [i for i in range(len(output_shape))] + ) + ), + ] + ), + ir.ArrayAttr.get( + [ir.Attribute.parse("#linalg.iterator_type")] + * len(output_shape) + ), + ) + block = ir.Block.create_at_start( + op.region, + [ + ir.RankedTensorType(input1.type).element_type, + ir.RankedTensorType(input3.type).element_type, + ir.RankedTensorType(output.result.type).element_type, + ], + ) + select_op = arith.SelectOp(block.arguments[0], input2, block.arguments[1]) + block.append(select_op) + block.append(linalg.YieldOp([select_op.result])) + + return op + +def scalar_tensor_op(node: ScalarTensorOp, symbol_table): + """ + Import the tensor Scalar_Tensor operation. + From Buddy ScalarTensorOp to MLIR arith `ConstantOp` operation. + """ + assert len(node.args) == 1 + dtype = node.tensor_meta["dtype"] + attr = mlir_element_attr_get(dtype, node.args[0]) + op = arith.ConstantOp(dtype, attr) + + return op ops_registry = { - "arange.start": arange_op, - "arange.default": arange_op, - "unsqueeze.default": unsqueeze_op, - "view.default": view_op, - "ones.default": ones_op, - "full.default": full_op, - "lt.Tensor": lt_op, - "embedding.default": embedding_op, - "masked_fill.Scalar": masked_fill_op, - "slice.Tensor": slice_op, - "expand.default": expand_op, - "_to_copy.default": to_copy_op, - "rsub.Scalar": rsub_op, - "pow.Tensor_Scalar": pow_op, - "mean.dim": mean_op, - "rsqrt.default": rsqrt_op, - "mul.Tensor": mul_op, - "t.default": t_op, - "mm.default": matmul_op, - "transpose.int": transpose_op, - "index.Tensor": index_op, - "neg.default": neg_op, - "cat.default": cat_op, - "squeeze.dim": squeeze_op, - "bmm.default": batch_matmul_op, - "div.Tensor": div_op, - "_softmax.default": softmax_op, - "clone.default": clone_op, - "silu.default": silu_op, "param.extract": param_extract, + "MatmulOp": matmul_op, + "ArangeOp": arange_op, + "UnsqueezeOp": unsqueeze_op, + "ViewOp": view_op, + "EmbeddingOp": embedding_op, + "OnesOp": ones_op, + "FullOp": full_op, + "LessThanOp": lt_op, + "MaskedFillOp": masked_fill_op, + "SliceOp": slice_op, + "ExpandOp": expand_op, + "ToCopyOp": to_copy_op, + "RsubOp": rsub_op, + "PowOp": pow_op, + "MeanOp": mean_op, + "RsqrtOp": rsqrt_op, + "MulOp": mul_op, + "TOp": t_op, + "TransposeOp": transpose_op, + "IndexOp": index_op, + "NegOp": neg_op, + "CatOp": cat_op, + "SqueezeOp": squeeze_op, + "BatchMatmulOp": batch_matmul_op, + "DivOp": div_op, + "SoftmaxOp": softmax_op, + "CloneOp": clone_op, + "SiluOp": silu_op, + "AddOp": add_op, + "WhereOp": where_op, + "ScalarTensorOp": scalar_tensor_op, } diff --git a/frontend/Python/ops/math.py b/frontend/Python/ops/math.py index 7e2de80b5..19820c2b3 100644 --- a/frontend/Python/ops/math.py +++ b/frontend/Python/ops/math.py @@ -22,11 +22,16 @@ def erf_op(node, symbol_table): - input_ = symbol_table.get((str(node.args[0]), 0)) - op = math.ErfOp(input_) + input_tensor = symbol_table.get((str(node.args[0]), 0)) + op = math.ErfOp(input_tensor) return op +def sqrt_op(node, symbol_table): + input_tensor = symbol_table.get((str(node.args[0]), 0)) + return math.SqrtOp(input_tensor) + ops_registry = { - "erf.default": erf_op, + "ErfOp": erf_op, + "SqrtOp": sqrt_op, } diff --git a/frontend/Python/ops/tosa.py b/frontend/Python/ops/tosa.py index bf957002a..8a0997a3a 100644 --- a/frontend/Python/ops/tosa.py +++ b/frontend/Python/ops/tosa.py @@ -14,17 +14,52 @@ # # ===--------------------------------------------------------------------------- # -# The registry of mappings from Torch node to MLIR tosa dialect operations. +# The registry of mappings from Buddy Graph to MLIR tosa dialect operations. # # ===--------------------------------------------------------------------------- -import torch import array from typing import Dict, List, Tuple, Union +import numpy import mlir.ir as ir from mlir.dialects import tensor, tosa +from ..graph import TensorDType +from ..graph import ( + AddOp, + PermuteOp, + AddMMOp, + BatchMatmulOp, + SubOp, + MulOp, + DivOp, + TanhOp, + ExpOp, + RsqrtOp, + AmaxOp, + ReshapeOp, + UnsqueezeOp, + SelectOp, + SliceOp, + ConvertElementTypeOp, + CloneOp, + VarMeanOp, + EmbeddingOp, + ExpandOp, + SumDimOp, + TOp, + TransposeOp, + MaxPool2dOp, + Conv2dOp, + ReluOp, + IotaOp, + SigmoidOp, + ReciprocalOp, + MeanOp, +) +from .utils import * + def _normalize_binary_operator_shape(shp1, shp2): """Normalize the shape of two input tensors according to the broadcasting @@ -75,9 +110,8 @@ def _gen_arith_binary_op(input1, input2, op_func): def _scalar_to_tensor( scalar: Union[float, int], element_type: ir.Type, shape: List[int] ): - """PyTorch allow the binary operation between tensor and scalar. But MLIR - does not. - So we need to convert scalars to the corresponding tensors.""" + """Convert scalers to cooresponding tensors since MLIR + doesn't support operation between scalers and tensors.""" element = ( ir.FloatAttr.get(element_type, float(scalar)) if str(element_type) == "f32" @@ -128,11 +162,11 @@ def _normalize_binary_operator_args(arg1, arg2): def addmm_op( - node, symbol_table: Dict[Tuple[str, int], ir.Operation] + node: AddMMOp, symbol_table: Dict[Tuple[str, int], ir.Operation] ) -> ir.Operation: """ Import matrix multiplication operation. - From PyTorch `aten.addmm.default` operator to MLIR TOSA `matmul` operation. + From buddy graph ir's `AddMMOp` operator to MLIR TOSA `matmul` operation. Note: this function first reshapes the input matrices to 3D tensors (since tosa.MatMulOp requires it). Then it multiplies these reshaped @@ -146,8 +180,7 @@ def addmm_op( Returns: op: The operation representing the result of adding the matrix - multiplication - to the input tensor. + multiplication to the input tensor. """ # get input input_ = symbol_table.get((str(node.args[0]), 0)) @@ -184,10 +217,11 @@ def addmm_op( return op -def bmm_op(node, symbol_table) -> ir.Operation: +def bmm_op(node: BatchMatmulOp, symbol_table) -> ir.Operation: """ Import batch matrix multiplication operation. - From PyTorch `aten.bmm.default` operator to MLIR TOSA `matmul` operation. + From buddy graph ir's `BatchMatmulOp` operator to MLIR TOSA `matmul` + operation. """ input_ = symbol_table.get((str(node.args[0]), 0)) mat2 = symbol_table.get((str(node.args[1]), 0)) @@ -200,30 +234,30 @@ def bmm_op(node, symbol_table) -> ir.Operation: return op -def add_op(node, symbol_table): +def add_op(node: AddOp, symbol_table): """ Import tensor addition operation. - From PyTorch `aten.add.Tensor` operator to MLIR TOSA `add` operation. + From buddy graph ir's `AddOp` operator to MLIR TOSA `add` operation. """ input1 = symbol_table.get((str(node.args[0]), 0), node.args[0]) input2 = symbol_table.get((str(node.args[1]), 0), node.args[1]) return _gen_arith_binary_op(input1, input2, tosa.AddOp) -def sub_op(node, symbol_table): +def sub_op(node: SubOp, symbol_table): """ Import tensor subtraction operation. - From PyTorch `aten.sub.Tensor` operator to MLIR TOSA `sub` operation. + From buddy graph ir's `SubOp` operator to MLIR TOSA `sub` operation. """ input1 = symbol_table.get((str(node.args[0]), 0), node.args[0]) input2 = symbol_table.get((str(node.args[1]), 0), node.args[1]) return _gen_arith_binary_op(input1, input2, tosa.SubOp) -def mul_op(node, symbol_table): +def mul_op(node: MulOp, symbol_table): """ - Import tensor multiplication operation. - From PyTorch `aten.mul.Tensor` operator to MLIR TOSA `mul` operation. + Import tensor division operation. + From buddy graph ir's `DivOp` operator to MLIR TOSA `div` operation. """ def _inner_op(result_type, input1, input2): @@ -240,10 +274,10 @@ def _inner_op(result_type, input1, input2): return _gen_arith_binary_op(input1, input2, _inner_op) -def div_op(node, symbol_table): +def div_op(node: DivOp, symbol_table): """ Import tensor division operation. - From PyTorch `aten.div.Tensor` operator to MLIR TOSA `div` operation. + From buddy graph ir's `DivOp` operator to MLIR TOSA `div` operation. """ def _inner_op(result_type, input1, input2): @@ -260,10 +294,10 @@ def _inner_op(result_type, input1, input2): return _gen_arith_binary_op(input1, input2, _inner_op) -def tanh_op(node, symbol_table): +def tanh_op(node: TanhOp, symbol_table): """ Import elementwise tanh operation. - From PyTorch `aten.tanh.default` operator to MLIR TOSA `tanh` operation. + From buddy graph ir's `TanhOp` operator to MLIR TOSA `tanh` operation. """ input1 = symbol_table.get((str(node.args[0]), 0)) sizes = ir.RankedTensorType(input1.type).shape @@ -273,10 +307,10 @@ def tanh_op(node, symbol_table): return op -def exp_op(node, symbol_table): +def exp_op(node: ExpOp, symbol_table): """ Import elementwise exponential operation. - From PyTorch `aten.exp.default` operator to MLIR TOSA `exp` operation. + From buddy graph ir's `ExpOp` operator to MLIR TOSA `exp` operation. """ input1 = symbol_table.get((str(node.args[0]), 0)) sizes = ir.RankedTensorType(input1.type).shape @@ -286,10 +320,10 @@ def exp_op(node, symbol_table): return op -def rsqrt_op(node, symbol_table): +def rsqrt_op(node: RsqrtOp, symbol_table): """ Import elementwise reciprocal square root operation. - From PyTorch `aten.rsqrt.default` operator to MLIR TOSA `rsqrt` operation. + From buddy graph ir's `RsqrtOp` operator to MLIR TOSA `rsqrt` operation. """ input1 = symbol_table.get((str(node.args[0]), 0)) sizes = ir.RankedTensorType(input1.type).shape @@ -301,15 +335,11 @@ def rsqrt_op(node, symbol_table): return op -def amax_op(node, symbol_table): +def amax_op(node: AmaxOp, symbol_table): """ Import the amax operation. - From PyTorch `aten.amax.default` operator to MLIR TOSA `reduce_max` + From buddy graph ir's `AmaxOp` operator to MLIR TOSA `reduce_max` operation. - - Note: This conversion function returns the maximum value of each slice - of the input tensor in the given dimension(s). This is consistent - with PyTorch's `torch.amax` operator. """ input1 = symbol_table.get((str(node.args[0]), 0)) dim_val = node.args[1][0] @@ -321,10 +351,10 @@ def amax_op(node, symbol_table): return op -def reshape_op(node, symbol_table): +def reshape_op(node: ReshapeOp, symbol_table): """ Import the reshape operation. - From PyTorch `aten.reshape.default` operator to MLIR TOSA `reshape` + From buddy graph ir's `ReshapeOp` operator to MLIR TOSA `reshape` operation. Note: If the new shape contains one and only one `-1`, the size of the new @@ -362,34 +392,30 @@ def reshape_op(node, symbol_table): return op -def unsqueeze_op(node, symbol_table): +def unsqueeze_op(node: UnsqueezeOp, symbol_table): """ Import the unsqueeze operation. - From PyTorch `aten.unsqueeze.default` operator to MLIR TOSA `reshape` + From buddy graph ir's `UnsqueezeOp` operator to MLIR TOSA `reshape` operation. - - Note: "unsqueeze" means inserting a new dimension of size 1 at the specified - position. For more information, please refer to - https://pytorch.org/docs/stable/generated/torch.unsqueeze.html """ input_tensor = symbol_table.get((str(node.args[0]), 0)) dim = node.args[1] sizes = ir.RankedTensorType(input_tensor.type).shape - sizes.insert(dim, 1) + if dim == -1: + sizes.append(1) + else: + sizes.insert(dim, 1) new_shape_content = array.array("i", sizes) new_shape_content = memoryview(new_shape_content) op = tosa.ReshapeOp(input_tensor, new_shape_content) return op -def select_op(node, symbol_table): +def select_op(node: SelectOp, symbol_table): """ Import the select operation. - From PyTorch `aten.select.int` operator to MLIR TOSA `reshape` operation. - - Note: "select" means slicing the input tensor along the selected dimension - at the given index. For more information, please refer to - https://pytorch.org/docs/stable/generated/torch.select.html + From buddy graph ir's `SelectOp` operator to MLIR TOSA `reshape` + operation. """ input_tensor = symbol_table.get((str(node.args[0]), 0)) dim = node.args[1] @@ -416,14 +442,11 @@ def select_op(node, symbol_table): return op -def slice_op(node, symbol_table): +def slice_op(node: SliceOp, symbol_table): """ Import the slice operation. - From PyTorch `aten.slice.Tensor` operator to MLIR tensor `extract_slice` + From buddy graph ir's `SliceOp` operator to MLIR TOSA `extract_slice` operation. - - Note: "slice" means slicing the input tensor along the selected dimension - from a given start index to an end index. """ input_tensor = symbol_table.get((str(node.args[0]), 0)) dim = node.args[1] @@ -477,17 +500,19 @@ def slice_op(node, symbol_table): return op -def convert_element_type_op(node, symbol_table): +def convert_element_type_op(node: ConvertElementTypeOp, symbol_table): """ Import the element type conversion operation. - From PyTorch `prims.convert_element_type.default` operator to - MLIR TOSA `cast` operation. + From buddy graph ir's `ConvertElementTypeOp` operator to MLIR TOSA + `cast` operation. """ - # maintain a mapping of torch types and mlir types + # maintain a mapping of buddy dtype to mlir types types_mapping = { - torch.float64: ir.F64Type.get(), - torch.float32: ir.F32Type.get(), - torch.float16: ir.F16Type.get(), + TensorDType.Float64: ir.F64Type.get(), + TensorDType.Float32: ir.F32Type.get(), + TensorDType.Float16: ir.F16Type.get(), + TensorDType.Int32: ir.IntegerType.get_signless(32), + TensorDType.Bool: ir.IntegerType.get_signless(1), } input_tensor = symbol_table.get((str(node.args[0]), 0)) to_cast_type = types_mapping[node.args[1]] @@ -496,13 +521,13 @@ def convert_element_type_op(node, symbol_table): return tosa.CastOp(output_type, input_tensor) -def clone_op(node, symbol_table): +def clone_op(node: CloneOp, symbol_table): """ Import the clone operation. - From PyTorch `aten.clone.default` operator to MLIR TOSA `identity` + From buddy graph ir's `CloneOp` operator to MLIR TOSA `identity` operation. - Note: Since MLIR follow the SSA form, when using the `identity` operation, + Note: Since MLIR follows the SSA form, when using the `identity` operation, we actually deep-copies the original tensor. """ input_tensor = symbol_table.get((str(node.args[0]), 0)) @@ -513,13 +538,16 @@ def clone_op(node, symbol_table): return tosa.IdentityOp(output_type, input_tensor) -def var_mean_op(node, symbol_table): +def var_mean_op(node: VarMeanOp, symbol_table): """ Import the variance & mean operation. - From PyTorch `aten.var_mean.default` operator to two MLIR TOSA `mul` + From buddy graph ir's `VarMeanOp` operator to two MLIR TOSA `mul` operation. - Note: The conversion procedure can be splited into two steps: + Note: By now, this conversion function follows PyTorch's `var_mean` + semantic. + + The conversion procedure can be splited into two steps: 1. In the first part, we calculate the mean value along the given dimension(s) in `mean_dim_op` function. We first reduce the input tensor along the given dimension(s) using tosa's `reduce_sum` @@ -667,10 +695,10 @@ def var_dim_op( return var_op, mean_op -def permute_op(node, symbol_table): +def permute_op(node: PermuteOp, symbol_table): """ Import the permute operation. - From PyTorch `aten.permute.default` operator to MLIR TOSA `transpose` + From buddy graph ir's `PermuteOp` operator to MLIR TOSA `transpose` operation. """ input_tensor = symbol_table.get((str(node.args[0]), 0)) @@ -693,10 +721,10 @@ def permute_op(node, symbol_table): return permute_op -def embedding_op(node, symbol_table): +def embedding_op(node: EmbeddingOp, symbol_table): """ Import the embedding operation. - From PyTorch `aten.embedding.default` operator to MLIR TOSA `reshape` + From buddy graph ir's `EmbeddingOp` operator to MLIR TOSA `reshape` operation. Note: Althought this conversion function will finally return a `reshape` @@ -754,10 +782,10 @@ def embedding_op(node, symbol_table): return op -def expand_op(node, symbol_table) -> ir.Operation: +def expand_op(node: ExpandOp, symbol_table) -> ir.Operation: """ Import the expand operation. - From PyTorch `aten.expand.default` operator to MLIR TOSA `add` operation. + From buddy graph ir's `ExpandOp` operator to MLIR TOSA `add` operation. Note: This conversion is implemented using the broadcast machanism of TOSA `add` operation. We allocate a tensor with the shape to expand and @@ -787,11 +815,10 @@ def expand_op(node, symbol_table) -> ir.Operation: return op -def sum_op(node, symbol_table): +def sum_op(node: SumDimOp, symbol_table): """ Import the sum operation. - From PyTorch `aten.sum.dim_IntList` operator to MLIR TOSA `reduce_sum` - operation. + From buddy graph ir's `SumDimOp` operator to MLIR TOSA `reduce_sum` """ input_tensor = symbol_table.get((str(node.args[0]), 0)) reduce_sum_dims = node.args[1] @@ -813,40 +840,37 @@ def sum_op(node, symbol_table): return reduce_sum_op -def t_op(node, symbol_table): +def t_op(node: TOp, symbol_table): """ Import the tensor transpose operation. - From PyTorch `aten.t.default` operator to MLIR TOSA `reduce_sum` operation. + From buddy graph ir's `TOp` operator to MLIR TOSA `transpose` operation """ assert len(node.args) == 1 input1 = symbol_table.get((str(node.args[0]), 0)) - if input1 is None: - return + assert input1 is not None input_shape = list(ir.RankedTensorType(input1.type).shape) - output_shape = list(node.meta["tensor_meta"].shape) - if len(input_shape) == 2: - perm_const_op = tosa.ConstOp( - ir.DenseElementsAttr.get(memoryview(array.array("i", [1, 0]))) - ) - result_element_type = ir.RankedTensorType(input1.type).element_type - permute_result_type = ir.RankedTensorType.get( - output_shape, result_element_type - ) - op = tosa.TransposeOp( - permute_result_type, input1, perm_const_op.results[0] - ) + output_shape = list(node.tensor_meta["shape"]) + assert len(input_shape) == 2, "Input tensor must be 2D" + perm_const_op = tosa.ConstOp( + ir.DenseElementsAttr.get(memoryview(array.array("i", [1, 0]))) + ) + result_element_type = ir.RankedTensorType(input1.type).element_type + permute_result_type = ir.RankedTensorType.get( + output_shape, result_element_type + ) + op = tosa.TransposeOp(permute_result_type, input1, perm_const_op.results[0]) return op -def transpose_op(node, symbol_table): +def transpose_op(node: TransposeOp, symbol_table): """ Import the tensor permute operation based on input dims. - From PyTorch `aten.transpose.int` operator to MLIR TOSA `reduce_sum` + From buddy graph ir's `TransposeOp` operator to MLIR TOSA `transpose` operation. """ - assert len(node.args) == 3 + assert len(node.args) == 3, "Input tensor must be 3D" input1 = symbol_table.get((str(node.args[0]), 0)) if input1 is None: return @@ -857,7 +881,7 @@ def transpose_op(node, symbol_table): temp = perm_list[dim1] perm_list[dim1] = perm_list[dim2] perm_list[dim2] = temp - output_shape = list(node.meta["tensor_meta"].shape) + output_shape = list(node.tensor_meta["shape"]) perm_const_op = tosa.ConstOp( ir.DenseElementsAttr.get(memoryview(array.array("i", perm_list))) ) @@ -870,29 +894,352 @@ def transpose_op(node, symbol_table): return op +def maxpool2d_op(node: MaxPool2dOp, symbol_table): + """ + Import the maxpool2d operation. + From Buddy MaxPool2dOp to MLIR TOSA `max_pool2d` operation. + """ + if len(node.args) == 5: + raise NotImplementedError + input1 = symbol_table.get((str(node.args[0]), 0)) + kernel = node.args[1] + stride = node.args[2] + if len(node.args) > 3: + pad = node.args[3] + else: + pad = [0 for _ in kernel] + dtype = node.tensor_meta["dtype"] + result_element_type = mlir_element_type_get(dtype) + if node._layout.find("NCHW") != -1: + perm_list = [0, 2, 3, 1] + perm_const_op = tosa.ConstOp( + ir.DenseElementsAttr.get(memoryview(array.array("i", perm_list))) + ) + out_shape = list(ir.RankedTensorType(input1.type).shape) + perm_shape = [] + perm_shape.append(out_shape[0]) + perm_shape.append(out_shape[2]) + perm_shape.append(out_shape[3]) + perm_shape.append(out_shape[1]) + permute_result_type = ir.RankedTensorType.get( + perm_shape, result_element_type + ) + input1 = tosa.TransposeOp( + permute_result_type, input1, perm_const_op.results[0] + ).result + out_shape = node.tensor_meta["shape"] + if len(pad) == 1: + pad = [pad[0]] * 4 + elif len(pad) == 2: + pad = [pad[0]] * 2 + [pad[1]] * 2 + kernel_attr = ir._denseI64ArrayAttr(kernel, None) + stride_attr = ir._denseI64ArrayAttr(stride, None) + pad_attr = ir._denseI64ArrayAttr(pad, None) + if node._layout.find("NCHW") != -1: + perm_shape = [] + perm_shape.append(out_shape[0]) + perm_shape.append(out_shape[2]) + perm_shape.append(out_shape[3]) + perm_shape.append(out_shape[1]) + out_shape = perm_shape + output = ir.RankedTensorType.get(out_shape, result_element_type) + op = tosa.MaxPool2dOp(output, input1, kernel_attr, stride_attr, pad_attr) + if node._layout.find("NCHW") != -1: + perm_list = [0, 3, 1, 2] + perm_const_op = tosa.ConstOp( + ir.DenseElementsAttr.get(memoryview(array.array("i", perm_list))) + ) + perm_shape = [] + perm_shape.append(out_shape[0]) + perm_shape.append(out_shape[3]) + perm_shape.append(out_shape[1]) + perm_shape.append(out_shape[2]) + permute_result_type = ir.RankedTensorType.get( + perm_shape, result_element_type + ) + op = tosa.TransposeOp( + permute_result_type, op.result, perm_const_op.results[0] + ) + return op + + +def convolution2d_op(node: Conv2dOp, symbol_table): + """ + Import the convolution operation. + From Buddy Conv2dOp to MLIR TOSA `conv2d` operation. + """ + assert len(node.args) == 9 + input1 = symbol_table.get((str(node.args[0]), 0)) + weight = symbol_table.get((str(node.args[1]), 0)) + is_kernel_transposed = node.args[6] + dtype = node.tensor_meta["dtype"] + result_element_type = mlir_element_type_get(dtype) + if node._layout.find("NCHW") != -1: + perm_list = [0, 2, 3, 1] + perm_const_op = tosa.ConstOp( + ir.DenseElementsAttr.get(memoryview(array.array("i", perm_list))) + ) + out_shape = list(ir.RankedTensorType(input1.type).shape) + perm_shape = [] + perm_shape.append(out_shape[0]) + perm_shape.append(out_shape[2]) + perm_shape.append(out_shape[3]) + perm_shape.append(out_shape[1]) + permute_result_type = ir.RankedTensorType.get( + perm_shape, result_element_type + ) + input1 = tosa.TransposeOp( + permute_result_type, input1, perm_const_op.results[0] + ).result + if node._layout.find("FCHW") != -1: + perm_list = [0, 2, 3, 1] + perm_const_op = tosa.ConstOp( + ir.DenseElementsAttr.get(memoryview(array.array("i", perm_list))) + ) + out_shape = list(ir.RankedTensorType(weight.type).shape) + perm_shape = [] + perm_shape.append(out_shape[0]) + perm_shape.append(out_shape[2]) + perm_shape.append(out_shape[3]) + perm_shape.append(out_shape[1]) + permute_result_type = ir.RankedTensorType.get( + perm_shape, result_element_type + ) + weight = tosa.TransposeOp( + permute_result_type, weight, perm_const_op.results[0] + ).result + if is_kernel_transposed: + in_channels = list(ir.RankedTensorType(weight.type).shape)[0] + out_channels = list(ir.RankedTensorType(weight.type).shape)[1] + else: + in_channels = list(ir.RankedTensorType(weight.type).shape)[1] + out_channels = list(ir.RankedTensorType(weight.type).shape)[0] + if len(node._parents) == 2: + new_size_tensor_type = ir.RankedTensorType.get( + [out_channels], result_element_type + ) + element = mlir_element_attr_get(dtype, 0) + new_size_attr = ir.DenseElementsAttr.get_splat( + new_size_tensor_type, element + ) + bias_tensor = tosa.ConstOp(new_size_attr).results[0] + else: + bias_tensor = symbol_table.get((str(node.args[2]), 0)) + assert input1 != None and weight != None and bias_tensor != None + stride = node.args[3] + input_padding = node.args[4] + if len(input_padding) == 1: + input_padding = [input_padding[0]] * 4 + elif len(input_padding) == 2: + input_padding = [input_padding[0]] * 2 + [input_padding[1]] * 2 + dilation = node.args[5] + groups = node.args[8] + out_shape = node.tensor_meta["shape"] + if node._layout.find("NCHW") != -1: + perm_shape = [] + perm_shape.append(out_shape[0]) + perm_shape.append(out_shape[2]) + perm_shape.append(out_shape[3]) + perm_shape.append(out_shape[1]) + out_shape = perm_shape + output = ir.RankedTensorType.get(out_shape, result_element_type) + stride_attr = ir._denseI64ArrayAttr(stride, None) + assert groups == 1, 'tosa.conv2d only support one group' + if is_kernel_transposed: + if sum(input_padding) > 0 or sum(dilation) > len(dilation): + raise NotImplementedError + out_padding = node.args[7] + for i in range(len(out_padding), 4): + out_padding = [0] + out_padding + out_padding_attr = ir._denseI64ArrayAttr(out_padding, None) + out_shape_attr = ir._denseI64ArrayAttr(out_shape, None) + op = tosa.TransposeConv2DOp( + output, + input1, + weight, + bias_tensor, + out_padding_attr, + stride_attr, + out_shape_attr, + ) + else: + input_padding_attr = ir._denseI64ArrayAttr(input_padding, None) + dilation_attr = ir._denseI64ArrayAttr(dilation, None) + op = tosa.Conv2DOp( + output, + input1, + weight, + bias_tensor, + input_padding_attr, + stride_attr, + dilation_attr, + ) + if node._layout.find("NCHW") != -1: + perm_list = [0, 3, 1, 2] + perm_const_op = tosa.ConstOp( + ir.DenseElementsAttr.get(memoryview(array.array("i", perm_list))) + ) + perm_shape = [] + perm_shape.append(out_shape[0]) + perm_shape.append(out_shape[3]) + perm_shape.append(out_shape[1]) + perm_shape.append(out_shape[2]) + permute_result_type = ir.RankedTensorType.get( + perm_shape, result_element_type + ) + op = tosa.TransposeOp( + permute_result_type, op.result, perm_const_op.results[0] + ) + return op + + +def relu_op(node: ReluOp, symbol_table): + """ + Import the tensor relu operation. + From Buddy ReluOp to MLIR TOSA `maximum` operation. + """ + assert len(node.args) == 1 + input1 = symbol_table.get((str(node.args[0]), 0)) + if input1 is None: + return + output_shape = list(node.tensor_meta["shape"]) + dtype = node.tensor_meta["dtype"] + element = mlir_element_attr_get(dtype, 0) + tensor_type = ir.RankedTensorType.get(output_shape, element.type) + attr = ir.DenseElementsAttr.get_splat(tensor_type, element) + zero_op = tosa.ConstOp(attr) + result_element_type = mlir_element_type_get(dtype) + op = tosa.MaximumOp(tensor_type, input1, zero_op) + + return op + + +def iota_op(node: IotaOp, symbol_table): + """ + Import the tensor iota operation. + From Buddy IotaOp to MLIR TOSA `ConstOp` operation. + """ + assert len(node.args) == 1 + output_shape = list(node.tensor_meta["shape"]) + dtype = node.tensor_meta["dtype"] + start = node.kwargs["start"] + end = node.args[0] + step = node.kwargs["step"] + mlir_dtype = mlir_element_type_get(dtype) + tensor_type = ir.RankedTensorType.get(output_shape, mlir_dtype) + attr = ir.DenseElementsAttr.get( + numpy.arange(start, end, step), + type=tensor_type, + ) + op = tosa.ConstOp(attr) + + return op + + +def sigmoid_op(node: SigmoidOp, symbol_table): + """ + Import the tensor sigmoid operation. + From Buddy SigmoidOp to MLIR TOSA `SigmoidOp` operation. + """ + assert len(node.args) == 1 + input1 = symbol_table.get((str(node.args[0]), 0)) + if input1 is None: + return + output_shape = list(node.tensor_meta["shape"]) + dtype = node.tensor_meta["dtype"] + mlir_dtype = mlir_element_type_get(dtype) + tensor_type = ir.RankedTensorType.get(output_shape, mlir_dtype) + op = tosa.SigmoidOp(tensor_type, input1) + + return op + + +def reciprocal_op(node: ReciprocalOp, symbol_table): + input_tensor = symbol_table.get((str(node.args[0]), 0)) + return tosa.ReciprocalOp(input_tensor.type, input_tensor) + + +def mean_op(node: MeanOp, symbol_table): + input_tensor = symbol_table.get((str(node.args[0]), 0)) + keepdim = node.args[2] + dims = [x for x in node.args[1]] + if isinstance(dims, int): + dims = [dims] + + for dim_item_idx, _ in enumerate(dims): + if dims[dim_item_idx] < 0: + dims[dim_item_idx] += len( + ir.RankedTensorType(input_tensor.type).shape + ) + + reduce_sum_result = input_tensor + for dim_item in dims: + reduce_dim_attr = ir.IntegerAttr.get( + ir.IntegerType.get_signless(32), dim_item + ) + reduce_sum_op = tosa.ReduceSumOp(reduce_sum_result, reduce_dim_attr) + reduce_sum_result = reduce_sum_op.results[0] + + tensor_shp = ir.RankedTensorType(input_tensor.type).shape + dim_size = 1 + + for dim_item in dims: + dim_size *= tensor_shp[dim_item] + + denominator_const_op = tosa.ConstOp( + ir.DenseElementsAttr.get(memoryview(array.array("f", [dim_size]))) + ) + reciprocal_op = tosa.ReciprocalOp( + denominator_const_op.results[0].type, denominator_const_op + ) + + ret = tosa.MulOp( + reduce_sum_op.results[0].type, + reciprocal_op.results[0], + reduce_sum_op.results[0], + ir.IntegerAttr.get(ir.IntegerType.get_signless(8), 0), + ) + + if not keepdim: + result_shp = ir.RankedTensorType(ret.results[0].type).shape + result_shp = [siz for siz in result_shp if siz != 1] + ret = tosa.ReshapeOp( + ret.results[0], memoryview(array.array("i", result_shp)) + ) + + return ret + + ops_registry = { - "add.Tensor": add_op, - "mul.Tensor": mul_op, - "sub.Tensor": sub_op, - "sum.dim_IntList": sum_op, - "tanh.default": tanh_op, - "amax.default": amax_op, - "rsqrt.default": rsqrt_op, - "bmm.default": bmm_op, - "clone.default": clone_op, - "div.Tensor": div_op, - "exp.default": exp_op, - "expand.default": expand_op, - "var_mean.correction": var_mean_op, - "addmm.default": addmm_op, - "reshape.default": reshape_op, - "view.default": reshape_op, - "select.int": select_op, - "slice.Tensor": slice_op, - "embedding.default": embedding_op, - "convert_element_type.default": convert_element_type_op, - "permute.default": permute_op, - "unsqueeze.default": unsqueeze_op, - "t.default": t_op, - "transpose.int": transpose_op, + "AddOp": add_op, + "MulOp": mul_op, + "SubOp": sub_op, + "SumDimOp": sum_op, + "TanhOp": tanh_op, + "AmaxOp": amax_op, + "RsqrtOp": rsqrt_op, + "BatchMatmulOp": bmm_op, + "CloneOp": clone_op, + "DivOp": div_op, + "ExpOp": exp_op, + "ExpandOp": expand_op, + "VarMeanOp": var_mean_op, + "AddMMOp": addmm_op, + "ReshapeOp": reshape_op, + "ViewOp": reshape_op, + "SelectOp": select_op, + "SliceOp": slice_op, + "EmbeddingOp": embedding_op, + "ConvertElementTypeOp": convert_element_type_op, + "PermuteOp": permute_op, + "UnsqueezeOp": unsqueeze_op, + "TOp": t_op, + "TransposeOp": transpose_op, + "MaxPool2dOp": maxpool2d_op, + "Conv2dOp": convolution2d_op, + "ReluOp": relu_op, + "IotaOp": iota_op, + "SigmoidOp": sigmoid_op, + "ReciprocalOp": reciprocal_op, + "MeanOp": mean_op, } diff --git a/frontend/Python/ops/utils.py b/frontend/Python/ops/utils.py new file mode 100644 index 000000000..337f5a6b4 --- /dev/null +++ b/frontend/Python/ops/utils.py @@ -0,0 +1,56 @@ +# ===- utils.py ---------------------------------------------------------------- +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# ===--------------------------------------------------------------------------- +# +# The element utils of mlir element type and attribute. +# +# ===--------------------------------------------------------------------------- + +from typing import Dict +import mlir.ir as ir + +from ..graph import TensorDType + + +def mlir_element_type_get(type_name): + """ + Get the mlir element type base on TensorDType's enum type. + Args: + type_name: The TensorDType's enum type. + """ + match type_name: + case TensorDType.Float32: + return ir.F32Type.get() + case TensorDType.Int64: + return ir.IntegerType.get_signless(64) + case TensorDType.Bool: + return ir.IntegerType.get_signless(1) + + +def mlir_element_attr_get(type_name, value): + """ + Get the mlir element attribute base on TensorDType's enum type and value. + Args: + type_name: The TensorDType's enum type. + value: The real value for mlir element attribute. + """ + match type_name: + case TensorDType.Float32: + return ir.FloatAttr.get(ir.F32Type.get(), value) + case TensorDType.Int64: + return ir.IntegerAttr.get(ir.IntegerType.get_signless(64), value) + case TensorDType.Bool: + return ir.IntegerAttr.get(ir.IntegerType.get_signless(1), value) + diff --git a/midend/include/Dialect/VectorExp/VectorExpOps.td b/midend/include/Dialect/VectorExp/VectorExpOps.td index 67f492643..aeacba34d 100644 --- a/midend/include/Dialect/VectorExp/VectorExpOps.td +++ b/midend/include/Dialect/VectorExp/VectorExpOps.td @@ -25,6 +25,8 @@ include "VectorExpDialect.td" include "mlir/Interfaces/InferTypeOpInterface.td" include "mlir/Interfaces/SideEffectInterfaces.td" +include "mlir/IR/AttrTypeBase.td" + //===----------------------------------------------------------------------===// // Vector Predication Operation //===----------------------------------------------------------------------===// @@ -41,6 +43,30 @@ def VectorExp_PredicationOp : VectorExp_Op<"predication"> { "$region `:` type($result)"; } +//===----------------------------------------------------------------------===// +// Vector GetVL Operation +//===----------------------------------------------------------------------===// + +def VectorExp_GetVLOp : VectorExp_Op<"get_vl"> { + let summary = "Vector Experiment GetVL Operation."; + let arguments = (ins TypeAttr:$dtype, IndexAttr:$lmul); + let results = (outs Index:$result); + let assemblyFormat = "$dtype `,` $lmul attr-dict `:` type($result)"; +} + +//===----------------------------------------------------------------------===// +// Vector SetVL Operation +//===----------------------------------------------------------------------===// + +def VectorExp_SetVLOp : VectorExp_Op<"set_vl"> { + let summary = "Vector Experiment SetVL Operation."; + let arguments = (ins Index:$vl); + // TODO: Add optional returns. + // let results = (outs AnyType:$result); + let regions = (region AnyRegion:$region); + let assemblyFormat = "$vl attr-dict `:` type($vl) $region"; +} + //===----------------------------------------------------------------------===// // Vector Load Operation with Dynamic Length //===----------------------------------------------------------------------===// diff --git a/midend/include/Utils/DAPUtils.h b/midend/include/Utils/DAPUtils.h new file mode 100644 index 000000000..9a9f418c7 --- /dev/null +++ b/midend/include/Utils/DAPUtils.h @@ -0,0 +1,58 @@ +//====- DAPUtils.h --------------------------------------------------------===// +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +// +// This file defines DAP dialect specific utility functions for the buddy +// compiler ecosystem. +// +//===----------------------------------------------------------------------===// + +#ifndef INCLUDE_UTILS_DAPUTILS_H +#define INCLUDE_UTILS_DAPUTILS_H + +#include "Utils/Utils.h" +#include + +using namespace mlir; + +namespace buddy { +namespace dap { + +// Generate 5 vector params from SOS matrices +SmallVector generateSOSParams(OpBuilder &rewriter, Location loc, + VectorType vectorTy, Value f0, Value f1, + Value c0, Value c1, Value c2, Value c4, + Value c5, Value filterSize, + Value kernel); + +// Processing iir operation, result are stored in output MemRef +void biquadProcess(OpBuilder &rewriter, Location loc, VectorType vectorTy, + Value f0, Value c0, Value c1, Value cUpperBound, + Value iUpperBound, SmallVector SOSParams, + ArrayRef arrayRef, Value N, Value input, + Value output); + +// Total process for a specific vector length iir vectorization process +void iirVectorizationProcess(OpBuilder &rewriter, Location loc, uint64_t vecLen, + FloatType floatType, Value f0, Value f1, Value c0, + Value c1, Value c2, Value c4, Value c5, + Value filterSize, Value kernel, + ArrayRef arrayRef, Value N, Value input, + Value output); + +} // namespace dap +} // namespace buddy + +#endif // INCLUDE_UTILS_DAPUTILS_H diff --git a/midend/lib/Conversion/CMakeLists.txt b/midend/lib/Conversion/CMakeLists.txt index 0d94bfa16..fc47e4171 100644 --- a/midend/lib/Conversion/CMakeLists.txt +++ b/midend/lib/Conversion/CMakeLists.txt @@ -3,6 +3,7 @@ add_subdirectory(LowerBud) add_subdirectory(LowerDIP) add_subdirectory(LowerRVV) add_subdirectory(LowerDAP) +add_subdirectory(DAPVectorization) add_subdirectory(MatMulOptimization) add_subdirectory(TransposeOptimization) add_subdirectory(ConvOptimization) @@ -11,3 +12,4 @@ add_subdirectory(LowerGemmini) add_subdirectory(LowerLinalgToGemmini) add_subdirectory(SchedulingOnDevices) add_subdirectory(LowerSche) +add_subdirectory(MLIRGPU) diff --git a/midend/lib/Conversion/DAPVectorization/CMakeLists.txt b/midend/lib/Conversion/DAPVectorization/CMakeLists.txt new file mode 100644 index 000000000..d67592051 --- /dev/null +++ b/midend/lib/Conversion/DAPVectorization/CMakeLists.txt @@ -0,0 +1,6 @@ +add_mlir_library(DAPVectorization + DAPVectorization.cpp + + LINK_LIBS PUBLIC + BuddyDAPUtils +) diff --git a/midend/lib/Conversion/DAPVectorization/DAPVectorization.cpp b/midend/lib/Conversion/DAPVectorization/DAPVectorization.cpp new file mode 100644 index 000000000..8c3eb3069 --- /dev/null +++ b/midend/lib/Conversion/DAPVectorization/DAPVectorization.cpp @@ -0,0 +1,222 @@ +//====- DAPVectorization.cpp - DAP Dialect Vectorization Pass ------------===// +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +// +// This file defines DAP dialect vectorization pass. +// +//===----------------------------------------------------------------------===// + +#include "mlir/Dialect/Affine/IR/AffineOps.h" +#include "mlir/Dialect/Bufferization/Transforms/Bufferize.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/Dialect/Linalg/IR/Linalg.h" +#include "mlir/Dialect/Linalg/Transforms/Transforms.h" +#include "mlir/Dialect/MemRef/IR/MemRef.h" +#include "mlir/Dialect/Vector/IR/VectorOps.h" +#include "mlir/Pass/Pass.h" + +#include "DAP/DAPDialect.h" +#include "DAP/DAPOps.h" +#include "Utils/DAPUtils.h" +#include + +using namespace mlir; +using namespace buddy; +using namespace vector; +using namespace mlir::arith; +using namespace mlir::linalg; + +//===----------------------------------------------------------------------===// +// Rewrite Pattern +//===----------------------------------------------------------------------===// + +namespace { +class DAPIirVectorization : public OpRewritePattern { +public: + using OpRewritePattern::OpRewritePattern; + + explicit DAPIirVectorization(MLIRContext *context) + : OpRewritePattern(context) {} + + LogicalResult matchAndRewrite(dap::IirOp op, + PatternRewriter &rewriter) const override { + auto loc = op->getLoc(); + auto ctx = op->getContext(); + + Value input = op->getOperand(0); + Value kernel = op->getOperand(1); + Value output = op->getOperand(2); + + Value c0 = rewriter.create(loc, 0); + Value c1 = rewriter.create(loc, 1); + Value c2 = rewriter.create(loc, 2); + Value c4 = rewriter.create(loc, 4); + Value c5 = rewriter.create(loc, 5); + Value c8 = rewriter.create(loc, 8); + Value c16 = rewriter.create(loc, 16); + Value c32 = rewriter.create(loc, 32); + + Value N = rewriter.create(loc, input, c0); + Value filterSize = rewriter.create(loc, kernel, c0); + + FloatType f32 = FloatType::getF32(ctx); + Value f0 = rewriter.create(loc, APFloat(0.0f), f32); + Value f1 = rewriter.create(loc, APFloat(1.0f), f32); + + Value cond4 = + rewriter.create(loc, CmpIPredicate::ule, filterSize, c4); + Value cond8 = + rewriter.create(loc, CmpIPredicate::ule, filterSize, c8); + Value cond16 = + rewriter.create(loc, CmpIPredicate::ule, filterSize, c16); + Value cond32 = + rewriter.create(loc, CmpIPredicate::ule, filterSize, c32); + + // clang-format off + rewriter.create(loc, cond4, + /*thenBuilder=*/ + [&](OpBuilder &builder, Location loc) { + dap::iirVectorizationProcess(builder, loc, 4, f32, f0, f1, c0, c1, c2, c4, c5, + filterSize, kernel, ArrayRef{0, 0, 1, 2}, + N, input, output); + + builder.create(loc); + }, + /*elseBuilder=*/ + [&](OpBuilder &builder, Location loc) { + builder.create(loc, cond8, + /*thenBuilder=*/ + [&](OpBuilder &builder, Location loc){ + dap::iirVectorizationProcess(builder, loc, 8, f32, f0, f1, c0, c1, c2, c4, c5, + filterSize, kernel, + ArrayRef{0, 0, 1, 2, 3, 4, 5, 6}, N, + input, output); + + builder.create(loc); + }, + /*elseBuilder=*/ + [&](OpBuilder &builder, Location loc) { + builder.create(loc, cond16, + /*thenBuilder=*/ + [&](OpBuilder &builder, Location loc){ + dap::iirVectorizationProcess(builder, loc, 16, f32, f0, f1, c0, c1, c2, c4, c5, + filterSize, kernel, ArrayRef{0, 0, 1, 2, + 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}, N, + input, output); + + builder.create(loc); + }, + /*elseBuilder=*/ + [&](OpBuilder &builder, Location loc) { + builder.create(loc, cond32, + /*thenBuilder=*/ + [&](OpBuilder &builder, Location loc){ + dap::iirVectorizationProcess(builder, loc, 32, f32, f0, f1, c0, c1, c2, c4, c5, + filterSize, kernel, ArrayRef{0, 0, 1, 2, + 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, + 30}, N, input, output); + + builder.create(loc); + }, + /*elseBuilder=*/ + [&](OpBuilder &builder, Location loc) { + dap::iirVectorizationProcess(builder, loc, 64, f32, f0, f1, c0, c1, c2, c4, c5, + filterSize, kernel, ArrayRef{0, 0, 1, 2, 3, 4, + 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, + 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, + 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, + 62}, N, input, output); + + builder.create(loc); + } + ); + builder.create(loc); + }); + + builder.create(loc); + }); + + builder.create(loc); + }); + // clang-format on + + rewriter.eraseOp(op); + return success(); + } +}; + +} // end anonymous namespace + +void populateVectorizeDAPConversionPatterns(RewritePatternSet &patterns) { + patterns.add(patterns.getContext()); +} + +//===----------------------------------------------------------------------===// +// VectorizeDAPPass +//===----------------------------------------------------------------------===// + +namespace { +class VectorizeDAPPass + : public PassWrapper> { +public: + MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(VectorizeDAPPass) + VectorizeDAPPass() = default; + VectorizeDAPPass(const VectorizeDAPPass &) {} + + StringRef getArgument() const final { return "vectorize-dap"; } + StringRef getDescription() const final { return "Vectorize DAP Dialect."; } + + void runOnOperation() override; + + void getDependentDialects(DialectRegistry ®istry) const override { + registry.insert(); + } +}; +} // end anonymous namespace. + +void VectorizeDAPPass::runOnOperation() { + MLIRContext *context = &getContext(); + ModuleOp module = getOperation(); + + ConversionTarget target(*context); + // clang-format off + target.addLegalDialect< + affine::AffineDialect, + scf::SCFDialect, + func::FuncDialect, + memref::MemRefDialect, + VectorDialect, + arith::ArithDialect, + linalg::LinalgDialect>(); + target.addLegalOp(); + // clang-format on + + RewritePatternSet patterns(context); + populateVectorizeDAPConversionPatterns(patterns); + + if (failed(applyPartialConversion(module, target, std::move(patterns)))) + signalPassFailure(); +} + +namespace mlir { +namespace buddy { +void registerDAPVectorizePass() { PassRegistration(); } +} // namespace buddy +} // namespace mlir diff --git a/midend/lib/Conversion/LowerDAP/LowerDAPPass.cpp b/midend/lib/Conversion/LowerDAP/LowerDAPPass.cpp index 33148d547..bf77f358b 100644 --- a/midend/lib/Conversion/LowerDAP/LowerDAPPass.cpp +++ b/midend/lib/Conversion/LowerDAP/LowerDAPPass.cpp @@ -21,11 +21,11 @@ #include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/Bufferization/Transforms/Bufferize.h" #include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/Dialect/Linalg/IR/Linalg.h" #include "mlir/Dialect/Linalg/Transforms/Transforms.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/Dialect/Vector/IR/VectorOps.h" #include "mlir/Pass/Pass.h" -#include "mlir/Dialect/Linalg/IR/Linalg.h" #include "DAP/DAPDialect.h" #include "DAP/DAPOps.h" @@ -175,10 +175,7 @@ class DAPIirLowering : public OpRewritePattern { public: using OpRewritePattern::OpRewritePattern; - explicit DAPIirLowering(MLIRContext *context, int64_t strideParam) - : OpRewritePattern(context) { - stride = strideParam; - } + explicit DAPIirLowering(MLIRContext *context) : OpRewritePattern(context) {} LogicalResult matchAndRewrite(dap::IirOp op, PatternRewriter &rewriter) const override { @@ -197,141 +194,60 @@ class DAPIirLowering : public OpRewritePattern { Value N = rewriter.create(loc, input, c0); Value filterSize = rewriter.create(loc, kernel, c0); - Value strideVal = rewriter.create(loc, stride); FloatType f32 = FloatType::getF32(ctx); - VectorType vectorTy32 = VectorType::get({stride}, f32); - - Value zr = rewriter.create(loc, APFloat(float(0)), f32); - // calculate the upper bound of the FIR part - Value strictN = rewriter.create(loc, N, c2); - Value strideRem = rewriter.create(loc, strictN, strideVal); - Value upperN = rewriter.create(loc, N, strideRem); - // loop over every row in SOS matrix rewriter.create( - loc, c0, filterSize, c1, ValueRange{std::nullopt}, - [&](OpBuilder &builder, Location loc, ValueRange ivs, - ValueRange iargs) { - Value b0 = builder.create(loc, kernel, - ValueRange{ivs[0], c0}); - Value b1 = builder.create(loc, kernel, - ValueRange{ivs[0], c1}); - Value b2 = builder.create(loc, kernel, - ValueRange{ivs[0], c2}); - // Value a0 of kernel is not used - Value a1 = builder.create(loc, kernel, - ValueRange{ivs[0], c4}); - Value a2 = builder.create(loc, kernel, - ValueRange{ivs[0], c5}); + loc, c0, filterSize, c1, ValueRange{input}, + [&](OpBuilder &builder, Location loc, Value iv, ValueRange iarg) { + Value b0 = + builder.create(loc, kernel, ValueRange{iv, c0}); + Value b1 = + builder.create(loc, kernel, ValueRange{iv, c1}); + Value b2 = + builder.create(loc, kernel, ValueRange{iv, c2}); + Value a1 = + builder.create(loc, kernel, ValueRange{iv, c4}); + Value a2 = + builder.create(loc, kernel, ValueRange{iv, c5}); Value z1 = builder.create(loc, APFloat(float(0)), f32); Value z2 = builder.create(loc, APFloat(float(0)), f32); - Value x0 = builder.create(loc, input, ValueRange{c0}); - Value temp = builder.create(loc, b0, x0); - builder.create(loc, temp, output, ValueRange{c0}); - - Value x1 = builder.create(loc, input, ValueRange{c1}); - Value temp0 = builder.create(loc, b0, x1); - Value temp1 = builder.create(loc, b1, x0); - Value temp2 = builder.create(loc, temp0, temp1); - builder.create(loc, temp2, output, ValueRange{c1}); - - Value Vecb0 = - builder.create(loc, vectorTy32, b0); - Value Vecb1 = - builder.create(loc, vectorTy32, b1); - Value Vecb2 = - builder.create(loc, vectorTy32, b2); - - // A biquad filter expression: - // y[n] = b0*x[n] + b1*x[n-1] + b2*x[n-2] - a1*y[n-1] - a2*y[n-2]; - // FIR part - builder.create( - loc, c2, upperN, strideVal, ValueRange{std::nullopt}, - [&](OpBuilder &builder, Location loc, Value iv, - ValueRange itrargs) { - Value idx0 = iv; - Value idx1 = builder.create(loc, idx0, c1); - Value idx2 = builder.create(loc, idx0, c2); - - Value inputVec0 = builder.create(loc, vectorTy32, input, - ValueRange{idx0}); - Value inputVec1 = builder.create(loc, vectorTy32, input, - ValueRange{idx1}); - Value inputVec2 = builder.create(loc, vectorTy32, input, - ValueRange{idx2}); - - Value outputVec = - rewriter.create(loc, vectorTy32, zr); - Value resVec0 = - builder.create(loc, inputVec0, Vecb0, outputVec); - Value resVec1 = - builder.create(loc, inputVec1, Vecb1, resVec0); - Value resVec2 = - builder.create(loc, inputVec2, Vecb2, resVec1); - builder.create(loc, resVec2, output, ValueRange{idx0}); - - builder.create(loc, std::nullopt); - }); - - // process the remain data of FIR part - Value idx1 = builder.create(loc, upperN, c1); - Value idx2 = builder.create(loc, upperN, c2); - Value in1 = - builder.create(loc, input, ValueRange{idx1}); - Value in2 = - builder.create(loc, input, ValueRange{idx2}); - - builder.create( - loc, upperN, N, c1, ValueRange{in1, in2}, - [&](OpBuilder &builder, Location loc, Value iv, - ValueRange itrargs) { - Value in0 = - builder.create(loc, input, ValueRange{iv}); - - Value temp0 = builder.create(loc, b0, in0); - Value temp1 = builder.create(loc, b1, in1); - Value temp2 = builder.create(loc, b2, in2); - Value sum0 = builder.create(loc, temp0, temp1); - Value sum1 = builder.create(loc, sum0, temp2); - - builder.create(loc, sum1, output, ValueRange{iv}); - - builder.create(loc, std::vector{in0, in1}); - }); - - // IIR part + // Loop reordering, compute z1 for next iteration, z2 for the second + // following iteration. builder.create( loc, c0, N, c1, ValueRange{z1, z2}, [&](OpBuilder &builder, Location loc, Value iv, - ValueRange itrargs) { - Value x = - builder.create(loc, output, ValueRange{iv}); - Value t1 = builder.create(loc, a1, itrargs[1]); - Value t2 = builder.create(loc, a2, itrargs[0]); - Value y = builder.create(loc, t1, t2); - Value opt = builder.create(loc, x, y); - - builder.create(loc, opt, output, - ValueRange{iv}); + ValueRange iargs) { + Value inElem = builder.create(loc, iarg[0], iv); + Value t0 = builder.create(loc, b0, inElem); + Value outElem = + builder.create(loc, t0, iargs[0]); + + Value t1 = builder.create(loc, b1, inElem); + Value t2 = builder.create(loc, a1, outElem); + Value t3 = builder.create(loc, t1, t2); + Value z1Next = builder.create(loc, t3, iargs[1]); + + Value t4 = builder.create(loc, b2, inElem); + Value t5 = builder.create(loc, a2, outElem); + Value z2Next = builder.create(loc, t4, t5); + + builder.create(loc, outElem, output, iv); builder.create( - loc, std::vector{itrargs[1], opt}); + loc, std::vector{z1Next, z2Next}); }); - builder.create(loc, output, input); - builder.create(loc, std::nullopt); + + builder.create(loc, output); }); rewriter.eraseOp(op); return success(); } - -private: - int64_t stride; }; } // end anonymous namespace @@ -340,7 +256,7 @@ void populateLowerDAPConversionPatterns(RewritePatternSet &patterns, int64_t stride) { patterns.add(patterns.getContext()); patterns.add(patterns.getContext(), stride); - patterns.add(patterns.getContext(), stride); + patterns.add(patterns.getContext()); } //===----------------------------------------------------------------------===// @@ -363,7 +279,8 @@ class LowerDAPPass : public PassWrapper> { void getDependentDialects(DialectRegistry ®istry) const override { registry.insert(); + affine::AffineDialect, arith::ArithDialect, + linalg::LinalgDialect>(); } Option stride{*this, "DAP-vector-splitting", llvm::cl::desc("Vector splitting size."), @@ -376,10 +293,10 @@ void LowerDAPPass::runOnOperation() { ModuleOp module = getOperation(); ConversionTarget target(*context); - target.addLegalDialect(); + target + .addLegalDialect(); target.addLegalOp(); RewritePatternSet patterns(context); diff --git a/midend/lib/Conversion/LowerSche/LowerSchePass.cpp b/midend/lib/Conversion/LowerSche/LowerSchePass.cpp index 882ffd2f4..0d1da54f2 100644 --- a/midend/lib/Conversion/LowerSche/LowerSchePass.cpp +++ b/midend/lib/Conversion/LowerSche/LowerSchePass.cpp @@ -1,4 +1,4 @@ -//====- LowerSchePass.cpp - Sche Dialect Lowering Pass ---------------------===// +//====- LowerSchePass.cpp - Sche Dialect Lowering Pass -------------------===// // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -19,19 +19,19 @@ //===----------------------------------------------------------------------===// #include "mlir/Dialect/Arith/IR/Arith.h" +#include "mlir/Dialect/Async/IR/Async.h" +#include "mlir/Dialect/Bufferization/IR/Bufferization.h" #include "mlir/Dialect/Bufferization/Transforms/Bufferize.h" #include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/Dialect/GPU/IR/GPUDialect.h" #include "mlir/Dialect/LLVMIR/LLVMDialect.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" -#include "mlir/Dialect/Vector/IR/VectorOps.h" -#include "mlir/Dialect/GPU/IR/GPUDialect.h" -#include "mlir/Dialect/Bufferization/IR/Bufferization.h" #include "mlir/Dialect/SCF/IR/SCF.h" -#include "mlir/Dialect/Async/IR/Async.h" +#include "mlir/Dialect/Vector/IR/VectorOps.h" +#include "mlir/IR/Builders.h" #include "mlir/IR/BuiltinDialect.h" #include "mlir/Pass/Pass.h" #include "mlir/Transforms/DialectConversion.h" -#include "mlir/IR/Builders.h" #include "Bud/BudDialect.h" #include "Bud/BudOps.h" @@ -40,7 +40,6 @@ #include - using namespace mlir; using namespace buddy; @@ -50,81 +49,100 @@ using namespace buddy; namespace { -class WaitOpScheLowering : public ConversionPattern { +class WaitOpScheLowering : public ConversionPattern { public: - explicit WaitOpScheLowering(TypeConverter &typeConverter, MLIRContext *context) - : ConversionPattern(typeConverter, sche::WaitOp::getOperationName(), 1, context) {} - + explicit WaitOpScheLowering(TypeConverter &typeConverter, + MLIRContext *context) + : ConversionPattern(typeConverter, sche::WaitOp::getOperationName(), 1, + context) {} LogicalResult - matchAndRewrite(Operation* op, ArrayRef operands, + matchAndRewrite(Operation *op, ArrayRef operands, mlir::ConversionPatternRewriter &rewriter) const final { assert(operands.size() == 1); auto loc = op->getLoc(); - auto typeConverter = getTypeConverter(); + // auto typeConverter = getTypeConverter(); rewriter.setInsertionPoint(op); - auto awaitOp = rewriter.create(loc, operands[0]); + rewriter.create(loc, operands[0]); rewriter.eraseOp(op); return success(); } }; -//lower to GPU Dialect -class OnDeviceOpScheLowering : public ConversionPattern { +// lower to GPU Dialect +class OnDeviceOpScheLowering : public ConversionPattern { public: -explicit OnDeviceOpScheLowering(TypeConverter &typeConverter, MLIRContext *context) - : ConversionPattern(typeConverter, sche::OnDeviceOp::getOperationName(), 1, context) {} - - //convert operands with tensor or vector type into memref operands, and register these operands to GPU - OpBuilder::InsertPoint convertOperands(mlir::ConversionPatternRewriter &rewriter, ValueRange operands, IRMapping &mp, Location& loc, OpBuilder::InsertPoint insertPointBeforeOp, OpBuilder::InsertPoint insertPointToBlockStart) const { + explicit OnDeviceOpScheLowering(TypeConverter &typeConverter, + MLIRContext *context) + : ConversionPattern(typeConverter, sche::OnDeviceOp::getOperationName(), + 1, context) {} + + // convert operands with tensor or vector type into memref operands, and + // register these operands to GPU + OpBuilder::InsertPoint + convertOperands(mlir::ConversionPatternRewriter &rewriter, + ValueRange operands, IRMapping &mp, Location &loc, + OpBuilder::InsertPoint insertPointBeforeOp, + OpBuilder::InsertPoint insertPointToBlockStart) const { rewriter.restoreInsertionPoint(insertPointBeforeOp); - for(auto v : operands){ + for (auto v : operands) { auto t = v.getType(); - if(isa(t)){ + if (isa(t)) { auto shape = dyn_cast(t).getShape(); auto ele_type = dyn_cast(t).getElementType(); - auto to_memref_op = rewriter.create(loc, MemRefType::get(shape, ele_type), v); + auto to_memref_op = rewriter.create( + loc, MemRefType::get(shape, ele_type), v); mp.map(v, to_memref_op.getResult()); - auto memref_cast_op = rewriter.create(loc, UnrankedMemRefType::get(ele_type, {}), to_memref_op.getResult()); - auto host_register_op = rewriter.create(loc, memref_cast_op.getResult()); - } - else if(isa(t)){ + auto memref_cast_op = rewriter.create( + loc, UnrankedMemRefType::get(ele_type, {}), + to_memref_op.getResult()); + rewriter.create(loc, memref_cast_op.getResult()); + } else if (isa(t)) { auto shape = dyn_cast(t).getShape(); auto ele_type = dyn_cast(t).getElementType(); auto mem_type = MemRefType::get(shape, ele_type); auto alloc_op = rewriter.create(loc, mem_type); - auto memref_cast_op = rewriter.create(loc, UnrankedMemRefType::get(ele_type, {}), alloc_op.getResult()); - auto host_register_op = rewriter.create(loc, memref_cast_op.getResult()); - auto idx0 = rewriter.create(loc, rewriter.getIndexAttr(0)).getResult(); + auto memref_cast_op = rewriter.create( + loc, UnrankedMemRefType::get(ele_type, {}), alloc_op.getResult()); + rewriter.create(loc, memref_cast_op.getResult()); + auto idx0 = + rewriter.create(loc, rewriter.getIndexAttr(0)) + .getResult(); llvm::SmallVector indices(shape.size(), idx0); - auto vector_transfer_write_op = rewriter.create(loc, v, alloc_op.getResult(), indices); + rewriter.create(loc, v, alloc_op.getResult(), + indices); mp.map(v, alloc_op.getResult()); - } - else if(isa(t)){ - auto host_register_op = rewriter.create(loc, v); - } - else if(isa(t)){ + } else if (isa(t)) { + rewriter.create(loc, v); + } else if (isa(t)) { auto memref_type = dyn_cast(t); - auto memref_cast_op = rewriter.create(loc, UnrankedMemRefType::get(memref_type.getElementType(), memref_type.getMemorySpace()), v); - auto host_register_op = rewriter.create(loc, memref_cast_op.getResult()); - } - else{ + auto memref_cast_op = rewriter.create( + loc, + UnrankedMemRefType::get(memref_type.getElementType(), + memref_type.getMemorySpace()), + v); + rewriter.create(loc, memref_cast_op.getResult()); + } else { continue; } } rewriter.restoreInsertionPoint(insertPointToBlockStart); - for(auto v : operands){ + for (auto v : operands) { auto t = v.getType(); - if(isa(t)){ - auto to_tensor_op = rewriter.create(loc, t, mp.lookup(v)); + if (isa(t)) { + auto to_tensor_op = rewriter.create( + loc, t, mp.lookup(v)); mp.map(v, to_tensor_op.getResult()); - } - else if(isa(t)){ - auto idx0 = rewriter.create(loc, rewriter.getIndexAttr(0)).getResult(); - llvm::SmallVector indices(dyn_cast(t).getShape().size(), idx0); - auto transfer_read_op = rewriter.create(loc, dyn_cast(t), mp.lookup(v), indices); + } else if (isa(t)) { + auto idx0 = + rewriter.create(loc, rewriter.getIndexAttr(0)) + .getResult(); + llvm::SmallVector indices( + dyn_cast(t).getShape().size(), idx0); + auto transfer_read_op = rewriter.create( + loc, dyn_cast(t), mp.lookup(v), indices); mp.map(v, transfer_read_op.getResult()); } } @@ -132,160 +150,209 @@ explicit OnDeviceOpScheLowering(TypeConverter &typeConverter, MLIRContext *conte return rewriter.saveInsertionPoint(); } - //convert results with tensor or vector type into memref , and register these results to GPU - SmallVector convertResults(mlir::ConversionPatternRewriter &rewriter, ValueRange results, IRMapping &mp, Location& loc, OpBuilder::InsertPoint insertPointBeforeOp, OpBuilder::InsertPoint insertPointAfterGpuLaunchOp) const { + // convert results with tensor or vector type into memref , and register these + // results to GPU + SmallVector + convertResults(mlir::ConversionPatternRewriter &rewriter, ValueRange results, + IRMapping &mp, Location &loc, + OpBuilder::InsertPoint insertPointBeforeOp, + OpBuilder::InsertPoint insertPointAfterGpuLaunchOp) const { rewriter.restoreInsertionPoint(insertPointBeforeOp); SmallVector result_memrefs; - for(auto v : results){ + for (auto v : results) { MemRefType mem_type; auto t = v.getType(); - //TODO:必须要有rank - if(isa(t)){ + // TODO: must have the rank + if (isa(t)) { auto shape = dyn_cast(t).getShape(); auto ele_type = dyn_cast(t).getElementType(); mem_type = MemRefType::get(shape, ele_type); - } - else if(isa(t)){ + } else if (isa(t)) { auto shape = dyn_cast(t).getShape(); auto ele_type = dyn_cast(t).getElementType(); mem_type = MemRefType::get(shape, ele_type); - } - else if(isa(t)){ + } else if (isa(t)) { mem_type = dyn_cast(t); - } - else{ + } else { mem_type = MemRefType::get({1}, t); } auto alloc_op = rewriter.create(loc, mem_type); result_memrefs.push_back(alloc_op.getResult()); - auto memref_cast_op = rewriter.create(loc, UnrankedMemRefType::get(mem_type.getElementType(), mem_type.getMemorySpace()), alloc_op.getResult()); - auto host_register_op = rewriter.create(loc, memref_cast_op.getResult()); + auto memref_cast_op = rewriter.create( + loc, + UnrankedMemRefType::get(mem_type.getElementType(), + mem_type.getMemorySpace()), + alloc_op.getResult()); + rewriter.create(loc, memref_cast_op.getResult()); } rewriter.restoreInsertionPoint(insertPointAfterGpuLaunchOp); - //convert result'type into original type for returning - int i=0; - for(auto v : results){ + // convert result'type into original type for returning + int i = 0; + for (auto v : results) { auto t = v.getType(); - //TODO:必须要有rank - if(isa(t)){ - auto shape = dyn_cast_or_null(t).getShape(); - auto ele_type = dyn_cast_or_null(t).getElementType(); - auto to_tensor_op = rewriter.create(loc, t, result_memrefs[i++]); + // TODO: must have the rank + if (isa(t)) { + auto to_tensor_op = rewriter.create( + loc, t, result_memrefs[i++]); v.replaceAllUsesWith(to_tensor_op.getResult()); - } - else if(isa(t)){ - auto idx0 = rewriter.create(loc, rewriter.getIndexAttr(0)).getResult(); - llvm::SmallVector indices(dyn_cast(t).getShape().size(), idx0); - auto transfer_read_op = rewriter.create(loc, dyn_cast(t), result_memrefs[i++], indices); + } else if (isa(t)) { + auto idx0 = + rewriter.create(loc, rewriter.getIndexAttr(0)) + .getResult(); + llvm::SmallVector indices( + dyn_cast(t).getShape().size(), idx0); + auto transfer_read_op = rewriter.create( + loc, dyn_cast(t), result_memrefs[i++], indices); v.replaceAllUsesWith(transfer_read_op.getResult()); - } - else if(isa(t)){ + } else if (isa(t)) { v.replaceAllUsesWith(result_memrefs[i++]); - } - else{ - auto idx0 = rewriter.create(loc, rewriter.getIndexAttr(0)).getResult(); - auto load_op = rewriter.create(loc, v.getType(), result_memrefs[i++], ValueRange{idx0}); + } else { + auto idx0 = + rewriter.create(loc, rewriter.getIndexAttr(0)) + .getResult(); + auto load_op = rewriter.create( + loc, v.getType(), result_memrefs[i++], ValueRange{idx0}); v.replaceAllUsesWith(load_op.getResult()); } } return result_memrefs; } - //OnDeviceOp from ScfForOp conversion - void lowerFromForOp(scf::ForOp forOp, gpu::LaunchOp gpuLaunchOp, OpBuilder::InsertPoint insertPointBeforeOp, OpBuilder::InsertPoint insertPointInGpuLaunchBody, Location loc, PatternRewriter &rewriter,Value gridX, Value gridY, Value gridZ, Value blockX, Value blockY, Value blockZ) const { + // OnDeviceOp from ScfForOp conversion + void lowerFromForOp(scf::ForOp forOp, gpu::LaunchOp gpuLaunchOp, + OpBuilder::InsertPoint insertPointBeforeOp, + OpBuilder::InsertPoint insertPointInGpuLaunchBody, + Location loc, PatternRewriter &rewriter, Value gridX, + Value gridY, Value gridZ, Value blockX, Value blockY, + Value blockZ) const { rewriter.restoreInsertionPoint(insertPointBeforeOp); Value upperBound = forOp.getUpperBound(); Value lowerBound = forOp.getLowerBound(); Value step = forOp.getStep(); - //Calculate the step size range required in a block + // Calculate the step size range required in a block auto range = rewriter.create(loc, upperBound, lowerBound); - Value stepRange = rewriter.create(loc, range.getResult(), step); - Value stepRangeInBlock = rewriter.create(loc, stepRange, gridX); + Value stepRange = + rewriter.create(loc, range.getResult(), step); + Value stepRangeInBlock = + rewriter.create(loc, stepRange, gridX); Value remInBlock = rewriter.create(loc, stepRange, gridX); - auto idx0 = rewriter.create(loc, rewriter.getIndexAttr(0)).getResult(); - auto idx1 = rewriter.create(loc, rewriter.getIndexAttr(1)).getResult(); - - auto& body = gpuLaunchOp.getBody(); - auto& bodyBlock = body.front(); + auto idx0 = + rewriter.create(loc, rewriter.getIndexAttr(0)) + .getResult(); + auto idx1 = + rewriter.create(loc, rewriter.getIndexAttr(1)) + .getResult(); rewriter.restoreInsertionPoint(insertPointInGpuLaunchBody); - Value start = rewriter.create(loc, stepRangeInBlock, gpuLaunchOp.getBlockIds().x); + Value start = rewriter.create(loc, stepRangeInBlock, + gpuLaunchOp.getBlockIds().x); start = rewriter.create(loc, start, lowerBound); - Value cmp_rem_blkId = rewriter.create(loc, arith::CmpIPredicate::sgt, remInBlock, gpuLaunchOp.getBlockIds().x); - Value cmp_rem_blkId_index = rewriter.create(loc, rewriter.getIndexType(), cmp_rem_blkId); - stepRangeInBlock = rewriter.create(loc, cmp_rem_blkId_index, stepRangeInBlock); - Value min = rewriter.create(loc, gpuLaunchOp.getBlockIds().x, remInBlock); + Value cmp_rem_blkId = + rewriter.create(loc, arith::CmpIPredicate::sgt, + remInBlock, gpuLaunchOp.getBlockIds().x); + Value cmp_rem_blkId_index = rewriter.create( + loc, rewriter.getIndexType(), cmp_rem_blkId); + stepRangeInBlock = rewriter.create(loc, cmp_rem_blkId_index, + stepRangeInBlock); + Value min = rewriter.create( + loc, gpuLaunchOp.getBlockIds().x, remInBlock); start = rewriter.create(loc, start, min); - //Calculate the step size range required in a thread - Value stepRangeInThread = rewriter.create(loc, stepRangeInBlock, gpuLaunchOp.getBlockSizeX()); - Value remInThread = rewriter.create(loc, stepRangeInBlock, gpuLaunchOp.getBlockSizeX()); - Value cmp_rem_threadId = rewriter.create(loc, arith::CmpIPredicate::sgt, remInThread, gpuLaunchOp.getThreadIds().x); - Value cmp_rem_threadId_index = rewriter.create(loc, rewriter.getIndexType(), cmp_rem_threadId); - stepRangeInThread = rewriter.create(loc, cmp_rem_threadId_index, stepRangeInThread); - - Value end = rewriter.create(loc, start, stepRangeInThread); - - auto sub_forOp = rewriter.create(loc, idx0, stepRangeInThread, idx1, forOp.getInitArgs(), - [&](OpBuilder& builder, Location loc, - Value iv, ValueRange iterArgs) - { - Block &bodyBlock = forOp.getRegion().front();//original forOp's bodyBlock - IRMapping mp; - iv = builder.create(loc, iv, gpuLaunchOp.getBlockSizeX()); - iv = builder.create(loc, iv, gpuLaunchOp.getThreadIds().x); - iv = builder.create(loc, iv, step); - iv = builder.create(loc, iv, start); - mp.map(bodyBlock.getArgument(0), iv); - for(auto&& [a, b] : llvm::zip(bodyBlock.getArguments().drop_front(), iterArgs)){ - mp.map(a, b); - } - for(auto&& op_ : bodyBlock.getOperations()){ - builder.insert(op_.clone(mp)); - } - }); + // Calculate the step size range required in a thread + Value stepRangeInThread = rewriter.create( + loc, stepRangeInBlock, gpuLaunchOp.getBlockSizeX()); + Value remInThread = rewriter.create( + loc, stepRangeInBlock, gpuLaunchOp.getBlockSizeX()); + Value cmp_rem_threadId = rewriter.create( + loc, arith::CmpIPredicate::sgt, remInThread, + gpuLaunchOp.getThreadIds().x); + Value cmp_rem_threadId_index = rewriter.create( + loc, rewriter.getIndexType(), cmp_rem_threadId); + stepRangeInThread = rewriter.create( + loc, cmp_rem_threadId_index, stepRangeInThread); + + rewriter.create(loc, start, stepRangeInThread); + + rewriter.create( + loc, idx0, stepRangeInThread, idx1, forOp.getInitArgs(), + [&](OpBuilder &builder, Location loc, Value iv, ValueRange iterArgs) { + Block &bodyBlock = + forOp.getRegion().front(); // original forOp's bodyBlock + IRMapping mp; + iv = builder.create(loc, iv, + gpuLaunchOp.getBlockSizeX()); + iv = builder.create(loc, iv, + gpuLaunchOp.getThreadIds().x); + iv = builder.create(loc, iv, step); + iv = builder.create(loc, iv, start); + mp.map(bodyBlock.getArgument(0), iv); + for (auto &&[a, b] : + llvm::zip(bodyBlock.getArguments().drop_front(), iterArgs)) { + mp.map(a, b); + } + for (auto &&op_ : bodyBlock.getOperations()) { + builder.insert(op_.clone(mp)); + } + }); } LogicalResult - matchAndRewrite(Operation* op, ArrayRef operands, + matchAndRewrite(Operation *op, ArrayRef operands, mlir::ConversionPatternRewriter &rewriter) const final { auto loc = op->getLoc(); auto onDeviceOp = dyn_cast(op); rewriter.setInsertionPoint(op); - auto grid_x = rewriter.create(loc, rewriter.getIndexAttr(3)).getResult(); - auto grid_y = rewriter.create(loc, rewriter.getIndexAttr(1)).getResult(); - auto grid_z = rewriter.create(loc, rewriter.getIndexAttr(1)).getResult(); - auto block_x = rewriter.create(loc, rewriter.getIndexAttr(3)).getResult(); - auto block_y = rewriter.create(loc, rewriter.getIndexAttr(1)).getResult(); - auto block_z = rewriter.create(loc, rewriter.getIndexAttr(1)).getResult(); + auto grid_x = + rewriter.create(loc, rewriter.getIndexAttr(3)) + .getResult(); + auto grid_y = + rewriter.create(loc, rewriter.getIndexAttr(1)) + .getResult(); + auto grid_z = + rewriter.create(loc, rewriter.getIndexAttr(1)) + .getResult(); + auto block_x = + rewriter.create(loc, rewriter.getIndexAttr(3)) + .getResult(); + auto block_y = + rewriter.create(loc, rewriter.getIndexAttr(1)) + .getResult(); + auto block_z = + rewriter.create(loc, rewriter.getIndexAttr(1)) + .getResult(); OpBuilder::InsertPoint insertBeforeOp, insertAfterOp; gpu::LaunchOp gpu_launch_op; - //if use async + // if use async Value token = onDeviceOp.getAsyncToken(); - if(token){ - auto asyncDependencies = operands.take_front(onDeviceOp.getODSOperandIndexAndLength(0).second - onDeviceOp.getODSOperandIndexAndLength(0).first); - auto async_exec_op = rewriter.create(loc, TypeRange{}, asyncDependencies, ValueRange{}); + if (token) { + auto asyncDependencies = + operands.take_front(onDeviceOp.getODSOperandIndexAndLength(0).second - + onDeviceOp.getODSOperandIndexAndLength(0).first); + auto async_exec_op = rewriter.create( + loc, TypeRange{}, asyncDependencies, ValueRange{}); rewriter.replaceAllUsesWith(token, async_exec_op.getToken()); - auto& bodyBlock = async_exec_op.getBodyRegion().front(); + auto &bodyBlock = async_exec_op.getBodyRegion().front(); rewriter.setInsertionPointToStart(&bodyBlock); - gpu_launch_op = rewriter.create(loc, grid_x, grid_y, grid_z, block_x, block_y, block_z); + gpu_launch_op = rewriter.create( + loc, grid_x, grid_y, grid_z, block_x, block_y, block_z); rewriter.setInsertionPoint(async_exec_op); insertBeforeOp = rewriter.saveInsertionPoint(); rewriter.setInsertionPointAfter(async_exec_op); insertAfterOp = rewriter.saveInsertionPoint(); - }else{ - gpu_launch_op = rewriter.create(loc, grid_x, grid_y, grid_z, block_x, block_y, block_z); + } else { + gpu_launch_op = rewriter.create( + loc, grid_x, grid_y, grid_z, block_x, block_y, block_z); rewriter.setInsertionPoint(gpu_launch_op); insertBeforeOp = rewriter.saveInsertionPoint(); rewriter.setInsertionPointAfter(gpu_launch_op); insertAfterOp = rewriter.saveInsertionPoint(); } - auto& bodyBlock = gpu_launch_op.getBody().front(); + auto &bodyBlock = gpu_launch_op.getBody().front(); rewriter.setInsertionPointToStart(&bodyBlock); auto insertToStart = rewriter.saveInsertionPoint(); @@ -293,59 +360,73 @@ explicit OnDeviceOpScheLowering(TypeConverter &typeConverter, MLIRContext *conte auto insertToEnd = rewriter.saveInsertionPoint(); IRMapping mp; - auto innerOperands = operands.take_back(onDeviceOp.getODSOperandIndexAndLength(1).second - onDeviceOp.getODSOperandIndexAndLength(1).first); - auto insertPointInGpuLaunchBody = convertOperands(rewriter, innerOperands, mp, loc, insertBeforeOp, insertToStart); + auto innerOperands = + operands.take_back(onDeviceOp.getODSOperandIndexAndLength(1).second - + onDeviceOp.getODSOperandIndexAndLength(1).first); + auto insertPointInGpuLaunchBody = convertOperands( + rewriter, innerOperands, mp, loc, insertBeforeOp, insertToStart); auto results = onDeviceOp.getInnerResults(); - auto result_memrefs = convertResults(rewriter, results, mp, loc, insertBeforeOp, insertAfterOp); - + auto result_memrefs = convertResults(rewriter, results, mp, loc, + insertBeforeOp, insertAfterOp); + assert(isa(op->getAttr("sche.source"))); - auto sche_source = dyn_cast_or_null(op->getAttr("sche.source")).strref(); + auto sche_source = + dyn_cast_or_null(op->getAttr("sche.source")).strref(); - //scf::for lower - if(sche_source == "scf.for"){ - Operation& op_ = onDeviceOp.getRegion().front().front(); + // scf::for lower + if (sche_source == "scf.for") { + Operation &op_ = onDeviceOp.getRegion().front().front(); auto for_op = dyn_cast(op_); - lowerFromForOp(for_op, gpu_launch_op, insertBeforeOp, insertPointInGpuLaunchBody, loc, rewriter, grid_x, grid_y, grid_z, block_x, block_y, block_z); - } - else if(sche_source == "func"){ + lowerFromForOp(for_op, gpu_launch_op, insertBeforeOp, + insertPointInGpuLaunchBody, loc, rewriter, grid_x, grid_y, + grid_z, block_x, block_y, block_z); + } else if (sche_source == "func") { rewriter.restoreInsertionPoint(insertPointInGpuLaunchBody); - for(auto&& op_ : onDeviceOp.getRegion().front().getOperations()){ - if(!op_.hasTrait()){ + for (auto &&op_ : onDeviceOp.getRegion().front().getOperations()) { + if (!op_.hasTrait()) { auto new_op = rewriter.clone(op_, mp); - for(auto&& [a, b] : llvm::zip(op_.getResults(), new_op->getResults())){ + for (auto &&[a, b] : + llvm::zip(op_.getResults(), new_op->getResults())) { mp.map(a, b); } - }else{ - int i=0; + } else { + int i = 0; rewriter.restoreInsertionPoint(insertToEnd); - for(auto res : op_.getOperands()){ + for (auto res : op_.getOperands()) { auto t = res.getType(); - //TODO:必须要有rank - if(isa(t)){ + // TODO: must have the rank + if (isa(t)) { auto shape = dyn_cast(t).getShape(); auto ele_type = dyn_cast(t).getElementType(); - auto to_memref_op = rewriter.create(loc, MemRefType::get(shape, ele_type), mp.lookupOrNull(res)); - auto copy_op = rewriter.create(loc, to_memref_op.getResult(), result_memrefs[i++]); - } - else if(isa(t)){ - auto idx0 = rewriter.create(loc, rewriter.getIndexAttr(0)).getResult(); - llvm::SmallVector indices(dyn_cast(t).getShape().size(), idx0); - auto vector_transfer_write_op = rewriter.create(loc, mp.lookupOrNull(res), result_memrefs[i++], indices); - } - else if(isa(t)){ - auto copy_op = rewriter.create(loc, mp.lookupOrNull(res), result_memrefs[i++]); - } - else{ - auto store_op = rewriter.create(loc, mp.lookupOrNull(res), result_memrefs[i++]); + auto to_memref_op = rewriter.create( + loc, MemRefType::get(shape, ele_type), + mp.lookupOrNull(res)); + rewriter.create(loc, to_memref_op.getResult(), + result_memrefs[i++]); + } else if (isa(t)) { + auto idx0 = + rewriter + .create(loc, rewriter.getIndexAttr(0)) + .getResult(); + llvm::SmallVector indices( + dyn_cast(t).getShape().size(), idx0); + rewriter.create( + loc, mp.lookupOrNull(res), result_memrefs[i++], + indices); + } else if (isa(t)) { + rewriter.create(loc, mp.lookupOrNull(res), + result_memrefs[i++]); + } else { + rewriter.create(loc, mp.lookupOrNull(res), + result_memrefs[i++]); } } } } - } - else{ - //TODO add conversion of onDeviceOp from more op - printf("conversion from source %s has not implemented\n", sche_source); - abort(); + } else { + // TODO add conversion of onDeviceOp from more op + op->emitError("Conversion from source " + sche_source + + " has not been implemented"); } rewriter.setInsertionPointToEnd(&bodyBlock); @@ -359,7 +440,8 @@ explicit OnDeviceOpScheLowering(TypeConverter &typeConverter, MLIRContext *conte } // end anonymous namespace -void populateLowerScheConversionPatterns(TypeConverter& typeConverter, RewritePatternSet &patterns) { +void populateLowerScheConversionPatterns(TypeConverter &typeConverter, + RewritePatternSet &patterns) { // clang-format off patterns.add(typeConverter, patterns.getContext()); patterns.add(typeConverter, patterns.getContext()); @@ -371,7 +453,8 @@ void populateLowerScheConversionPatterns(TypeConverter& typeConverter, RewritePa //===----------------------------------------------------------------------===// namespace { -class LowerSchePass : public PassWrapper> { +class LowerSchePass + : public PassWrapper> { public: MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(LowerSchePass) LowerSchePass() = default; @@ -422,8 +505,10 @@ void LowerSchePass::runOnOperation() { target.addIllegalDialect(); TypeConverter typeConverter; - typeConverter.addConversion([&](sche::AsyncTokenType type){return async::TokenType::get(context);}); - typeConverter.addConversion([&](Type type){return type;}); + typeConverter.addConversion([&](sche::AsyncTokenType type) { + return async::TokenType::get(context); + }); + typeConverter.addConversion([&](Type type) { return type; }); RewritePatternSet patterns(context); populateLowerScheConversionPatterns(typeConverter, patterns); diff --git a/midend/lib/Conversion/MLIRGPU/CMakeLists.txt b/midend/lib/Conversion/MLIRGPU/CMakeLists.txt new file mode 100644 index 000000000..041c6ff11 --- /dev/null +++ b/midend/lib/Conversion/MLIRGPU/CMakeLists.txt @@ -0,0 +1,30 @@ +add_mlir_library(MLIRGPUPasses + GPUHostRegister.cpp + GPUBufferize.cpp + ADDITIONAL_HEADER_DIRS + ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/Bufferization + + LINK_LIBS PUBLIC + MLIRArithDialect + MLIRBufferizationDialect + MLIRControlFlowInterfaces + MLIRFuncDialect + MLIRFunctionInterfaces + MLIRInferTypeOpInterface + MLIRIR + MLIRMemRefDialect + MLIRPass + MLIRTensorDialect + MLIRSCFDialect + MLIRSideEffectInterfaces + MLIRSubsetOpInterface + MLIRTransforms + MLIRViewLikeInterface + MLIRSupport + BuddyUtils + MLIRBufferizationTransforms + MLIRGPUDialect +) + + + diff --git a/midend/lib/Conversion/MLIRGPU/ConvertMemcpyToGPU.cpp b/midend/lib/Conversion/MLIRGPU/ConvertMemcpyToGPU.cpp new file mode 100644 index 000000000..1417a96bb --- /dev/null +++ b/midend/lib/Conversion/MLIRGPU/ConvertMemcpyToGPU.cpp @@ -0,0 +1,256 @@ +//===- ConvertMemcpyToGPU.cpp +//-------------------------------------------------===// +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +// +// This file implements the pass that converts memcpy to gpu operations. +// +//===----------------------------------------------------------------------===// + +#include "mlir/Dialect/Arith/IR/Arith.h" +#include "mlir/Dialect/GPU/IR/GPUDialect.h" +#include "mlir/Dialect/MemRef/IR/MemRef.h" +#include "mlir/IR/Builders.h" +#include "mlir/IR/BuiltinOps.h" +#include "mlir/IR/BuiltinTypes.h" +#include "mlir/IR/OperationSupport.h" +#include "mlir/IR/TypeRange.h" +#include "mlir/IR/ValueRange.h" +#include "mlir/IR/Visitors.h" +#include "mlir/Support/LLVM.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +using namespace mlir; +using namespace vector; + +//===----------------------------------------------------------------------===// +// Rewrite Pattern +//===----------------------------------------------------------------------===// + +namespace { + +class ConvertMemcpyToGPUPattern : public ConversionPattern { +public: + explicit ConvertMemcpyToGPUPattern(MLIRContext *context) + : ConversionPattern(gpu::LaunchFuncOp().getOperationName(), 1, context) {} + + LogicalResult + matchAndRewrite(Operation *op, ArrayRef /*operands*/, + ConversionPatternRewriter &rewriter) const override { + llvm::errs() << op->getName().getStringRef() << "\n"; + return success(); + } + +private: +}; +} // end anonymous namespace + +//===----------------------------------------------------------------------===// +// ConvertMemcpyToGPUPass +//===----------------------------------------------------------------------===// + +namespace { + +class ConvertMemcpyToGPUPass + : public PassWrapper> { +public: + MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(ConvertMemcpyToGPUPass) + StringRef getArgument() const final { return "convert-memcpy-to-gpu"; } + StringRef getDescription() const final { + return "Convert memref opertaions to gpu operations."; + } + ConvertMemcpyToGPUPass() = default; + ConvertMemcpyToGPUPass(const ConvertMemcpyToGPUPass & + + ) {} + + void runOnOperation() override; + + void getDependentDialects(DialectRegistry ®istry) const override { + registry.insert(); + } +}; + +void ConvertMemcpyToGPUPass::runOnOperation() { + auto funcOp = getOperation(); + std::set unDeallocatedOperations; + + // Copy all function arguments to gpu, needs deallocation + OpBuilder builder(funcOp->getContext()); + builder.setInsertionPointToStart(&(funcOp.getBody().front())); + unsigned numArgs = funcOp.getNumArguments(); + for (unsigned i = 0; i < numArgs; ++i) { + BlockArgument arg = funcOp.getArgument(i); + // Create a gpu.alloc op, then copy memory to it + // TODO: Move this out of operation, make the copy process async + auto memrefType = dyn_cast(arg.getType()); + auto gpuAllocOp = builder.create( + builder.getUnknownLoc(), TypeRange({memrefType}), ValueRange({})); + unDeallocatedOperations.insert(&gpuAllocOp); + auto gpuMemcpyOp = builder.create( + gpuAllocOp.getLoc(), TypeRange(), ValueRange(), gpuAllocOp.getResult(0), + arg); + // Replace all users with GPU memory + auto users = arg.getUsers(); + std::vector usersVec(users.begin(), users.end()); + for (auto user : usersVec) { + // Don't replace memcpy's operand + if (isa(user)) + continue; + for (size_t j = 0; j < user->getNumOperands(); j++) { + if (user->getOperand(j) == arg) { + user->setOperand(j, gpuAllocOp.getResult(0)); + } + } + } + } + + funcOp->walk([&](Operation *nestedOp) { + // Replace all allocations with GPU.alloc + if (auto allocOp = dyn_cast(nestedOp)) { + // Rewrite this allocOp to gpu.alloc, change for all users + builder.setInsertionPointAfter(allocOp); + auto result = allocOp->getResult(0); + auto memrefType = dyn_cast(result.getType()); + auto gpuAllocOp = builder.create( + allocOp->getLoc(), TypeRange({memrefType}), ValueRange({})); + auto users = result.getUsers(); + std::vector usersVec(users.begin(), users.end()); + for (auto user : usersVec) { + for (size_t j = 0; j < user->getNumOperands(); j++) { + // Only the return value will not have dealloc op + if (auto deallocOp = dyn_cast(user)) { + builder.setInsertionPointAfter(deallocOp); + auto gpuDeallocOp = builder.create( + deallocOp->getLoc(), TypeRange(), ValueRange(), + gpuAllocOp.getResult(0)); + deallocOp->erase(); + } + else if (user->getOperand(j) == result) { + user->setOperand(j, gpuAllocOp.getResult(0)); + } + } + } + allocOp->erase(); + } + // Replace all memory.copy operations with gpu.memcpy + else if (auto copyOp = dyn_cast(nestedOp)) { + auto src = copyOp.getOperand(0); + auto dst = copyOp.getOperand(1); + // Notice: GPU.memcpy has a different src dst order + builder.setInsertionPointAfter(copyOp); + auto gpuMemcpyOp = builder.create( + copyOp->getLoc(), TypeRange(), ValueRange(), dst, src); + { + auto users = src.getUsers(); + std::vector usersVec(users.begin(), users.end()); + for (auto user : usersVec) { + for (size_t j = 0; j < user->getNumOperands(); j++) { + if (user->getOperand(j) == src) { + user->setOperand(j, gpuMemcpyOp.getOperand(1)); + } + } + } + } + { + auto users = dst.getUsers(); + std::vector usersVec(users.begin(), users.end()); + for (auto user : usersVec) { + for (size_t j = 0; j < user->getNumOperands(); j++) { + if (user->getOperand(j) == src) { + user->setOperand(j, gpuMemcpyOp.getOperand(0)); + } + } + } + } + copyOp->erase(); + } + // Allocate space on GPU and copy global memrefs to GPU, needs deallocation + else if (auto getGlobalOp = dyn_cast(nestedOp)) { + builder.setInsertionPointAfter(getGlobalOp); + auto result = getGlobalOp->getResult(0); + auto memrefType = dyn_cast(result.getType()); + auto gpuAllocOp = builder.create( + getGlobalOp->getLoc(), TypeRange({memrefType}), ValueRange({})); + unDeallocatedOperations.insert(&gpuAllocOp); + auto src = result; + auto dst = gpuAllocOp->getResult(0); + auto gpuMemcpyOp = builder.create( + gpuAllocOp->getLoc(), TypeRange(), ValueRange(), dst, src); + { + auto users = src.getUsers(); + std::vector usersVec(users.begin(), users.end()); + for (auto user : usersVec) { + if (isa(user)) + continue; + for (size_t j = 0; j < user->getNumOperands(); j++) { + if (user->getOperand(j) == src) { + user->setOperand(j, dst); + } + } + } + } + } + // Copy data back to CPU, deallocate GPU, then return + else if (auto returnOp = dyn_cast(nestedOp)) { + builder.setInsertionPoint(returnOp); + + for (auto* gpuAllocOp: unDeallocatedOperations){ + auto gpuDeallocOp = builder.create( + builder.getUnknownLoc(), TypeRange(), ValueRange(), + gpuAllocOp->getResult(0)); + } + builder.setInsertionPoint(returnOp); + for (unsigned i = 0; i < returnOp.getNumOperands(); ++i) { + auto val = returnOp->getOperand(i); + auto memRefType = dyn_cast(val.getType()); + auto allocOp = builder.create( + builder.getUnknownLoc(), memRefType); + auto gpuMemcpyOp = builder.create( + allocOp.getLoc(), TypeRange(), ValueRange(), + allocOp->getResult(0), val); + auto gpuDeallocOp = builder.create( + gpuMemcpyOp->getLoc(), TypeRange(), ValueRange(), val); + returnOp->setOperand(i, allocOp->getResult(0)); + } + } + return WalkResult::advance(); + }); +} +} // end anonymous namespace. + +namespace mlir { +namespace buddy { +void registerConvertMemcpyToGPUPass() { + PassRegistration(); +} +} // namespace buddy +} // namespace mlir diff --git a/midend/lib/Conversion/MLIRGPU/GPUBufferize.cpp b/midend/lib/Conversion/MLIRGPU/GPUBufferize.cpp new file mode 100644 index 000000000..94a24e52e --- /dev/null +++ b/midend/lib/Conversion/MLIRGPU/GPUBufferize.cpp @@ -0,0 +1,220 @@ +// Copyright 2021 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +//===- GPUBufferizePass.cpp - ---------------------------------------------===// +// +// Wrapper pass to use MLIR's One-Shot Bufferize pass. +// +//===----------------------------------------------------------------------===// + +#include "mlir/Conversion/AffineToStandard/AffineToStandard.h" +#include "mlir/Conversion/ComplexToStandard/ComplexToStandard.h" +#include "mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h" +#include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h" +#include "mlir/Conversion/VectorToGPU/VectorToGPU.h" +#include "mlir/Dialect/Affine/IR/AffineOps.h" +#include "mlir/Dialect/Arith/IR/Arith.h" +#include "mlir/Dialect/Arith/Transforms/Passes.h" +#include "mlir/Dialect/Bufferization/IR/Bufferization.h" +#include "mlir/Dialect/Bufferization/Transforms/OneShotAnalysis.h" +#include "mlir/Dialect/Bufferization/Transforms/Passes.h" +#include "mlir/Dialect/Bufferization/Transforms/Transforms.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/Dialect/Func/Transforms/Passes.h" +#include "mlir/Dialect/GPU/IR/GPUDialect.h" +#include "mlir/Dialect/GPU/Transforms/Passes.h" +#include "mlir/Dialect/Linalg/IR/Linalg.h" +#include "mlir/Dialect/Linalg/Passes.h" +#include "mlir/Dialect/Linalg/Transforms/Transforms.h" +#include "mlir/Dialect/MemRef/IR/MemRef.h" +#include "mlir/Dialect/MemRef/Transforms/Passes.h" +#include "mlir/Dialect/SCF/IR/SCF.h" +#include "mlir/Dialect/Tensor/IR/Tensor.h" +#include "mlir/Dialect/Vector/IR/VectorOps.h" +#include "mlir/IR/BuiltinAttributes.h" +#include "mlir/IR/BuiltinOps.h" +#include "mlir/IR/BuiltinTypes.h" +#include "mlir/IR/Visitors.h" +#include "mlir/Pass/Pass.h" +#include "mlir/Pass/PassManager.h" +#include "mlir/Pass/PassOptions.h" +#include "mlir/Pass/PassRegistry.h" +#include "mlir/Support/LLVM.h" +#include "mlir/Transforms/GreedyPatternRewriteDriver.h" +#include "mlir/Transforms/Passes.h" +#include "llvm/Support/Casting.h" +#include +#include +#include +#include +#include +#include +#include +#include + +using mlir::bufferization::BufferizationOptions; +using mlir::bufferization::OneShotAnalysisState; +using mlir::bufferization::OneShotBufferizationOptions; + +using namespace mlir; + +namespace { + +bool hasSharedMemoryAddressSpace(MemRefType memrefType) { + auto addrSpace = llvm::dyn_cast_if_present( + memrefType.getMemorySpace()); + return addrSpace && + addrSpace.getValue() == gpu::GPUDialect::getWorkgroupAddressSpace(); +} + +static FailureOr gpuAllocationFn(OpBuilder &builder, Location loc, + MemRefType memRefType, + ValueRange dynamicSizes, + unsigned alignment) { + auto workgroupSpace = gpu::AddressSpaceAttr::get( + builder.getContext(), gpu::GPUDialect::getWorkgroupAddressSpace()); + MemRefType allocType = + MemRefType::get(memRefType.getShape(), memRefType.getElementType(), + AffineMap(), workgroupSpace); + return builder.create(loc, allocType, dynamicSizes) + .getResult(); +} + +static LogicalResult gpuCopyFn(OpBuilder &builder, Location loc, Value from, + Value to) { + bool needsBarrier = false; + if (hasSharedMemoryAddressSpace(llvm::cast(from.getType()))) { + needsBarrier = true; + } + if (hasSharedMemoryAddressSpace(llvm::cast(to.getType()))) { + needsBarrier = true; + } + if (needsBarrier) + builder.create(loc); + // Operation *copy = + builder.create(loc, from, to); + if (needsBarrier) { + // setMarker(copy, getCopyToWorkgroupMemoryMarker()); + builder.create(loc); + } + return success(); +} + +/// Pass to convert from tensor based ops to memref based ops. +class BuudyGPUBufferizePass + : public PassWrapper> { +public: + explicit BuudyGPUBufferizePass( + std::optional allocationFn = + gpuAllocationFn, + std::optional memCpyFn = gpuCopyFn) + : allocationFn(allocationFn), memCpyFn(memCpyFn) {} + + MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(BuudyGPUBufferizePass) + StringRef getArgument() const final { return "gpu-bufferize"; } + StringRef getDescription() const final { + return "One shot bufferize GPU pass."; + } + + void getDependentDialects(DialectRegistry ®istry) const override { + // clang-format off + registry + .insert(); + // clang-format on + } + + void runOnOperation() override; + +private: + const std::optional allocationFn; + const std::optional memCpyFn; +}; + + + +} // namespace + +// The following is copied from bufferization::runOneShotBufferize with +// modifications. +LogicalResult +runBuudyOneShotBufferize(Operation *op, + const OneShotBufferizationOptions &options) { + OneShotAnalysisState state(op, options); + if (failed(analyzeOp(op, state))) + return failure(); + if (options.testAnalysisOnly) + return success(); + return bufferization::runOneShotBufferize(op, options); +} + +/// Run comprehensive bufferize. +void BuudyGPUBufferizePass::runOnOperation() { + ModuleOp moduleOp = getOperation(); + OneShotBufferizationOptions options; + options.allocationFn = allocationFn; + options.memCpyFn = memCpyFn; + + if (failed(runBuudyOneShotBufferize(moduleOp, options))) { + return signalPassFailure(); + } + + // Remove redundant args and unused results. + { + RewritePatternSet patterns(&getContext()); + linalg::populateEraseUnusedOperandsAndResultsPatterns(patterns); + if (failed(applyPatternsAndFoldGreedily(moduleOp, std::move(patterns)))) { + return signalPassFailure(); + } + } +} + +std::unique_ptr> createBuudyGPUBufferizePass( + std::optional allocationFn, + std::optional memCpyFn) { + if (!allocationFn) + allocationFn = gpuAllocationFn; + if (!memCpyFn) + memCpyFn = gpuCopyFn; + return std::make_unique(allocationFn, + memCpyFn); +} + +void addBuudyPostBufferizationPasses(OpPassManager &passManager) { + passManager.addPass(memref::createResolveShapedTypeResultDimsPass()); + passManager.addNestedPass(createCanonicalizerPass()); + passManager.addNestedPass(createCSEPass()); + // There are redundant memcpy (with linalg.generic form) ops created, which + // can be deleted by canonicalizer. We have to run it again because the + // memrefs are unified in CSE pass, so we can truely remove redundant memcpy. + passManager.addNestedPass(createCanonicalizerPass()); +} + +void addBuudyGPUBufferizePasses( + OpPassManager &passManager, + std::optional allocationFn, + std::optional memCpyFn) { + passManager.addPass(bufferization::createEmptyTensorEliminationPass()); + passManager.addPass(bufferization::createEmptyTensorToAllocTensorPass()); + passManager.addPass( + createBuudyGPUBufferizePass(allocationFn, memCpyFn)); + addBuudyPostBufferizationPasses(passManager); +} + +namespace mlir { +namespace buddy { +void registerBuddyGPUBufferizePass() { PassRegistration(); } +} // namespace buddy +} // namespace mlir \ No newline at end of file diff --git a/midend/lib/Conversion/MLIRGPU/GPUHostRegister.cpp b/midend/lib/Conversion/MLIRGPU/GPUHostRegister.cpp new file mode 100644 index 000000000..e9d8a046f --- /dev/null +++ b/midend/lib/Conversion/MLIRGPU/GPUHostRegister.cpp @@ -0,0 +1,323 @@ +//===- GPUHostRegister.cpp +//-------------------------------------------------===// +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +// +// This file implements the GPU host register pass that adds gpu.host_register. +// +//===----------------------------------------------------------------------===// + +#include "mlir/Dialect/GPU/IR/GPUDialect.h" +#include "mlir/Dialect/MemRef/IR/MemRef.h" +#include "mlir/IR/Builders.h" +#include "mlir/IR/BuiltinOps.h" +#include "mlir/IR/BuiltinTypes.h" +#include "mlir/IR/OperationSupport.h" +#include "mlir/IR/Visitors.h" +#include "mlir/Support/LLVM.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +using namespace mlir; +using namespace vector; + +//===----------------------------------------------------------------------===// +// Rewrite Pattern +//===----------------------------------------------------------------------===// + +namespace { + +class GPUHostRegisterPattern : public ConversionPattern { +public: + explicit GPUHostRegisterPattern(MLIRContext *context) + : ConversionPattern(gpu::LaunchFuncOp().getOperationName(), 1, context) {} + + LogicalResult + matchAndRewrite(Operation *op, ArrayRef /*operands*/, + ConversionPatternRewriter &rewriter) const override { + llvm::errs() << op->getName().getStringRef() << "\n"; + return success(); + } + +private: +}; +} // end anonymous namespace + +//===----------------------------------------------------------------------===// +// GPUHostRegisterPass +//===----------------------------------------------------------------------===// + +namespace { + +class GPUHostRegisterPass + : public PassWrapper> { +public: + MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(GPUHostRegisterPass) + StringRef getArgument() const final { return "gpu-host-register"; } + StringRef getDescription() const final { + return "Register host memory to legalize gpu access."; + } + GPUHostRegisterPass() = default; + GPUHostRegisterPass(const GPUHostRegisterPass & + + ) {} + + void runOnOperation() override; + + void getDependentDialects(DialectRegistry ®istry) const override { + registry.insert(); + } +}; +} // end anonymous namespace. + +Value *getSourceOperand(Operation *op) { + auto operands = op->getOperands(); + Value *memrefOperand = nullptr; + for (auto operand : operands) { + if (!operand.getType().isa()) + continue; + if (memrefOperand) { + llvm_unreachable("Op has more than one memref operand"); + } + memrefOperand = &operand; + } + if (!memrefOperand) { + llvm_unreachable("Op has no memref operand"); + } + return memrefOperand; +} + +std::pair getAllocationOp(Value *value) { + if (auto *producerOp = value->getDefiningOp()) { + if (auto allocOp = dyn_cast(producerOp)) { + // llvm::dbgs()<getName().getStringRef()<<":"<getResult(0)<<"\n"; + // llvm::dbgs()<<"returning location:"<getLoc()<<"\n"; + return {producerOp, 0}; + } + // else if (auto reallocOp) + // else if (auto allocaOp) + // Getglobal needs to create a copy + else if (auto getGlobalOp = dyn_cast(producerOp)) { + return {producerOp, 1}; + } + else if (auto subviewOp = dyn_cast(producerOp)) { + for(auto operand : producerOp->getOperands()) { + if (!operand.getType().isa()) + continue; + return getAllocationOp(&operand); + } + } + else if (auto loadOp = dyn_cast(producerOp)) { + for (auto operand : producerOp->getOperands()) { + if (!operand.getType().isa()) + continue; + return getAllocationOp(&operand); + } + } + else if (auto collapseShapeOp = + dyn_cast(producerOp)) { + for (auto operand : producerOp->getOperands()) { + if (!operand.getType().isa()) + continue; + return getAllocationOp(&operand); + } + } + else if (auto expandShapeOp = + dyn_cast(producerOp)) { + for (auto operand : producerOp->getOperands()) { + if (!operand.getType().isa()) + continue; + return getAllocationOp(&operand); + } + } + else if (auto castOp = dyn_cast(producerOp)) { + for (auto operand : producerOp->getOperands()) { + if (!operand.getType().isa()) + continue; + return getAllocationOp(&operand); + } + } + else if (auto reinterpretCastOp = + dyn_cast(producerOp)) { + for (auto operand : producerOp->getOperands()) { + if (!operand.getType().isa()) + continue; + return getAllocationOp(&operand); + } + } + else if (auto reshapeOp = dyn_cast(producerOp)) { + for (auto operand : producerOp->getOperands()) { + if (!operand.getType().isa()) + continue; + return getAllocationOp(&operand); + } + } + else if (auto transposeOp = dyn_cast(producerOp)) { + for (auto operand : producerOp->getOperands()) { + if (!operand.getType().isa()) + continue; + return getAllocationOp(&operand); + } + } + else if (auto viewOp = dyn_cast(producerOp)) { + for (auto operand : producerOp->getOperands()) { + if (!operand.getType().isa()) + continue; + return getAllocationOp(&operand); + } + } + else { + llvm_unreachable("Unknown producer op"); + } + // Look for parent op + } + // llvm::dbgs() << "returning null:" << value << "\n"; + // value->dump(); + // Values comes from outside the function + return {reinterpret_cast(value), 3}; +} +static bool isEqual(const Operation *lhsC, const Operation *rhsC) { + auto *lhs = const_cast(lhsC); + auto *rhs = const_cast(rhsC); + if (lhs == rhs) + return true; + + return OperationEquivalence::isEquivalentTo(const_cast(lhsC), + const_cast(rhsC), + OperationEquivalence::None); +} + +void GPUHostRegisterPass::runOnOperation() { + auto module = getOperation(); + std::set allocations; + std::map globalAllocations; + std::set outsideValues; + module->walk([&](Operation *nestedOp) { + if (auto launchFuncOp = dyn_cast(nestedOp)) { + // OpBuilder barrierBuilder(launchFuncOp->getContext()); + // barrierBuilder.setInsertionPointAfter(launchFuncOp); + // barrierBuilder.create(launchFuncOp->getLoc()); + + for (auto operand : launchFuncOp->getOperands()) { + if (!operand.getType().isa()) + continue; + auto res = getAllocationOp(&operand); + auto allocOp = res.first; + Operation *insertionOp = nullptr; + if (!allocOp) + continue; + + if (res.second == 0) { + insertionOp = allocOp; + auto result = allocations.insert(insertionOp); + if (result.second) { + OpBuilder builder(insertionOp->getContext()); + builder.setInsertionPointAfter(insertionOp); + auto memrefType = dyn_cast(operand.getType()); + auto elementType = memrefType.getElementType(); + UnrankedMemRefType resType = + UnrankedMemRefType::get(elementType, 0); + Value cast = builder.create( + insertionOp->getLoc(), resType, insertionOp->getResult(0)); + builder.create(insertionOp->getLoc(), cast); + } else { + // llvm::dbgs() << insertionOp->getName().getStringRef() + // << " has been registered\n"; + } + } + else if (res.second == 1) { + // add a copy for this global op + OpBuilder builder(allocOp->getContext()); + builder.setInsertionPointAfter(allocOp); + auto memrefType = dyn_cast(operand.getType()); + auto newAllocOp = builder.create( + allocOp->getLoc(), memrefType, ValueRange{}); + builder.create( + allocOp->getLoc(), allocOp->getResult(0), newAllocOp.getResult()); + for (size_t i = 0; i < launchFuncOp->getNumOperands(); i++) { + if (launchFuncOp->getOperand(i) == operand) { + launchFuncOp->setOperand(i, newAllocOp.getResult()); + } + } + auto result = allocations.insert(newAllocOp); + auto elementType = memrefType.getElementType(); + UnrankedMemRefType resType = UnrankedMemRefType::get(elementType, 0); + auto castOp = builder.create( + newAllocOp->getLoc(), resType, newAllocOp->getResult(0)); + builder.create(castOp->getLoc(), + castOp.getResult()); + globalAllocations[allocOp] = &newAllocOp; + } + else if (res.second == 3) { + // Register the external memory directly + auto value = reinterpret_cast(res.first); + if (outsideValues.find(value)!=outsideValues.end()){ + llvm::dbgs()<<"Global value registered.\n"; + return WalkResult::advance(); + } + auto context = operand.getContext(); + auto region = launchFuncOp->getParentRegion(); + auto block = ®ion->front(); + auto loc = launchFuncOp->getParentOp()->getLoc(); + OpBuilder builder(context); + builder.setInsertionPoint(block, block->begin()); + auto memrefType = dyn_cast(operand.getType()); + auto elementType = memrefType.getElementType(); + UnrankedMemRefType resType = + UnrankedMemRefType::get(elementType, 0); + auto castOp = builder.create(loc,resType,*value); + builder.create(loc,castOp.getResult()); + outsideValues.insert(value); + } + } + return WalkResult::advance(); + } else if (auto deallocOp = dyn_cast(nestedOp)) { + auto operand = deallocOp->getOperand(0); + if (!operand.getType().isa()) + return WalkResult::advance(); + if (auto getGlobalOp = + dyn_cast(operand.getDefiningOp())) { + if (globalAllocations.find(operand.getDefiningOp()) != + globalAllocations.end()) { + auto allocOp = globalAllocations[operand.getDefiningOp()]; + deallocOp->setOperand(0, allocOp->getResult()); + } + } + } + return WalkResult::advance(); + }); +} + +namespace mlir { +namespace buddy { +void registerGPUHostRegisterPass() { PassRegistration(); } +} // namespace buddy +} // namespace mlir diff --git a/midend/lib/Utils/CMakeLists.txt b/midend/lib/Utils/CMakeLists.txt index 7d21a6765..ff9aa6e38 100644 --- a/midend/lib/Utils/CMakeLists.txt +++ b/midend/lib/Utils/CMakeLists.txt @@ -1,6 +1,7 @@ add_mlir_library(BuddyUtils Utils.cpp DIPUtils.cpp + DAPUtils.cpp AffineTransformUtils.cpp ) @@ -9,4 +10,11 @@ add_mlir_library(BuddyDIPUtils LINK_LIBS PUBLIC BuddyUtils - ) \ No newline at end of file + ) + +add_mlir_library(BuddyDAPUtils + DAPUtils.cpp + + LINK_LIBS PUBLIC + BuddyUtils + ) diff --git a/midend/lib/Utils/DAPUtils.cpp b/midend/lib/Utils/DAPUtils.cpp new file mode 100644 index 000000000..4586f43b4 --- /dev/null +++ b/midend/lib/Utils/DAPUtils.cpp @@ -0,0 +1,220 @@ +//====- DAPUtils.cpp ------------------------------------------------------===// +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +// +// This file implements DAP dialect specific utility functions for the buddy +// compiler ecosystem. +// +//===----------------------------------------------------------------------===// + +#ifndef UTILS_DAPUTILS_DEF +#define UTILS_DAPUTILS_DEF + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "DAP/DAPDialect.h" +#include "DAP/DAPOps.h" +#include "Utils/DAPUtils.h" +#include "Utils/Utils.h" + +using namespace mlir; + +namespace buddy { +namespace dap { + +// Generate 5 vector params from SOS matrices +SmallVector generateSOSParams(OpBuilder &rewriter, Location loc, + VectorType vectorTy, Value f0, Value f1, + Value c0, Value c1, Value c2, Value c4, + Value c5, Value filterSize, + Value kernel) { + Value initB0 = rewriter.create(loc, vectorTy, f1); + Value initB1 = rewriter.create(loc, vectorTy, f0); + Value initB2 = rewriter.create(loc, vectorTy, f0); + Value initA1 = rewriter.create(loc, vectorTy, f0); + Value initA2 = rewriter.create(loc, vectorTy, f0); + + // Distribute all params into 5 param vectors + auto vecDistribute = rewriter.create( + loc, c0, filterSize, c1, + ValueRange{initB0, initB1, initB2, initA1, initA2}, + [&](OpBuilder &builder, Location loc, Value iv, ValueRange iargs) { + Value b0 = + builder.create(loc, kernel, ValueRange{iv, c0}); + Value b1 = + builder.create(loc, kernel, ValueRange{iv, c1}); + Value b2 = + builder.create(loc, kernel, ValueRange{iv, c2}); + Value a1 = + builder.create(loc, kernel, ValueRange{iv, c4}); + Value a2 = + builder.create(loc, kernel, ValueRange{iv, c5}); + + Value b0Next = + builder.create(loc, b0, iargs[0], iv); + Value b1Next = + builder.create(loc, b1, iargs[1], iv); + Value b2Next = + builder.create(loc, b2, iargs[2], iv); + Value a1Next = + builder.create(loc, a1, iargs[3], iv); + Value a2Next = + builder.create(loc, a2, iargs[4], iv); + + builder.create( + loc, std::vector{b0Next, b1Next, b2Next, a1Next, a2Next}); + }); + + return SmallVector{vecDistribute.getResults()}; +} + +// Processing iir operation, result are stored in output MemRef +void biquadProcess(OpBuilder &rewriter, Location loc, VectorType vectorTy, + Value f0, Value c0, Value c1, Value cUpperBound, + Value iUpperBound, SmallVector SOSParams, + ArrayRef arrayRef, Value N, Value input, + Value output) { + Value vecB0 = SOSParams[0]; + Value vecB1 = SOSParams[1]; + Value vecB2 = SOSParams[2]; + Value vecA1 = SOSParams[3]; + Value vecA2 = SOSParams[4]; + + Value vecOut = rewriter.create(loc, vectorTy, f0); + Value vecS1 = rewriter.create(loc, vectorTy, f0); + Value vecS2 = rewriter.create(loc, vectorTy, f0); + + // Injection stage for iir operation, no output produced + auto injectionResult = rewriter.create( + loc, c0, cUpperBound, c1, ValueRange{vecOut, vecS1, vecS2}, + [&](OpBuilder &builder, Location loc, Value iv, ValueRange iargs) { + Value inElem = builder.create(loc, input, iv); + Value vecInMoveRight = builder.create( + loc, iargs[0], iargs[0], arrayRef); + Value vecInNext = builder.create( + loc, inElem, vecInMoveRight, c0); + Value vecOutNext = + builder.create(loc, vecB0, vecInNext, iargs[1]); + + Value vecS1Lhs = + builder.create(loc, vecB1, vecInNext, iargs[2]); + Value vecS1Rhs = builder.create(loc, vecA1, vecOutNext); + Value vecS1Next = + builder.create(loc, vecS1Lhs, vecS1Rhs); + + Value vecS2Lhs = builder.create(loc, vecB2, vecInNext); + Value vecS2Rhs = builder.create(loc, vecA2, vecOutNext); + Value vecS2Next = + builder.create(loc, vecS2Lhs, vecS2Rhs); + + builder.create( + loc, std::vector{vecOutNext, vecS1Next, vecS2Next}); + }); + + Value upperBound = rewriter.create(loc, N, cUpperBound); + + // Processing stage for iir operation, start to produce ouput + auto processResult = rewriter.create( + loc, c0, upperBound, c1, ValueRange{injectionResult.getResults()}, + [&](OpBuilder &builder, Location loc, Value iv, ValueRange iargs) { + Value index = builder.create(loc, iv, cUpperBound); + Value inElem = builder.create(loc, input, index); + Value vecInMoveRight = builder.create( + loc, iargs[0], iargs[0], arrayRef); + Value vecInNext = builder.create( + loc, inElem, vecInMoveRight, c0); + Value vecOutNext = + builder.create(loc, vecB0, vecInNext, iargs[1]); + Value outElem = builder.create( + loc, vecOutNext, iUpperBound); + builder.create(loc, outElem, output, iv); + + Value vecS1Lhs = + builder.create(loc, vecB1, vecInNext, iargs[2]); + Value vecS1Rhs = builder.create(loc, vecA1, vecOutNext); + Value vecS1Next = + builder.create(loc, vecS1Lhs, vecS1Rhs); + + Value vecS2Lhs = builder.create(loc, vecB2, vecInNext); + Value vecS2Rhs = builder.create(loc, vecA2, vecOutNext); + Value vecS2Next = + builder.create(loc, vecS2Lhs, vecS2Rhs); + + builder.create( + loc, std::vector{vecOutNext, vecS1Next, vecS2Next}); + }); + + // Tail ending stafe for iir operation, generate rest ouput + rewriter.create( + loc, upperBound, N, c1, ValueRange{processResult.getResults()}, + [&](OpBuilder &builder, Location loc, Value iv, ValueRange iargs) { + Value vecInMoveRight = builder.create( + loc, iargs[0], iargs[0], arrayRef); + Value vecInNext = builder.create( + loc, f0, vecInMoveRight, c0); + Value vecOutNext = + builder.create(loc, vecB0, vecInNext, iargs[1]); + Value outElem = builder.create( + loc, vecOutNext, iUpperBound); + builder.create(loc, outElem, output, iv); + + Value vecS1Lhs = + builder.create(loc, vecB1, vecInNext, iargs[2]); + Value vecS1Rhs = builder.create(loc, vecA1, vecOutNext); + Value vecS1Next = + builder.create(loc, vecS1Lhs, vecS1Rhs); + + Value vecS2Lhs = builder.create(loc, vecB2, vecInNext); + Value vecS2Rhs = builder.create(loc, vecA2, vecOutNext); + Value vecS2Next = + builder.create(loc, vecS2Lhs, vecS2Rhs); + + builder.create( + loc, std::vector{vecOutNext, vecS1Next, vecS2Next}); + }); +} + +// Total process for a specific vector length iir vectorization process +void iirVectorizationProcess(OpBuilder &rewriter, Location loc, uint64_t vecLen, + FloatType floatType, Value f0, Value f1, Value c0, + Value c1, Value c2, Value c4, Value c5, + Value filterSize, Value kernel, + ArrayRef arrayRef, Value N, Value input, + Value output) { + VectorType vectorTy = VectorType::get(vecLen, floatType); + uint64_t vecLenMinusOne = vecLen - 1; + Value cUpperBound = + rewriter.create(loc, vecLenMinusOne); + Value iUpperBound = rewriter.create( + loc, + /*value=*/vecLenMinusOne, /*width=*/64); + + auto SOSParams = dap::generateSOSParams(rewriter, loc, vectorTy, f0, f1, c0, + c1, c2, c4, c5, filterSize, kernel); + dap::biquadProcess(rewriter, loc, vectorTy, f0, c0, c1, cUpperBound, + iUpperBound, SOSParams, arrayRef, N, input, output); +} + +} // namespace dap +} // namespace buddy +#endif // UTILS_DAPUTILS_DEF diff --git a/requirements.txt b/requirements.txt index 782c70af9..606179eb7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,3 +5,5 @@ tokenizers == 0.13.3 sentencepiece == 0.1.99 accelerate protobuf +pybind11 == 2.11.1 +torchvision diff --git a/tests/Interface/core/ContainerTest.cpp b/tests/Interface/core/ContainerTest.cpp index 0f69d8938..3d80b3375 100644 --- a/tests/Interface/core/ContainerTest.cpp +++ b/tests/Interface/core/ContainerTest.cpp @@ -56,7 +56,7 @@ int main() { // Test custom shape no malloc constructor. //===--------------------------------------------------------------------===// MemRef testCustomShapeNoMallocConstructor(sizes, false, 0); - // CHECK: (nil) + // CHECK: {{(nil)|0x0}} fprintf(stderr, "%p\n", testCustomShapeNoMallocConstructor.getData()); //===--------------------------------------------------------------------===// @@ -64,7 +64,7 @@ int main() { //===--------------------------------------------------------------------===// std::vector arrayShape = {1, 80, 32000}; MemRef testArrayNoMallocConstructor(arrayShape, false, 0); - // CHECK: (nil) + // CHECK: {{(nil)|0x0}} fprintf(stderr, "%p\n", testArrayNoMallocConstructor.getData()); //===--------------------------------------------------------------------===// diff --git a/tests/Python/test_addmm.py b/tests/Python/test_addmm.py index cb4459f45..563c87446 100644 --- a/tests/Python/test_addmm.py +++ b/tests/Python/test_addmm.py @@ -22,8 +22,11 @@ def foo(x, y, z): aot_autograd_decomposition=inductor_decomp, ) -foo_mlir = dynamo.optimize(dynamo_compiler)(foo) -foo_mlir(in1, in2, in3) +graphs = dynamo_compiler.importer(foo, in1, in2, in3) +assert len(graphs) == 1 +graph = graphs[0] +graph.lower_to_top_level_ir() +print(graph._imported_module) # CHECK: module { # CHECK-LABEL: func.func @forward @@ -32,4 +35,3 @@ def foo(x, y, z): # CHECK: return %{{.*}} # CHECK: } # CHECK: } -print(dynamo_compiler.imported_module) diff --git a/tests/Python/test_amax.py b/tests/Python/test_amax.py index 3759b352c..81944a2c2 100644 --- a/tests/Python/test_amax.py +++ b/tests/Python/test_amax.py @@ -22,8 +22,11 @@ def foo(x, dim): aot_autograd_decomposition=inductor_decomp, ) -foo_mlir = dynamo.optimize(dynamo_compiler)(foo) -foo_mlir(in1, dim) +graphs = dynamo_compiler.importer(foo, in1, dim) +assert len(graphs) == 1 +graph = graphs[0] +graph.lower_to_top_level_ir() +print(graph._imported_module) # CHECK: module { # CHECK-LABEL: func.func @forward @@ -31,4 +34,3 @@ def foo(x, dim): # CHECK: return %{{.*}} # CHECK: } # CHECK: } -print(dynamo_compiler.imported_module) diff --git a/tests/Python/test_arange.py b/tests/Python/test_arange.py index ac7fa3c45..f7e1cd1c4 100644 --- a/tests/Python/test_arange.py +++ b/tests/Python/test_arange.py @@ -2,10 +2,9 @@ import torch import torch._dynamo as dynamo -from torch._inductor.decomposition import decompositions as inductor_decomp from torch._functorch.aot_autograd import aot_autograd_decompositions from buddy.compiler.frontend import DynamoCompiler -from buddy.compiler.ops import tosa +from buddy.compiler.ops import linalg def foo(x): @@ -16,12 +15,15 @@ def foo(x): # Initialize the dynamo compiler. dynamo_compiler = DynamoCompiler( - primary_registry=tosa.ops_registry, + primary_registry=linalg.ops_registry, aot_autograd_decomposition=aot_autograd_decompositions, ) -foo_mlir = dynamo.optimize(dynamo_compiler)(foo) -foo_mlir(in1) +graphs = dynamo_compiler.importer(foo, in1) +assert len(graphs) == 1 +graph = graphs[0] +graph.lower_to_top_level_ir() +print(graph._imported_module) # CHECK: module { # CHECK-LABEL: func.func @forward @@ -29,4 +31,3 @@ def foo(x): # CHECK: return %{{.*}} # CHECK: } # CHECK: } -print(dynamo_compiler.imported_module) diff --git a/tests/Python/test_arith_add.py b/tests/Python/test_arith_add.py index 44db4609d..9c6e9d312 100644 --- a/tests/Python/test_arith_add.py +++ b/tests/Python/test_arith_add.py @@ -1,11 +1,10 @@ # RUN: %PYTHON %s 2>&1 | FileCheck %s import torch -import torch._dynamo as dynamo from torch._inductor.decomposition import decompositions as inductor_decomp from buddy.compiler.frontend import DynamoCompiler -from buddy.compiler.ops import tosa +from buddy.compiler.ops import linalg def foo(x, y): @@ -17,12 +16,15 @@ def foo(x, y): # Initialize the dynamo compiler. dynamo_compiler = DynamoCompiler( - primary_registry=tosa.ops_registry, + primary_registry=linalg.ops_registry, aot_autograd_decomposition=inductor_decomp, ) -foo_mlir = dynamo.optimize(dynamo_compiler)(foo) -foo_mlir(in1, in2) +graphs = dynamo_compiler.importer(foo, in1, in2) +assert len(graphs) == 1 +graph = graphs[0] +graph.lower_to_top_level_ir() +print(graph._imported_module) # CHECK: module { # CHECK-LABEL: func.func @forward @@ -30,4 +32,3 @@ def foo(x, y): # CHECK: return %{{.*}} # CHECK: } # CHECK: } -print(dynamo_compiler.imported_module) diff --git a/tests/Python/test_arith_div.py b/tests/Python/test_arith_div.py index afc222a15..cf5b29023 100644 --- a/tests/Python/test_arith_div.py +++ b/tests/Python/test_arith_div.py @@ -5,7 +5,7 @@ from torch._inductor.decomposition import decompositions as inductor_decomp from buddy.compiler.frontend import DynamoCompiler -from buddy.compiler.ops import tosa +from buddy.compiler.ops import linalg def foo(x, y): @@ -17,12 +17,15 @@ def foo(x, y): # Initialize the dynamo compiler. dynamo_compiler = DynamoCompiler( - primary_registry=tosa.ops_registry, + primary_registry=linalg.ops_registry, aot_autograd_decomposition=inductor_decomp, ) -foo_mlir = dynamo.optimize(dynamo_compiler)(foo) -foo_mlir(in1, in2) +graphs = dynamo_compiler.importer(foo, in1, in2) +assert len(graphs) == 1 +graph = graphs[0] +graph.lower_to_top_level_ir() +print(graph._imported_module) # CHECK: module { # CHECK-LABEL: func.func @forward @@ -31,4 +34,3 @@ def foo(x, y): # CHECK: return %{{.*}} # CHECK: } # CHECK: } -print(dynamo_compiler.imported_module) diff --git a/tests/Python/test_arith_mul.py b/tests/Python/test_arith_mul.py index 9dc4dfbff..b22c6ebfd 100644 --- a/tests/Python/test_arith_mul.py +++ b/tests/Python/test_arith_mul.py @@ -5,7 +5,7 @@ from torch._inductor.decomposition import decompositions as inductor_decomp from buddy.compiler.frontend import DynamoCompiler -from buddy.compiler.ops import tosa +from buddy.compiler.ops import linalg def foo(x, y): @@ -13,21 +13,24 @@ def foo(x, y): in1 = torch.randn(10) -in2 = torch.randn(10) +in2 = 2 # Initialize the dynamo compiler. dynamo_compiler = DynamoCompiler( - primary_registry=tosa.ops_registry, + primary_registry=linalg.ops_registry, aot_autograd_decomposition=inductor_decomp, ) -foo_mlir = dynamo.optimize(dynamo_compiler)(foo) -foo_mlir(in1, in2) +graphs = dynamo_compiler.importer(foo, in1, in2) +assert len(graphs) == 1 +graph = graphs[0] +graph.lower_to_top_level_ir() +print(graph._imported_module) # CHECK: module { # CHECK-LABEL: func.func @forward +# CHECK: %{{.*}} = arith.constant # CHECK: %{{.*}} = tosa.mul # CHECK: return %{{.*}} # CHECK: } # CHECK: } -print(dynamo_compiler.imported_module) diff --git a/tests/Python/test_arith_sub.py b/tests/Python/test_arith_sub.py index 95b5475fc..0f6238afa 100644 --- a/tests/Python/test_arith_sub.py +++ b/tests/Python/test_arith_sub.py @@ -21,8 +21,11 @@ def foo(x, y): aot_autograd_decomposition=inductor_decomp, ) -foo_mlir = dynamo.optimize(dynamo_compiler)(foo) -foo_mlir(in1, in2) +graphs = dynamo_compiler.importer(foo, in1, in2) +assert len(graphs) == 1 +graph = graphs[0] +graph.lower_to_top_level_ir() +print(graph._imported_module) # CHECK: module { # CHECK-LABEL: func.func @forward @@ -30,4 +33,3 @@ def foo(x, y): # CHECK: return %{{.*}} # CHECK: } # CHECK: } -print(dynamo_compiler.imported_module) diff --git a/tests/Python/test_bmm.py b/tests/Python/test_bmm.py index 403b0621b..ec7c8b160 100644 --- a/tests/Python/test_bmm.py +++ b/tests/Python/test_bmm.py @@ -5,7 +5,7 @@ from torch._inductor.decomposition import decompositions as inductor_decomp from buddy.compiler.frontend import DynamoCompiler -from buddy.compiler.ops import tosa +from buddy.compiler.ops import linalg def foo(x, y): @@ -17,17 +17,20 @@ def foo(x, y): # Initialize the dynamo compiler. dynamo_compiler = DynamoCompiler( - primary_registry=tosa.ops_registry, + primary_registry=linalg.ops_registry, aot_autograd_decomposition=inductor_decomp, ) -foo_mlir = dynamo.optimize(dynamo_compiler)(foo) -foo_mlir(in1, in2) +graphs = dynamo_compiler.importer(foo, in1, in2) +assert len(graphs) == 1 +graph = graphs[0] +graph.lower_to_top_level_ir() +print(graph._imported_module) # CHECK: module { # CHECK-LABEL: func.func @forward -# CHECK: %{{.*}} = tosa.matmul +# CHECK: %{{.*}} = arith.constant +# CHECK: %{{.*}} = linalg.batch_matmul # CHECK: return %{{.*}} # CHECK: } # CHECK: } -print(dynamo_compiler.imported_module) diff --git a/tests/Python/test_cat.py b/tests/Python/test_cat.py index db9dacf11..9c769ae65 100644 --- a/tests/Python/test_cat.py +++ b/tests/Python/test_cat.py @@ -6,7 +6,7 @@ from torch._functorch.aot_autograd import aot_autograd_decompositions from buddy.compiler.frontend import DynamoCompiler -from buddy.compiler.ops import tosa +from buddy.compiler.ops import linalg def foo(x, y): @@ -17,12 +17,15 @@ def foo(x, y): in2 = torch.ones([13, 13], dtype=torch.float32) # Initialize the dynamo compiler. dynamo_compiler = DynamoCompiler( - primary_registry=tosa.ops_registry, + primary_registry=linalg.ops_registry, aot_autograd_decomposition=aot_autograd_decompositions, ) -foo_mlir = dynamo.optimize(dynamo_compiler)(foo) -foo_mlir(in1, in2) +graphs = dynamo_compiler.importer(foo, in1, in2) +assert len(graphs) == 1 +graph = graphs[0] +graph.lower_to_top_level_ir() +print(graph._imported_module) # CHECK: module { # CHECK-LABEL: func.func @forward @@ -32,4 +35,3 @@ def foo(x, y): # CHECK: return %{{.*}} # CHECK: } # CHECK: } -print(dynamo_compiler.imported_module) diff --git a/tests/Python/test_clone.py b/tests/Python/test_clone.py index 24fcd3225..3eabd7d64 100644 --- a/tests/Python/test_clone.py +++ b/tests/Python/test_clone.py @@ -5,7 +5,7 @@ from torch._inductor.decomposition import decompositions as inductor_decomp from buddy.compiler.frontend import DynamoCompiler -from buddy.compiler.ops import tosa +from buddy.compiler.ops import linalg def foo(x): @@ -16,17 +16,19 @@ def foo(x): # Initialize the dynamo compiler. dynamo_compiler = DynamoCompiler( - primary_registry=tosa.ops_registry, + primary_registry=linalg.ops_registry, aot_autograd_decomposition=inductor_decomp, ) -foo_mlir = dynamo.optimize(dynamo_compiler)(foo) -foo_mlir(in1) +graphs = dynamo_compiler.importer(foo, in1) +assert len(graphs) == 1 +graph = graphs[0] +graph.lower_to_top_level_ir() +print(graph._imported_module) # CHECK: module { # CHECK-LABEL: func.func @forward -# CHECK: %{{.*}} = tosa.identity +# CHECK: %{{.*}} = tensor.extract_slice # CHECK: return %{{.*}} # CHECK: } # CHECK: } -print(dynamo_compiler.imported_module) diff --git a/tests/Python/test_convert_element_type.py b/tests/Python/test_convert_element_type.py index 63cd1ddae..ca8838463 100644 --- a/tests/Python/test_convert_element_type.py +++ b/tests/Python/test_convert_element_type.py @@ -13,7 +13,7 @@ def foo(x, to_cast_type): in1 = torch.randn(10).to(torch.float32) -to_cast_type = torch.float16 +to_cast_type = torch.int32 # Initialize the dynamo compiler. dynamo_compiler = DynamoCompiler( @@ -21,8 +21,11 @@ def foo(x, to_cast_type): aot_autograd_decomposition=inductor_decomp, ) -foo_mlir = dynamo.optimize(dynamo_compiler)(foo) -foo_mlir(in1, to_cast_type) +graphs = dynamo_compiler.importer(foo, in1, to_cast_type) +assert len(graphs) == 1 +graph = graphs[0] +graph.lower_to_top_level_ir() +print(graph._imported_module) # CHECK: module { # CHECK-LABEL: func.func @forward @@ -30,4 +33,3 @@ def foo(x, to_cast_type): # CHECK: return %{{.*}} # CHECK: } # CHECK: } -print(dynamo_compiler.imported_module) diff --git a/tests/Python/test_convolution_default.py b/tests/Python/test_convolution_default.py new file mode 100644 index 000000000..fed1607c7 --- /dev/null +++ b/tests/Python/test_convolution_default.py @@ -0,0 +1,42 @@ +# RUN: %PYTHON %s 2>&1 | FileCheck %s + +import torch +import torch._dynamo as dynamo +from torch._inductor.decomposition import decompositions as inductor_decomp + +from buddy.compiler.frontend import DynamoCompiler +from buddy.compiler.ops import tosa + + +class Convolution(torch.nn.Module): + def __init__(self, *args, **kwargs) -> None: + super().__init__(*args, **kwargs) + self.conv = torch.nn.Conv2d(3, 255, (5, 5), 3, 3, bias=False) + + def forward(self, a): + return self.conv(a) + + +model = Convolution() +dynamo_compiler = DynamoCompiler( + primary_registry=tosa.ops_registry, + aot_autograd_decomposition=inductor_decomp, +) + +in1 = torch.randn((1, 3, 640, 480)) +graphs = dynamo_compiler.importer(model, in1) +assert len(graphs) == 1 +graph = graphs[0] +graph.lower_to_top_level_ir() +print(graph._imported_module) +# CHECK: module { +# CHECK-LABEL: func.func @forward +# CHECK: %{{.*}} = "tosa.const" +# CHECK: %{{.*}} = tosa.transpose +# CHECK: %{{.*}} = "tosa.const"() +# CHECK: %{{.*}} = tosa.transpose +# CHECK: %{{.*}} = tosa.conv2d +# CHECK: %{{.*}} = tosa.transpose +# CHECK: return %{{.*}} +# CHECK: } +# CHECK: } diff --git a/tests/Python/test_embedding.py b/tests/Python/test_embedding.py index ee76d2068..484bb617b 100644 --- a/tests/Python/test_embedding.py +++ b/tests/Python/test_embedding.py @@ -22,8 +22,11 @@ def foo(weight, indices): weight = torch.randn(10, 5) indices = torch.randint(10, (3, 3)).to(torch.int32) -foo_mlir = dynamo.optimize(dynamo_compiler)(foo) -foo_mlir(weight, indices) +graphs = dynamo_compiler.importer(foo, weight, indices) +assert len(graphs) == 1 +graph = graphs[0] +graph.lower_to_top_level_ir() +print(graph._imported_module) # CHECK: module { # CHECK-LABEL: func.func @forward @@ -34,16 +37,29 @@ def foo(weight, indices): # CHECK: return %{{.*}} # CHECK: } # CHECK: } -print(dynamo_compiler.imported_module) - # test cast case weight = torch.randn(10, 5) indices = torch.randint(10, (3, 3)).to(torch.int64) +graphs = dynamo_compiler.importer(foo, weight, indices) +print(graphs) +assert len(graphs) == 2 +graphs[0].lower_to_top_level_ir() +print(graphs[0]._imported_module) + +# CHECK: module { +# CHECK-LABEL: func.func @forward +# CHECK: %{{.*}} = tosa.reshape +# CHECK: %{{.*}} = tosa.reshape +# CHECK: %{{.*}} = tosa.gather +# CHECK: %{{.*}} = tosa.reshape +# CHECK: return %{{.*}} +# CHECK: } +# CHECK: } -foo_mlir = dynamo.optimize(dynamo_compiler)(foo) -foo_mlir(weight, indices) +graphs[1].lower_to_top_level_ir() +print(graphs[1]._imported_module) # CHECK: module { # CHECK-LABEL: func.func @forward @@ -54,5 +70,4 @@ def foo(weight, indices): # CHECK: %{{.*}} = tosa.reshape # CHECK: return %{{.*}} # CHECK: } -# CHECK: } -print(dynamo_compiler.imported_module) +# CHECK: } \ No newline at end of file diff --git a/tests/Python/test_exp.py b/tests/Python/test_exp.py index 3fcff4361..7519a999b 100644 --- a/tests/Python/test_exp.py +++ b/tests/Python/test_exp.py @@ -20,8 +20,11 @@ def foo(x): aot_autograd_decomposition=inductor_decomp, ) -foo_mlir = dynamo.optimize(dynamo_compiler)(foo) -foo_mlir(in1) +graphs = dynamo_compiler.importer(foo, in1) +assert len(graphs) == 1 +graph = graphs[0] +graph.lower_to_top_level_ir() +print(graph._imported_module) # CHECK: module { # CHECK-LABEL: func.func @forward @@ -29,4 +32,3 @@ def foo(x): # CHECK: return %{{.*}} # CHECK: } # CHECK: } -print(dynamo_compiler.imported_module) diff --git a/tests/Python/test_full.py b/tests/Python/test_full.py index 33cdc2c1d..0a5f5888b 100644 --- a/tests/Python/test_full.py +++ b/tests/Python/test_full.py @@ -5,7 +5,7 @@ from torch._inductor.decomposition import decompositions as inductor_decomp from buddy.compiler.frontend import DynamoCompiler -from buddy.compiler.ops import tosa +from buddy.compiler.ops import linalg def foo(x, y): @@ -17,12 +17,15 @@ def foo(x, y): # Initialize the dynamo compiler. dynamo_compiler = DynamoCompiler( - primary_registry=tosa.ops_registry, + primary_registry=linalg.ops_registry, aot_autograd_decomposition=inductor_decomp, ) -foo_mlir = dynamo.optimize(dynamo_compiler)(foo) -foo_mlir(in1, in2) +graphs = dynamo_compiler.importer(foo, in1, in2) +assert len(graphs) == 1 +graph = graphs[0] +graph.lower_to_top_level_ir() +print(graph._imported_module) # CHECK: module { # CHECK-LABEL: func.func @forward @@ -30,4 +33,3 @@ def foo(x, y): # CHECK: return %{{.*}} # CHECK: } # CHECK: } -print(dynamo_compiler.imported_module) diff --git a/tests/Python/test_index.py b/tests/Python/test_index.py index da31095c1..c21ce1a5f 100644 --- a/tests/Python/test_index.py +++ b/tests/Python/test_index.py @@ -6,7 +6,7 @@ from torch._functorch.aot_autograd import aot_autograd_decompositions from buddy.compiler.frontend import DynamoCompiler -from buddy.compiler.ops import tosa +from buddy.compiler.ops import linalg def foo(x, y): @@ -17,12 +17,15 @@ def foo(x, y): in2 = torch.tensor([1]) # Initialize the dynamo compiler. dynamo_compiler = DynamoCompiler( - primary_registry=tosa.ops_registry, + primary_registry=linalg.ops_registry, aot_autograd_decomposition=aot_autograd_decompositions, ) -foo_mlir = dynamo.optimize(dynamo_compiler)(foo) -foo_mlir(in1, in2) +graphs = dynamo_compiler.importer(foo, in1, in2) +assert len(graphs) == 1 +graph = graphs[0] +graph.lower_to_top_level_ir() +print(graph._imported_module) # CHECK: module { # CHECK-LABEL: func.func @forward @@ -31,4 +34,3 @@ def foo(x, y): # CHECK: return %{{.*}} # CHECK: } # CHECK: } -print(dynamo_compiler.imported_module) diff --git a/tests/Python/test_expand.py b/tests/Python/test_iota.py similarity index 52% rename from tests/Python/test_expand.py rename to tests/Python/test_iota.py index 37e9aca38..d4e9d3e56 100644 --- a/tests/Python/test_expand.py +++ b/tests/Python/test_iota.py @@ -8,25 +8,28 @@ from buddy.compiler.ops import tosa -def foo(x, new_size): - return torch.ops.aten.expand(x, new_size) +class foo(torch.nn.Module): + def __init__(self, *args, **kwargs) -> None: + super().__init__(*args, **kwargs) -x = torch.randn(1, 3) -new_size = (6, 3) + def forward(self, a): + return torch.arange(a) -# Initialize the dynamo compiler. + +model = foo() dynamo_compiler = DynamoCompiler( primary_registry=tosa.ops_registry, aot_autograd_decomposition=inductor_decomp, ) - -foo_mlir = dynamo.optimize(dynamo_compiler)(foo) -foo_mlir(x, new_size) - +in1 = 40 +graphs = dynamo_compiler.importer(model, in1) +assert len(graphs) == 1 +graph = graphs[0] +graph.lower_to_top_level_ir() +print(graph._imported_module) # CHECK: module { # CHECK-LABEL: func.func @forward -# CHECK: %{{.*}} = tosa.add -# CHECK: return %{{.*}} : tensor<6x3xf32> +# CHECK: %{{.*}} = "tosa.const" +# CHECK: return %{{.*}} # CHECK: } # CHECK: } -print(dynamo_compiler.imported_module) diff --git a/tests/Python/test_lt.py b/tests/Python/test_lt.py index a6f30b61c..5cea5ce5f 100644 --- a/tests/Python/test_lt.py +++ b/tests/Python/test_lt.py @@ -5,23 +5,26 @@ from torch._inductor.decomposition import decompositions as inductor_decomp from buddy.compiler.frontend import DynamoCompiler -from buddy.compiler.ops import tosa +from buddy.compiler.ops import linalg def foo(x, y): - return torch.ops.aten.lt(x,y) + return torch.ops.aten.lt(x, y) in1 = torch.ones([13], dtype=torch.int64) in2 = torch.ones([13, 1], dtype=torch.int64) # Initialize the dynamo compiler. dynamo_compiler = DynamoCompiler( - primary_registry=tosa.ops_registry, + primary_registry=linalg.ops_registry, aot_autograd_decomposition=inductor_decomp, ) -foo_mlir = dynamo.optimize(dynamo_compiler)(foo) -foo_mlir(in1, in2) +graphs = dynamo_compiler.importer(foo, in1, in2) +assert len(graphs) == 1 +graph = graphs[0] +graph.lower_to_top_level_ir() +print(graph._imported_module) # CHECK: module { # CHECK-LABEL: func.func @forward @@ -30,4 +33,3 @@ def foo(x, y): # CHECK: return %{{.*}} # CHECK: } # CHECK: } -print(dynamo_compiler.imported_module) diff --git a/tests/Python/test_masked_fill.py b/tests/Python/test_masked_fill.py index 3802b3de7..3abbe88cd 100644 --- a/tests/Python/test_masked_fill.py +++ b/tests/Python/test_masked_fill.py @@ -6,7 +6,7 @@ from torch._functorch.aot_autograd import aot_autograd_decompositions from buddy.compiler.frontend import DynamoCompiler -from buddy.compiler.ops import tosa +from buddy.compiler.ops import linalg def foo(x, y, z): @@ -18,12 +18,15 @@ def foo(x, y, z): in3 = 0 # Initialize the dynamo compiler. dynamo_compiler = DynamoCompiler( - primary_registry=tosa.ops_registry, + primary_registry=linalg.ops_registry, aot_autograd_decomposition=aot_autograd_decompositions, ) -foo_mlir = dynamo.optimize(dynamo_compiler)(foo) -foo_mlir(in1, in2, in3) +graphs = dynamo_compiler.importer(foo, in1, in2, in3) +assert len(graphs) == 1 +graph = graphs[0] +graph.lower_to_top_level_ir() +print(graph._imported_module) # CHECK: module { # CHECK-LABEL: func.func @forward @@ -33,4 +36,3 @@ def foo(x, y, z): # CHECK: return %{{.*}} # CHECK: } # CHECK: } -print(dynamo_compiler.imported_module) diff --git a/tests/Python/test_max_pool2d.py b/tests/Python/test_max_pool2d.py new file mode 100644 index 000000000..eecfc73d9 --- /dev/null +++ b/tests/Python/test_max_pool2d.py @@ -0,0 +1,44 @@ +# RUN: %PYTHON %s 2>&1 | FileCheck %s + +import torch +from torch._inductor.decomposition import decompositions as inductor_decomp + +from buddy.compiler.frontend import DynamoCompiler +from buddy.compiler.ops import tosa + + +class TestModule(torch.nn.Module): + def __init__(self, *args, **kwargs) -> None: + super().__init__(*args, **kwargs) + self.pool = torch.nn.MaxPool2d((5, 5), 3, (2, 2)) + + def forward(self, a): + return self.pool(a) + + +model = TestModule() +dynamo_compiler = DynamoCompiler( + primary_registry=tosa.ops_registry, + aot_autograd_decomposition=inductor_decomp, +) + +in1 = torch.randn((1, 3, 640, 480)) + +model_opt = torch.compile(model, backend=dynamo_compiler) +assert torch.allclose(model_opt(in1), model(in1), equal_nan=True) + +graphs = dynamo_compiler.importer(model, in1) +assert len(graphs) == 1 +graph = graphs[0] +graph.lower_to_top_level_ir() +print(graph._imported_module) +# CHECK: module { +# CHECK-LABEL: func.func @forward +# CHECK: %{{.*}} = "tosa.const" +# CHECK: %{{.*}} = tosa.transpose +# CHECK: %{{.*}} = tosa.max_pool2d +# CHECK: %{{.*}} = "tosa.const" +# CHECK: %{{.*}} = tosa.transpose +# CHECK: return %{{.*}} +# CHECK: } +# CHECK: } diff --git a/tests/Python/test_mean.py b/tests/Python/test_mean.py index 781e49416..0595619d1 100644 --- a/tests/Python/test_mean.py +++ b/tests/Python/test_mean.py @@ -1,16 +1,14 @@ # RUN: %PYTHON %s 2>&1 | FileCheck %s import torch -import torch._dynamo as dynamo from torch._inductor.decomposition import decompositions as inductor_decomp -from torch._functorch.aot_autograd import aot_autograd_decompositions from buddy.compiler.frontend import DynamoCompiler from buddy.compiler.ops import tosa -def foo(x, y, z): - return torch.mean(x, y, z) +def foo(x, y, keepdim): + return torch.mean(x, y, keepdim=keepdim) in1 = torch.ones([13, 13], dtype=torch.float32) @@ -19,17 +17,25 @@ def foo(x, y, z): # Initialize the dynamo compiler. dynamo_compiler = DynamoCompiler( primary_registry=tosa.ops_registry, - aot_autograd_decomposition=aot_autograd_decompositions, + aot_autograd_decomposition=inductor_decomp, ) -foo_mlir = dynamo.optimize(dynamo_compiler)(foo) -foo_mlir(in1, in2, in3) +foo_mlir = torch.compile(foo, backend=dynamo_compiler) +assert torch.allclose( + foo_mlir(in1, in2, keepdim=in3), foo(in1, in2, keepdim=in3), equal_nan=True +) +graphs = dynamo_compiler.importer(foo, in1, in2, in3) +assert len(graphs) == 1 +graph = graphs[0] +graph.lower_to_top_level_ir() +print(graph._imported_module) # CHECK: module { # CHECK-LABEL: func.func @forward -# CHECK: %{{.*}} = arith.constant -# CHECK: %{{.*}} = linalg.generic +# CHECK: %{{.*}} = tosa.reduce_sum +# CHECK: %{{.*}} = "tosa.const" +# CHECK: %{{.*}} = tosa.reciprocal +# CHECK: %{{.*}} = tosa.mul # CHECK: return %{{.*}} # CHECK: } # CHECK: } -print(dynamo_compiler.imported_module) diff --git a/tests/Python/test_mm.py b/tests/Python/test_mm.py index 4440b4ad8..4f7c41df3 100644 --- a/tests/Python/test_mm.py +++ b/tests/Python/test_mm.py @@ -6,7 +6,7 @@ from torch._functorch.aot_autograd import aot_autograd_decompositions from buddy.compiler.frontend import DynamoCompiler -from buddy.compiler.ops import tosa +from buddy.compiler.ops import linalg def foo(x, y): @@ -17,12 +17,15 @@ def foo(x, y): in2 = torch.ones([13, 13], dtype=torch.float32) # Initialize the dynamo compiler. dynamo_compiler = DynamoCompiler( - primary_registry=tosa.ops_registry, + primary_registry=linalg.ops_registry, aot_autograd_decomposition=aot_autograd_decompositions, ) -foo_mlir = dynamo.optimize(dynamo_compiler)(foo) -foo_mlir(in1, in2) +graphs = dynamo_compiler.importer(foo, in1, in2) +assert len(graphs) == 1 +graph = graphs[0] +graph.lower_to_top_level_ir() +print(graph._imported_module) # CHECK: module { # CHECK-LABEL: func.func @forward @@ -31,4 +34,3 @@ def foo(x, y): # CHECK: return %{{.*}} # CHECK: } # CHECK: } -print(dynamo_compiler.imported_module) diff --git a/tests/Python/test_neg.py b/tests/Python/test_neg.py index e2f9e6f3d..78261085a 100644 --- a/tests/Python/test_neg.py +++ b/tests/Python/test_neg.py @@ -6,7 +6,7 @@ from torch._functorch.aot_autograd import aot_autograd_decompositions from buddy.compiler.frontend import DynamoCompiler -from buddy.compiler.ops import tosa +from buddy.compiler.ops import linalg def foo(x): @@ -16,18 +16,20 @@ def foo(x): in1 = torch.ones([13, 13], dtype=torch.float32) # Initialize the dynamo compiler. dynamo_compiler = DynamoCompiler( - primary_registry=tosa.ops_registry, + primary_registry=linalg.ops_registry, aot_autograd_decomposition=aot_autograd_decompositions, ) -foo_mlir = dynamo.optimize(dynamo_compiler)(foo) -foo_mlir(in1) +graphs = dynamo_compiler.importer(foo, in1) +assert len(graphs) == 1 +graph = graphs[0] +graph.lower_to_top_level_ir() +print(graph._imported_module) # CHECK: module { # CHECK-LABEL: func.func @forward # CHECK: %{{.*}} = tensor.empty -# CHECK: %{{.*}} = linalg.generic +# CHECK: %{{.*}} = linalg.negf # CHECK: return %{{.*}} # CHECK: } # CHECK: } -print(dynamo_compiler.imported_module) diff --git a/tests/Python/test_ones.py b/tests/Python/test_ones.py index 7343fd102..4af4ead36 100644 --- a/tests/Python/test_ones.py +++ b/tests/Python/test_ones.py @@ -5,7 +5,7 @@ from torch._inductor.decomposition import decompositions as inductor_decomp from buddy.compiler.frontend import DynamoCompiler -from buddy.compiler.ops import tosa +from buddy.compiler.ops import linalg def foo(x): @@ -16,12 +16,15 @@ def foo(x): # Initialize the dynamo compiler. dynamo_compiler = DynamoCompiler( - primary_registry=tosa.ops_registry, + primary_registry=linalg.ops_registry, aot_autograd_decomposition=inductor_decomp, ) -foo_mlir = dynamo.optimize(dynamo_compiler)(foo) -foo_mlir(in1) +graphs = dynamo_compiler.importer(foo, in1) +assert len(graphs) == 1 +graph = graphs[0] +graph.lower_to_top_level_ir() +print(graph._imported_module) # CHECK: module { # CHECK-LABEL: func.func @forward @@ -29,4 +32,3 @@ def foo(x): # CHECK: return %{{.*}} # CHECK: } # CHECK: } -print(dynamo_compiler.imported_module) diff --git a/tests/Python/test_permute.py b/tests/Python/test_permute.py index d260df3c2..7f1aad3e1 100644 --- a/tests/Python/test_permute.py +++ b/tests/Python/test_permute.py @@ -21,8 +21,11 @@ def foo(x, y): aot_autograd_decomposition=inductor_decomp, ) -foo_mlir = dynamo.optimize(dynamo_compiler)(foo) -foo_mlir(x, perm) +graphs = dynamo_compiler.importer(foo, x, perm) +assert len(graphs) == 1 +graph = graphs[0] +graph.lower_to_top_level_ir() +print(graph._imported_module) # CHECK: module { # CHECK-LABEL: func.func @forward @@ -30,4 +33,3 @@ def foo(x, y): # CHECK: return %{{.*}} : tensor<4x3x2xf32> # CHECK: } # CHECK: } -print(dynamo_compiler.imported_module) diff --git a/tests/Python/test_pow.py b/tests/Python/test_pow.py index cfc47feb1..d67156383 100644 --- a/tests/Python/test_pow.py +++ b/tests/Python/test_pow.py @@ -6,7 +6,7 @@ from torch._functorch.aot_autograd import aot_autograd_decompositions from buddy.compiler.frontend import DynamoCompiler -from buddy.compiler.ops import tosa +from buddy.compiler.ops import linalg def foo(x, y): @@ -17,12 +17,15 @@ def foo(x, y): in2 = 2 # Initialize the dynamo compiler. dynamo_compiler = DynamoCompiler( - primary_registry=tosa.ops_registry, + primary_registry=linalg.ops_registry, aot_autograd_decomposition=aot_autograd_decompositions, ) -foo_mlir = dynamo.optimize(dynamo_compiler)(foo) -foo_mlir(in1, in2) +graphs = dynamo_compiler.importer(foo, in1, in2) +assert len(graphs) == 1 +graph = graphs[0] +graph.lower_to_top_level_ir() +print(graph._imported_module) # CHECK: module { # CHECK-LABEL: func.func @forward @@ -32,4 +35,3 @@ def foo(x, y): # CHECK: return %{{.*}} # CHECK: } # CHECK: } -print(dynamo_compiler.imported_module) diff --git a/tests/Python/test_reciprocal.py b/tests/Python/test_reciprocal.py new file mode 100644 index 000000000..9c31fb8b5 --- /dev/null +++ b/tests/Python/test_reciprocal.py @@ -0,0 +1,36 @@ +# RUN: %PYTHON %s 2>&1 | FileCheck %s + +import torch +from torch._inductor.decomposition import decompositions as inductor_decomp + +from buddy.compiler.frontend import DynamoCompiler +from buddy.compiler.ops import math + + +def foo(x): + return torch.ops.aten.reciprocal(x) + + +x = torch.randn(10, 3, 6) + +# Initialize the dynamo compiler. +dynamo_compiler = DynamoCompiler( + primary_registry=math.ops_registry, + aot_autograd_decomposition=inductor_decomp, +) + +foo_mlir = torch.compile(foo, backend=dynamo_compiler) +assert torch.allclose(foo_mlir(x), foo(x), equal_nan=True) + +graphs = dynamo_compiler.importer(foo, x) +assert len(graphs) == 1 +graph = graphs[0] +graph.lower_to_top_level_ir() +print(graph._imported_module) + +# CHECK: module { +# CHECK-LABEL: func.func @forward +# CHECK: %{{.*}} = tosa.reciprocal +# CHECK: return %{{.*}} +# CHECK: } +# CHECK: } diff --git a/tests/Python/test_relu.py b/tests/Python/test_relu.py new file mode 100644 index 000000000..c6d6bc6ae --- /dev/null +++ b/tests/Python/test_relu.py @@ -0,0 +1,36 @@ +# RUN: %PYTHON %s 2>&1 | FileCheck %s + +import torch +import torch._dynamo as dynamo +from torch._inductor.decomposition import decompositions as inductor_decomp + +from buddy.compiler.frontend import DynamoCompiler +from buddy.compiler.ops import tosa + + +class foo(torch.nn.Module): + def __init__(self, *args, **kwargs) -> None: + super().__init__(*args, **kwargs) + + def forward(self, a): + return torch.relu(a) + + +model = foo() +dynamo_compiler = DynamoCompiler( + primary_registry=tosa.ops_registry, + aot_autograd_decomposition=inductor_decomp, +) +in1 = torch.randn((1, 3, 640, 480), device="cpu") +graphs = dynamo_compiler.importer(model, in1) +assert len(graphs) == 1 +graph = graphs[0] +graph.lower_to_top_level_ir() +print(graph._imported_module) +# CHECK: module { +# CHECK-LABEL: func.func @forward +# CHECK: %{{.*}} = "tosa.const" +# CHECK: %{{.*}} = tosa.maximum +# CHECK: return %{{.*}} +# CHECK: } +# CHECK: } diff --git a/tests/Python/test_reshape.py b/tests/Python/test_reshape.py index 56a194697..989e0e4da 100644 --- a/tests/Python/test_reshape.py +++ b/tests/Python/test_reshape.py @@ -21,8 +21,11 @@ def foo(x, new_shape): aot_autograd_decomposition=inductor_decomp, ) -foo_mlir = dynamo.optimize(dynamo_compiler)(foo) -foo_mlir(x, new_shape) +graphs = dynamo_compiler.importer(foo, x, new_shape) +assert len(graphs) == 1 +graph = graphs[0] +graph.lower_to_top_level_ir() +print(graph._imported_module) # CHECK: module { # CHECK-LABEL: func.func @forward @@ -30,4 +33,3 @@ def foo(x, new_shape): # CHECK: return %{{.*}} # CHECK: } # CHECK: } -print(dynamo_compiler.imported_module) diff --git a/tests/Python/test_rsqrt.py b/tests/Python/test_rsqrt.py index 8ca0cf929..370334d66 100644 --- a/tests/Python/test_rsqrt.py +++ b/tests/Python/test_rsqrt.py @@ -5,7 +5,7 @@ from torch._inductor.decomposition import decompositions as inductor_decomp from buddy.compiler.frontend import DynamoCompiler -from buddy.compiler.ops import tosa +from buddy.compiler.ops import linalg def foo(x): @@ -16,17 +16,20 @@ def foo(x): # Initialize the dynamo compiler. dynamo_compiler = DynamoCompiler( - primary_registry=tosa.ops_registry, + primary_registry=linalg.ops_registry, aot_autograd_decomposition=inductor_decomp, ) -foo_mlir = dynamo.optimize(dynamo_compiler)(foo) -foo_mlir(x) +graphs = dynamo_compiler.importer(foo, x) +assert len(graphs) == 1 +graph = graphs[0] +graph.lower_to_top_level_ir() +print(graph._imported_module) # CHECK: module { # CHECK-LABEL: func.func @forward -# CHECK: %{{.*}} = tosa.rsqrt +# CHECK: %{{.*}} = tensor.empty() +# CHECK: %{{.*}} = linalg.generic # CHECK: return %{{.*}} # CHECK: } # CHECK: } -print(dynamo_compiler.imported_module) diff --git a/tests/Python/test_rsub.py b/tests/Python/test_rsub.py index fc945970c..99843af0e 100644 --- a/tests/Python/test_rsub.py +++ b/tests/Python/test_rsub.py @@ -6,28 +6,32 @@ from torch._functorch.aot_autograd import aot_autograd_decompositions from buddy.compiler.frontend import DynamoCompiler -from buddy.compiler.ops import tosa +from buddy.compiler.ops import linalg def foo(x, y): - return y-x + return torch.ops.aten.rsub(x, y) in1 = torch.ones([13, 13], dtype=torch.float32) -in2 = torch.ones([13, 13], dtype=torch.float32) +in2 = 2 # Initialize the dynamo compiler. dynamo_compiler = DynamoCompiler( - primary_registry=tosa.ops_registry, + primary_registry=linalg.ops_registry, aot_autograd_decomposition=aot_autograd_decompositions, ) -foo_mlir = dynamo.optimize(dynamo_compiler)(foo) -foo_mlir(in1, in2) +graphs = dynamo_compiler.importer(foo, in1, in2) +assert len(graphs) == 1 +graph = graphs[0] +graph.lower_to_top_level_ir() +print(graph._imported_module) # CHECK: module { # CHECK-LABEL: func.func @forward -# CHECK: %{{.*}} = tosa.sub +# CHECK: %{{.*}} = arith.constant +# CHECK: %{{.*}} = tensor.empty() +# CHECK: %{{.*}} = linalg.generic # CHECK: return %{{.*}} # CHECK: } # CHECK: } -print(dynamo_compiler.imported_module) diff --git a/tests/Python/test_select.py b/tests/Python/test_select.py index d94bd296a..c54420a11 100644 --- a/tests/Python/test_select.py +++ b/tests/Python/test_select.py @@ -22,8 +22,11 @@ def foo(x, dim, index): aot_autograd_decomposition=inductor_decomp, ) -foo_mlir = dynamo.optimize(dynamo_compiler)(foo) -foo_mlir(x, dim, index) +graphs = dynamo_compiler.importer(foo, x, dim, index) +assert len(graphs) == 1 +graph = graphs[0] +graph.lower_to_top_level_ir() +print(graph._imported_module) # CHECK: module { # CHECK-LABEL: func.func @forward @@ -32,4 +35,3 @@ def foo(x, dim, index): # CHECK: return %{{.*}} : tensor<3x2xf32> # CHECK: } # CHECK: } -print(dynamo_compiler.imported_module) diff --git a/tests/Python/test_sigmoid.py b/tests/Python/test_sigmoid.py new file mode 100644 index 000000000..43f03cc11 --- /dev/null +++ b/tests/Python/test_sigmoid.py @@ -0,0 +1,35 @@ +# RUN: %PYTHON %s 2>&1 | FileCheck %s + +import torch +import torch._dynamo as dynamo +from torch._inductor.decomposition import decompositions as inductor_decomp + +from buddy.compiler.frontend import DynamoCompiler +from buddy.compiler.ops import tosa + + +class foo(torch.nn.Module): + def __init__(self, *args, **kwargs) -> None: + super().__init__(*args, **kwargs) + + def forward(self, a): + return torch.sigmoid(a) + + +model = foo() +dynamo_compiler = DynamoCompiler( + primary_registry=tosa.ops_registry, + aot_autograd_decomposition=inductor_decomp, +) +in1 = torch.randn((1, 3, 640, 480), device="cpu") +graphs = dynamo_compiler.importer(model, in1) +assert len(graphs) == 1 +graph = graphs[0] +graph.lower_to_top_level_ir() +print(graph._imported_module) +# CHECK: module { +# CHECK-LABEL: func.func @forward +# CHECK: %{{.*}} = tosa.sigmoid +# CHECK: return %{{.*}} +# CHECK: } +# CHECK: } diff --git a/tests/Python/test_silu.py b/tests/Python/test_silu.py index dcd919ca5..2aa504776 100644 --- a/tests/Python/test_silu.py +++ b/tests/Python/test_silu.py @@ -6,7 +6,7 @@ from torch._functorch.aot_autograd import aot_autograd_decompositions from buddy.compiler.frontend import DynamoCompiler -from buddy.compiler.ops import tosa +from buddy.compiler.ops import linalg def foo(x): @@ -16,12 +16,15 @@ def foo(x): in1 = torch.ones([13, 13], dtype=torch.float32) # Initialize the dynamo compiler. dynamo_compiler = DynamoCompiler( - primary_registry=tosa.ops_registry, + primary_registry=linalg.ops_registry, aot_autograd_decomposition=aot_autograd_decompositions, ) -foo_mlir = dynamo.optimize(dynamo_compiler)(foo) -foo_mlir(in1) +graphs = dynamo_compiler.importer(foo, in1) +assert len(graphs) == 1 +graph = graphs[0] +graph.lower_to_top_level_ir() +print(graph._imported_module) # CHECK: module { # CHECK-LABEL: func.func @forward @@ -30,4 +33,3 @@ def foo(x): # CHECK: return %{{.*}} # CHECK: } # CHECK: } -print(dynamo_compiler.imported_module) diff --git a/tests/Python/test_slice.py b/tests/Python/test_slice.py index 61a8658e1..acc0acaa2 100644 --- a/tests/Python/test_slice.py +++ b/tests/Python/test_slice.py @@ -5,7 +5,7 @@ from torch._inductor.decomposition import decompositions as inductor_decomp from buddy.compiler.frontend import DynamoCompiler -from buddy.compiler.ops import tosa +from buddy.compiler.ops import linalg def foo(x, dim, start_idx, end_idx): @@ -19,12 +19,15 @@ def foo(x, dim, start_idx, end_idx): # Initialize the dynamo compiler. dynamo_compiler = DynamoCompiler( - primary_registry=tosa.ops_registry, + primary_registry=linalg.ops_registry, aot_autograd_decomposition=inductor_decomp, ) -foo_mlir = dynamo.optimize(dynamo_compiler)(foo) -foo_mlir(x, dim, start_idx, end_idx) +graphs = dynamo_compiler.importer(foo, x, dim, start_idx, end_idx) +assert len(graphs) == 1 +graph = graphs[0] +graph.lower_to_top_level_ir() +print(graph._imported_module) # CHECK: module { # CHECK-LABEL: func.func @forward @@ -32,4 +35,3 @@ def foo(x, dim, start_idx, end_idx): # CHECK: return %{{.*}} : tensor<3x2x2xf32> # CHECK: } # CHECK: } -print(dynamo_compiler.imported_module) diff --git a/tests/Python/test_softmax.py b/tests/Python/test_softmax.py index d5e656de7..eca5b2c60 100644 --- a/tests/Python/test_softmax.py +++ b/tests/Python/test_softmax.py @@ -6,7 +6,7 @@ from torch._functorch.aot_autograd import aot_autograd_decompositions from buddy.compiler.frontend import DynamoCompiler -from buddy.compiler.ops import tosa +from buddy.compiler.ops import linalg def foo(x): @@ -16,26 +16,22 @@ def foo(x): in1 = torch.ones([13, 13], dtype=torch.float32) # Initialize the dynamo compiler. dynamo_compiler = DynamoCompiler( - primary_registry=tosa.ops_registry, + primary_registry=linalg.ops_registry, aot_autograd_decomposition=aot_autograd_decompositions, ) -foo_mlir = dynamo.optimize(dynamo_compiler)(foo) -foo_mlir(in1) +graphs = dynamo_compiler.importer(foo, in1) +assert len(graphs) == 1 +graph = graphs[0] +graph.lower_to_top_level_ir() +print(graph._imported_module) # CHECK: module { # CHECK-LABEL: func.func @forward -# CHECK: %{{.*}} = tensor.empty +# CHECK: %{{.*}} = arith.constant # CHECK: %{{.*}} = linalg.generic -# CHECK: %{{.*}} = linalg.generic -# CHECK: %{{.*}} = tensor.empty -# CHECK: %{{.*}} = linalg.generic -# CHECK: %{{.*}} = tensor.empty -# CHECK: %{{.*}} = linalg.generic -# CHECK: %{{.*}} = linalg.generic -# CHECK: %{{.*}} = tensor.empty +# CHECK: %{{.*}} = tensor.empty() # CHECK: %{{.*}} = linalg.generic # CHECK: return %{{.*}} # CHECK: } # CHECK: } -print(dynamo_compiler.imported_module) diff --git a/tests/Python/test_sqrt.py b/tests/Python/test_sqrt.py new file mode 100644 index 000000000..b929d1107 --- /dev/null +++ b/tests/Python/test_sqrt.py @@ -0,0 +1,36 @@ +# RUN: %PYTHON %s 2>&1 | FileCheck %s + +import torch +from torch._inductor.decomposition import decompositions as inductor_decomp + +from buddy.compiler.frontend import DynamoCompiler +from buddy.compiler.ops import math + + +def foo(x): + return torch.ops.aten.sqrt(x) + + +x = torch.randn(10, 3, 6) + +# Initialize the dynamo compiler. +dynamo_compiler = DynamoCompiler( + primary_registry=math.ops_registry, + aot_autograd_decomposition=inductor_decomp, +) + +foo_mlir = torch.compile(foo, backend=dynamo_compiler) +assert torch.allclose(foo_mlir(x), foo(x), equal_nan=True) + +graphs = dynamo_compiler.importer(foo, x) +assert len(graphs) == 1 +graph = graphs[0] +graph.lower_to_top_level_ir() +print(graph._imported_module) + +# CHECK: module { +# CHECK-LABEL: func.func @forward +# CHECK: %{{.*}} = math.sqrt +# CHECK: return %{{.*}} +# CHECK: } +# CHECK: } diff --git a/tests/Python/test_squeeze.py b/tests/Python/test_squeeze.py index f394ca8d7..e6b1b5c00 100644 --- a/tests/Python/test_squeeze.py +++ b/tests/Python/test_squeeze.py @@ -6,7 +6,7 @@ from torch._functorch.aot_autograd import aot_autograd_decompositions from buddy.compiler.frontend import DynamoCompiler -from buddy.compiler.ops import tosa +from buddy.compiler.ops import linalg def foo(x): @@ -16,12 +16,15 @@ def foo(x): in1 = torch.ones([1, 13, 13], dtype=torch.float32) # Initialize the dynamo compiler. dynamo_compiler = DynamoCompiler( - primary_registry=tosa.ops_registry, + primary_registry=linalg.ops_registry, aot_autograd_decomposition=aot_autograd_decompositions, ) -foo_mlir = dynamo.optimize(dynamo_compiler)(foo) -foo_mlir(in1) +graphs = dynamo_compiler.importer(foo, in1) +assert len(graphs) == 1 +graph = graphs[0] +graph.lower_to_top_level_ir() +print(graph._imported_module) # CHECK: module { # CHECK-LABEL: func.func @forward @@ -30,4 +33,3 @@ def foo(x): # CHECK: return %{{.*}} # CHECK: } # CHECK: } -print(dynamo_compiler.imported_module) diff --git a/tests/Python/test_sum.py b/tests/Python/test_sum.py index 713910f15..e97f94209 100644 --- a/tests/Python/test_sum.py +++ b/tests/Python/test_sum.py @@ -22,8 +22,11 @@ def foo(x, dim): aot_autograd_decomposition=inductor_decomp, ) -foo_mlir = dynamo.optimize(dynamo_compiler)(foo) -foo_mlir(x, dim) +graphs = dynamo_compiler.importer(foo, x, dim) +assert len(graphs) == 1 +graph = graphs[0] +graph.lower_to_top_level_ir() +print(graph._imported_module) # CHECK: module { # CHECK-LABEL: func.func @forward @@ -31,4 +34,3 @@ def foo(x, dim): # CHECK: return %{{.*}} # CHECK: } # CHECK: } -print(dynamo_compiler.imported_module) diff --git a/tests/Python/test_t.py b/tests/Python/test_t.py index 835bb4c2f..09d44facc 100644 --- a/tests/Python/test_t.py +++ b/tests/Python/test_t.py @@ -6,7 +6,7 @@ from torch._functorch.aot_autograd import aot_autograd_decompositions from buddy.compiler.frontend import DynamoCompiler -from buddy.compiler.ops import tosa +from buddy.compiler.ops import linalg def foo(x): @@ -16,18 +16,20 @@ def foo(x): in1 = torch.ones([13, 13], dtype=torch.float32) # Initialize the dynamo compiler. dynamo_compiler = DynamoCompiler( - primary_registry=tosa.ops_registry, + primary_registry=linalg.ops_registry, aot_autograd_decomposition=aot_autograd_decompositions, ) -foo_mlir = dynamo.optimize(dynamo_compiler)(foo) -foo_mlir(in1) +graphs = dynamo_compiler.importer(foo, in1) +assert len(graphs) == 1 +graph = graphs[0] +graph.lower_to_top_level_ir() +print(graph._imported_module) # CHECK: module { # CHECK-LABEL: func.func @forward -# CHECK: %{{.*}} = "tosa.const" -# CHECK: %{{.*}} = tosa.transpose +# CHECK: %{{.*}} = tensor.empty() +# CHECK: %{{.*}} = linalg.transpose # CHECK: return %{{.*}} # CHECK: } # CHECK: } -print(dynamo_compiler.imported_module) diff --git a/tests/Python/test_tanh.py b/tests/Python/test_tanh.py index b1875dfd5..b9ca6082c 100644 --- a/tests/Python/test_tanh.py +++ b/tests/Python/test_tanh.py @@ -20,8 +20,11 @@ def foo(x): aot_autograd_decomposition=inductor_decomp, ) -foo_mlir = dynamo.optimize(dynamo_compiler)(foo) -foo_mlir(x) +graphs = dynamo_compiler.importer(foo, x) +assert len(graphs) == 1 +graph = graphs[0] +graph.lower_to_top_level_ir() +print(graph._imported_module) # CHECK: module { # CHECK-LABEL: func.func @forward @@ -29,4 +32,3 @@ def foo(x): # CHECK: return %{{.*}} # CHECK: } # CHECK: } -print(dynamo_compiler.imported_module) diff --git a/tests/Python/test_to_copy.py b/tests/Python/test_to_copy.py index 9632d9f5c..0b6c2ad22 100644 --- a/tests/Python/test_to_copy.py +++ b/tests/Python/test_to_copy.py @@ -6,7 +6,7 @@ from torch._functorch.aot_autograd import aot_autograd_decompositions from buddy.compiler.frontend import DynamoCompiler -from buddy.compiler.ops import tosa +from buddy.compiler.ops import linalg def foo(x): @@ -16,12 +16,15 @@ def foo(x): in1 = torch.ones([13, 13], dtype=torch.bool) # Initialize the dynamo compiler. dynamo_compiler = DynamoCompiler( - primary_registry=tosa.ops_registry, + primary_registry=linalg.ops_registry, aot_autograd_decomposition=aot_autograd_decompositions, ) -foo_mlir = dynamo.optimize(dynamo_compiler)(foo) -foo_mlir(in1) +graphs = dynamo_compiler.importer(foo, in1) +assert len(graphs) == 1 +graph = graphs[0] +graph.lower_to_top_level_ir() +print(graph._imported_module) # CHECK: module { # CHECK-LABEL: func.func @forward @@ -30,4 +33,3 @@ def foo(x): # CHECK: return %{{.*}} # CHECK: } # CHECK: } -print(dynamo_compiler.imported_module) diff --git a/tests/Python/test_transpose.py b/tests/Python/test_transpose.py index d7e71be8e..9769604f3 100644 --- a/tests/Python/test_transpose.py +++ b/tests/Python/test_transpose.py @@ -3,7 +3,6 @@ import torch import torch._dynamo as dynamo from torch._inductor.decomposition import decompositions as inductor_decomp -from torch._functorch.aot_autograd import aot_autograd_decompositions from buddy.compiler.frontend import DynamoCompiler from buddy.compiler.ops import tosa @@ -19,17 +18,19 @@ def foo(x, y, z): # Initialize the dynamo compiler. dynamo_compiler = DynamoCompiler( primary_registry=tosa.ops_registry, - aot_autograd_decomposition=aot_autograd_decompositions, + aot_autograd_decomposition=inductor_decomp, ) -foo_mlir = dynamo.optimize(dynamo_compiler)(foo) -foo_mlir(in1, in2, in3) +graphs = dynamo_compiler.importer(foo, in1, in2, in3) +assert len(graphs) == 1 +graph = graphs[0] +graph.lower_to_top_level_ir() +print(graph._imported_module) # CHECK: module { # CHECK-LABEL: func.func @forward -# CHECK: %{{.*}} = "tosa.const" +# CHECK: %{{.*}} = "tosa.const"() # CHECK: %{{.*}} = tosa.transpose # CHECK: return %{{.*}} # CHECK: } # CHECK: } -print(dynamo_compiler.imported_module) diff --git a/tests/Python/test_unsqueeze.py b/tests/Python/test_unsqueeze.py index 577354b9f..5cb4ee552 100644 --- a/tests/Python/test_unsqueeze.py +++ b/tests/Python/test_unsqueeze.py @@ -5,7 +5,7 @@ from torch._inductor.decomposition import decompositions as inductor_decomp from buddy.compiler.frontend import DynamoCompiler -from buddy.compiler.ops import tosa +from buddy.compiler.ops import linalg def foo(x, dim): @@ -17,12 +17,15 @@ def foo(x, dim): # Initialize the dynamo compiler. dynamo_compiler = DynamoCompiler( - primary_registry=tosa.ops_registry, + primary_registry=linalg.ops_registry, aot_autograd_decomposition=inductor_decomp, ) -foo_mlir = dynamo.optimize(dynamo_compiler)(foo) -foo_mlir(x, dim) +graphs = dynamo_compiler.importer(foo, x, dim) +assert len(graphs) == 1 +graph = graphs[0] +graph.lower_to_top_level_ir() +print(graph._imported_module) # CHECK: module { # CHECK-LABEL: func.func @forward @@ -30,4 +33,3 @@ def foo(x, dim): # CHECK: return %{{.*}} : tensor<1x10xf32> # CHECK: } # CHECK: } -print(dynamo_compiler.imported_module) diff --git a/tests/Python/test_var_mean.py b/tests/Python/test_var_mean.py index eb7f254e4..eae1c9983 100644 --- a/tests/Python/test_var_mean.py +++ b/tests/Python/test_var_mean.py @@ -24,8 +24,11 @@ def foo_keepdim(x): aot_autograd_decomposition=inductor_decomp, ) -foo_mlir = dynamo.optimize(dynamo_compiler)(foo) -foo_mlir(x) +graphs = dynamo_compiler.importer(foo, x) +assert len(graphs) == 1 +graph = graphs[0] +graph.lower_to_top_level_ir() +print(graph._imported_module) # CHECK: module { # CHECK-LABEL: func.func @forward @@ -44,10 +47,33 @@ def foo_keepdim(x): # CHECK: return %{{.*}} : tensor, tensor # CHECK: } # CHECK: } -print(dynamo_compiler.imported_module) -foo_keepdim_mlir = dynamo.optimize(dynamo_compiler)(foo_keepdim) -foo_keepdim_mlir(x) +graphs = dynamo_compiler.importer(foo_keepdim, x) +assert len(graphs) == 2 +graphs[0].lower_to_top_level_ir() +print(graphs[0]._imported_module) + +# CHECK: module { +# CHECK-LABEL: func.func @forward +# CHECK: %{{.*}} = tosa.reduce_sum +# CHECK: %{{.*}} = "tosa.const" +# CHECK: %{{.*}} = tosa.reciprocal +# CHECK: %{{.*}} = tosa.mul +# CHECK: %{{.*}} = tosa.sub +# CHECK: %{{.*}} = tosa.mul +# CHECK: %{{.*}} = tosa.reduce_sum +# CHECK: %{{.*}} = "tosa.const" +# CHECK: %{{.*}} = tosa.reciprocal +# CHECK: %{{.*}} = tosa.mul +# CHECK: %{{.*}} = tosa.reshape +# CHECK: %{{.*}} = tosa.reshape +# CHECK: return %{{.*}} : tensor, tensor +# CHECK: } +# CHECK: } + +graphs[1].lower_to_top_level_ir() +print(graphs[1]._imported_module) + # CHECK: module { # CHECK-LABEL: func.func @forward # CHECK: %{{.*}} = tosa.reduce_sum @@ -63,4 +89,3 @@ def foo_keepdim(x): # CHECK: return %{{.*}} : tensor<1x1x1xf32>, tensor<1x1x1xf32> # CHECK: } # CHECK: } -print(dynamo_compiler.imported_module) diff --git a/tests/Python/test_view.py b/tests/Python/test_view.py index 44db4609d..31eacddc7 100644 --- a/tests/Python/test_view.py +++ b/tests/Python/test_view.py @@ -5,29 +5,31 @@ from torch._inductor.decomposition import decompositions as inductor_decomp from buddy.compiler.frontend import DynamoCompiler -from buddy.compiler.ops import tosa +from buddy.compiler.ops import linalg def foo(x, y): - return x + y + return torch.ops.aten.view(x, y) in1 = torch.randn(10) -in2 = torch.randn(10) +in2 = (2, 5) # Initialize the dynamo compiler. dynamo_compiler = DynamoCompiler( - primary_registry=tosa.ops_registry, + primary_registry=linalg.ops_registry, aot_autograd_decomposition=inductor_decomp, ) -foo_mlir = dynamo.optimize(dynamo_compiler)(foo) -foo_mlir(in1, in2) +graphs = dynamo_compiler.importer(foo, in1, in2) +assert len(graphs) == 1 +graph = graphs[0] +graph.lower_to_top_level_ir() +print(graph._imported_module) # CHECK: module { # CHECK-LABEL: func.func @forward -# CHECK: %{{.*}} = tosa.add +# CHECK: %{{.*}} = tosa.reshape # CHECK: return %{{.*}} # CHECK: } # CHECK: } -print(dynamo_compiler.imported_module) diff --git a/tests/Python/test_where.py b/tests/Python/test_where.py new file mode 100644 index 000000000..5266f00b7 --- /dev/null +++ b/tests/Python/test_where.py @@ -0,0 +1,38 @@ +# RUN: %PYTHON %s 2>&1 | FileCheck %s + +import torch +import torch._dynamo as dynamo +from torch._inductor.decomposition import decompositions as inductor_decomp +from torch._functorch.aot_autograd import aot_autograd_decompositions + +from buddy.compiler.frontend import DynamoCompiler +from buddy.compiler.ops import linalg + + +def foo(x, y, z): + return torch.where(x, y, z) + + +in1 = torch.ones([13, 13], dtype=torch.bool) +in2 = 0 +in3 = torch.ones([13, 13], dtype=torch.float32) +# Initialize the dynamo compiler. +dynamo_compiler = DynamoCompiler( + primary_registry=linalg.ops_registry, + aot_autograd_decomposition=aot_autograd_decompositions, +) + +graphs = dynamo_compiler.importer(foo, in1, in2, in3) +assert len(graphs) == 1 +graph = graphs[0] +graph.lower_to_top_level_ir() +print(graph._imported_module) + +# CHECK: module { +# CHECK-LABEL: func.func @forward +# CHECK: %{{.*}} = arith.constant +# CHECK: %{{.*}} = tensor.empty +# CHECK: %{{.*}} = linalg.generic +# CHECK: return %{{.*}} +# CHECK: } +# CHECK: } diff --git a/tools/buddy-opt/CMakeLists.txt b/tools/buddy-opt/CMakeLists.txt index 575fe7eea..afa07bff8 100644 --- a/tools/buddy-opt/CMakeLists.txt +++ b/tools/buddy-opt/CMakeLists.txt @@ -20,6 +20,7 @@ target_link_libraries(buddy-opt LowerDIPPass BuddyDAP LowerDAPPass + DAPVectorization BuddyRVV LowerRVVPass MatMulOptimization @@ -34,4 +35,5 @@ target_link_libraries(buddy-opt LowerLinalgToGemminiPass SchedulingOnDevices LowerSche + MLIRGPUPasses ) diff --git a/tools/buddy-opt/buddy-opt.cpp b/tools/buddy-opt/buddy-opt.cpp index 4f359cb18..bf9652d71 100644 --- a/tools/buddy-opt/buddy-opt.cpp +++ b/tools/buddy-opt/buddy-opt.cpp @@ -56,6 +56,7 @@ void registerPoolingVectorizationPass(); void registerLowerBudPass(); void registerLowerDIPPass(); void registerLowerDAPPass(); +void registerDAPVectorizePass(); void registerLowerRVVPass(); void registerBatchMatMulOptimizePass(); void registerMatMulOptimizePass(); @@ -68,6 +69,8 @@ void registerLowerGemminiPass(); void registerLowerLinalgToGemminiPass(); void registerDeviceSchedulePass(); void registerLowerSchePass(); +void registerGPUHostRegisterPass(); +void registerBuddyGPUBufferizePass(); } // namespace buddy } // namespace mlir @@ -82,6 +85,8 @@ int main(int argc, char **argv) { mlir::buddy::registerLowerBudPass(); mlir::buddy::registerLowerDIPPass(); mlir::buddy::registerLowerDAPPass(); + // Register Vectorization of DAP Dialect. + mlir::buddy::registerDAPVectorizePass(); mlir::buddy::registerLowerRVVPass(); mlir::buddy::registerLowerVectorExpPass(); mlir::buddy::registerLowerGemminiPass(); @@ -95,9 +100,13 @@ int main(int argc, char **argv) { mlir::buddy::registerTransposeOptimizationPass(); mlir::buddy::registerConvOptimizePass(); mlir::buddy::registerDeviceSchedulePass(); - mlir::buddy::registerLowerSchePass();; + mlir::buddy::registerLowerSchePass(); - mlir::DialectRegistry registry; + // Register gpu passes + mlir::buddy::registerGPUHostRegisterPass(); + mlir::buddy::registerBuddyGPUBufferizePass(); + + mlir::DialectRegistry registry; // Register all MLIR core dialects. registerAllDialects(registry); mlir::registerAllExtensions(registry);