From 719e7ded47a476b5f716caeccbd077bac2834676 Mon Sep 17 00:00:00 2001 From: Ellis Shi Date: Tue, 31 Oct 2023 15:44:10 +0800 Subject: [PATCH] [midend][examples] Add MatMulParallelVectorization, BuiltinTransposeVectorization, BatchMatMulOptimize. (#215) * [midend] Add MatMulParallelVectorization and optimized BatchMatMulOptimize. * [examples] Fix makefile arguments and add MatMulParallelVectorization testcase. * [midend][examples] Add parallelize BuiltinTransposeVectorization and tests. * [midend] Restrict the transpose optimize for 2 rank tensor and remove unused header files. * [midend] Canonicalize dynamic rank detection. --- .../MLIRLinalg/linalg-batch-matmul-f32.mlir | 7 +- .../MLIRLinalg/linalg-batch-matmul-i8.mlir | 7 +- .../MLIRLinalg/linalg-matmul-opt-f32.mlir | 28 ++ examples/MLIRLinalg/linalg-matmul-opt-i8.mlir | 42 ++ examples/MLIRLinalg/linalg-transpose-f32.mlir | 27 + examples/MLIRLinalg/makefile | 136 +++++- midend/lib/Conversion/CMakeLists.txt | 1 + .../BatchMatMulOptimize.cpp | 429 +++++++++------- .../MatMulOptimization/CMakeLists.txt | 6 +- .../MatMulParallelVectorization.cpp | 369 ++++++++++++++ .../BuiltinTransposeVectorization.cpp | 460 ++++++++++++++++++ .../TransposeOptimization/CMakeLists.txt | 5 + tools/buddy-opt/CMakeLists.txt | 4 +- tools/buddy-opt/buddy-opt.cpp | 5 +- 14 files changed, 1340 insertions(+), 186 deletions(-) create mode 100644 examples/MLIRLinalg/linalg-matmul-opt-f32.mlir create mode 100644 examples/MLIRLinalg/linalg-matmul-opt-i8.mlir create mode 100644 examples/MLIRLinalg/linalg-transpose-f32.mlir create mode 100644 midend/lib/Conversion/MatMulOptimization/MatMulParallelVectorization.cpp create mode 100644 midend/lib/Conversion/TransposeOptimization/BuiltinTransposeVectorization.cpp create mode 100644 midend/lib/Conversion/TransposeOptimization/CMakeLists.txt diff --git a/examples/MLIRLinalg/linalg-batch-matmul-f32.mlir b/examples/MLIRLinalg/linalg-batch-matmul-f32.mlir index 1ab0fe00ea..251e100dc3 100644 --- a/examples/MLIRLinalg/linalg-batch-matmul-f32.mlir +++ b/examples/MLIRLinalg/linalg-batch-matmul-f32.mlir @@ -1,5 +1,5 @@ // RUN: buddy-opt -batchmatmul-optimize -verify-diagnostics -expand-strided-metadata -lower-affine -convert-vector-to-llvm -finalize-memref-to-llvm -convert-scf-to-cf -convert-linalg-to-llvm -llvm-request-c-wrappers -convert-func-to-llvm -reconcile-unrealized-casts %s \ -// RUN: | mlir-cpu-runner -O0 -e buddy_batchmatmul_f32 \ +// RUN: | mlir-cpu-runner -O0 -e buddy_batchmatmul_f32 -entry-point-result=void \ // RUN: -shared-libs=%mlir_runner_utils_dir/libmlir_runner_utils%shlibext,%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext \ // RUN: | FileCheck %s @@ -9,7 +9,7 @@ memref.global "private" @C : memref<2x2x4xf32> = dense<[[[ 49., 113., 146., 82. func.func private @printMemrefF32(memref<*xf32>) attributes { llvm.emit_c_interface } -func.func @buddy_batchmatmul_f32() -> f32{ +func.func @buddy_batchmatmul_f32(){ %a = memref.get_global @A : memref<2x2x3xf32> %b = memref.get_global @B : memref<2x3x4xf32> %c = memref.get_global @C : memref<2x2x4xf32> @@ -24,6 +24,5 @@ func.func @buddy_batchmatmul_f32() -> f32{ // CHECK{LITERAL}: [12, 76, 96, 56]], // CHECK{LITERAL}: [[48, 162, 72, 156], // CHECK{LITERAL}: [16, 112, 0, 104]]] - %zero = arith.constant 0.0 :f32 - return %zero :f32 + return } diff --git a/examples/MLIRLinalg/linalg-batch-matmul-i8.mlir b/examples/MLIRLinalg/linalg-batch-matmul-i8.mlir index 7b39258fcf..a6e5a51f61 100644 --- a/examples/MLIRLinalg/linalg-batch-matmul-i8.mlir +++ b/examples/MLIRLinalg/linalg-batch-matmul-i8.mlir @@ -1,5 +1,5 @@ // RUN: buddy-opt -batchmatmul-optimize -verify-diagnostics -expand-strided-metadata -lower-affine -convert-vector-to-llvm -finalize-memref-to-llvm -convert-scf-to-cf -convert-linalg-to-llvm -llvm-request-c-wrappers -convert-func-to-llvm -reconcile-unrealized-casts %s \ -// RUN: | mlir-cpu-runner -O0 -e buddy_batchmatmul_i8 \ +// RUN: | mlir-cpu-runner -O0 -e buddy_batchmatmul_i8 -entry-point-result=void \ // RUN: -shared-libs=%mlir_runner_utils_dir/libmlir_runner_utils%shlibext,%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext \ // RUN: | FileCheck %s @@ -9,7 +9,7 @@ memref.global "private" @C : memref<2x2x4xi8> = dense<[[[49, 12, 14, 82],[6, 38, func.func private @printMemrefF32(memref<*xf32>) attributes { llvm.emit_c_interface } -func.func @buddy_batchmatmul_i8() -> f32{ +func.func @buddy_batchmatmul_i8(){ %a = memref.get_global @A : memref<2x2x3xi8> %b = memref.get_global @B : memref<2x3x4xi8> %c = memref.get_global @C : memref<2x2x4xi8> @@ -41,6 +41,5 @@ func.func @buddy_batchmatmul_i8() -> f32{ // CHECK{LITERAL}: [12, 76, 96, 56]], // CHECK{LITERAL}: [[48, -94, 72, -100], // CHECK{LITERAL}: [16, 112, 0, 104]]] - %zero = arith.constant 0.0 :f32 - return %zero :f32 + return } diff --git a/examples/MLIRLinalg/linalg-matmul-opt-f32.mlir b/examples/MLIRLinalg/linalg-matmul-opt-f32.mlir new file mode 100644 index 0000000000..c224e1ac2c --- /dev/null +++ b/examples/MLIRLinalg/linalg-matmul-opt-f32.mlir @@ -0,0 +1,28 @@ +// RUN: buddy-opt -matmul-paralell-vectorization-optimize -verify-diagnostics -expand-strided-metadata -lower-affine -convert-vector-to-llvm -finalize-memref-to-llvm -convert-scf-to-cf -convert-linalg-to-llvm -llvm-request-c-wrappers -convert-func-to-llvm -reconcile-unrealized-casts %s \ +// RUN: | mlir-cpu-runner -O0 -e buddy_matmul_f32 -entry-point-result=void \ +// RUN: -shared-libs=%mlir_runner_utils_dir/libmlir_runner_utils%shlibext,%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext \ +// RUN: | FileCheck %s + +memref.global "private" @A : memref<4x3xf32> = dense<[[9., 4., 6.],[2., 4., 0.],[6., 3., 3.],[0., 4., 7.]]> +memref.global "private" @B : memref<3x4xf32> = dense<[[1., 3., 8., 0.],[1., 8., 8., 7.], [6., 9., 7., 9.]]> +memref.global "private" @C : memref<4x4xf32> = dense<[[49., 113., 146., 82.],[6., 38., 48., 28.],[24., 81., 36., 78.],[8., 56., 0., 52.]]> + +func.func private @printMemrefF32(memref<*xf32>) attributes { llvm.emit_c_interface } + +func.func @buddy_matmul_f32(){ + %a = memref.get_global @A : memref<4x3xf32> + %b = memref.get_global @B : memref<3x4xf32> + %c = memref.get_global @C : memref<4x4xf32> + + linalg.matmul + ins(%a, %b: memref<4x3xf32>, memref<3x4xf32>) + outs(%c: memref<4x4xf32>) + %printed_c = memref.cast %c : memref<4x4xf32> to memref<*xf32> + call @printMemrefF32(%printed_c) : (memref<*xf32>) -> () + // CHECK: {{Unranked Memref base@ = 0x[0-9A-Fa-f]{1,} rank = 2 offset = 0 sizes = \[4, 4\] strides = \[4, 1\] data =}} + // CHECK{LITERAL}: [[98, 226, 292, 164], + // CHECK{LITERAL}: [12, 76, 96, 56], + // CHECK{LITERAL}: [51, 150, 129, 126], + // CHECK{LITERAL}: [54, 151, 81, 143]] + return +} diff --git a/examples/MLIRLinalg/linalg-matmul-opt-i8.mlir b/examples/MLIRLinalg/linalg-matmul-opt-i8.mlir new file mode 100644 index 0000000000..c5836ec39a --- /dev/null +++ b/examples/MLIRLinalg/linalg-matmul-opt-i8.mlir @@ -0,0 +1,42 @@ +// RUN: buddy-opt -matmul-paralell-vectorization-optimize -verify-diagnostics -expand-strided-metadata -lower-affine -convert-vector-to-llvm -finalize-memref-to-llvm -convert-scf-to-cf -convert-linalg-to-llvm -llvm-request-c-wrappers -convert-func-to-llvm -reconcile-unrealized-casts %s \ +// RUN: | mlir-cpu-runner -O0 -e buddy_matmul_i8 -entry-point-result=void \ +// RUN: -shared-libs=%mlir_runner_utils_dir/libmlir_runner_utils%shlibext,%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext \ +// RUN: | FileCheck %s + +memref.global "private" @A : memref<4x3xi8> = dense<[[9, 4, 6],[2, 4, 0],[6, 3, 3],[0, 4, 7]]> +memref.global "private" @B : memref<3x4xi8> = dense<[[1, 3, 8, 0],[1, 8, 8, 7], [6, 9, 7, 9]]> +memref.global "private" @C : memref<4x4xi8> = dense<[[49, 113, 46, 82],[6, 38, 48, 28],[24, 81, 36, 78],[8, 56, 0, 52]]> + +func.func private @printMemrefF32(memref<*xf32>) attributes { llvm.emit_c_interface } + +func.func @buddy_matmul_i8(){ + %a = memref.get_global @A : memref<4x3xi8> + %b = memref.get_global @B : memref<3x4xi8> + %c = memref.get_global @C : memref<4x4xi8> + + linalg.matmul + ins(%a, %b: memref<4x3xi8>, memref<3x4xi8>) + outs(%c: memref<4x4xi8>) + + %cst_0 = arith.constant 0 : index + %cst_1 = arith.constant 1 : index + %cst_4 = arith.constant 4 : index + + %c_f32 = memref.alloca() : memref<4x4xf32> + scf.for %i = %cst_0 to %cst_4 step %cst_1 { + scf.for %j = %cst_0 to %cst_4 step %cst_1 { + %val_i8 = memref.load %c[%i, %j] : memref<4x4xi8> + %val_f32 = arith.sitofp %val_i8 : i8 to f32 + memref.store %val_f32, %c_f32[%i, %j] : memref<4x4xf32> + } + } + + %printed_c = memref.cast %c_f32 : memref<4x4xf32> to memref<*xf32> + call @printMemrefF32(%printed_c) : (memref<*xf32>) -> () + // CHECK: {{Unranked Memref base@ = 0x[0-9A-Fa-f]{1,} rank = 2 offset = 0 sizes = \[4, 4\] strides = \[4, 1\] data =}} + // CHECK{LITERAL}: [[98, -30, -64, -92], + // CHECK{LITERAL}: [12, 76, 96, 56], + // CHECK{LITERAL}: [51, -106, -127, 126], + // CHECK{LITERAL}: [54, -105, 81, -113]] + return +} diff --git a/examples/MLIRLinalg/linalg-transpose-f32.mlir b/examples/MLIRLinalg/linalg-transpose-f32.mlir new file mode 100644 index 0000000000..3b1ef59839 --- /dev/null +++ b/examples/MLIRLinalg/linalg-transpose-f32.mlir @@ -0,0 +1,27 @@ +// RUN: buddy-opt -transpose-optimize="vector-size=16" -verify-diagnostics -lower-affine -expand-strided-metadata -convert-vector-to-scf -convert-vector-to-llvm -finalize-memref-to-llvm -convert-scf-to-cf -convert-arith-to-llvm -convert-func-to-llvm -lower-affine -llvm-request-c-wrappers -convert-arith-to-llvm -reconcile-unrealized-casts %s \ +// RUN: | mlir-cpu-runner -O0 -e buddy_transpose_f32 -entry-point-result=void \ +// RUN: -shared-libs=%mlir_runner_utils_dir/libmlir_runner_utils%shlibext,%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext \ +// RUN: | FileCheck %s + +memref.global "private" @A : memref<3x4xf32> = dense<[[1., 3., 8., 0.],[1., 8., 8., 7.], [6., 9., 7., 9.]]> + +func.func private @printMemrefF32(memref<*xf32>) attributes { llvm.emit_c_interface } + +func.func @buddy_transpose_f32(){ + %a = memref.get_global @A : memref<3x4xf32> + %b = memref.alloc() : memref<4x3xf32> + + linalg.transpose + ins(%a: memref<3x4xf32>) + outs(%b: memref<4x3xf32>) + permutation = [1, 0] + %printed_b = memref.cast %b : memref<4x3xf32> to memref<*xf32> + call @printMemrefF32(%printed_b) : (memref<*xf32>) -> () + memref.dealloc %b : memref<4x3xf32> + // CHECK: {{Unranked Memref base@ = 0x[0-9A-Fa-f]{1,} rank = 2 offset = 0 sizes = \[4, 3\] strides = \[3, 1\] data =}} + // CHECK{LITERAL}: [[1, 1, 6], + // CHECK{LITERAL}: [3, 8, 9], + // CHECK{LITERAL}: [8, 8, 7], + // CHECK{LITERAL}: [0, 7, 9]] + return +} diff --git a/examples/MLIRLinalg/makefile b/examples/MLIRLinalg/makefile index 5fba1b9b6b..f214fa7f67 100644 --- a/examples/MLIRLinalg/makefile +++ b/examples/MLIRLinalg/makefile @@ -148,7 +148,7 @@ linalg-batch-matmul-optimize-run: -convert-arith-to-llvm \ -convert-func-to-llvm \ -reconcile-unrealized-casts | \ - ${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \ + ${MLIR_CPU_RUNNER} ${OPT_FLAG} -e buddy_batchmatmul_f32 -entry-point-result=void \ -shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS} linalg-batch-matmul-lower: @@ -170,7 +170,7 @@ linalg-batch-matmul-run: -convert-linalg-to-loops -lower-affine -convert-scf-to-cf \ -convert-vector-to-llvm -finalize-memref-to-llvm -convert-arith-to-llvm \ -convert-func-to-llvm -reconcile-unrealized-casts | \ - ${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void -shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS} + ${MLIR_CPU_RUNNER} ${OPT_FLAG} -e buddy_batchmatmul_f32 -entry-point-result=void -shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS} linalg-batch-matmul-optimize-lower: @${BUDDY_OPT} linalg-batch-matmul-f32.mlir ${MLIR_OPT_OPTIONS} \ @@ -203,7 +203,7 @@ linalg-batch-matmul-i8-optimize-run: -convert-arith-to-llvm \ -convert-func-to-llvm \ -reconcile-unrealized-casts | \ - ${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \ + ${MLIR_CPU_RUNNER} ${OPT_FLAG} -e buddy_batchmatmul_i8 -entry-point-result=void \ -shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS} linalg-batch-matmul-i8-lower: @@ -225,7 +225,7 @@ linalg-batch-matmul-i8-run: -convert-linalg-to-loops -lower-affine -convert-scf-to-cf \ -convert-vector-to-llvm -finalize-memref-to-llvm -convert-arith-to-llvm \ -convert-func-to-llvm -reconcile-unrealized-casts | \ - ${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void -shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS} + ${MLIR_CPU_RUNNER} ${OPT_FLAG} -e buddy_batchmatmul_i8 -entry-point-result=void -shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS} linalg-batch-matmul-i8-optimize-lower: @${BUDDY_OPT} linalg-batch-matmul-i8.mlir ${MLIR_OPT_OPTIONS} \ @@ -246,6 +246,134 @@ linalg-batch-matmul-i8-optimize-translate: -reconcile-unrealized-casts | \ ${MLIR_TRANSLATE} --mlir-to-llvmir -o log.ll +linalg-matmul-parallized-vectorized-optmize-run: + @${BUDDY_OPT} linalg-matmul-opt-f32.mlir ${MLIR_OPT_OPTIONS} \ + -matmul-paralell-vectorization-optimize="vector-size=128" \ + -convert-linalg-to-loops \ + -expand-strided-metadata \ + -lower-affine \ + -convert-scf-to-cf \ + -convert-vector-to-llvm \ + -finalize-memref-to-llvm \ + -convert-arith-to-llvm \ + -convert-func-to-llvm \ + -reconcile-unrealized-casts | \ + ${MLIR_CPU_RUNNER} ${OPT_FLAG} -e buddy_matmul_f32 -entry-point-result=void \ + -shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS} + +linalg-matmul-parallized-vectorized-optmize-lower: + @${BUDDY_OPT} linalg-matmul-opt-f32.mlir ${MLIR_OPT_OPTIONS} \ + -matmul-paralell-vectorization-optimize="vector-size=128" \ + -o ./log.mlir + +linalg-matmul-parallized-vectorized-optmize-translate: + @${BUDDY_OPT} linalg-matmul-opt-f32.mlir ${MLIR_OPT_OPTIONS} \ + -matmul-paralell-vectorization-optimize="vector-size=128" \ + -convert-linalg-to-loops \ + -expand-strided-metadata \ + -lower-affine \ + -convert-scf-to-cf \ + -convert-vector-to-llvm \ + -finalize-memref-to-llvm \ + -convert-arith-to-llvm \ + -convert-func-to-llvm \ + -reconcile-unrealized-casts | \ + ${MLIR_TRANSLATE} --mlir-to-llvmir -o log.ll + +linalg-matmul-i8-parallized-vectorized-optmize-run: + @${BUDDY_OPT} linalg-matmul-opt-i8.mlir ${MLIR_OPT_OPTIONS} \ + -matmul-paralell-vectorization-optimize="vector-size=128" \ + -convert-linalg-to-loops \ + -expand-strided-metadata \ + -lower-affine \ + -convert-scf-to-cf \ + -convert-vector-to-llvm \ + -finalize-memref-to-llvm \ + -convert-arith-to-llvm \ + -convert-func-to-llvm \ + -reconcile-unrealized-casts | \ + ${MLIR_CPU_RUNNER} ${OPT_FLAG} -e buddy_matmul_i8 -entry-point-result=void \ + -shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS} + +linalg-matmul-i8-parallized-vectorized-optmize-lower: + @${BUDDY_OPT} linalg-matmul-opt-i8.mlir ${MLIR_OPT_OPTIONS} \ + -matmul-paralell-vectorization-optimize="vector-size=128" \ + -o ./log.mlir + +linalg-matmul-i8-parallized-vectorized-optmize-translate: + @${BUDDY_OPT} linalg-matmul-opt-i8.mlir ${MLIR_OPT_OPTIONS} \ + -matmul-paralell-vectorization-optimize="vector-size=128" \ + -convert-linalg-to-loops \ + -expand-strided-metadata \ + -lower-affine \ + -convert-scf-to-cf \ + -convert-vector-to-llvm \ + -finalize-memref-to-llvm \ + -convert-arith-to-llvm \ + -convert-func-to-llvm \ + -reconcile-unrealized-casts | \ + ${MLIR_TRANSLATE} --mlir-to-llvmir -o log.ll + +linalg-transpose-optimize-run: + @${BUDDY_OPT} linalg-transpose-f32.mlir ${MLIR_OPT_OPTIONS} \ + -transpose-optimize="vector-size=16" \ + -lower-affine \ + -convert-vector-to-scf \ + -convert-vector-to-llvm \ + -finalize-memref-to-llvm \ + -convert-scf-to-cf \ + -convert-arith-to-llvm \ + -llvm-request-c-wrappers \ + -convert-func-to-llvm \ + -lower-affine \ + -convert-arith-to-llvm \ + -reconcile-unrealized-casts | \ + ${MLIR_CPU_RUNNER} ${OPT_FLAG} -e buddy_transpose_f32 -entry-point-result=void \ + -shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS} + +linalg-transpose-lower: + @${MLIR_OPT} linalg-transpose-f32.mlir ${MLIR_OPT_OPTIONS} \ + -convert-linalg-to-loops -lower-affine -convert-scf-to-cf \ + -convert-vector-to-llvm -finalize-memref-to-llvm -convert-arith-to-llvm \ + -convert-func-to-llvm -reconcile-unrealized-casts \ + -o ./log.mlir + +linalg-transpose-translate: + @${MLIR_OPT} linalg-transpose-f32.mlir ${MLIR_OPT_OPTIONS} \ + -convert-linalg-to-loops -lower-affine -convert-scf-to-cf \ + -convert-vector-to-llvm -finalize-memref-to-llvm -convert-arith-to-llvm \ + -convert-func-to-llvm -reconcile-unrealized-casts | \ + ${MLIR_TRANSLATE} --mlir-to-llvmir -o log.ll + +linalg-transpose-run: + @${MLIR_OPT} linalg-transpose-f32.mlir ${MLIR_OPT_OPTIONS} \ + -convert-linalg-to-loops -lower-affine -convert-scf-to-cf \ + -convert-vector-to-llvm -finalize-memref-to-llvm -convert-arith-to-llvm \ + -convert-func-to-llvm -reconcile-unrealized-casts | \ + ${MLIR_CPU_RUNNER} ${OPT_FLAG} -e buddy_transpose_f32 -entry-point-result=void -shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS} + +linalg-transpose-optimize-lower: + @${BUDDY_OPT} linalg-transpose-f32.mlir ${MLIR_OPT_OPTIONS} \ + -transpose-optimize="vector-size=16" \ + -o ./log.mlir + +linalg-transpose-optimize-translate: + @${BUDDY_OPT} linalg-transpose-f32.mlir ${MLIR_OPT_OPTIONS} \ + -transpose-optimize="vector-size=16" \ + -lower-affine \ + -convert-vector-to-scf \ + -convert-vector-to-llvm \ + -finalize-memref-to-llvm \ + -convert-scf-to-cf \ + -convert-arith-to-llvm \ + -llvm-request-c-wrappers \ + -convert-func-to-llvm \ + -lower-affine \ + -convert-arith-to-llvm \ + -reconcile-unrealized-casts | \ + ${MLIR_TRANSLATE} --mlir-to-llvmir -o log.ll + + linalg-conv2d_nchw_fchw-lower: @${MLIR_OPT} ./linalg-conv2d_nchw_fchw.mlir \ -convert-linalg-to-loops -o ./log.mlir diff --git a/midend/lib/Conversion/CMakeLists.txt b/midend/lib/Conversion/CMakeLists.txt index 4a1067a982..20c7b2b27c 100644 --- a/midend/lib/Conversion/CMakeLists.txt +++ b/midend/lib/Conversion/CMakeLists.txt @@ -4,6 +4,7 @@ add_subdirectory(LowerDIP) add_subdirectory(LowerRVV) add_subdirectory(LowerDAP) add_subdirectory(MatMulOptimization) +add_subdirectory(TransposeOptimization) add_subdirectory(ConvOptimization) add_subdirectory(LowerVectorExp) add_subdirectory(LowerGemmini) diff --git a/midend/lib/Conversion/MatMulOptimization/BatchMatMulOptimize.cpp b/midend/lib/Conversion/MatMulOptimization/BatchMatMulOptimize.cpp index 334c6c4023..f86435abbd 100644 --- a/midend/lib/Conversion/MatMulOptimization/BatchMatMulOptimize.cpp +++ b/midend/lib/Conversion/MatMulOptimization/BatchMatMulOptimize.cpp @@ -1,5 +1,4 @@ -//===- BatchMatMulOptimize.cpp -//-------------------------------------------------===// +//===- BatchMatMulOptimize.cpp --------------------------------------------===// // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -18,14 +17,14 @@ // This file implements the batchmatmul optimization. // //===----------------------------------------------------------------------===// - #include "mlir/Dialect/Arith/IR/Arith.h" #include "mlir/IR/AffineExpr.h" #include "mlir/IR/AffineMap.h" #include "mlir/IR/Attributes.h" #include "mlir/IR/Builders.h" #include "mlir/IR/BuiltinAttributes.h" -#include "mlir/IR/TypeRange.h" +#include "mlir/IR/BuiltinTypes.h" +#include "mlir/IR/IntegerSet.h" #include "mlir/IR/ValueRange.h" #include "llvm/ADT/ArrayRef.h" #include @@ -41,6 +40,7 @@ using namespace mlir; using namespace vector; +using namespace affine; //===----------------------------------------------------------------------===// // Rewrite Pattern @@ -62,187 +62,274 @@ class BatchMatMulOptimizePattern : public ConversionPattern { ConversionPatternRewriter &rewriter) const override { auto loc = op->getLoc(); - // Get input A, B, C. + // Retrieve input tensors A, B, and C. Value A = op->getOperand(0); Value B = op->getOperand(1); Value C = op->getOperand(2); - // Get ElementType of input and output. - auto A_elementType = A.getType().cast().getElementType(); - // Some constants. - const Value c0 = + // Acquire the element type of input tensors. + Type elementType = A.getType().cast().getElementType(); + + // Define constants. + const Value zeroIndex = rewriter.create(loc, rewriter.getIndexAttr(0)); - const Value step = rewriter.create( - loc, rewriter.getIndexAttr(affineVectorSize)); const AffineExpr d0 = rewriter.getAffineDimExpr(0); const AffineExpr d1 = rewriter.getAffineDimExpr(1); const AffineExpr d2 = rewriter.getAffineDimExpr(2); - const AffineExpr c0_affine = rewriter.getAffineConstantExpr(0); - - const Value c0_dynamicType = rewriter.create( - loc, rewriter.getZeroAttr(A_elementType)); - const Value c0_dynamicType_vec = rewriter.create( - loc, VectorType::get({affineVectorSize}, A_elementType), c0_dynamicType); - - // Dims - Value BATCH = rewriter.create(loc, A, 0); // Batch size - Value M = rewriter.create(loc, A, 1); // A row - Value N = rewriter.create(loc, B, 2); // B col - Value K = rewriter.create(loc, B, 1); // B row - - auto reducedValues = llvm::to_vector<4>(llvm::map_range( - ArrayRef{}, - [](const mlir::affine::LoopReduction &red) { return red.value; })); - - // Build parallel loop body. - auto parallelLoop = rewriter.create( - loc, ValueRange(reducedValues).getTypes(), ValueRange{BATCH}, - ArrayRef{ - rewriter.getNamedAttr( - "lowerBoundsGroups", - rewriter.getI32TensorAttr(ArrayRef{1})), - rewriter.getNamedAttr( - "upperBoundsGroups", - rewriter.getI32TensorAttr(ArrayRef{1})), - rewriter.getNamedAttr("lowerBoundsMap", - AffineMapAttr::get(AffineMap::get( - 0, 0, {c0_affine}, - rewriter.getContext()))), - rewriter.getNamedAttr("upperBoundsMap", - AffineMapAttr::get(AffineMap::get( - 1, 0, {d0}, rewriter.getContext()))), - rewriter.getNamedAttr("reductions", rewriter.getArrayAttr({})), - rewriter.getNamedAttr("steps", rewriter.getI64ArrayAttr(1))}); - - auto body = new Block(); - rewriter.setInsertionPointToStart(body); - body->addArgument(rewriter.getIndexType(), loc); - - Value ivBatch = body->getArguments()[0]; + const AffineExpr s0 = rewriter.getAffineSymbolExpr(0); + const AffineExpr zeroAffine = rewriter.getAffineConstantExpr(0); + + const Value zeroElementType = rewriter.create( + loc, rewriter.getZeroAttr(elementType)); + const Value zeroElementTypeVec = rewriter.create( + loc, VectorType::get({affineVectorSize}, elementType), zeroElementType); + + // Get dimensions of input tensors. + Value batch = rewriter.create(loc, A, 0); + Value aRow = rewriter.create(loc, A, 1); + Value bCol = rewriter.create(loc, B, 2); + Value bRow = rewriter.create(loc, B, 1); + + // Calculate the length of the tail, which might not fit in a vector. + Value tailLength = rewriter.create( + loc, AffineMap::get(1, 0, d0 % affineVectorSize), ValueRange{bCol}); + + // Generate a mask vector based on the tail length. + Value maskVector = rewriter.create( + loc, VectorType::get({affineVectorSize}, rewriter.getI1Type()), + ValueRange{tailLength}); + + SmallVector reducedValues = llvm::to_vector<4>( + llvm::map_range(ArrayRef{}, + [](const LoopReduction &red) { return red.value; })); + + // Apply the column of matrix B. + Value appliedColOfB = rewriter.create( + loc, AffineMap::get(1, 0, d0.ceilDiv(affineVectorSize)), + ValueRange{bCol}); + + // Create the primary parallel batch level loop. + AffineParallelOp parallelBatchLoop = + rewriter.create( + loc, ValueRange(reducedValues).getTypes(), ValueRange{batch}, + ArrayRef{ + rewriter.getNamedAttr("lowerBoundsGroups", + rewriter.getI32TensorAttr({1})), + rewriter.getNamedAttr("upperBoundsGroups", + rewriter.getI32TensorAttr({1})), + rewriter.getNamedAttr( + "lowerBoundsMap", + AffineMapAttr::get(AffineMap::get(0, 0, {zeroAffine}, + rewriter.getContext()))), + rewriter.getNamedAttr("upperBoundsMap", + AffineMapAttr::get(AffineMap::get( + 1, 0, {d0}, rewriter.getContext()))), + rewriter.getNamedAttr("reductions", rewriter.getArrayAttr({})), + rewriter.getNamedAttr("steps", rewriter.getI64ArrayAttr({1}))}); + // Create the loop body for the parallel loop. + Block *loopBody = new Block(); + rewriter.setInsertionPointToStart(loopBody); + loopBody->addArgument(rewriter.getIndexType(), loc); + Value loopVarBatchIdx = loopBody->getArguments()[0]; + + // Prefetching data from tensor 'A' for better cache utilization. rewriter.create( loc, A, AffineMap::get(3, 0, {d0, d1, d2}, rewriter.getContext()), - ArrayRef{ivBatch, M, K}, false, 3, true); + ArrayRef{loopVarBatchIdx, aRow, bRow}, false, 3, true); + affine::buildAffineLoopNest( - rewriter, loc, {c0}, {K}, 1, + rewriter, loc, {zeroIndex}, {appliedColOfB}, 1, [&](OpBuilder &builder, Location loc, ValueRange ivRange) { - Value ivB_row = ivRange.front(); - affine::buildAffineLoopNest( - builder, loc, {c0}, {M}, 1, - [&](OpBuilder &builder, Location loc, ValueRange ivRange) { - Value ivA_row = ivRange.front(); - Value applied_n = builder.create( - loc, AffineMap::get(1, 0, d0.ceilDiv(affineVectorSize)), - ValueRange{N}); - affine::buildAffineLoopNest( - builder, loc, {c0}, {applied_n}, 1, - [&](OpBuilder &builder, Location loc, ValueRange ivRange) { - Value ivB_col = ivRange.front(); - Value a_ele = builder.create( - loc, A, ValueRange{ivBatch, ivA_row, ivB_row}); - Value a_vec = builder.create( - loc, - VectorType::get({affineVectorSize}, A_elementType), - a_ele); - Value b_col_cur = - builder.create(loc, ivB_col, step); - Value tail_len = - builder.create(loc, N, b_col_cur); - Value tail_flag = builder.create( - loc, mlir::arith::CmpIPredicate::sge, tail_len, step); - builder.create( - loc, tail_flag, - [&](OpBuilder &builder, Location loc) { - Value b_vec = - builder.create( - loc, - VectorType::get({affineVectorSize}, - A_elementType), - B, - AffineMap::get( - 3, 0, {d0, d1, d2 * affineVectorSize}, - rewriter.getContext()), - ValueRange{ivBatch, ivB_row, ivB_col}); - Value c_vec = - builder.create( - loc, - VectorType::get({affineVectorSize}, - A_elementType), - C, - AffineMap::get( - 3, 0, {d0, d1, d2 * affineVectorSize}, - rewriter.getContext()), - ValueRange{ivBatch, ivA_row, ivB_col}); - Value result_vec; - if (A_elementType.isa()) { - Value add_vec = builder.create( - loc, a_vec, b_vec); - result_vec = builder.create( - loc, add_vec, c_vec); - } else { - result_vec = builder.create( - loc, a_vec, b_vec, c_vec); - } - builder.create( - loc, result_vec, C, - AffineMap::get(3, 0, - {d0, d1, d2 * affineVectorSize}, - rewriter.getContext()), - ValueRange{ivBatch, ivA_row, ivB_col}); - builder.create(loc); - }, - [&](OpBuilder &builder, Location loc) { - Value mask_vec = - builder.create( - loc, - VectorType::get({affineVectorSize}, - rewriter.getI1Type()), - ValueRange{tail_len}); - Value b_col_idx_tail = - builder.create(loc, ivB_col, - step); - Value b_vec_tail = - builder.create( - loc, - VectorType::get({affineVectorSize}, - A_elementType), - B, - ValueRange{ivBatch, ivB_row, - b_col_idx_tail}, - mask_vec, c0_dynamicType_vec); - Value c_vec_tail = - builder.create( - loc, - VectorType::get({affineVectorSize}, - A_elementType), - C, - ValueRange{ivBatch, ivA_row, - b_col_idx_tail}, - mask_vec, c0_dynamicType_vec); - Value result_vec_tail; - if (A_elementType.isa()) { - Value add_vec = builder.create( - loc, a_vec, b_vec_tail); - result_vec_tail = builder.create( - loc, add_vec, c_vec_tail); - } else { - result_vec_tail = builder.create( - loc, a_vec, b_vec_tail, c_vec_tail); - } - builder.create( - loc, C, - ValueRange{ivBatch, ivA_row, b_col_idx_tail}, - mask_vec, result_vec_tail); - builder.create(loc); - }); - }); - }); + Value loopVarColOfB = ivRange.front(); + + // Compile time branch detection. + if (C.getType().cast().isDynamicDim(2) or + C.getType().cast().getDimSize(2) % affineVectorSize != + 0) { + + // Depending on the position, use either full vectors or tail + // vectors. + affine::AffineIfOp branchingOp = builder.create( + loc, + IntegerSet::get( + 1, 1, {d0 * -affineVectorSize + s0 - affineVectorSize}, + {false}), + ValueRange{loopVarBatchIdx, bCol}, true); + + // Branch handling full vector operations. + OpBuilder trueBranchBuilder = branchingOp.getThenBodyBuilder(); + affine::buildAffineLoopNest( + trueBranchBuilder, loc, {zeroIndex}, {bRow}, 1, + [&](OpBuilder &builder, Location loc, ValueRange ivRange) { + Value loopVarRowOfB = ivRange.front(); + Value bVec = builder.create( + loc, VectorType::get({affineVectorSize}, elementType), B, + AffineMap::get(3, 0, {d0, d1, d2 * affineVectorSize}, + rewriter.getContext()), + ValueRange{loopVarBatchIdx, loopVarRowOfB, + loopVarColOfB}); + affine::buildAffineLoopNest( + builder, loc, {zeroIndex}, {aRow}, 1, + [&](OpBuilder &builder, Location loc, + ValueRange ivRange) { + Value loopVarRowOfA = ivRange.front(); + Value aElement = builder.create( + loc, A, + ValueRange{loopVarBatchIdx, loopVarRowOfA, + loopVarRowOfB}); + Value aVec = builder.create( + loc, + VectorType::get({affineVectorSize}, elementType), + aElement); + Value cVec = builder.create( + loc, + VectorType::get({affineVectorSize}, elementType), C, + AffineMap::get(3, 0, + {d0, d1, d2 * affineVectorSize}, + builder.getContext()), + ValueRange{loopVarBatchIdx, loopVarRowOfA, + loopVarColOfB}); + Value computedVec; + + // Compute the result vector either through integer + // multiplication and addition or fused multiply-add + // based on the element type. + if (elementType.isa()) { + Value mulVec = + builder.create(loc, aVec, bVec); + computedVec = + builder.create(loc, mulVec, cVec); + } else { + computedVec = builder.create( + loc, aVec, bVec, cVec); + } + builder.create( + loc, computedVec, C, + AffineMap::get(3, 0, + {d0, d1, d2 * affineVectorSize}, + builder.getContext()), + ValueRange{loopVarBatchIdx, loopVarRowOfA, + loopVarColOfB}); + }); + }); + + // Branch handling operations on the tail. + OpBuilder falseBranchBuilder = branchingOp.getElseBodyBuilder(); + affine::buildAffineLoopNest( + falseBranchBuilder, loc, {zeroIndex}, {bRow}, 1, + [&](OpBuilder &builder, Location loc, ValueRange ivRange) { + Value loopVarRowOfB = ivRange.front(); + Value tailIdxColOfB = builder.create( + loc, AffineMap::get(1, 0, d0 * affineVectorSize), + ValueRange{loopVarColOfB}); + Value bVec = builder.create( + loc, VectorType::get({affineVectorSize}, elementType), B, + ValueRange{loopVarBatchIdx, loopVarRowOfB, tailIdxColOfB}, + maskVector, zeroElementTypeVec); + affine::buildAffineLoopNest( + builder, loc, {zeroIndex}, {aRow}, 1, + [&](OpBuilder &builder, Location loc, + ValueRange ivRange) { + Value loopVarRowOfA = ivRange.front(); + Value aElement = builder.create( + loc, A, + ValueRange{loopVarBatchIdx, loopVarRowOfA, + loopVarRowOfB}); + Value aVec = builder.create( + loc, + VectorType::get({affineVectorSize}, elementType), + aElement); + Value cVec = builder.create( + loc, + VectorType::get({affineVectorSize}, elementType), C, + ValueRange{loopVarBatchIdx, loopVarRowOfA, + tailIdxColOfB}, + maskVector, zeroElementTypeVec); + Value computedVec; + + // Compute the result vector either through integer + // multiplication and addition or fused multiply-add + // based on the element type. + if (elementType.isa()) { + Value mulVec = + builder.create(loc, aVec, bVec); + computedVec = + builder.create(loc, mulVec, cVec); + } else { + computedVec = builder.create( + loc, aVec, bVec, cVec); + } + builder.create( + loc, C, + ValueRange{loopVarBatchIdx, loopVarRowOfA, + tailIdxColOfB}, + maskVector, computedVec); + }); + }); + } else { + affine::buildAffineLoopNest( + builder, loc, {zeroIndex}, {bRow}, 1, + [&](OpBuilder &builder, Location loc, ValueRange ivRange) { + Value loopVarRowOfB = ivRange.front(); + Value bVec = builder.create( + loc, VectorType::get({affineVectorSize}, elementType), B, + AffineMap::get(3, 0, {d0, d1, d2 * affineVectorSize}, + rewriter.getContext()), + ValueRange{loopVarBatchIdx, loopVarRowOfB, + loopVarColOfB}); + affine::buildAffineLoopNest( + builder, loc, {zeroIndex}, {aRow}, 1, + [&](OpBuilder &builder, Location loc, + ValueRange ivRange) { + Value loopVarRowOfA = ivRange.front(); + Value aElement = builder.create( + loc, A, + ValueRange{loopVarBatchIdx, loopVarRowOfA, + loopVarRowOfB}); + Value aVec = builder.create( + loc, + VectorType::get({affineVectorSize}, elementType), + aElement); + Value cVec = builder.create( + loc, + VectorType::get({affineVectorSize}, elementType), C, + AffineMap::get(3, 0, + {d0, d1, d2 * affineVectorSize}, + builder.getContext()), + ValueRange{loopVarBatchIdx, loopVarRowOfA, + loopVarColOfB}); + Value computedVec; + + // Compute the result vector either through integer + // multiplication and addition or fused multiply-add + // based on the element type. + if (elementType.isa()) { + Value mulVec = + builder.create(loc, aVec, bVec); + computedVec = + builder.create(loc, mulVec, cVec); + } else { + computedVec = builder.create( + loc, aVec, bVec, cVec); + } + builder.create( + loc, computedVec, C, + AffineMap::get(3, 0, + {d0, d1, d2 * affineVectorSize}, + builder.getContext()), + ValueRange{loopVarBatchIdx, loopVarRowOfA, + loopVarColOfB}); + }); + }); + } }); rewriter.create(loc); - parallelLoop.getRegion().push_back(body); - rewriter.setInsertionPointAfter(parallelLoop); + // Finalize the loop and erase the original operation. + parallelBatchLoop.getRegion().push_back(loopBody); + rewriter.setInsertionPointAfter(parallelBatchLoop); rewriter.eraseOp(op); return success(); @@ -279,9 +366,9 @@ class BatchMatMulOptimizePass affine::AffineDialect, VectorDialect>(); } - Option affineVectorSize{ - *this, "vector-size", - llvm::cl::desc("Affine Vector size."), llvm::cl::init(64)}; + Option affineVectorSize{*this, "vector-size", + llvm::cl::desc("Affine Vector size."), + llvm::cl::init(64)}; }; } // end anonymous namespace. diff --git a/midend/lib/Conversion/MatMulOptimization/CMakeLists.txt b/midend/lib/Conversion/MatMulOptimization/CMakeLists.txt index 4f28fa8a44..8e726863eb 100644 --- a/midend/lib/Conversion/MatMulOptimization/CMakeLists.txt +++ b/midend/lib/Conversion/MatMulOptimization/CMakeLists.txt @@ -2,7 +2,7 @@ add_mlir_library(MatMulOptimization BatchMatMulOptimize.cpp MatMulOptimize.cpp MatMulVectorization.cpp - + MatMulParallelVectorization.cpp LINK_LIBS PUBLIC BuddyUtils ) @@ -10,3 +10,7 @@ add_mlir_library(MatMulOptimization add_mlir_library(BatchMatMulOptimization BatchMatMulOptimize.cpp ) + +add_mlir_library(MatMulParallelVectorization + MatMulParallelVectorization.cpp +) diff --git a/midend/lib/Conversion/MatMulOptimization/MatMulParallelVectorization.cpp b/midend/lib/Conversion/MatMulOptimization/MatMulParallelVectorization.cpp new file mode 100644 index 0000000000..90cbae7d21 --- /dev/null +++ b/midend/lib/Conversion/MatMulOptimization/MatMulParallelVectorization.cpp @@ -0,0 +1,369 @@ +//===- MatMulParallelVectorization.cpp ------------------------------------===// +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +// +// This file implements the matmul-paralell-vectorization optimization. +// +//===----------------------------------------------------------------------===// + +#include "mlir/Dialect/Arith/IR/Arith.h" +#include "mlir/Dialect/MemRef/IR/MemRef.h" +#include "mlir/Dialect/Vector/IR/VectorOps.h" +#include "mlir/IR/AffineExpr.h" +#include "mlir/IR/AffineMap.h" +#include "mlir/IR/Attributes.h" +#include "mlir/IR/Builders.h" +#include "mlir/IR/BuiltinAttributes.h" +#include "mlir/IR/BuiltinTypes.h" +#include "mlir/IR/IntegerSet.h" +#include "mlir/IR/TypeRange.h" +#include "mlir/IR/Types.h" +#include "mlir/IR/ValueRange.h" +#include "llvm/ADT/ArrayRef.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace mlir; +using namespace vector; +using namespace affine; + +//===----------------------------------------------------------------------===// +// Rewrite Pattern +//===----------------------------------------------------------------------===// + +namespace { + +class MatMulParallelVectorizationPattern : public ConversionPattern { +public: + explicit MatMulParallelVectorizationPattern(MLIRContext *context, + int64_t affineVectorSizeParam) + : ConversionPattern(linalg::MatmulOp::getOperationName(), 1, context) { + affineVectorSize = affineVectorSizeParam; + } + + LogicalResult + matchAndRewrite(Operation *op, ArrayRef /*operands*/, + ConversionPatternRewriter &rewriter) const override { + auto loc = op->getLoc(); + + // Retrieve input tensors A, B, and C. + Value A = op->getOperand(0); + Value B = op->getOperand(1); + Value C = op->getOperand(2); + + // Acquire the element type of input tensors. + Type elementType = A.getType().cast().getElementType(); + + // Define constants. + const Value zeroIndex = + rewriter.create(loc, rewriter.getIndexAttr(0)); + const AffineExpr d0 = rewriter.getAffineDimExpr(0); + const AffineExpr d1 = rewriter.getAffineDimExpr(1); + const AffineExpr s0 = rewriter.getAffineSymbolExpr(0); + const AffineExpr zeroAffine = rewriter.getAffineConstantExpr(0); + + const Value zeroElementType = rewriter.create( + loc, rewriter.getZeroAttr(elementType)); + const Value zeroElementTypeVec = rewriter.create( + loc, VectorType::get({affineVectorSize}, elementType), zeroElementType); + + // Get dimensions of input tensors. + Value aRow = rewriter.create(loc, A, 0); + Value bCol = rewriter.create(loc, B, 1); + Value bRow = rewriter.create(loc, B, 0); + + // Calculate the length of the tail, which might not fit in a vector. + Value tailLength = rewriter.create( + loc, AffineMap::get(1, 0, d0 % affineVectorSize), ValueRange{bCol}); + + // Generate a mask vector based on the tail length. + Value maskVector = rewriter.create( + loc, VectorType::get({affineVectorSize}, rewriter.getI1Type()), + ValueRange{tailLength}); + + SmallVector reducedValues = llvm::to_vector<4>( + llvm::map_range(ArrayRef{}, + [](const LoopReduction &red) { return red.value; })); + + // Apply the column of matrix B. + Value appliedColOfB = rewriter.create( + loc, AffineMap::get(1, 0, d0.ceilDiv(affineVectorSize)), + ValueRange{bCol}); + + // Create the primary parallel loop for matrix multiplication. + AffineParallelOp parallelLoop = rewriter.create( + loc, ValueRange(reducedValues).getTypes(), ValueRange{appliedColOfB}, + ArrayRef{ + rewriter.getNamedAttr("lowerBoundsGroups", + rewriter.getI32TensorAttr({1})), + rewriter.getNamedAttr("upperBoundsGroups", + rewriter.getI32TensorAttr({1})), + rewriter.getNamedAttr( + "lowerBoundsMap", + AffineMapAttr::get( + AffineMap::get(0, 0, {zeroAffine}, rewriter.getContext()))), + rewriter.getNamedAttr("upperBoundsMap", + AffineMapAttr::get(AffineMap::get( + 1, 0, {d0}, rewriter.getContext()))), + rewriter.getNamedAttr("reductions", rewriter.getArrayAttr({})), + rewriter.getNamedAttr("steps", rewriter.getI64ArrayAttr({1}))}); + + // Create the loop body for the parallel loop. + Block *loopBody = new Block(); + rewriter.setInsertionPointToStart(loopBody); + loopBody->addArgument(rewriter.getIndexType(), loc); + Value loopVarColOfB = loopBody->getArguments()[0]; + + // Prefetching data from tensor 'A' for better cache utilization. + rewriter.create( + loc, A, AffineMap::get(2, 0, {d0, d1}, rewriter.getContext()), + ArrayRef{aRow, bRow}, false, 3, true); + + // Compile time branch detection. + if (C.getType().cast().isDynamicDim(1) or + C.getType().cast().getDimSize(1) % affineVectorSize != 0) { + + // Depending on the position, use either full vectors or tail vectors. + affine::AffineIfOp branchingOp = rewriter.create( + loc, + IntegerSet::get( + 1, 1, {d0 * -affineVectorSize + s0 - affineVectorSize}, {false}), + ValueRange{loopVarColOfB, bCol}, true); + + // Branch handling full vector operations. + OpBuilder trueBranchBuilder = branchingOp.getThenBodyBuilder(); + affine::buildAffineLoopNest( + trueBranchBuilder, loc, {zeroIndex}, {bRow}, 1, + [&](OpBuilder &builder, Location loc, ValueRange ivRange) { + Value loopVarRowOfB = ivRange.front(); + Value bVec = builder.create( + loc, VectorType::get({affineVectorSize}, elementType), B, + AffineMap::get(2, 0, {d0, d1 * affineVectorSize}, + rewriter.getContext()), + ValueRange{loopVarRowOfB, loopVarColOfB}); + affine::buildAffineLoopNest( + builder, loc, {zeroIndex}, {aRow}, 1, + [&](OpBuilder &builder, Location loc, ValueRange ivRange) { + Value loopVarRowOfA = ivRange.front(); + Value aElement = builder.create( + loc, A, ValueRange{loopVarRowOfA, loopVarRowOfB}); + Value aVec = builder.create( + loc, VectorType::get({affineVectorSize}, elementType), + aElement); + Value cVec = builder.create( + loc, VectorType::get({affineVectorSize}, elementType), C, + AffineMap::get(2, 0, {d0, d1 * affineVectorSize}, + builder.getContext()), + ValueRange{loopVarRowOfA, loopVarColOfB}); + Value computedVec; + + // Compute the result vector either through integer + // multiplication and addition or fused multiply-add + // based on the element type. + if (elementType.isa()) { + Value mulVec = + builder.create(loc, aVec, bVec); + computedVec = + builder.create(loc, mulVec, cVec); + } else { + computedVec = + builder.create(loc, aVec, bVec, cVec); + } + builder.create( + loc, computedVec, C, + AffineMap::get(2, 0, {d0, d1 * affineVectorSize}, + builder.getContext()), + ValueRange{loopVarRowOfA, loopVarColOfB}); + }); + }); + + // Branch handling operations on the tail. + OpBuilder falseBranchBuilder = branchingOp.getElseBodyBuilder(); + affine::buildAffineLoopNest( + falseBranchBuilder, loc, {zeroIndex}, {bRow}, 1, + [&](OpBuilder &builder, Location loc, ValueRange ivRange) { + Value loopVarRowOfB = ivRange.front(); + Value tailIdxColOfB = builder.create( + loc, AffineMap::get(1, 0, d0 * affineVectorSize), + ValueRange{loopVarColOfB}); + Value bVec = builder.create( + loc, VectorType::get({affineVectorSize}, elementType), B, + ValueRange{loopVarRowOfB, tailIdxColOfB}, maskVector, + zeroElementTypeVec); + affine::buildAffineLoopNest( + builder, loc, {zeroIndex}, {aRow}, 1, + [&](OpBuilder &builder, Location loc, ValueRange ivRange) { + Value loopVarRowOfA = ivRange.front(); + Value aElement = builder.create( + loc, A, ValueRange{loopVarRowOfA, loopVarRowOfB}); + Value aVec = builder.create( + loc, VectorType::get({affineVectorSize}, elementType), + aElement); + Value cVec = builder.create( + loc, VectorType::get({affineVectorSize}, elementType), C, + ValueRange{loopVarRowOfA, tailIdxColOfB}, maskVector, + zeroElementTypeVec); + Value computedVec; + + // Compute the result vector either through integer + // multiplication and addition or fused multiply-add based on + // the element type. + if (elementType.isa()) { + Value mulVec = + builder.create(loc, aVec, bVec); + computedVec = + builder.create(loc, mulVec, cVec); + } else { + computedVec = + builder.create(loc, aVec, bVec, cVec); + } + builder.create( + loc, C, ValueRange{loopVarRowOfA, tailIdxColOfB}, + maskVector, computedVec); + }); + }); + } else { + affine::buildAffineLoopNest( + rewriter, loc, {zeroIndex}, {bRow}, 1, + [&](OpBuilder &builder, Location loc, ValueRange ivRange) { + Value loopVarRowOfB = ivRange.front(); + Value bVec = builder.create( + loc, VectorType::get({affineVectorSize}, elementType), B, + AffineMap::get(2, 0, {d0, d1 * affineVectorSize}, + rewriter.getContext()), + ValueRange{loopVarRowOfB, loopVarColOfB}); + affine::buildAffineLoopNest( + builder, loc, {zeroIndex}, {aRow}, 1, + [&](OpBuilder &builder, Location loc, ValueRange ivRange) { + Value loopVarRowOfA = ivRange.front(); + Value aElement = builder.create( + loc, A, ValueRange{loopVarRowOfA, loopVarRowOfB}); + Value aVec = builder.create( + loc, VectorType::get({affineVectorSize}, elementType), + aElement); + Value cVec = builder.create( + loc, VectorType::get({affineVectorSize}, elementType), C, + AffineMap::get(2, 0, {d0, d1 * affineVectorSize}, + builder.getContext()), + ValueRange{loopVarRowOfA, loopVarColOfB}); + Value computedVec; + + // Compute the result vector either through integer + // multiplication and addition or fused multiply-add + // based on the element type. + if (elementType.isa()) { + Value mulVec = + builder.create(loc, aVec, bVec); + computedVec = + builder.create(loc, mulVec, cVec); + } else { + computedVec = + builder.create(loc, aVec, bVec, cVec); + } + builder.create( + loc, computedVec, C, + AffineMap::get(2, 0, {d0, d1 * affineVectorSize}, + builder.getContext()), + ValueRange{loopVarRowOfA, loopVarColOfB}); + }); + }); + } + + rewriter.create(loc); + + // Finalize the loop and erase the original operation. + parallelLoop.getRegion().push_back(loopBody); + rewriter.setInsertionPointAfter(parallelLoop); + + rewriter.eraseOp(op); + return success(); + } + +private: + int64_t affineVectorSize; +}; + +} // end anonymous namespace + +//===----------------------------------------------------------------------===// +// MatMulParallelVectorizationPass +//===----------------------------------------------------------------------===// + +/// This is a partial lowering linalg pooling operations to mixture of +/// Affine + Vector operations. +namespace { +class MatMulParallelVectorizationPass + : public PassWrapper> { +public: + MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(MatMulParallelVectorizationPass) + StringRef getArgument() const final { + return "matmul-paralell-vectorization-optimize"; + } + StringRef getDescription() const final { + return "MatMulParallelVectorization Optimization."; + } + MatMulParallelVectorizationPass() = default; + MatMulParallelVectorizationPass(const MatMulParallelVectorizationPass &) {} + explicit MatMulParallelVectorizationPass(int64_t affineVectorSizeParam) { + affineVectorSize = affineVectorSizeParam; + } + + void runOnOperation() override; + + void getDependentDialects(DialectRegistry ®istry) const override { + registry.insert(); + } + + Option affineVectorSize{*this, "vector-size", + llvm::cl::desc("Affine Vector size."), + llvm::cl::init(64)}; +}; +} // end anonymous namespace. + +void MatMulParallelVectorizationPass::runOnOperation() { + MLIRContext *context = &getContext(); + ModuleOp module = getOperation(); + + ConversionTarget target(*context); + target + .addLegalDialect(); + target.addLegalOp(); + target.addLegalOp(); + + RewritePatternSet patterns(context); + patterns.add(context, affineVectorSize); + + if (failed(applyPartialConversion(module, target, std::move(patterns)))) + signalPassFailure(); +} + +namespace mlir { +namespace buddy { +void registerMatMulParallelVectorizationPass() { + PassRegistration(); +} +} // namespace buddy +} // namespace mlir diff --git a/midend/lib/Conversion/TransposeOptimization/BuiltinTransposeVectorization.cpp b/midend/lib/Conversion/TransposeOptimization/BuiltinTransposeVectorization.cpp new file mode 100644 index 0000000000..0cb8761a72 --- /dev/null +++ b/midend/lib/Conversion/TransposeOptimization/BuiltinTransposeVectorization.cpp @@ -0,0 +1,460 @@ +//===- BuiltinTransposeVectorization.cpp ----------------------------------===// +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +// +// This file implements the transpose optimization. +// +//===----------------------------------------------------------------------===// +#include "mlir/Dialect/Arith/IR/Arith.h" +#include "mlir/Dialect/Vector/IR/VectorOps.h" +#include "mlir/IR/AffineExpr.h" +#include "mlir/IR/AffineMap.h" +#include "mlir/IR/Attributes.h" +#include "mlir/IR/Builders.h" +#include "mlir/IR/BuiltinAttributes.h" +#include "mlir/IR/BuiltinTypes.h" +#include "mlir/IR/IntegerSet.h" +#include "mlir/IR/TypeRange.h" +#include "mlir/IR/ValueRange.h" +#include "mlir/Support/LogicalResult.h" +#include "llvm/ADT/ArrayRef.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace mlir; +using namespace vector; +using namespace affine; + +//===----------------------------------------------------------------------===// +// Rewrite Pattern +//===----------------------------------------------------------------------===// + +namespace { + +class TransposeOptimizationPattern : public ConversionPattern { +public: + explicit TransposeOptimizationPattern(MLIRContext *context, + int64_t affineVectorSizeParam) + : ConversionPattern(linalg::TransposeOp::getOperationName(), 1, context) { + affineVectorSize = affineVectorSizeParam; + } + + LogicalResult + matchAndRewrite(Operation *op, ArrayRef /*operands*/, + ConversionPatternRewriter &rewriter) const override { + auto permutationArrayAttr = + op->getAttr(rewriter.getStringAttr("permutation")) + .cast() + .asArrayRef(); + + // Retrieve input tensors A, B. + Value A = op->getOperand(0); + Value B = op->getOperand(1); + + // Only to rewrite the rank 2 tensor transpose. + if (permutationArrayAttr[0] != 1 or permutationArrayAttr[1] != 0 or + A.getType().cast().getRank() != 2) { + return failure(); + } + + auto loc = op->getLoc(); + + // Acquire the element type of input tensors. + Type elementType = A.getType().cast().getElementType(); + + // Define constants. + const Value index0 = + rewriter.create(loc, rewriter.getIndexAttr(0)); + const Value indexVecSize = rewriter.create( + loc, rewriter.getIndexAttr(affineVectorSize)); + const AffineExpr d0 = rewriter.getAffineDimExpr(0); + const AffineExpr d1 = rewriter.getAffineDimExpr(1); + const AffineExpr s0 = rewriter.getAffineSymbolExpr(0); + const AffineExpr zeroAffine = rewriter.getAffineConstantExpr(0); + + const Value zeroElementType = rewriter.create( + loc, rewriter.getZeroAttr(elementType)); + + // Get dimensions of input tensor. + Value Row = rewriter.create(loc, A, 0); + Value Col = rewriter.create(loc, A, 1); + + // Calculate the length of the tail, which might not fit in a vector. + Value rowUnalignedLength = rewriter.create( + loc, AffineMap::get(1, 0, d0 % affineVectorSize), ValueRange{Row}); + Value colUnalignedLength = rewriter.create( + loc, AffineMap::get(1, 0, d0 % affineVectorSize), ValueRange{Col}); + Value rowUpperBound = rewriter.create( + loc, + AffineMap::get(1, 0, d0.floorDiv(affineVectorSize) * affineVectorSize), + ValueRange{Row}); + + // Generate a mask vector based on the tail length. + Value rowEndMaskLoad = rewriter.create( + loc, + VectorType::get({affineVectorSize, affineVectorSize}, + rewriter.getI1Type()), + ValueRange{rowUnalignedLength, indexVecSize}); + Value colEndMaskLoad = rewriter.create( + loc, + VectorType::get({affineVectorSize, affineVectorSize}, + rewriter.getI1Type()), + ValueRange{indexVecSize, colUnalignedLength}); + Value rowEndMaskStore = rewriter.create( + loc, + VectorType::get({affineVectorSize, affineVectorSize}, + rewriter.getI1Type()), + ValueRange{indexVecSize, rowUnalignedLength}); + Value colEndMaskStore = rewriter.create( + loc, + VectorType::get({affineVectorSize, affineVectorSize}, + rewriter.getI1Type()), + ValueRange{colUnalignedLength, indexVecSize}); + + SmallVector reducedValues = llvm::to_vector<4>( + llvm::map_range(ArrayRef{}, + [](const LoopReduction &red) { return red.value; })); + + // Create the primary parallel loop. + AffineParallelOp parallelColLoop = + rewriter.create( + loc, ValueRange(reducedValues).getTypes(), ValueRange{Col}, + ArrayRef{ + rewriter.getNamedAttr("lowerBoundsGroups", + rewriter.getI32TensorAttr({1})), + rewriter.getNamedAttr("upperBoundsGroups", + rewriter.getI32TensorAttr({1})), + rewriter.getNamedAttr( + "lowerBoundsMap", + AffineMapAttr::get(AffineMap::get(0, 0, {zeroAffine}, + rewriter.getContext()))), + rewriter.getNamedAttr( + "upperBoundsMap", + AffineMapAttr::get(AffineMap::get( + 0, 1, + {s0.floorDiv(affineVectorSize) * affineVectorSize}, + rewriter.getContext()))), + rewriter.getNamedAttr("reductions", rewriter.getArrayAttr({})), + rewriter.getNamedAttr( + "steps", rewriter.getI64ArrayAttr({affineVectorSize}))}); + + // Create the loop body for the parallel loop. + Block *loopBody = new Block(); + rewriter.setInsertionPointToStart(loopBody); + loopBody->addArgument(rewriter.getIndexType(), loc); + Value colIdx = loopBody->getArguments()[0]; + + affine::buildAffineLoopNest( + rewriter, loc, {index0}, {rowUpperBound}, affineVectorSize, + [&](OpBuilder &builder, Location loc, ValueRange ivRange) { + Value rowIdx = ivRange.front(); + + auto tiledMatrix = rewriter.create( + loc, + TypeRange{VectorType::get( + ArrayRef{affineVectorSize, affineVectorSize}, + elementType)}, + ValueRange{A, rowIdx, colIdx, zeroElementType}, + ArrayRef{ + rewriter.getNamedAttr( + "in_bounds", + rewriter.getBoolArrayAttr(ArrayRef{false, true})), + rewriter.getNamedAttr( + "operand_segment_sizes", + rewriter.getDenseI32ArrayAttr(ArrayRef{1, 2, 1, 0})), + rewriter.getNamedAttr( + "permutation_map", + AffineMapAttr::get(AffineMap::get( + 2, 0, {d0, d1}, rewriter.getContext()))), + }); + + rewriter.create( + loc, TypeRange{}, ValueRange{tiledMatrix, B, colIdx, rowIdx}, + ArrayRef{ + rewriter.getNamedAttr( + "in_bounds", + rewriter.getBoolArrayAttr(ArrayRef{true, true})), + rewriter.getNamedAttr( + "operand_segment_sizes", + rewriter.getDenseI32ArrayAttr(ArrayRef{1, 1, 2, 0})), + rewriter.getNamedAttr( + "permutation_map", + AffineMapAttr::get(AffineMap::get(2, 0, {d1, d0}, + builder.getContext()))), + }); + + // Compile time branch detection. + if (A.getType().cast().isDynamicDim(0) or + A.getType().cast().getDimSize(0) % affineVectorSize != + 0) { + // Depending on the position, use either full vectors or tail + // vectors. + affine::AffineIfOp branchingRowUnaligned = + builder.create( + loc, + IntegerSet::get(1, 0, {d0 % affineVectorSize - 1}, {false}), + ValueRange{Row}, false); + + // Branch handling unaligned rows. + OpBuilder trueRowUnalignedBranchBuilder = + branchingRowUnaligned.getThenBodyBuilder(); + + auto rowUnalignedTiledMatrix = + trueRowUnalignedBranchBuilder.create( + loc, + TypeRange{VectorType::get( + ArrayRef{affineVectorSize, affineVectorSize}, + elementType)}, + ValueRange{A, rowUpperBound, colIdx, zeroElementType, + rowEndMaskLoad}, + ArrayRef{ + rewriter.getNamedAttr("in_bounds", + rewriter.getBoolArrayAttr( + ArrayRef{false, true})), + rewriter.getNamedAttr("operand_segment_sizes", + rewriter.getDenseI32ArrayAttr( + ArrayRef{1, 2, 1, 1})), + rewriter.getNamedAttr( + "permutation_map", + AffineMapAttr::get(AffineMap::get( + 2, 0, {d0, d1}, rewriter.getContext()))), + }); + trueRowUnalignedBranchBuilder.create( + loc, TypeRange{}, + ValueRange{rowUnalignedTiledMatrix, B, colIdx, rowUpperBound, + rowEndMaskStore}, + ArrayRef{ + rewriter.getNamedAttr( + "in_bounds", + rewriter.getBoolArrayAttr(ArrayRef{true, false})), + rewriter.getNamedAttr("operand_segment_sizes", + rewriter.getDenseI32ArrayAttr( + ArrayRef{1, 1, 2, 1})), + rewriter.getNamedAttr( + "permutation_map", + AffineMapAttr::get(AffineMap::get( + 2, 0, {d1, d0}, builder.getContext()))), + }); + } + }); + + rewriter.create(loc); + + // Finalize the loop. + parallelColLoop.getRegion().push_back(loopBody); + rewriter.setInsertionPointAfter(parallelColLoop); + + if (A.getType().cast().isDynamicDim(1) or + A.getType().cast().getDimSize(1) % affineVectorSize != 0) { + + affine::AffineIfOp branchingColUnaligned = + rewriter.create( + loc, IntegerSet::get(1, 0, {d0 % affineVectorSize - 1}, {false}), + ValueRange{Col}, false); + + OpBuilder trueColUnalignedBranchBuilder = + branchingColUnaligned.getThenBodyBuilder(); + Value colUpperBound = + trueColUnalignedBranchBuilder.create( + loc, + AffineMap::get(1, 0, + d0.floorDiv(affineVectorSize) * affineVectorSize), + ValueRange{Col}); + + affine::buildAffineLoopNest( + trueColUnalignedBranchBuilder, loc, {index0}, {rowUpperBound}, + affineVectorSize, + [&](OpBuilder &builder, Location loc, ValueRange ivRange) { + Value rowIdx = ivRange.front(); + auto colUnalignedTiledMatrix = + builder.create( + loc, + TypeRange{VectorType::get( + ArrayRef{affineVectorSize, affineVectorSize}, + elementType)}, + ValueRange{A, rowIdx, colUpperBound, zeroElementType, + colEndMaskLoad}, + ArrayRef{ + builder.getNamedAttr("in_bounds", + builder.getBoolArrayAttr( + ArrayRef{false, false})), + builder.getNamedAttr("operand_segment_sizes", + builder.getDenseI32ArrayAttr( + ArrayRef{1, 2, 1, 1})), + builder.getNamedAttr( + "permutation_map", + AffineMapAttr::get(AffineMap::get( + 2, 0, {d0, d1}, builder.getContext()))), + }); + builder.create( + loc, TypeRange{}, + ValueRange{colUnalignedTiledMatrix, B, colUpperBound, rowIdx, + colEndMaskStore}, + ArrayRef{ + rewriter.getNamedAttr( + "in_bounds", + rewriter.getBoolArrayAttr(ArrayRef{false, true})), + rewriter.getNamedAttr("operand_segment_sizes", + rewriter.getDenseI32ArrayAttr( + ArrayRef{1, 1, 2, 1})), + rewriter.getNamedAttr( + "permutation_map", + AffineMapAttr::get(AffineMap::get( + 2, 0, {d1, d0}, builder.getContext()))), + }); + }); + + if (A.getType().cast().isDynamicDim(0) or + A.getType().cast().getDimSize(0) % affineVectorSize != + 0) { + affine::AffineIfOp branchingRowColUnaligned = + trueColUnalignedBranchBuilder.create( + loc, + IntegerSet::get(1, 0, {d0 % affineVectorSize - 1}, {false}), + ValueRange{Col}, false); + + OpBuilder trueRowColUnalignedBranchBuilder = + branchingRowColUnaligned.getThenBodyBuilder(); + Value rowColEndMaskLoad = + trueRowColUnalignedBranchBuilder.create( + loc, + VectorType::get({affineVectorSize, affineVectorSize}, + trueRowColUnalignedBranchBuilder.getI1Type()), + ValueRange{rowUnalignedLength, colUnalignedLength}); + Value rowColEndMaskStore = + trueRowColUnalignedBranchBuilder.create( + loc, + VectorType::get({affineVectorSize, affineVectorSize}, + trueRowColUnalignedBranchBuilder.getI1Type()), + ValueRange{colUnalignedLength, rowUnalignedLength}); + auto rowColUnalignedTiledMatrix = + trueRowColUnalignedBranchBuilder.create( + loc, + TypeRange{VectorType::get( + ArrayRef{affineVectorSize, affineVectorSize}, + elementType)}, + ValueRange{A, rowUpperBound, colUpperBound, zeroElementType, + rowColEndMaskLoad}, + ArrayRef{ + trueRowColUnalignedBranchBuilder.getNamedAttr( + "in_bounds", + trueRowColUnalignedBranchBuilder.getBoolArrayAttr( + ArrayRef{false, false})), + trueRowColUnalignedBranchBuilder.getNamedAttr( + "operand_segment_sizes", + trueRowColUnalignedBranchBuilder.getDenseI32ArrayAttr( + ArrayRef{1, 2, 1, 1})), + trueRowColUnalignedBranchBuilder.getNamedAttr( + "permutation_map", + AffineMapAttr::get(AffineMap::get( + 2, 0, {d0, d1}, + trueRowColUnalignedBranchBuilder.getContext()))), + }); + trueRowColUnalignedBranchBuilder.create( + loc, TypeRange{}, + ValueRange{rowColUnalignedTiledMatrix, B, colUpperBound, + rowUpperBound, rowColEndMaskStore}, + ArrayRef{ + rewriter.getNamedAttr( + "in_bounds", + rewriter.getBoolArrayAttr(ArrayRef{false, false})), + rewriter.getNamedAttr( + "operand_segment_sizes", + rewriter.getDenseI32ArrayAttr(ArrayRef{1, 1, 2, 1})), + rewriter.getNamedAttr( + "permutation_map", + AffineMapAttr::get(AffineMap::get( + 2, 0, {d1, d0}, + trueRowColUnalignedBranchBuilder.getContext()))), + }); + } + } + + // Erase the original operation. + rewriter.eraseOp(op); + return success(); + } + +private: + int64_t affineVectorSize; +}; +} // end anonymous namespace + +//===----------------------------------------------------------------------===// +// TransposeOptimizationPass +//===----------------------------------------------------------------------===// + +/// This is a partial lowering linalg pooling operations to mixture of +/// Affine + Vector operations. +namespace { +class TransposeOptimizationPass + : public PassWrapper> { +public: + MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TransposeOptimizationPass) + StringRef getArgument() const final { return "transpose-optimize"; } + StringRef getDescription() const final { + return "Transpose Optimization only for rank 2 tensor."; + } + TransposeOptimizationPass() = default; + TransposeOptimizationPass(const TransposeOptimizationPass &) {} + explicit TransposeOptimizationPass(int64_t affineVectorSizeParam) { + affineVectorSize = affineVectorSizeParam; + } + + void runOnOperation() override; + + void getDependentDialects(DialectRegistry ®istry) const override { + registry + .insert(); + } + + Option affineVectorSize{*this, "vector-size", + llvm::cl::desc("Affine Vector size."), + llvm::cl::init(16)}; +}; +} // end anonymous namespace. + +void TransposeOptimizationPass::runOnOperation() { + MLIRContext *context = &getContext(); + ModuleOp module = getOperation(); + + ConversionTarget target(*context); + target.addLegalDialect(); + target.addLegalOp(); + target.addLegalOp(); + + RewritePatternSet patterns(context); + patterns.add(context, affineVectorSize); + + if (failed(applyPartialConversion(module, target, std::move(patterns)))) + signalPassFailure(); +} + +namespace mlir { +namespace buddy { +void registerTransposeOptimizationPass() { + PassRegistration(); +} +} // namespace buddy +} // namespace mlir diff --git a/midend/lib/Conversion/TransposeOptimization/CMakeLists.txt b/midend/lib/Conversion/TransposeOptimization/CMakeLists.txt new file mode 100644 index 0000000000..70d5ca7fca --- /dev/null +++ b/midend/lib/Conversion/TransposeOptimization/CMakeLists.txt @@ -0,0 +1,5 @@ +add_mlir_library(TransposeOptimization + BuiltinTransposeVectorization.cpp + LINK_LIBS PUBLIC + BuddyUtils +) diff --git a/tools/buddy-opt/CMakeLists.txt b/tools/buddy-opt/CMakeLists.txt index 333ee42ead..27b628720d 100644 --- a/tools/buddy-opt/CMakeLists.txt +++ b/tools/buddy-opt/CMakeLists.txt @@ -21,10 +21,12 @@ target_link_libraries(buddy-opt LowerRVVPass MatMulOptimization BatchMatMulOptimization + MatMulParallelVectorization + TransposeOptimization ConvOptimization VectorExp LowerVectorExpPass BuddyGemmini LowerGemminiPass LowerLinalgToGemminiPass - ) +) diff --git a/tools/buddy-opt/buddy-opt.cpp b/tools/buddy-opt/buddy-opt.cpp index c906af8ff3..af5c38f545 100644 --- a/tools/buddy-opt/buddy-opt.cpp +++ b/tools/buddy-opt/buddy-opt.cpp @@ -57,7 +57,8 @@ void registerLowerRVVPass(); void registerBatchMatMulOptimizePass(); void registerMatMulOptimizePass(); void registerMatMulVectorizationPass(); - +void registerMatMulParallelVectorizationPass(); +void registerTransposeOptimizationPass(); void registerConvOptimizePass(); void registerLowerVectorExpPass(); void registerLowerGemminiPass(); @@ -84,7 +85,9 @@ int main(int argc, char **argv) { // Register Several Optimize Pass. mlir::buddy::registerMatMulOptimizePass(); mlir::buddy::registerMatMulVectorizationPass(); + mlir::buddy::registerMatMulParallelVectorizationPass(); mlir::buddy::registerBatchMatMulOptimizePass(); + mlir::buddy::registerTransposeOptimizationPass(); mlir::buddy::registerConvOptimizePass(); mlir::DialectRegistry registry;