Skip to content

Commit

Permalink
[midend][examples] Add MatMulParallelVectorization, BuiltinTransposeV…
Browse files Browse the repository at this point in the history
…ectorization, BatchMatMulOptimize. (#215)

* [midend] Add MatMulParallelVectorization and optimized BatchMatMulOptimize.

* [examples] Fix makefile arguments and add MatMulParallelVectorization testcase.

* [midend][examples] Add parallelize BuiltinTransposeVectorization and tests.

* [midend] Restrict the transpose optimize for 2 rank tensor and remove unused header files.

* [midend] Canonicalize dynamic rank detection.
  • Loading branch information
EllisLambda authored Oct 31, 2023
1 parent 4cee632 commit 719e7de
Show file tree
Hide file tree
Showing 14 changed files with 1,340 additions and 186 deletions.
7 changes: 3 additions & 4 deletions examples/MLIRLinalg/linalg-batch-matmul-f32.mlir
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
// RUN: buddy-opt -batchmatmul-optimize -verify-diagnostics -expand-strided-metadata -lower-affine -convert-vector-to-llvm -finalize-memref-to-llvm -convert-scf-to-cf -convert-linalg-to-llvm -llvm-request-c-wrappers -convert-func-to-llvm -reconcile-unrealized-casts %s \
// RUN: | mlir-cpu-runner -O0 -e buddy_batchmatmul_f32 \
// RUN: | mlir-cpu-runner -O0 -e buddy_batchmatmul_f32 -entry-point-result=void \
// RUN: -shared-libs=%mlir_runner_utils_dir/libmlir_runner_utils%shlibext,%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext \
// RUN: | FileCheck %s

Expand All @@ -9,7 +9,7 @@ memref.global "private" @C : memref<2x2x4xf32> = dense<[[[ 49., 113., 146., 82.

func.func private @printMemrefF32(memref<*xf32>) attributes { llvm.emit_c_interface }

func.func @buddy_batchmatmul_f32() -> f32{
func.func @buddy_batchmatmul_f32(){
%a = memref.get_global @A : memref<2x2x3xf32>
%b = memref.get_global @B : memref<2x3x4xf32>
%c = memref.get_global @C : memref<2x2x4xf32>
Expand All @@ -24,6 +24,5 @@ func.func @buddy_batchmatmul_f32() -> f32{
// CHECK{LITERAL}: [12, 76, 96, 56]],
// CHECK{LITERAL}: [[48, 162, 72, 156],
// CHECK{LITERAL}: [16, 112, 0, 104]]]
%zero = arith.constant 0.0 :f32
return %zero :f32
return
}
7 changes: 3 additions & 4 deletions examples/MLIRLinalg/linalg-batch-matmul-i8.mlir
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
// RUN: buddy-opt -batchmatmul-optimize -verify-diagnostics -expand-strided-metadata -lower-affine -convert-vector-to-llvm -finalize-memref-to-llvm -convert-scf-to-cf -convert-linalg-to-llvm -llvm-request-c-wrappers -convert-func-to-llvm -reconcile-unrealized-casts %s \
// RUN: | mlir-cpu-runner -O0 -e buddy_batchmatmul_i8 \
// RUN: | mlir-cpu-runner -O0 -e buddy_batchmatmul_i8 -entry-point-result=void \
// RUN: -shared-libs=%mlir_runner_utils_dir/libmlir_runner_utils%shlibext,%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext \
// RUN: | FileCheck %s

Expand All @@ -9,7 +9,7 @@ memref.global "private" @C : memref<2x2x4xi8> = dense<[[[49, 12, 14, 82],[6, 38,

func.func private @printMemrefF32(memref<*xf32>) attributes { llvm.emit_c_interface }

func.func @buddy_batchmatmul_i8() -> f32{
func.func @buddy_batchmatmul_i8(){
%a = memref.get_global @A : memref<2x2x3xi8>
%b = memref.get_global @B : memref<2x3x4xi8>
%c = memref.get_global @C : memref<2x2x4xi8>
Expand Down Expand Up @@ -41,6 +41,5 @@ func.func @buddy_batchmatmul_i8() -> f32{
// CHECK{LITERAL}: [12, 76, 96, 56]],
// CHECK{LITERAL}: [[48, -94, 72, -100],
// CHECK{LITERAL}: [16, 112, 0, 104]]]
%zero = arith.constant 0.0 :f32
return %zero :f32
return
}
28 changes: 28 additions & 0 deletions examples/MLIRLinalg/linalg-matmul-opt-f32.mlir
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
// RUN: buddy-opt -matmul-paralell-vectorization-optimize -verify-diagnostics -expand-strided-metadata -lower-affine -convert-vector-to-llvm -finalize-memref-to-llvm -convert-scf-to-cf -convert-linalg-to-llvm -llvm-request-c-wrappers -convert-func-to-llvm -reconcile-unrealized-casts %s \
// RUN: | mlir-cpu-runner -O0 -e buddy_matmul_f32 -entry-point-result=void \
// RUN: -shared-libs=%mlir_runner_utils_dir/libmlir_runner_utils%shlibext,%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext \
// RUN: | FileCheck %s

memref.global "private" @A : memref<4x3xf32> = dense<[[9., 4., 6.],[2., 4., 0.],[6., 3., 3.],[0., 4., 7.]]>
memref.global "private" @B : memref<3x4xf32> = dense<[[1., 3., 8., 0.],[1., 8., 8., 7.], [6., 9., 7., 9.]]>
memref.global "private" @C : memref<4x4xf32> = dense<[[49., 113., 146., 82.],[6., 38., 48., 28.],[24., 81., 36., 78.],[8., 56., 0., 52.]]>

func.func private @printMemrefF32(memref<*xf32>) attributes { llvm.emit_c_interface }

func.func @buddy_matmul_f32(){
%a = memref.get_global @A : memref<4x3xf32>
%b = memref.get_global @B : memref<3x4xf32>
%c = memref.get_global @C : memref<4x4xf32>

linalg.matmul
ins(%a, %b: memref<4x3xf32>, memref<3x4xf32>)
outs(%c: memref<4x4xf32>)
%printed_c = memref.cast %c : memref<4x4xf32> to memref<*xf32>
call @printMemrefF32(%printed_c) : (memref<*xf32>) -> ()
// CHECK: {{Unranked Memref base@ = 0x[0-9A-Fa-f]{1,} rank = 2 offset = 0 sizes = \[4, 4\] strides = \[4, 1\] data =}}
// CHECK{LITERAL}: [[98, 226, 292, 164],
// CHECK{LITERAL}: [12, 76, 96, 56],
// CHECK{LITERAL}: [51, 150, 129, 126],
// CHECK{LITERAL}: [54, 151, 81, 143]]
return
}
42 changes: 42 additions & 0 deletions examples/MLIRLinalg/linalg-matmul-opt-i8.mlir
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
// RUN: buddy-opt -matmul-paralell-vectorization-optimize -verify-diagnostics -expand-strided-metadata -lower-affine -convert-vector-to-llvm -finalize-memref-to-llvm -convert-scf-to-cf -convert-linalg-to-llvm -llvm-request-c-wrappers -convert-func-to-llvm -reconcile-unrealized-casts %s \
// RUN: | mlir-cpu-runner -O0 -e buddy_matmul_i8 -entry-point-result=void \
// RUN: -shared-libs=%mlir_runner_utils_dir/libmlir_runner_utils%shlibext,%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext \
// RUN: | FileCheck %s

memref.global "private" @A : memref<4x3xi8> = dense<[[9, 4, 6],[2, 4, 0],[6, 3, 3],[0, 4, 7]]>
memref.global "private" @B : memref<3x4xi8> = dense<[[1, 3, 8, 0],[1, 8, 8, 7], [6, 9, 7, 9]]>
memref.global "private" @C : memref<4x4xi8> = dense<[[49, 113, 46, 82],[6, 38, 48, 28],[24, 81, 36, 78],[8, 56, 0, 52]]>

func.func private @printMemrefF32(memref<*xf32>) attributes { llvm.emit_c_interface }

func.func @buddy_matmul_i8(){
%a = memref.get_global @A : memref<4x3xi8>
%b = memref.get_global @B : memref<3x4xi8>
%c = memref.get_global @C : memref<4x4xi8>

linalg.matmul
ins(%a, %b: memref<4x3xi8>, memref<3x4xi8>)
outs(%c: memref<4x4xi8>)

%cst_0 = arith.constant 0 : index
%cst_1 = arith.constant 1 : index
%cst_4 = arith.constant 4 : index

%c_f32 = memref.alloca() : memref<4x4xf32>
scf.for %i = %cst_0 to %cst_4 step %cst_1 {
scf.for %j = %cst_0 to %cst_4 step %cst_1 {
%val_i8 = memref.load %c[%i, %j] : memref<4x4xi8>
%val_f32 = arith.sitofp %val_i8 : i8 to f32
memref.store %val_f32, %c_f32[%i, %j] : memref<4x4xf32>
}
}

%printed_c = memref.cast %c_f32 : memref<4x4xf32> to memref<*xf32>
call @printMemrefF32(%printed_c) : (memref<*xf32>) -> ()
// CHECK: {{Unranked Memref base@ = 0x[0-9A-Fa-f]{1,} rank = 2 offset = 0 sizes = \[4, 4\] strides = \[4, 1\] data =}}
// CHECK{LITERAL}: [[98, -30, -64, -92],
// CHECK{LITERAL}: [12, 76, 96, 56],
// CHECK{LITERAL}: [51, -106, -127, 126],
// CHECK{LITERAL}: [54, -105, 81, -113]]
return
}
27 changes: 27 additions & 0 deletions examples/MLIRLinalg/linalg-transpose-f32.mlir
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
// RUN: buddy-opt -transpose-optimize="vector-size=16" -verify-diagnostics -lower-affine -expand-strided-metadata -convert-vector-to-scf -convert-vector-to-llvm -finalize-memref-to-llvm -convert-scf-to-cf -convert-arith-to-llvm -convert-func-to-llvm -lower-affine -llvm-request-c-wrappers -convert-arith-to-llvm -reconcile-unrealized-casts %s \
// RUN: | mlir-cpu-runner -O0 -e buddy_transpose_f32 -entry-point-result=void \
// RUN: -shared-libs=%mlir_runner_utils_dir/libmlir_runner_utils%shlibext,%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext \
// RUN: | FileCheck %s

memref.global "private" @A : memref<3x4xf32> = dense<[[1., 3., 8., 0.],[1., 8., 8., 7.], [6., 9., 7., 9.]]>

func.func private @printMemrefF32(memref<*xf32>) attributes { llvm.emit_c_interface }

func.func @buddy_transpose_f32(){
%a = memref.get_global @A : memref<3x4xf32>
%b = memref.alloc() : memref<4x3xf32>

linalg.transpose
ins(%a: memref<3x4xf32>)
outs(%b: memref<4x3xf32>)
permutation = [1, 0]
%printed_b = memref.cast %b : memref<4x3xf32> to memref<*xf32>
call @printMemrefF32(%printed_b) : (memref<*xf32>) -> ()
memref.dealloc %b : memref<4x3xf32>
// CHECK: {{Unranked Memref base@ = 0x[0-9A-Fa-f]{1,} rank = 2 offset = 0 sizes = \[4, 3\] strides = \[3, 1\] data =}}
// CHECK{LITERAL}: [[1, 1, 6],
// CHECK{LITERAL}: [3, 8, 9],
// CHECK{LITERAL}: [8, 8, 7],
// CHECK{LITERAL}: [0, 7, 9]]
return
}
136 changes: 132 additions & 4 deletions examples/MLIRLinalg/makefile
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ linalg-batch-matmul-optimize-run:
-convert-arith-to-llvm \
-convert-func-to-llvm \
-reconcile-unrealized-casts | \
${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \
${MLIR_CPU_RUNNER} ${OPT_FLAG} -e buddy_batchmatmul_f32 -entry-point-result=void \
-shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS}

linalg-batch-matmul-lower:
Expand All @@ -170,7 +170,7 @@ linalg-batch-matmul-run:
-convert-linalg-to-loops -lower-affine -convert-scf-to-cf \
-convert-vector-to-llvm -finalize-memref-to-llvm -convert-arith-to-llvm \
-convert-func-to-llvm -reconcile-unrealized-casts | \
${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void -shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS}
${MLIR_CPU_RUNNER} ${OPT_FLAG} -e buddy_batchmatmul_f32 -entry-point-result=void -shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS}

linalg-batch-matmul-optimize-lower:
@${BUDDY_OPT} linalg-batch-matmul-f32.mlir ${MLIR_OPT_OPTIONS} \
Expand Down Expand Up @@ -203,7 +203,7 @@ linalg-batch-matmul-i8-optimize-run:
-convert-arith-to-llvm \
-convert-func-to-llvm \
-reconcile-unrealized-casts | \
${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \
${MLIR_CPU_RUNNER} ${OPT_FLAG} -e buddy_batchmatmul_i8 -entry-point-result=void \
-shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS}

linalg-batch-matmul-i8-lower:
Expand All @@ -225,7 +225,7 @@ linalg-batch-matmul-i8-run:
-convert-linalg-to-loops -lower-affine -convert-scf-to-cf \
-convert-vector-to-llvm -finalize-memref-to-llvm -convert-arith-to-llvm \
-convert-func-to-llvm -reconcile-unrealized-casts | \
${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void -shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS}
${MLIR_CPU_RUNNER} ${OPT_FLAG} -e buddy_batchmatmul_i8 -entry-point-result=void -shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS}

linalg-batch-matmul-i8-optimize-lower:
@${BUDDY_OPT} linalg-batch-matmul-i8.mlir ${MLIR_OPT_OPTIONS} \
Expand All @@ -246,6 +246,134 @@ linalg-batch-matmul-i8-optimize-translate:
-reconcile-unrealized-casts | \
${MLIR_TRANSLATE} --mlir-to-llvmir -o log.ll

linalg-matmul-parallized-vectorized-optmize-run:
@${BUDDY_OPT} linalg-matmul-opt-f32.mlir ${MLIR_OPT_OPTIONS} \
-matmul-paralell-vectorization-optimize="vector-size=128" \
-convert-linalg-to-loops \
-expand-strided-metadata \
-lower-affine \
-convert-scf-to-cf \
-convert-vector-to-llvm \
-finalize-memref-to-llvm \
-convert-arith-to-llvm \
-convert-func-to-llvm \
-reconcile-unrealized-casts | \
${MLIR_CPU_RUNNER} ${OPT_FLAG} -e buddy_matmul_f32 -entry-point-result=void \
-shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS}

linalg-matmul-parallized-vectorized-optmize-lower:
@${BUDDY_OPT} linalg-matmul-opt-f32.mlir ${MLIR_OPT_OPTIONS} \
-matmul-paralell-vectorization-optimize="vector-size=128" \
-o ./log.mlir

linalg-matmul-parallized-vectorized-optmize-translate:
@${BUDDY_OPT} linalg-matmul-opt-f32.mlir ${MLIR_OPT_OPTIONS} \
-matmul-paralell-vectorization-optimize="vector-size=128" \
-convert-linalg-to-loops \
-expand-strided-metadata \
-lower-affine \
-convert-scf-to-cf \
-convert-vector-to-llvm \
-finalize-memref-to-llvm \
-convert-arith-to-llvm \
-convert-func-to-llvm \
-reconcile-unrealized-casts | \
${MLIR_TRANSLATE} --mlir-to-llvmir -o log.ll

linalg-matmul-i8-parallized-vectorized-optmize-run:
@${BUDDY_OPT} linalg-matmul-opt-i8.mlir ${MLIR_OPT_OPTIONS} \
-matmul-paralell-vectorization-optimize="vector-size=128" \
-convert-linalg-to-loops \
-expand-strided-metadata \
-lower-affine \
-convert-scf-to-cf \
-convert-vector-to-llvm \
-finalize-memref-to-llvm \
-convert-arith-to-llvm \
-convert-func-to-llvm \
-reconcile-unrealized-casts | \
${MLIR_CPU_RUNNER} ${OPT_FLAG} -e buddy_matmul_i8 -entry-point-result=void \
-shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS}

linalg-matmul-i8-parallized-vectorized-optmize-lower:
@${BUDDY_OPT} linalg-matmul-opt-i8.mlir ${MLIR_OPT_OPTIONS} \
-matmul-paralell-vectorization-optimize="vector-size=128" \
-o ./log.mlir

linalg-matmul-i8-parallized-vectorized-optmize-translate:
@${BUDDY_OPT} linalg-matmul-opt-i8.mlir ${MLIR_OPT_OPTIONS} \
-matmul-paralell-vectorization-optimize="vector-size=128" \
-convert-linalg-to-loops \
-expand-strided-metadata \
-lower-affine \
-convert-scf-to-cf \
-convert-vector-to-llvm \
-finalize-memref-to-llvm \
-convert-arith-to-llvm \
-convert-func-to-llvm \
-reconcile-unrealized-casts | \
${MLIR_TRANSLATE} --mlir-to-llvmir -o log.ll

linalg-transpose-optimize-run:
@${BUDDY_OPT} linalg-transpose-f32.mlir ${MLIR_OPT_OPTIONS} \
-transpose-optimize="vector-size=16" \
-lower-affine \
-convert-vector-to-scf \
-convert-vector-to-llvm \
-finalize-memref-to-llvm \
-convert-scf-to-cf \
-convert-arith-to-llvm \
-llvm-request-c-wrappers \
-convert-func-to-llvm \
-lower-affine \
-convert-arith-to-llvm \
-reconcile-unrealized-casts | \
${MLIR_CPU_RUNNER} ${OPT_FLAG} -e buddy_transpose_f32 -entry-point-result=void \
-shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS}

linalg-transpose-lower:
@${MLIR_OPT} linalg-transpose-f32.mlir ${MLIR_OPT_OPTIONS} \
-convert-linalg-to-loops -lower-affine -convert-scf-to-cf \
-convert-vector-to-llvm -finalize-memref-to-llvm -convert-arith-to-llvm \
-convert-func-to-llvm -reconcile-unrealized-casts \
-o ./log.mlir

linalg-transpose-translate:
@${MLIR_OPT} linalg-transpose-f32.mlir ${MLIR_OPT_OPTIONS} \
-convert-linalg-to-loops -lower-affine -convert-scf-to-cf \
-convert-vector-to-llvm -finalize-memref-to-llvm -convert-arith-to-llvm \
-convert-func-to-llvm -reconcile-unrealized-casts | \
${MLIR_TRANSLATE} --mlir-to-llvmir -o log.ll

linalg-transpose-run:
@${MLIR_OPT} linalg-transpose-f32.mlir ${MLIR_OPT_OPTIONS} \
-convert-linalg-to-loops -lower-affine -convert-scf-to-cf \
-convert-vector-to-llvm -finalize-memref-to-llvm -convert-arith-to-llvm \
-convert-func-to-llvm -reconcile-unrealized-casts | \
${MLIR_CPU_RUNNER} ${OPT_FLAG} -e buddy_transpose_f32 -entry-point-result=void -shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS}

linalg-transpose-optimize-lower:
@${BUDDY_OPT} linalg-transpose-f32.mlir ${MLIR_OPT_OPTIONS} \
-transpose-optimize="vector-size=16" \
-o ./log.mlir

linalg-transpose-optimize-translate:
@${BUDDY_OPT} linalg-transpose-f32.mlir ${MLIR_OPT_OPTIONS} \
-transpose-optimize="vector-size=16" \
-lower-affine \
-convert-vector-to-scf \
-convert-vector-to-llvm \
-finalize-memref-to-llvm \
-convert-scf-to-cf \
-convert-arith-to-llvm \
-llvm-request-c-wrappers \
-convert-func-to-llvm \
-lower-affine \
-convert-arith-to-llvm \
-reconcile-unrealized-casts | \
${MLIR_TRANSLATE} --mlir-to-llvmir -o log.ll


linalg-conv2d_nchw_fchw-lower:
@${MLIR_OPT} ./linalg-conv2d_nchw_fchw.mlir \
-convert-linalg-to-loops -o ./log.mlir
Expand Down
1 change: 1 addition & 0 deletions midend/lib/Conversion/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ add_subdirectory(LowerDIP)
add_subdirectory(LowerRVV)
add_subdirectory(LowerDAP)
add_subdirectory(MatMulOptimization)
add_subdirectory(TransposeOptimization)
add_subdirectory(ConvOptimization)
add_subdirectory(LowerVectorExp)
add_subdirectory(LowerGemmini)
Expand Down
Loading

0 comments on commit 719e7de

Please sign in to comment.