From 224a8b94d9c886f8e8aa6c8fc3f38fa17a32d756 Mon Sep 17 00:00:00 2001 From: sforekeeper Date: Fri, 3 Nov 2023 13:08:13 +0800 Subject: [PATCH] Add GPU transform lowering examples. --- examples/MLIRTransform/gpu-nested-forall.mlir | 23 +++++++++++ .../gpu-reduction-tile-forall.mlir | 21 ++++++++++ examples/MLIRTransform/makefile | 40 +++++++++++++++++++ 3 files changed, 84 insertions(+) create mode 100644 examples/MLIRTransform/gpu-nested-forall.mlir create mode 100644 examples/MLIRTransform/gpu-reduction-tile-forall.mlir diff --git a/examples/MLIRTransform/gpu-nested-forall.mlir b/examples/MLIRTransform/gpu-nested-forall.mlir new file mode 100644 index 0000000000..5fbbae23ab --- /dev/null +++ b/examples/MLIRTransform/gpu-nested-forall.mlir @@ -0,0 +1,23 @@ +!type4d = memref<32x64x4x32xf32> + +func.func @saxpy4d(%x: !type4d, %y: !type4d, %alpha : f32) -> !type4d { + %c32 = arith.constant 32 : index + %c64 = arith.constant 64 : index + %c4 = arith.constant 4 : index + scf.forall (%i, %j) in (%c32, %c64) { + scf.forall (%k, %l) in (%c4, %c32) { + %4 = memref.load %x[%i, %j, %k, %l] : !type4d + %5 = memref.load %y[%i, %j, %k, %l] : !type4d + %6 = math.fma %alpha, %4, %5 : f32 + memref.store %6, %y[%i, %j, %k, %l] : !type4d + } { mapping = [#gpu.thread, #gpu.thread] } + } { mapping = [#gpu.block, #gpu.block] } + return %y : !type4d +} + +transform.sequence failures(propagate) { +^bb1(%arg0: !transform.any_op): + %funcop = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op + %gpuLaunch = transform.gpu.map_forall_to_blocks %funcop { generate_gpu_launch } : (!transform.any_op) -> !transform.any_op + transform.gpu.map_nested_forall_to_threads %gpuLaunch block_dims = [32, 4, 1] : (!transform.any_op) -> !transform.any_op +} diff --git a/examples/MLIRTransform/gpu-reduction-tile-forall.mlir b/examples/MLIRTransform/gpu-reduction-tile-forall.mlir new file mode 100644 index 0000000000..dd89a774e8 --- /dev/null +++ b/examples/MLIRTransform/gpu-reduction-tile-forall.mlir @@ -0,0 +1,21 @@ +func.func @reduction_tile_parallel_cyclic_dist( + %arg0: tensor, %out: tensor) -> tensor { + %red = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, + affine_map<(d0, d1) -> (d0)>], + iterator_types = ["parallel", "reduction"]} + ins(%arg0 : tensor) + outs(%out : tensor) { + ^bb0(%arg7: f32, %arg9: f32): + %1 = arith.mulf %arg7, %arg7 : f32 + %2 = arith.addf %1, %arg9 : f32 + linalg.yield %2 : f32 + } -> tensor + return %red : tensor +} + +transform.sequence failures(propagate) { +^bb0(%arg1: !transform.any_op): + %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %1, %2, %3, %loop = transform.structured.tile_reduction_using_forall %0 + by num_threads = [0, 5], tile_sizes = [0, 3], mapping = [#gpu.thread] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op) +} diff --git a/examples/MLIRTransform/makefile b/examples/MLIRTransform/makefile index b314bc048b..173f6e8906 100644 --- a/examples/MLIRTransform/makefile +++ b/examples/MLIRTransform/makefile @@ -5,10 +5,12 @@ MLIR_OPT := ../../llvm/build/bin/mlir-opt MLIR_TRANSLATE := ../../llvm/build/bin/mlir-translate MLIR_CPU_RUNNER := ../../llvm/build/bin/mlir-cpu-runner LLC := ../../llvm/build/bin/llc +CLANG := ../../llvm/build/bin/clang OPT_FLAG := -O0 MLIR_RUNNER_UTILS := ../../llvm/build/lib/libmlir_runner_utils.so MLIR_C_RUNNER_UTILS := ../../llvm/build/lib/libmlir_c_runner_utils.so +MLIR_CUDA_RUNTIME := ../../llvm/build/lib/libmlir_cuda_runtime.so transform-conv2d-im2col-lower: @${MLIR_OPT} ./transform-conv2d-im2col.mlir \ @@ -116,3 +118,41 @@ transform-batch-matmul-e2e-rv-o: ${LLC} ${OPT_FLAG} -mtriple riscv64 -target-abi lp64d \ -mattr=+m,+d,+v -filetype=obj -riscv-v-vector-bits-min=128 \ -o ./transform-batch-matmul-e2e.o + +transform-gpu-nested-forall-lower: + @${MLIR_OPT} ./gpu-nested-forall.mlir \ + --test-transform-dialect-interpreter \ + --test-transform-dialect-erase-schedule \ + --canonicalize | \ + @${MLIR_OPT} \ + --gpu-kernel-outlining \ + --convert-gpu-to-nvvm \ + --reconcile-unrealized-casts | \ + @${MLIR_OPT} \ + --llvm-request-c-wrappers | \ + @${MLIR_OPT} \ + --test-lower-to-nvvm="cubin-chip=sm_80 cubin-features=+ptx71 cubin-format=fatbin" -o gpu-nested-forall.mlir + +transform-gpu-reduction-tile-first-step: + @${MLIR_OPT} ./gpu-nested-forall.mlir \ + --test-transform-dialect-interpreter \ + --test-transform-dialect-erase-schedule \ + --canonicalize | \ + @${MLIR_OPT} \ + -one-shot-bufferize="bufferize-function-boundaries" \ + -o gpu-reduction-tile-first-step.mlir + +transform-gpu-reduction-tile-second-step: + @${MLIR_OPT} ./gpu-reduction-tile-first-step.mlir \ + --test-transform-dialect-interpreter \ + --test-transform-dialect-erase-schedule \ + --canonicalize | \ + @${MLIR_OPT} \ + --gpu-kernel-outlining \ + --convert-gpu-to-nvvm \ + --reconcile-unrealized-casts | \ + @${MLIR_OPT} \ + --llvm-request-c-wrappers | \ + @${MLIR_OPT} \ + --test-lower-to-nvvm="cubin-chip=sm_80 cubin-features=+ptx71 cubin-format=fatbin" \ + -o gpu-nested-forall.mlir