From 224a8b94d9c886f8e8aa6c8fc3f38fa17a32d756 Mon Sep 17 00:00:00 2001
From: sforekeeper <zkliu6@gmail.com>
Date: Fri, 3 Nov 2023 13:08:13 +0800
Subject: [PATCH] Add GPU transform lowering examples.

---
 examples/MLIRTransform/gpu-nested-forall.mlir | 23 +++++++++++
 .../gpu-reduction-tile-forall.mlir            | 21 ++++++++++
 examples/MLIRTransform/makefile               | 40 +++++++++++++++++++
 3 files changed, 84 insertions(+)
 create mode 100644 examples/MLIRTransform/gpu-nested-forall.mlir
 create mode 100644 examples/MLIRTransform/gpu-reduction-tile-forall.mlir
diff --git a/examples/MLIRTransform/gpu-nested-forall.mlir b/examples/MLIRTransform/gpu-nested-forall.mlir
new file mode 100644
index 0000000000..5fbbae23ab
--- /dev/null
+++ b/examples/MLIRTransform/gpu-nested-forall.mlir
@@ -0,0 +1,23 @@
+!type4d = memref<32x64x4x32xf32>
+
+func.func @saxpy4d(%x: !type4d, %y: !type4d, %alpha : f32) -> !type4d {
+  %c32 = arith.constant 32 : index
+  %c64 = arith.constant 64 : index
+  %c4 = arith.constant 4 : index
+  scf.forall (%i, %j) in (%c32, %c64) {
+    scf.forall (%k, %l) in (%c4, %c32) {
+      %4 = memref.load %x[%i, %j, %k, %l] : !type4d
+      %5 = memref.load %y[%i, %j, %k, %l] : !type4d
+      %6 = math.fma %alpha, %4, %5 : f32
+      memref.store %6, %y[%i, %j, %k, %l] : !type4d
+    }  { mapping = [#gpu.thread<y>, #gpu.thread<x>] }
+  }  { mapping = [#gpu.block<x>, #gpu.block<y>] }
+  return %y : !type4d
+}
+
+transform.sequence failures(propagate) {
+^bb1(%arg0: !transform.any_op):
+  %funcop = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+  %gpuLaunch = transform.gpu.map_forall_to_blocks %funcop { generate_gpu_launch } : (!transform.any_op) -> !transform.any_op
+  transform.gpu.map_nested_forall_to_threads %gpuLaunch block_dims = [32, 4, 1] : (!transform.any_op) -> !transform.any_op
+}
diff --git a/examples/MLIRTransform/gpu-reduction-tile-forall.mlir b/examples/MLIRTransform/gpu-reduction-tile-forall.mlir
new file mode 100644
index 0000000000..dd89a774e8
--- /dev/null
+++ b/examples/MLIRTransform/gpu-reduction-tile-forall.mlir
@@ -0,0 +1,21 @@
+func.func @reduction_tile_parallel_cyclic_dist(
+  %arg0: tensor<?x?xf32>, %out: tensor<?xf32>) -> tensor<?xf32> {
+  %red = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
+                                          affine_map<(d0, d1) -> (d0)>],
+   iterator_types = ["parallel", "reduction"]}
+   ins(%arg0 : tensor<?x?xf32>)
+   outs(%out : tensor<?xf32>) {
+    ^bb0(%arg7: f32, %arg9: f32):
+      %1 = arith.mulf %arg7, %arg7 : f32
+      %2 = arith.addf %1, %arg9 : f32
+      linalg.yield %2 : f32
+    } -> tensor<?xf32>
+  return %red : tensor<?xf32>
+}
+
+transform.sequence failures(propagate) {
+^bb0(%arg1: !transform.any_op):
+  %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+  %1, %2, %3, %loop = transform.structured.tile_reduction_using_forall %0
+    by num_threads = [0, 5], tile_sizes = [0, 3], mapping = [#gpu.thread<x>] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
+}
diff --git a/examples/MLIRTransform/makefile b/examples/MLIRTransform/makefile
index b314bc048b..173f6e8906 100644
--- a/examples/MLIRTransform/makefile
+++ b/examples/MLIRTransform/makefile
@@ -5,10 +5,12 @@ MLIR_OPT := ../../llvm/build/bin/mlir-opt
 MLIR_TRANSLATE := ../../llvm/build/bin/mlir-translate
 MLIR_CPU_RUNNER := ../../llvm/build/bin/mlir-cpu-runner
 LLC := ../../llvm/build/bin/llc
+CLANG := ../../llvm/build/bin/clang
 OPT_FLAG := -O0
 
 MLIR_RUNNER_UTILS := ../../llvm/build/lib/libmlir_runner_utils.so
 MLIR_C_RUNNER_UTILS := ../../llvm/build/lib/libmlir_c_runner_utils.so
+MLIR_CUDA_RUNTIME := ../../llvm/build/lib/libmlir_cuda_runtime.so
 
 transform-conv2d-im2col-lower:
 	@${MLIR_OPT} ./transform-conv2d-im2col.mlir \
@@ -116,3 +118,41 @@ transform-batch-matmul-e2e-rv-o:
 	${LLC} ${OPT_FLAG} -mtriple riscv64 -target-abi lp64d \
 		-mattr=+m,+d,+v -filetype=obj -riscv-v-vector-bits-min=128 \
 		-o ./transform-batch-matmul-e2e.o
+
+transform-gpu-nested-forall-lower:
+	@${MLIR_OPT} ./gpu-nested-forall.mlir \
+		--test-transform-dialect-interpreter \
+		--test-transform-dialect-erase-schedule \
+		--canonicalize | \
+	@${MLIR_OPT} \
+	  --gpu-kernel-outlining \
+		--convert-gpu-to-nvvm \
+		--reconcile-unrealized-casts | \
+	@${MLIR_OPT} \
+	 	--llvm-request-c-wrappers | \
+	@${MLIR_OPT} \
+		--test-lower-to-nvvm="cubin-chip=sm_80 cubin-features=+ptx71 cubin-format=fatbin" -o gpu-nested-forall.mlir
+
+transform-gpu-reduction-tile-first-step:
+	@${MLIR_OPT} ./gpu-nested-forall.mlir \
+			--test-transform-dialect-interpreter \
+			--test-transform-dialect-erase-schedule \
+			--canonicalize | \
+	@${MLIR_OPT} \
+			-one-shot-bufferize="bufferize-function-boundaries" \
+			-o gpu-reduction-tile-first-step.mlir
+
+transform-gpu-reduction-tile-second-step:
+	@${MLIR_OPT} ./gpu-reduction-tile-first-step.mlir \
+			--test-transform-dialect-interpreter \
+			--test-transform-dialect-erase-schedule \
+			--canonicalize | \
+	@${MLIR_OPT} \
+	  --gpu-kernel-outlining \
+		--convert-gpu-to-nvvm \
+		--reconcile-unrealized-casts | \
+	@${MLIR_OPT} \
+	 	--llvm-request-c-wrappers | \
+	@${MLIR_OPT} \
+		--test-lower-to-nvvm="cubin-chip=sm_80 cubin-features=+ptx71 cubin-format=fatbin" \
+		-o gpu-nested-forall.mlir