diff --git a/examples/MLIRSparseTensor/makefile b/examples/MLIRSparseTensor/makefile index 6afd5f5e51..f31e3adf19 100644 --- a/examples/MLIRSparseTensor/makefile +++ b/examples/MLIRSparseTensor/makefile @@ -105,3 +105,24 @@ sparse-tensor-expand-lower: --linalg-generalize-named-ops \ --linalg-fuse-elementwise-ops \ --sparsification -o log.mlir + +# This target will show the original for-loop without vectorization, +# which is useful to compare with the vectorized version. +sparse-tensor-vectorization-linalg-lower: + @${MLIR_OPT} ./sparse-tensor-vectorization.mlir \ + --linalg-generalize-named-ops \ + --linalg-fuse-elementwise-ops \ + --sparsification \ + -o log.mlir +sparse-tensor-vectorization-lower: + @${MLIR_OPT} ./sparse-tensor-vectorization.mlir \ + --sparsification --cse \ + --sparse-vectorization="vl=16" --cse \ + -o log.mlir +# This example is used for code verification only, as there is currently no ARMSVE machine for us to run the code on. +# Do the same run, but with VLA enable +sparse-tensor-vla-vectorization-lower: + @${MLIR_OPT} ./sparse-tensor-vectorization.mlir \ + --sparsification --cse \ + --sparse-vectorization="vl=16 enable-vla-vectorization=true" --cse \ + -o log.mlir diff --git a/examples/MLIRSparseTensor/sparse-tensor-vectorization.mlir b/examples/MLIRSparseTensor/sparse-tensor-vectorization.mlir new file mode 100644 index 0000000000..5ed3c14bc0 --- /dev/null +++ b/examples/MLIRSparseTensor/sparse-tensor-vectorization.mlir @@ -0,0 +1,52 @@ +#SparseVector = #sparse_tensor.encoding<{ + dimLevelType = ["compressed"] +}> + +#trait_mul = { + indexing_maps = [ + affine_map<(i) -> (i)>, // a + affine_map<(i) -> (i)>, // b + affine_map<(i) -> (i)> // x (out) + ], + iterator_types = ["parallel"], + doc = "x(i) = a(i) * b(i)" +} + +// Example for parallel loop vectorization +func.func @sparse_mul(%arga: tensor<1024xf32, #SparseVector>, + %argb: tensor<1024xf32>, + %argx: tensor<1024xf32>) -> tensor<1024xf32> { + %0 = linalg.generic #trait_mul + ins(%arga, %argb: tensor<1024xf32, #SparseVector>, tensor<1024xf32>) + outs(%argx: tensor<1024xf32>) { + ^bb(%a: f32, %b: f32, %x: f32): + %0 = arith.mulf %a, %b : f32 + linalg.yield %0 : f32 + } -> tensor<1024xf32> + return %0 : tensor<1024xf32> +} + +#trait_reduction = { + indexing_maps = [ + affine_map<(i) -> (i)>, // a + affine_map<(i) -> (i)>, // b + affine_map<(i) -> ()> // x (out) + ], + iterator_types = ["reduction"], + doc = "x += a(i) * b(i)" +} + +// Example for reduction loop vectorization +func.func @sparse_reduction(%arga: tensor<1024xf32, #SparseVector>, + %argb: tensor<1024xf32>, + %argx: tensor) -> tensor { + %0 = linalg.generic #trait_reduction + ins(%arga, %argb: tensor<1024xf32, #SparseVector>, tensor<1024xf32>) + outs(%argx: tensor) { + ^bb(%a: f32, %b: f32, %x: f32): + %0 = arith.mulf %a, %b : f32 + %1 = arith.addf %x, %0 : f32 + linalg.yield %1 : f32 + } -> tensor + return %0 : tensor +}