[DAP] Add iir_simd operation for DAP dialect.

buddy-compiler · Oct 29, 2023 · abc66ab · abc66ab
1 parent 19cfc6c
commit abc66ab
Show file tree

Hide file tree

Showing 4 changed files with 259 additions and 12 deletions.
diff --git a/frontend/Interfaces/buddy/DAP/DSP/IIR.h b/frontend/Interfaces/buddy/DAP/DSP/IIR.h
@@ -34,9 +34,17 @@ void _mlir_ciface_mlir_iir(MemRef<float, 1> *inputBuddyConv1D,
                            MemRef<float, 2> *kernelBuddyConv1D,
                            MemRef<float, 1> *outputBuddyConv1D);
 
+void _mlir_ciface_mlir_iir_simd(MemRef<double, 1> *inputBuddyConv1D,
+                           MemRef<double, 2> *kernelBuddyConv1D,
+                           MemRef<double, 1> *outputBuddyConv1D);
+
 void _mlir_ciface_buddy_iir(MemRef<float, 1> *inputBuddyConv1D,
                             MemRef<float, 2> *kernelBuddyConv1D,
                             MemRef<float, 1> *outputBuddyConv1D);
+
+void _mlir_ciface_buddy_iir_simd(MemRef<double, 1> *inputBuddyConv1D,
+                            MemRef<double, 2> *kernelBuddyConv1D,
+                            MemRef<double, 1> *outputBuddyConv1D);
 }
 } // namespace detail
 

diff --git a/frontend/Interfaces/lib/DAP.mlir b/frontend/Interfaces/lib/DAP.mlir
@@ -28,6 +28,11 @@ func.func @buddy_iir(%in : memref<?xf32>, %filter : memref<?x?xf32>, %out : memr
   return
 } 
 
+func.func @buddy_iir_simd(%in : memref<?xf64>, %filter : memref<?x?xf64>, %out : memref<?xf64>) -> () {
+  dap.iir_simd %in, %filter, %out : memref<?xf64>, memref<?x?xf64>, memref<?xf64>
+  return
+} 
+
 func.func @buddy_biquad(%in : memref<?xf32>, %filter : memref<?xf32>, %out : memref<?xf32>) -> () {
   dap.biquad %in, %filter, %out : memref<?xf32>, memref<?xf32>, memref<?xf32>
   return

diff --git a/midend/include/Dialect/DAP/DAPOps.td b/midend/include/Dialect/DAP/DAPOps.td
@@ -94,4 +94,23 @@ def DAP_IirOp : DAP_Op<"iir"> {
   }];
 }
 
+def DAP_IirSimdOp : DAP_Op<"iir_simd"> {
+  let summary = [{IIR filter, a infinite impulse response (IIR), Unlike FIR filters,
+  they have a feedback(a recursive part of a filter). This is the SIMD version for iir operation.
+  ```mlir
+    dsp.iir_simd %input, %kernel, %output :memref<?xf64>, memref<?x?xf64>,
+            memref<?xf64>
+  ```
+  }];
+  let arguments = (ins Arg<AnyRankedOrUnrankedMemRef, "inputMemref",
+                           [MemRead]>:$memrefI,
+                       Arg<AnyRankedOrUnrankedMemRef, "kernelMemref",
+                           [MemRead]>:$memrefK,
+                       Arg<AnyRankedOrUnrankedMemRef, "outputMemref",
+                           [MemRead]>:$memrefO);
+  let assemblyFormat = [{
+    $memrefI `,` $memrefK `,` $memrefO attr-dict `:` type($memrefI) `,` type($memrefK) `,` type($memrefO)
+  }];
+}
+
 #endif // DAP_DAPOPS_TD
diff --git a/midend/lib/Conversion/LowerDAP/LowerDAPPass.cpp b/midend/lib/Conversion/LowerDAP/LowerDAPPass.cpp
@@ -21,11 +21,11 @@
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Bufferization/Transforms/Bufferize.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
 #include "mlir/Dialect/Linalg/Transforms/Transforms.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/Pass/Pass.h"
-#include "mlir/Dialect/Linalg/IR/Linalg.h"
 
 #include "DAP/DAPDialect.h"
 #include "DAP/DAPOps.h"
@@ -282,25 +282,26 @@ class DAPIirLowering : public OpRewritePattern<dap::IirOp> {
           // process the remain data of FIR part
           Value idx1 = builder.create<SubIOp>(loc, upperN, c1);
           Value idx2 = builder.create<SubIOp>(loc, upperN, c2);
-          Value in1 = 
+          Value in1 =
               builder.create<memref::LoadOp>(loc, input, ValueRange{idx1});
-          Value in2 = 
+          Value in2 =
               builder.create<memref::LoadOp>(loc, input, ValueRange{idx2});
 
           builder.create<scf::ForOp>(
-              loc, upperN, N, c1, ValueRange{in1, in2}, 
+              loc, upperN, N, c1, ValueRange{in1, in2},
               [&](OpBuilder &builder, Location loc, Value iv,
                   ValueRange itrargs) {
-                Value in0 = 
+                Value in0 =
                     builder.create<memref::LoadOp>(loc, input, ValueRange{iv});
 
                 Value temp0 = builder.create<MulFOp>(loc, b0, in0);
                 Value temp1 = builder.create<MulFOp>(loc, b1, in1);
                 Value temp2 = builder.create<MulFOp>(loc, b2, in2);
                 Value sum0 = builder.create<AddFOp>(loc, temp0, temp1);
                 Value sum1 = builder.create<AddFOp>(loc, sum0, temp2);
-
-                builder.create<memref::StoreOp>(loc, sum1, output, ValueRange{iv});
+
+                builder.create<memref::StoreOp>(loc, sum1, output,
+                                                ValueRange{iv});
 
                 builder.create<scf::YieldOp>(loc, std::vector<Value>{in0, in1});
               });
@@ -334,13 +335,226 @@ class DAPIirLowering : public OpRewritePattern<dap::IirOp> {
   int64_t stride;
 };
 
+class DAPIirSimdLowering : public OpRewritePattern<dap::IirSimdOp> {
+public:
+  using OpRewritePattern<dap::IirSimdOp>::OpRewritePattern;
+
+  explicit DAPIirSimdLowering(MLIRContext *context, int64_t strideParam)
+      : OpRewritePattern(context) {
+    stride = strideParam;
+  }
+
+  LogicalResult matchAndRewrite(dap::IirSimdOp op,
+                                PatternRewriter &rewriter) const override {
+    auto loc = op->getLoc();
+    auto ctx = op->getContext();
+
+    Value input = op->getOperand(0);
+    Value kernel = op->getOperand(1);
+    Value output = op->getOperand(2);
+
+    Value c0 = rewriter.create<ConstantIndexOp>(loc, 0);
+    Value c1 = rewriter.create<ConstantIndexOp>(loc, 1);
+    Value c2 = rewriter.create<ConstantIndexOp>(loc, 2);
+    Value c4 = rewriter.create<ConstantIndexOp>(loc, 4);
+    Value c5 = rewriter.create<ConstantIndexOp>(loc, 5);
+    Value c15 = rewriter.create<ConstantIndexOp>(loc, 15);
+    // TODO : Change the vector length value to an Attribute
+    Value c16 = rewriter.create<ConstantIndexOp>(loc, 16);
+
+    Value N = rewriter.create<memref::DimOp>(loc, input, c0);
+    Value filterSize = rewriter.create<memref::DimOp>(loc, kernel, c0);
+    Value strideVal = rewriter.create<ConstantIndexOp>(loc, stride);
+
+    FloatType f64 = FloatType::getF64(ctx);
+
+    VectorType vectorTy64 = VectorType::get(16, f64);
+
+    Value f0 = rewriter.create<ConstantFloatOp>(loc, APFloat(0.0d), f64);
+    Value f1 = rewriter.create<ConstantFloatOp>(loc, APFloat(1.0d), f64);
+
+    Value initB0 = rewriter.create<vector::SplatOp>(loc, vectorTy64, f1);
+    Value initB1 = rewriter.create<vector::SplatOp>(loc, vectorTy64, f0);
+    Value initB2 = rewriter.create<vector::SplatOp>(loc, vectorTy64, f0);
+    Value initA1 = rewriter.create<vector::SplatOp>(loc, vectorTy64, f0);
+    Value initA2 = rewriter.create<vector::SplatOp>(loc, vectorTy64, f0);
+
+    // Distribute all params into 5 param vectors
+    auto vecDistribute = rewriter.create<scf::ForOp>(
+        loc, c0, filterSize, c1,
+        ValueRange{initB0, initB1, initB2, initA1, initA2},
+        [&](OpBuilder &builder, Location loc, Value iv, ValueRange iargs) {
+          Value b0 =
+              builder.create<memref::LoadOp>(loc, kernel, ValueRange{iv, c0});
+          Value b1 =
+              builder.create<memref::LoadOp>(loc, kernel, ValueRange{iv, c1});
+          Value b2 =
+              builder.create<memref::LoadOp>(loc, kernel, ValueRange{iv, c2});
+          // Value a0 of kernel is not used
+          Value a1 =
+              builder.create<memref::LoadOp>(loc, kernel, ValueRange{iv, c4});
+          Value a2 =
+              builder.create<memref::LoadOp>(loc, kernel, ValueRange{iv, c5});
+
+          Value B0_next =
+              builder.create<vector::InsertElementOp>(loc, b0, iargs[0], iv);
+          Value B1_next =
+              builder.create<vector::InsertElementOp>(loc, b1, iargs[1], iv);
+          Value B2_next =
+              builder.create<vector::InsertElementOp>(loc, b2, iargs[2], iv);
+          Value A1_next =
+              builder.create<vector::InsertElementOp>(loc, a1, iargs[3], iv);
+          Value A2_next =
+              builder.create<vector::InsertElementOp>(loc, a2, iargs[4], iv);
+
+          builder.create<scf::YieldOp>(
+              loc,
+              std::vector<Value>{B0_next, B1_next, B2_next, A1_next, A2_next});
+        });
+
+    Value vecB0 = vecDistribute.getResult(0);
+    Value vecB1 = vecDistribute.getResult(1);
+    Value vecB2 = vecDistribute.getResult(2);
+    Value vecA1 = vecDistribute.getResult(3);
+    Value vecA2 = vecDistribute.getResult(4);
+
+    Value vecOut = rewriter.create<vector::SplatOp>(loc, vectorTy64, f0);
+    Value vecS1 = rewriter.create<vector::SplatOp>(loc, vectorTy64, f0);
+    Value vecS2 = rewriter.create<vector::SplatOp>(loc, vectorTy64, f0);
+
+    // The SIMD version for IIR operation can represented as a pipeline with
+    // {c16} stages. This loop represent the injection section, loop {stages-1}
+    // times.
+    auto injectionResult = rewriter.create<scf::ForOp>(
+        loc, c0, c15, c1, ValueRange{vecOut, vecS1, vecS2},
+        [&](OpBuilder &builder, Location loc, Value iv, ValueRange iargs) {
+          Value in_elem = builder.create<memref::LoadOp>(loc, input, iv);
+          Value vecIn_move_right = builder.create<vector::ShuffleOp>(
+              loc, iargs[0], iargs[0],
+              ArrayRef<int64_t>{0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
+                                14});
+          Value vecIn_next = builder.create<vector::InsertElementOp>(
+              loc, in_elem, vecIn_move_right, c0);
+          Value vecOut_next =
+              builder.create<vector::FMAOp>(loc, vecB0, vecIn_next, iargs[1]);
+
+          Value vecS1_lhs =
+              builder.create<vector::FMAOp>(loc, vecB1, vecIn_next, iargs[2]);
+          Value vecS1_rhs =
+              builder.create<arith::MulFOp>(loc, vecA1, vecOut_next);
+          Value vecS1_next =
+              builder.create<arith::SubFOp>(loc, vecS1_lhs, vecS1_rhs);
+
+          Value vecS2_lhs =
+              builder.create<arith::MulFOp>(loc, vecB2, vecIn_next);
+          Value vecS2_rhs =
+              builder.create<arith::MulFOp>(loc, vecA2, vecOut_next);
+          Value vecS2_next =
+              builder.create<arith::SubFOp>(loc, vecS2_lhs, vecS2_rhs);
+
+          builder.create<scf::YieldOp>(
+              loc, std::vector<Value>{vecOut_next, vecS1_next, vecS2_next});
+        });
+
+    Value vecOut_tmp1 = injectionResult.getResult(0);
+    Value vecS1_tmp1 = injectionResult.getResult(1);
+    Value vecS2_tmp1 = injectionResult.getResult(2);
+
+    Value i15 =
+        rewriter.create<arith::ConstantIntOp>(loc, /*value=*/15, /*width=*/64);
+    Value upperBound = rewriter.create<arith::SubIOp>(loc, N, c15);
+
+    // This loop represent full process section, start to produce output.
+    auto processResult = rewriter.create<scf::ForOp>(
+        loc, c0, upperBound, c1,
+        ValueRange{vecOut_tmp1, vecS1_tmp1, vecS2_tmp1},
+        [&](OpBuilder &builder, Location loc, Value iv, ValueRange iargs) {
+          Value index = builder.create<arith::AddIOp>(loc, iv, c15);
+          Value in_elem = builder.create<memref::LoadOp>(loc, input, index);
+          Value vecIn_move_right = builder.create<vector::ShuffleOp>(
+              loc, iargs[0], iargs[0],
+              ArrayRef<int64_t>{0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
+                                14});
+          Value vecIn_next = builder.create<vector::InsertElementOp>(
+              loc, in_elem, vecIn_move_right, c0);
+          Value vecOut_next =
+              builder.create<vector::FMAOp>(loc, vecB0, vecIn_next, iargs[1]);
+          Value out_elem =
+              builder.create<vector::ExtractElementOp>(loc, vecOut_next, i15);
+          builder.create<memref::StoreOp>(loc, out_elem, output, iv);
+
+          Value vecS1_lhs =
+              builder.create<vector::FMAOp>(loc, vecB1, vecIn_next, iargs[2]);
+          Value vecS1_rhs =
+              builder.create<arith::MulFOp>(loc, vecA1, vecOut_next);
+          Value vecS1_next =
+              builder.create<arith::SubFOp>(loc, vecS1_lhs, vecS1_rhs);
+
+          Value vecS2_lhs =
+              builder.create<arith::MulFOp>(loc, vecB2, vecIn_next);
+          Value vecS2_rhs =
+              builder.create<arith::MulFOp>(loc, vecA2, vecOut_next);
+          Value vecS2_next =
+              builder.create<arith::SubFOp>(loc, vecS2_lhs, vecS2_rhs);
+
+          builder.create<scf::YieldOp>(
+              loc, std::vector<Value>{vecOut_next, vecS1_next, vecS2_next});
+        });
+
+    Value vecOut_tmp2 = processResult.getResult(0);
+    Value vecS1_tmp2 = processResult.getResult(1);
+    Value vecS2_tmp2 = processResult.getResult(2);
+
+    // This loop represent tail ending section.
+    rewriter.create<scf::ForOp>(
+        loc, upperBound, N, c1, ValueRange{vecOut_tmp2, vecS1_tmp2, vecS2_tmp2},
+        [&](OpBuilder &builder, Location loc, Value iv, ValueRange iargs) {
+          Value vecIn_move_right = builder.create<vector::ShuffleOp>(
+              loc, iargs[0], iargs[0],
+              ArrayRef<int64_t>{0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
+                                14});
+          Value vecIn_next = builder.create<vector::InsertElementOp>(
+              loc, f0, vecIn_move_right, c0);
+          Value vecOut_next =
+              builder.create<vector::FMAOp>(loc, vecB0, vecIn_next, iargs[1]);
+          Value out_elem =
+              builder.create<vector::ExtractElementOp>(loc, vecOut_next, i15);
+          builder.create<memref::StoreOp>(loc, out_elem, output, iv);
+
+          Value vecS1_lhs =
+              builder.create<vector::FMAOp>(loc, vecB1, vecIn_next, iargs[2]);
+          Value vecS1_rhs =
+              builder.create<arith::MulFOp>(loc, vecA1, vecOut_next);
+          Value vecS1_next =
+              builder.create<arith::SubFOp>(loc, vecS1_lhs, vecS1_rhs);
+
+          Value vecS2_lhs =
+              builder.create<arith::MulFOp>(loc, vecB2, vecIn_next);
+          Value vecS2_rhs =
+              builder.create<arith::MulFOp>(loc, vecA2, vecOut_next);
+          Value vecS2_next =
+              builder.create<arith::SubFOp>(loc, vecS2_lhs, vecS2_rhs);
+
+          builder.create<scf::YieldOp>(
+              loc, std::vector<Value>{vecOut_next, vecS1_next, vecS2_next});
+        });
+
+    rewriter.eraseOp(op);
+    return success();
+  }
+
+private:
+  int64_t stride;
+};
+
 } // end anonymous namespace
 
 void populateLowerDAPConversionPatterns(RewritePatternSet &patterns,
                                         int64_t stride) {
   patterns.add<DAPFirLowering>(patterns.getContext());
   patterns.add<DAPBiquadLowering>(patterns.getContext(), stride);
   patterns.add<DAPIirLowering>(patterns.getContext(), stride);
+  patterns.add<DAPIirSimdLowering>(patterns.getContext(), stride);
 }
 
 //===----------------------------------------------------------------------===//
@@ -363,7 +577,8 @@ class LowerDAPPass : public PassWrapper<LowerDAPPass, OperationPass<ModuleOp>> {
   void getDependentDialects(DialectRegistry &registry) const override {
     registry.insert<buddy::dap::DAPDialect, func::FuncDialect,
                     memref::MemRefDialect, scf::SCFDialect, VectorDialect,
-                    affine::AffineDialect, arith::ArithDialect,linalg::LinalgDialect>();
+                    affine::AffineDialect, arith::ArithDialect,
+                    linalg::LinalgDialect>();
   }
   Option<int64_t> stride{*this, "DAP-vector-splitting",
                          llvm::cl::desc("Vector splitting size."),
@@ -376,10 +591,10 @@ void LowerDAPPass::runOnOperation() {
   ModuleOp module = getOperation();
 
   ConversionTarget target(*context);
-  target.addLegalDialect<affine::AffineDialect, scf::SCFDialect,
-                         func::FuncDialect, memref::MemRefDialect,
-                         VectorDialect, arith::ArithDialect,
-                         linalg::LinalgDialect>();
+  target
+      .addLegalDialect<affine::AffineDialect, scf::SCFDialect,
+                       func::FuncDialect, memref::MemRefDialect, VectorDialect,
+                       arith::ArithDialect, linalg::LinalgDialect>();
   target.addLegalOp<ModuleOp, func::FuncOp, func::ReturnOp>();
 
   RewritePatternSet patterns(context);