diff --git a/frontend/Interfaces/buddy/DAP/DSP/IIR.h b/frontend/Interfaces/buddy/DAP/DSP/IIR.h index c2c3bb1eb..10f124b74 100644 --- a/frontend/Interfaces/buddy/DAP/DSP/IIR.h +++ b/frontend/Interfaces/buddy/DAP/DSP/IIR.h @@ -34,9 +34,17 @@ void _mlir_ciface_mlir_iir(MemRef *inputBuddyConv1D, MemRef *kernelBuddyConv1D, MemRef *outputBuddyConv1D); +void _mlir_ciface_mlir_iir_simd(MemRef *inputBuddyConv1D, + MemRef *kernelBuddyConv1D, + MemRef *outputBuddyConv1D); + void _mlir_ciface_buddy_iir(MemRef *inputBuddyConv1D, MemRef *kernelBuddyConv1D, MemRef *outputBuddyConv1D); + +void _mlir_ciface_buddy_iir_simd(MemRef *inputBuddyConv1D, + MemRef *kernelBuddyConv1D, + MemRef *outputBuddyConv1D); } } // namespace detail diff --git a/frontend/Interfaces/lib/DAP.mlir b/frontend/Interfaces/lib/DAP.mlir index 41502ad45..8ebafe607 100644 --- a/frontend/Interfaces/lib/DAP.mlir +++ b/frontend/Interfaces/lib/DAP.mlir @@ -28,6 +28,11 @@ func.func @buddy_iir(%in : memref, %filter : memref, %out : memr return } +func.func @buddy_iir_simd(%in : memref, %filter : memref, %out : memref) -> () { + dap.iir_simd %in, %filter, %out : memref, memref, memref + return +} + func.func @buddy_biquad(%in : memref, %filter : memref, %out : memref) -> () { dap.biquad %in, %filter, %out : memref, memref, memref return diff --git a/midend/include/Dialect/DAP/DAPOps.td b/midend/include/Dialect/DAP/DAPOps.td index 9e7d894b9..44437c062 100644 --- a/midend/include/Dialect/DAP/DAPOps.td +++ b/midend/include/Dialect/DAP/DAPOps.td @@ -94,4 +94,23 @@ def DAP_IirOp : DAP_Op<"iir"> { }]; } +def DAP_IirSimdOp : DAP_Op<"iir_simd"> { + let summary = [{IIR filter, a infinite impulse response (IIR), Unlike FIR filters, + they have a feedback(a recursive part of a filter). This is the SIMD version for iir operation. + ```mlir + dsp.iir_simd %input, %kernel, %output :memref, memref, + memref + ``` + }]; + let arguments = (ins Arg:$memrefI, + Arg:$memrefK, + Arg:$memrefO); + let assemblyFormat = [{ + $memrefI `,` $memrefK `,` $memrefO attr-dict `:` type($memrefI) `,` type($memrefK) `,` type($memrefO) + }]; +} + #endif // DAP_DAPOPS_TD diff --git a/midend/lib/Conversion/LowerDAP/LowerDAPPass.cpp b/midend/lib/Conversion/LowerDAP/LowerDAPPass.cpp index 33148d547..f350b3004 100644 --- a/midend/lib/Conversion/LowerDAP/LowerDAPPass.cpp +++ b/midend/lib/Conversion/LowerDAP/LowerDAPPass.cpp @@ -21,11 +21,11 @@ #include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/Bufferization/Transforms/Bufferize.h" #include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/Dialect/Linalg/IR/Linalg.h" #include "mlir/Dialect/Linalg/Transforms/Transforms.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/Dialect/Vector/IR/VectorOps.h" #include "mlir/Pass/Pass.h" -#include "mlir/Dialect/Linalg/IR/Linalg.h" #include "DAP/DAPDialect.h" #include "DAP/DAPOps.h" @@ -282,16 +282,16 @@ class DAPIirLowering : public OpRewritePattern { // process the remain data of FIR part Value idx1 = builder.create(loc, upperN, c1); Value idx2 = builder.create(loc, upperN, c2); - Value in1 = + Value in1 = builder.create(loc, input, ValueRange{idx1}); - Value in2 = + Value in2 = builder.create(loc, input, ValueRange{idx2}); builder.create( - loc, upperN, N, c1, ValueRange{in1, in2}, + loc, upperN, N, c1, ValueRange{in1, in2}, [&](OpBuilder &builder, Location loc, Value iv, ValueRange itrargs) { - Value in0 = + Value in0 = builder.create(loc, input, ValueRange{iv}); Value temp0 = builder.create(loc, b0, in0); @@ -299,8 +299,9 @@ class DAPIirLowering : public OpRewritePattern { Value temp2 = builder.create(loc, b2, in2); Value sum0 = builder.create(loc, temp0, temp1); Value sum1 = builder.create(loc, sum0, temp2); - - builder.create(loc, sum1, output, ValueRange{iv}); + + builder.create(loc, sum1, output, + ValueRange{iv}); builder.create(loc, std::vector{in0, in1}); }); @@ -334,6 +335,218 @@ class DAPIirLowering : public OpRewritePattern { int64_t stride; }; +class DAPIirSimdLowering : public OpRewritePattern { +public: + using OpRewritePattern::OpRewritePattern; + + explicit DAPIirSimdLowering(MLIRContext *context, int64_t strideParam) + : OpRewritePattern(context) { + stride = strideParam; + } + + LogicalResult matchAndRewrite(dap::IirSimdOp op, + PatternRewriter &rewriter) const override { + auto loc = op->getLoc(); + auto ctx = op->getContext(); + + Value input = op->getOperand(0); + Value kernel = op->getOperand(1); + Value output = op->getOperand(2); + + Value c0 = rewriter.create(loc, 0); + Value c1 = rewriter.create(loc, 1); + Value c2 = rewriter.create(loc, 2); + Value c4 = rewriter.create(loc, 4); + Value c5 = rewriter.create(loc, 5); + Value c15 = rewriter.create(loc, 15); + // TODO : Change the vector length value to an Attribute + Value c16 = rewriter.create(loc, 16); + + Value N = rewriter.create(loc, input, c0); + Value filterSize = rewriter.create(loc, kernel, c0); + Value strideVal = rewriter.create(loc, stride); + + FloatType f64 = FloatType::getF64(ctx); + + VectorType vectorTy64 = VectorType::get(16, f64); + + Value f0 = rewriter.create(loc, APFloat(0.0d), f64); + Value f1 = rewriter.create(loc, APFloat(1.0d), f64); + + Value initB0 = rewriter.create(loc, vectorTy64, f1); + Value initB1 = rewriter.create(loc, vectorTy64, f0); + Value initB2 = rewriter.create(loc, vectorTy64, f0); + Value initA1 = rewriter.create(loc, vectorTy64, f0); + Value initA2 = rewriter.create(loc, vectorTy64, f0); + + // Distribute all params into 5 param vectors + auto vecDistribute = rewriter.create( + loc, c0, filterSize, c1, + ValueRange{initB0, initB1, initB2, initA1, initA2}, + [&](OpBuilder &builder, Location loc, Value iv, ValueRange iargs) { + Value b0 = + builder.create(loc, kernel, ValueRange{iv, c0}); + Value b1 = + builder.create(loc, kernel, ValueRange{iv, c1}); + Value b2 = + builder.create(loc, kernel, ValueRange{iv, c2}); + // Value a0 of kernel is not used + Value a1 = + builder.create(loc, kernel, ValueRange{iv, c4}); + Value a2 = + builder.create(loc, kernel, ValueRange{iv, c5}); + + Value B0_next = + builder.create(loc, b0, iargs[0], iv); + Value B1_next = + builder.create(loc, b1, iargs[1], iv); + Value B2_next = + builder.create(loc, b2, iargs[2], iv); + Value A1_next = + builder.create(loc, a1, iargs[3], iv); + Value A2_next = + builder.create(loc, a2, iargs[4], iv); + + builder.create( + loc, + std::vector{B0_next, B1_next, B2_next, A1_next, A2_next}); + }); + + Value vecB0 = vecDistribute.getResult(0); + Value vecB1 = vecDistribute.getResult(1); + Value vecB2 = vecDistribute.getResult(2); + Value vecA1 = vecDistribute.getResult(3); + Value vecA2 = vecDistribute.getResult(4); + + Value vecOut = rewriter.create(loc, vectorTy64, f0); + Value vecS1 = rewriter.create(loc, vectorTy64, f0); + Value vecS2 = rewriter.create(loc, vectorTy64, f0); + + // The SIMD version for IIR operation can represented as a pipeline with + // {c16} stages. This loop represent the injection section, loop {stages-1} + // times. + auto injectionResult = rewriter.create( + loc, c0, c15, c1, ValueRange{vecOut, vecS1, vecS2}, + [&](OpBuilder &builder, Location loc, Value iv, ValueRange iargs) { + Value in_elem = builder.create(loc, input, iv); + Value vecIn_move_right = builder.create( + loc, iargs[0], iargs[0], + ArrayRef{0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, + 14}); + Value vecIn_next = builder.create( + loc, in_elem, vecIn_move_right, c0); + Value vecOut_next = + builder.create(loc, vecB0, vecIn_next, iargs[1]); + + Value vecS1_lhs = + builder.create(loc, vecB1, vecIn_next, iargs[2]); + Value vecS1_rhs = + builder.create(loc, vecA1, vecOut_next); + Value vecS1_next = + builder.create(loc, vecS1_lhs, vecS1_rhs); + + Value vecS2_lhs = + builder.create(loc, vecB2, vecIn_next); + Value vecS2_rhs = + builder.create(loc, vecA2, vecOut_next); + Value vecS2_next = + builder.create(loc, vecS2_lhs, vecS2_rhs); + + builder.create( + loc, std::vector{vecOut_next, vecS1_next, vecS2_next}); + }); + + Value vecOut_tmp1 = injectionResult.getResult(0); + Value vecS1_tmp1 = injectionResult.getResult(1); + Value vecS2_tmp1 = injectionResult.getResult(2); + + Value i15 = + rewriter.create(loc, /*value=*/15, /*width=*/64); + Value upperBound = rewriter.create(loc, N, c15); + + // This loop represent full process section, start to produce output. + auto processResult = rewriter.create( + loc, c0, upperBound, c1, + ValueRange{vecOut_tmp1, vecS1_tmp1, vecS2_tmp1}, + [&](OpBuilder &builder, Location loc, Value iv, ValueRange iargs) { + Value index = builder.create(loc, iv, c15); + Value in_elem = builder.create(loc, input, index); + Value vecIn_move_right = builder.create( + loc, iargs[0], iargs[0], + ArrayRef{0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, + 14}); + Value vecIn_next = builder.create( + loc, in_elem, vecIn_move_right, c0); + Value vecOut_next = + builder.create(loc, vecB0, vecIn_next, iargs[1]); + Value out_elem = + builder.create(loc, vecOut_next, i15); + builder.create(loc, out_elem, output, iv); + + Value vecS1_lhs = + builder.create(loc, vecB1, vecIn_next, iargs[2]); + Value vecS1_rhs = + builder.create(loc, vecA1, vecOut_next); + Value vecS1_next = + builder.create(loc, vecS1_lhs, vecS1_rhs); + + Value vecS2_lhs = + builder.create(loc, vecB2, vecIn_next); + Value vecS2_rhs = + builder.create(loc, vecA2, vecOut_next); + Value vecS2_next = + builder.create(loc, vecS2_lhs, vecS2_rhs); + + builder.create( + loc, std::vector{vecOut_next, vecS1_next, vecS2_next}); + }); + + Value vecOut_tmp2 = processResult.getResult(0); + Value vecS1_tmp2 = processResult.getResult(1); + Value vecS2_tmp2 = processResult.getResult(2); + + // This loop represent tail ending section. + rewriter.create( + loc, upperBound, N, c1, ValueRange{vecOut_tmp2, vecS1_tmp2, vecS2_tmp2}, + [&](OpBuilder &builder, Location loc, Value iv, ValueRange iargs) { + Value vecIn_move_right = builder.create( + loc, iargs[0], iargs[0], + ArrayRef{0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, + 14}); + Value vecIn_next = builder.create( + loc, f0, vecIn_move_right, c0); + Value vecOut_next = + builder.create(loc, vecB0, vecIn_next, iargs[1]); + Value out_elem = + builder.create(loc, vecOut_next, i15); + builder.create(loc, out_elem, output, iv); + + Value vecS1_lhs = + builder.create(loc, vecB1, vecIn_next, iargs[2]); + Value vecS1_rhs = + builder.create(loc, vecA1, vecOut_next); + Value vecS1_next = + builder.create(loc, vecS1_lhs, vecS1_rhs); + + Value vecS2_lhs = + builder.create(loc, vecB2, vecIn_next); + Value vecS2_rhs = + builder.create(loc, vecA2, vecOut_next); + Value vecS2_next = + builder.create(loc, vecS2_lhs, vecS2_rhs); + + builder.create( + loc, std::vector{vecOut_next, vecS1_next, vecS2_next}); + }); + + rewriter.eraseOp(op); + return success(); + } + +private: + int64_t stride; +}; + } // end anonymous namespace void populateLowerDAPConversionPatterns(RewritePatternSet &patterns, @@ -341,6 +554,7 @@ void populateLowerDAPConversionPatterns(RewritePatternSet &patterns, patterns.add(patterns.getContext()); patterns.add(patterns.getContext(), stride); patterns.add(patterns.getContext(), stride); + patterns.add(patterns.getContext(), stride); } //===----------------------------------------------------------------------===// @@ -363,7 +577,8 @@ class LowerDAPPass : public PassWrapper> { void getDependentDialects(DialectRegistry ®istry) const override { registry.insert(); + affine::AffineDialect, arith::ArithDialect, + linalg::LinalgDialect>(); } Option stride{*this, "DAP-vector-splitting", llvm::cl::desc("Vector splitting size."), @@ -376,10 +591,10 @@ void LowerDAPPass::runOnOperation() { ModuleOp module = getOperation(); ConversionTarget target(*context); - target.addLegalDialect(); + target + .addLegalDialect(); target.addLegalOp(); RewritePatternSet patterns(context);