diff --git a/codegen/compiler/src/Quidditch/Conversion/CMakeLists.txt b/codegen/compiler/src/Quidditch/Conversion/CMakeLists.txt index 491a442..f475b9d 100644 --- a/codegen/compiler/src/Quidditch/Conversion/CMakeLists.txt +++ b/codegen/compiler/src/Quidditch/Conversion/CMakeLists.txt @@ -38,3 +38,18 @@ iree_cc_library( MLIRSCFDialect MLIRTransforms ) + +iree_cc_library( + NAME + ConvertDMAToLLVM + SRCS + "ConvertDMAToLLVM.cpp" + DEPS + Quidditch::Dialect::DMA::IR::DMADialect + MLIRAnalysis + MLIRIR + MLIRLLVMCommonConversion + MLIRLLVMDialect + MLIRSCFDialect + MLIRTransforms +) diff --git a/codegen/compiler/src/Quidditch/Conversion/ConvertDMAToLLVM.cpp b/codegen/compiler/src/Quidditch/Conversion/ConvertDMAToLLVM.cpp new file mode 100644 index 0000000..d29c81b --- /dev/null +++ b/codegen/compiler/src/Quidditch/Conversion/ConvertDMAToLLVM.cpp @@ -0,0 +1,484 @@ +#include "ConvertDMAToLLVM.h" + +#include "mlir/Conversion/LLVMCommon/MemRefBuilder.h" +#include "mlir/Conversion/LLVMCommon/Pattern.h" +#include "mlir/Conversion/LLVMCommon/TypeConverter.h" +#include "mlir/Dialect/LLVMIR/LLVMDialect.h" +#include "mlir/Dialect/MemRef/IR/MemRef.h" +#include "mlir/Dialect/SCF/IR/SCF.h" +#include "mlir/Transforms/DialectConversion.h" + +#include "Quidditch/Dialect/DMA/IR/DMAOps.h" + +using namespace mlir; +using namespace quidditch::dma; + +/// Returns the number of potentially non-contiguous outer dimensions of +/// 'memRefType'. The remaining inner dimensions (i.e. all dimensions at index +/// 'NonContiguousOuterDims' to the MemRef rank) are known to be contiguous. +/// Returns failure if the layout attribute of the MemRef is unsupported. +static FailureOr getNumNonContiguousOuterDims(MemRefType memRefType) { + auto stridesAttr = + dyn_cast_or_null(memRefType.getLayout()); + if (!stridesAttr) { + if (memRefType.getLayout() && !memRefType.getLayout().isIdentity()) + return failure(); + + // No layout or identity layouts are by definition fully contiguous. + return 0; + } + + int64_t innerSize = 1; + ArrayRef shape = memRefType.getShape(); + ArrayRef strides = stridesAttr.getStrides(); + for (; !shape.empty(); + shape = shape.drop_back(), strides = strides.drop_back()) { + int64_t dim = shape.back(); + // Unit dims can be dropped alongside the corresponding stride of that dim. + if (dim == 1) + continue; + + int64_t stride = strides.back(); + if (ShapedType::isDynamic(stride)) + break; + + if (innerSize != stride) + break; + + // Note: Dim may be dynamic with the value -1. This intentionally will only + // fail the 'if' above later if the outer dims are non-zero. + innerSize *= dim; + } + + return shape.size(); +} + +/// Returns true if this MemRef type is known to have a fully contiguous layout. +/// TODO: Could be upstreamed next to +/// 'memref::isStaticShapeAndContiguousRowMajor' +static bool isContiguous(MemRefType memRefType) { + return getNumNonContiguousOuterDims(memRefType) == 0; +} + +namespace { +struct StartTransferOp1DLowering : ConvertOpToLLVMPattern { + + LLVM::LLVMFuncOp dmaStart1DFunc; + + StartTransferOp1DLowering(LLVM::LLVMFuncOp dmaStart1DFunc, + const LLVMTypeConverter &converter) + : ConvertOpToLLVMPattern(converter, /*benefit=*/2), + dmaStart1DFunc(dmaStart1DFunc) {} + + LogicalResult match(StartTransferOp op) const override { + return success(isContiguous(op.getSource().getType()) && + isContiguous(op.getDest().getType())); + } + + void rewrite(StartTransferOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + MemRefDescriptor sourceDescriptor(adaptor.getSource()); + MemRefDescriptor destDescriptor(adaptor.getDest()); + + Value source = sourceDescriptor.bufferPtr( + rewriter, op->getLoc(), *getTypeConverter(), op.getSource().getType()); + Value dest = destDescriptor.bufferPtr( + rewriter, op->getLoc(), *getTypeConverter(), op.getDest().getType()); + + MemRefType sourceMemRef = op.getSource().getType(); + SmallVector dynamicSizes; + for (auto [index, dim] : llvm::enumerate(sourceMemRef.getShape())) + if (ShapedType::isDynamic(dim)) + dynamicSizes.push_back( + sourceDescriptor.size(rewriter, op->getLoc(), index)); + + SmallVector sizes; + SmallVector strides; + Value totalSize; + getMemRefDescriptorSizes( + op->getLoc(), + // Offsets are not considered an identity layout. + // Get rid of the layout entirely for the size calculation. + MemRefType::get(sourceMemRef.getShape(), sourceMemRef.getElementType(), + nullptr, sourceMemRef.getMemorySpace()), + dynamicSizes, rewriter, sizes, strides, totalSize); + + rewriter.replaceOpWithNewOp(op, dmaStart1DFunc, + ValueRange{ + dest, + source, + totalSize, + }); + } +}; + +struct StartTransferOp2DLowering : ConvertOpToLLVMPattern { + + LLVM::LLVMFuncOp dmaStart2DFunc; + + StartTransferOp2DLowering(LLVM::LLVMFuncOp dmaStart2DFunc, + const LLVMTypeConverter &converter) + : ConvertOpToLLVMPattern(converter), dmaStart2DFunc(dmaStart2DFunc) {} + + LogicalResult + matchAndRewrite(StartTransferOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + MemRefType sourceMemRef = op.getSource().getType(); + MemRefType destMemRef = op.getDest().getType(); + + // Compute the size of the contiguous inner loop common to both MemRefs and + // "shave" it off the ends of the shapes and strides. The remaining shapes + // and strides are considered our outer dimensions. + FailureOr sourceNonContiguous = + getNumNonContiguousOuterDims(sourceMemRef); + FailureOr destNonContiguous = + getNumNonContiguousOuterDims(destMemRef); + if (failed(sourceNonContiguous) || failed(destNonContiguous)) + return failure(); + size_t sharedNonContiguous = + std::max(*sourceNonContiguous, *destNonContiguous); + if (sharedNonContiguous == 0) + return failure(); + + Value elementSize = rewriter.create( + op->getLoc(), + rewriter.getI32IntegerAttr(llvm::divideCeil( + op.getSource().getType().getElementTypeBitWidth(), 8))); + SmallVector sizes = + memref::getMixedSizes(rewriter, op->getLoc(), op.getSource()); + + // Build a loop nest iterating over all outer dimensions - 1 and adjusts the + // source and destination pointers accordingly. The inner-most outer + // dimension is used in the DMA call for the repetition count and strides. + SmallVector lowerBounds; + SmallVector upperBounds; + SmallVector steps; + Value zeroIndex = rewriter.create(op.getLoc(), 0); + Value oneIndex = rewriter.create(op.getLoc(), 1); + for (size_t index : llvm::seq(sharedNonContiguous - 1)) { + lowerBounds.push_back(zeroIndex); + steps.push_back(oneIndex); + upperBounds.push_back(getValueOrCreateConstantIndexOp( + rewriter, op->getLoc(), sizes[index])); + } + + Value contiguousSize; + for (auto index : + llvm::seq(sharedNonContiguous, sourceMemRef.getRank())) { + Value dim = + getValueOrCreateConstantIndexOp(rewriter, op->getLoc(), sizes[index]); + if (!contiguousSize) { + contiguousSize = dim; + continue; + } + contiguousSize = + rewriter.create(op->getLoc(), contiguousSize, dim); + } + contiguousSize = typeConverter->materializeTargetConversion( + rewriter, op->getLoc(), getIndexType(), contiguousSize); + contiguousSize = + rewriter.create(op->getLoc(), contiguousSize, elementSize); + + Value completedToken = rewriter.create(op->getLoc()); + + scf::LoopNest loopNest = scf::buildLoopNest( + rewriter, op->getLoc(), lowerBounds, upperBounds, steps, completedToken, + [&](OpBuilder &builder, Location loc, ValueRange ivs, + ValueRange iterArgs) -> scf::ValueVector { + SmallVector offsets = ivs; + SmallVector subSizes(sharedNonContiguous - 1, + rewriter.getIndexAttr(1)); + for (unsigned i : llvm::seq(sharedNonContiguous - 1, + sourceMemRef.getRank())) { + offsets.push_back(rewriter.getIndexAttr(0)); + subSizes.push_back(sizes[i]); + } + SmallVector strides(sourceMemRef.getRank(), + rewriter.getIndexAttr(1)); + + TypedValue sourceMemRefSlice = + rewriter.create(loc, op.getSource(), offsets, + subSizes, strides); + TypedValue destMemRefSlice = + rewriter.create(loc, op.getDest(), offsets, + subSizes, strides); + + auto sourceDescriptor = + MemRefDescriptor(typeConverter->materializeTargetConversion( + rewriter, op->getLoc(), + typeConverter->convertType(sourceMemRefSlice.getType()), + sourceMemRefSlice)); + auto destDescriptor = + MemRefDescriptor(typeConverter->materializeTargetConversion( + rewriter, op->getLoc(), + typeConverter->convertType(destMemRefSlice.getType()), + destMemRefSlice)); + + Value sourceAdjusted = sourceDescriptor.bufferPtr( + rewriter, op->getLoc(), *getTypeConverter(), + sourceMemRefSlice.getType()); + Value destAdjusted = destDescriptor.bufferPtr( + rewriter, op->getLoc(), *getTypeConverter(), + destMemRefSlice.getType()); + + Value sourceStride = + sourceDescriptor.stride(builder, loc, sharedNonContiguous - 1); + sourceStride = rewriter.create( + op->getLoc(), sourceStride, elementSize); + Value destStride = + destDescriptor.stride(builder, loc, sharedNonContiguous - 1); + destStride = rewriter.create(op->getLoc(), destStride, + elementSize); + + Value outerLoopSize = + sourceDescriptor.size(builder, loc, sharedNonContiguous - 1); + return {builder + .create(loc, dmaStart2DFunc, + ValueRange{ + destAdjusted, + sourceAdjusted, + contiguousSize, + destStride, + sourceStride, + outerLoopSize, + }) + .getResult()}; + }); + + Type tokenType = typeConverter->convertType(op.getType()); + rewriter.replaceOp( + op, typeConverter->materializeTargetConversion( + rewriter, op->getLoc(), tokenType, loopNest.results.front())); + return success(); + } +}; + +// TODO: These should not be hardcoded. +constexpr unsigned zeroMemSize = 0x10000; +constexpr unsigned zeroMemAddress = 0x10030000; + +struct StartContiguousZeroMemTransferOpOpLowering + : ConvertOpToLLVMPattern { + + LLVM::LLVMFuncOp dmaStart1DFunc; + LLVM::LLVMFuncOp dmaStart2DFunc; + + StartContiguousZeroMemTransferOpOpLowering(LLVM::LLVMFuncOp dmaStart1DFunc, + LLVM::LLVMFuncOp dmaStart2DFunc, + const LLVMTypeConverter &converter) + : ConvertOpToLLVMPattern(converter, /*benefit=*/2), + dmaStart1DFunc(dmaStart1DFunc), dmaStart2DFunc(dmaStart2DFunc) {} + + LogicalResult + matchAndRewrite(StartZeroMemTransferOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + if (!isContiguous(op.getFilled().getType())) + return failure(); + + Value zeroPointer = rewriter.create( + op->getLoc(), rewriter.getType(), + rewriter.create( + op->getLoc(), rewriter.getI32IntegerAttr(zeroMemAddress))); + Value zeroMemSizeValue = rewriter.create( + op->getLoc(), rewriter.getI32IntegerAttr(zeroMemSize)); + + SmallVector sizes; + SmallVector strides; + Value size; + + auto filledDesc = MemRefDescriptor(adaptor.getFilled()); + + MemRefType memRefType = op.getFilled().getType(); + SmallVector dynamicSizes; + for (auto [index, shape] : llvm::enumerate(memRefType.getShape())) + if (ShapedType::isDynamic(shape)) + dynamicSizes.push_back(filledDesc.size(rewriter, op->getLoc(), index)); + + // Function does not support strided layout, even if it is contiguous. + // Lie about it and remove it. + // TODO: Consider fixing this upstream. + // TODO: Make a clone method of `MemRefType` that changes just the layout. + this->getMemRefDescriptorSizes( + op->getLoc(), + MemRefType::get(memRefType.getShape(), memRefType.getElementType()), + dynamicSizes, rewriter, sizes, strides, size); + + Value zero = + createIndexAttrConstant(rewriter, op->getLoc(), getIndexType(), 0); + Value bufferPointer = filledDesc.bufferPtr(rewriter, op->getLoc(), + *getTypeConverter(), memRefType); + Value times2D = + rewriter.create(op->getLoc(), size, zeroMemSizeValue); + // Note: This call would not be legal as a 'start_dma_transfer' call as + // MemRefs do not allow internal aliasing, which the below does via the + // stride of 0. + rewriter.create(op->getLoc(), dmaStart2DFunc, + ValueRange{bufferPointer, zeroPointer, + zeroMemSizeValue, zeroMemSizeValue, + zero, times2D}); + Value offset = + rewriter.create(op->getLoc(), times2D, zeroMemSizeValue); + bufferPointer = rewriter.create( + op->getLoc(), bufferPointer.getType(), rewriter.getI8Type(), + bufferPointer, offset); + Value rest = + rewriter.create(op->getLoc(), size, zeroMemSizeValue); + rewriter.replaceOpWithNewOp( + op, dmaStart1DFunc, ValueRange{bufferPointer, zeroPointer, rest}); + return success(); + } +}; + +struct StartZeroMemTransferOpOpLowering + : ConvertOpToLLVMPattern { + + using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern; + + LogicalResult + matchAndRewrite(StartZeroMemTransferOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + MemRefType memRefType = op.getFilled().getType(); + + FailureOr nonContiguousDims = + getNumNonContiguousOuterDims(memRefType); + if (failed(nonContiguousDims) || nonContiguousDims == 0) + return failure(); + + SmallVector sizes = + memref::getMixedSizes(rewriter, op->getLoc(), op.getFilled()); + + SmallVector lowerBounds; + SmallVector upperBounds; + SmallVector steps; + Value zeroIndex = rewriter.create(op.getLoc(), 0); + Value oneIndex = rewriter.create(op.getLoc(), 1); + for (size_t index : llvm::seq(*nonContiguousDims)) { + lowerBounds.push_back(zeroIndex); + steps.push_back(oneIndex); + upperBounds.push_back(getValueOrCreateConstantIndexOp( + rewriter, op->getLoc(), sizes[index])); + } + + // Loop over every non-contiguous dimension to zero every contiguous + // inner subview. + Value completedToken = rewriter.create(op->getLoc()); + scf::LoopNest loopNest = scf::buildLoopNest( + rewriter, op->getLoc(), lowerBounds, upperBounds, steps, completedToken, + [&](OpBuilder &builder, Location loc, ValueRange ivs, + ValueRange iterArgs) -> scf::ValueVector { + SmallVector offsets = ivs; + SmallVector subSizes(*nonContiguousDims, + rewriter.getIndexAttr(1)); + for (unsigned i : + llvm::seq(*nonContiguousDims, memRefType.getRank())) { + offsets.push_back(rewriter.getIndexAttr(0)); + subSizes.push_back(sizes[i]); + } + SmallVector strides(memRefType.getRank(), + rewriter.getIndexAttr(1)); + + Value subMemRef = rewriter.create( + loc, op.getFilled(), offsets, subSizes, strides); + return { + builder.create(op->getLoc(), subMemRef)}; + }); + + Type tokenType = typeConverter->convertType(op.getType()); + rewriter.replaceOp( + op, typeConverter->materializeTargetConversion( + rewriter, op->getLoc(), tokenType, loopNest.results.front())); + return success(); + } +}; + +struct WaitForTransfersOpLowering : ConvertOpToLLVMPattern { + + using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern; + + LogicalResult + matchAndRewrite(WaitForTransfersOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + if (adaptor.getTokens().empty()) { + rewriter.eraseOp(op); + return success(); + } + + Value current = adaptor.getTokens().front(); + for (Value iter : llvm::drop_begin(adaptor.getTokens())) + current = rewriter.create(op->getLoc(), current, iter); + + Block *prev = op->getBlock(); + Block *body = rewriter.splitBlock(prev, op->getIterator()); + Block *after = rewriter.splitBlock(body, op->getNextNode()->getIterator()); + rewriter.setInsertionPointToEnd(prev); + rewriter.create(op->getLoc(), body); + + rewriter.setInsertionPointToEnd(body); + Value lastCompleted = + rewriter + .create( + op->getLoc(), /*res=*/rewriter.getI32Type(), + /*operands=*/ValueRange(), + // dmstati $0, 0 + // opcode6=0x2b, func3=0, func7=0b100, rd=$0, rs1=zero, + // rs2=imm5(0) + ".insn r 0x2b, 0, 0b100, $0, zero, zero\n", + /*constraints=*/"=r", + /*has_side_effects=*/true, /*is_align_stack=*/false, + /*asm_dialect=*/nullptr, /*operand_attrs=*/nullptr) + .getRes(); + Value notDone = rewriter.create( + op->getLoc(), LLVM::ICmpPredicate::ult, lastCompleted, current); + rewriter.create(op->getLoc(), notDone, body, after); + + rewriter.setInsertionPointToStart(after); + rewriter.eraseOp(op); + return success(); + } +}; + +struct CompletedTokenOpLowering : ConvertOpToLLVMPattern { + + using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern; + + LogicalResult + matchAndRewrite(CompletedTokenOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + rewriter.replaceOpWithNewOp( + op, typeConverter->convertType(op.getType()), 0); + return success(); + } +}; + +} // namespace + +void quidditch::populateDMAToLLVMConversionPatterns( + mlir::ModuleOp moduleOp, LLVMTypeConverter &typeConverter, + RewritePatternSet &patterns) { + + typeConverter.addConversion( + [](TokenType token) { return IntegerType::get(token.getContext(), 32); }); + + auto builder = OpBuilder::atBlockEnd(moduleOp.getBody()); + auto ptrType = builder.getType(); + IntegerType i32 = builder.getI32Type(); + IntegerType sizeT = i32; + auto dmaStart1D = builder.create( + builder.getUnknownLoc(), "snrt_dma_start_1d", + LLVM::LLVMFunctionType::get(i32, + ArrayRef{ptrType, ptrType, sizeT})); + dmaStart1D->setAttr("hal.import.bitcode", builder.getUnitAttr()); + + auto dmaStart2D = builder.create( + builder.getUnknownLoc(), "snrt_dma_start_2d", + LLVM::LLVMFunctionType::get( + i32, ArrayRef{ptrType, ptrType, sizeT, sizeT, sizeT, sizeT})); + dmaStart2D->setAttr("hal.import.bitcode", builder.getUnitAttr()); + + patterns.insert(typeConverter); + patterns.insert(dmaStart1D, typeConverter); + patterns.insert(dmaStart2D, typeConverter); + patterns.insert( + dmaStart1D, dmaStart2D, typeConverter); +} diff --git a/codegen/compiler/src/Quidditch/Conversion/ConvertDMAToLLVM.h b/codegen/compiler/src/Quidditch/Conversion/ConvertDMAToLLVM.h new file mode 100644 index 0000000..cbd351e --- /dev/null +++ b/codegen/compiler/src/Quidditch/Conversion/ConvertDMAToLLVM.h @@ -0,0 +1,10 @@ + +#pragma once + +#include "mlir/Conversion/LLVMCommon/TypeConverter.h" + +namespace quidditch { +void populateDMAToLLVMConversionPatterns(mlir::ModuleOp moduleOp, + mlir::LLVMTypeConverter &converter, + mlir::RewritePatternSet &patterns); +} diff --git a/codegen/compiler/src/Quidditch/Conversion/ConvertSnitchToLLVM.cpp b/codegen/compiler/src/Quidditch/Conversion/ConvertSnitchToLLVM.cpp index 4f404b7..063bae6 100644 --- a/codegen/compiler/src/Quidditch/Conversion/ConvertSnitchToLLVM.cpp +++ b/codegen/compiler/src/Quidditch/Conversion/ConvertSnitchToLLVM.cpp @@ -44,447 +44,6 @@ struct L1MemoryViewOpLowering : ConvertOpToLLVMPattern { return success(); } }; -} // namespace - -/// Returns the number of potentially non-contiguous outer dimensions of -/// 'memRefType'. The remaining inner dimensions (i.e. all dimensions at index -/// 'NonContiguousOuterDims' to the MemRef rank) are known to be contiguous. -/// Returns failure if the layout attribute of the MemRef is unsupported. -static FailureOr getNumNonContiguousOuterDims(MemRefType memRefType) { - auto stridesAttr = - dyn_cast_or_null(memRefType.getLayout()); - if (!stridesAttr) { - if (memRefType.getLayout() && !memRefType.getLayout().isIdentity()) - return failure(); - - // No layout or identity layouts are by definition fully contiguous. - return 0; - } - - int64_t innerSize = 1; - ArrayRef shape = memRefType.getShape(); - ArrayRef strides = stridesAttr.getStrides(); - for (; !shape.empty(); - shape = shape.drop_back(), strides = strides.drop_back()) { - int64_t dim = shape.back(); - // Unit dims can be dropped alongside the corresponding stride of that dim. - if (dim == 1) - continue; - - int64_t stride = strides.back(); - if (ShapedType::isDynamic(stride)) - break; - - if (innerSize != stride) - break; - - // Note: Dim may be dynamic with the value -1. This intentionally will only - // fail the 'if' above later if the outer dims are non-zero. - innerSize *= dim; - } - - return shape.size(); -} - -/// Returns true if this MemRef type is known to have a fully contiguous layout. -/// TODO: Could be upstreamed next to -/// 'memref::isStaticShapeAndContiguousRowMajor' -static bool isContiguous(MemRefType memRefType) { - return getNumNonContiguousOuterDims(memRefType) == 0; -} - -namespace { -struct StartDMATransferOp1DLowering - : ConvertOpToLLVMPattern { - - LLVM::LLVMFuncOp dmaStart1DFunc; - - StartDMATransferOp1DLowering(LLVM::LLVMFuncOp dmaStart1DFunc, - const LLVMTypeConverter &converter) - : ConvertOpToLLVMPattern(converter, /*benefit=*/2), - dmaStart1DFunc(dmaStart1DFunc) {} - - LogicalResult match(StartDMATransferOp op) const override { - return success(isContiguous(op.getSource().getType()) && - isContiguous(op.getDest().getType())); - } - - void rewrite(StartDMATransferOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - MemRefDescriptor sourceDescriptor(adaptor.getSource()); - MemRefDescriptor destDescriptor(adaptor.getDest()); - - Value source = sourceDescriptor.bufferPtr( - rewriter, op->getLoc(), *getTypeConverter(), op.getSource().getType()); - Value dest = destDescriptor.bufferPtr( - rewriter, op->getLoc(), *getTypeConverter(), op.getDest().getType()); - - MemRefType sourceMemRef = op.getSource().getType(); - SmallVector dynamicSizes; - for (auto [index, dim] : llvm::enumerate(sourceMemRef.getShape())) - if (ShapedType::isDynamic(dim)) - dynamicSizes.push_back( - sourceDescriptor.size(rewriter, op->getLoc(), index)); - - SmallVector sizes; - SmallVector strides; - Value totalSize; - getMemRefDescriptorSizes( - op->getLoc(), - // Offsets are not considered an identity layout. - // Get rid of the layout entirely for the size calculation. - MemRefType::get(sourceMemRef.getShape(), sourceMemRef.getElementType(), - nullptr, sourceMemRef.getMemorySpace()), - dynamicSizes, rewriter, sizes, strides, totalSize); - - rewriter.replaceOpWithNewOp(op, dmaStart1DFunc, - ValueRange{ - dest, - source, - totalSize, - }); - } -}; - -struct StartDMATransferOp2DLowering - : ConvertOpToLLVMPattern { - - LLVM::LLVMFuncOp dmaStart2DFunc; - - StartDMATransferOp2DLowering(LLVM::LLVMFuncOp dmaStart2DFunc, - const LLVMTypeConverter &converter) - : ConvertOpToLLVMPattern(converter), dmaStart2DFunc(dmaStart2DFunc) {} - - LogicalResult - matchAndRewrite(StartDMATransferOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - MemRefType sourceMemRef = op.getSource().getType(); - MemRefType destMemRef = op.getDest().getType(); - - // Compute the size of the contiguous inner loop common to both MemRefs and - // "shave" it off the ends of the shapes and strides. The remaining shapes - // and strides are considered our outer dimensions. - FailureOr sourceNonContiguous = - getNumNonContiguousOuterDims(sourceMemRef); - FailureOr destNonContiguous = - getNumNonContiguousOuterDims(destMemRef); - if (failed(sourceNonContiguous) || failed(destNonContiguous)) - return failure(); - size_t sharedNonContiguous = - std::max(*sourceNonContiguous, *destNonContiguous); - if (sharedNonContiguous == 0) - return failure(); - - Value elementSize = rewriter.create( - op->getLoc(), - rewriter.getI32IntegerAttr(llvm::divideCeil( - op.getSource().getType().getElementTypeBitWidth(), 8))); - SmallVector sizes = - memref::getMixedSizes(rewriter, op->getLoc(), op.getSource()); - - // Build a loop nest iterating over all outer dimensions - 1 and adjusts the - // source and destination pointers accordingly. The inner-most outer - // dimension is used in the DMA call for the repetition count and strides. - SmallVector lowerBounds; - SmallVector upperBounds; - SmallVector steps; - Value zeroIndex = rewriter.create(op.getLoc(), 0); - Value oneIndex = rewriter.create(op.getLoc(), 1); - for (size_t index : llvm::seq(sharedNonContiguous - 1)) { - lowerBounds.push_back(zeroIndex); - steps.push_back(oneIndex); - upperBounds.push_back(getValueOrCreateConstantIndexOp( - rewriter, op->getLoc(), sizes[index])); - } - - Value contiguousSize; - for (auto index : - llvm::seq(sharedNonContiguous, sourceMemRef.getRank())) { - Value dim = - getValueOrCreateConstantIndexOp(rewriter, op->getLoc(), sizes[index]); - if (!contiguousSize) { - contiguousSize = dim; - continue; - } - contiguousSize = - rewriter.create(op->getLoc(), contiguousSize, dim); - } - contiguousSize = typeConverter->materializeTargetConversion( - rewriter, op->getLoc(), getIndexType(), contiguousSize); - contiguousSize = - rewriter.create(op->getLoc(), contiguousSize, elementSize); - - Value completedToken = rewriter.create(op->getLoc()); - - scf::LoopNest loopNest = scf::buildLoopNest( - rewriter, op->getLoc(), lowerBounds, upperBounds, steps, completedToken, - [&](OpBuilder &builder, Location loc, ValueRange ivs, - ValueRange iterArgs) -> scf::ValueVector { - SmallVector offsets = ivs; - SmallVector subSizes(sharedNonContiguous - 1, - rewriter.getIndexAttr(1)); - for (unsigned i : llvm::seq(sharedNonContiguous - 1, - sourceMemRef.getRank())) { - offsets.push_back(rewriter.getIndexAttr(0)); - subSizes.push_back(sizes[i]); - } - SmallVector strides(sourceMemRef.getRank(), - rewriter.getIndexAttr(1)); - - TypedValue sourceMemRefSlice = - rewriter.create(loc, op.getSource(), offsets, - subSizes, strides); - TypedValue destMemRefSlice = - rewriter.create(loc, op.getDest(), offsets, - subSizes, strides); - - auto sourceDescriptor = - MemRefDescriptor(typeConverter->materializeTargetConversion( - rewriter, op->getLoc(), - typeConverter->convertType(sourceMemRefSlice.getType()), - sourceMemRefSlice)); - auto destDescriptor = - MemRefDescriptor(typeConverter->materializeTargetConversion( - rewriter, op->getLoc(), - typeConverter->convertType(destMemRefSlice.getType()), - destMemRefSlice)); - - Value sourceAdjusted = sourceDescriptor.bufferPtr( - rewriter, op->getLoc(), *getTypeConverter(), - sourceMemRefSlice.getType()); - Value destAdjusted = destDescriptor.bufferPtr( - rewriter, op->getLoc(), *getTypeConverter(), - destMemRefSlice.getType()); - - Value sourceStride = - sourceDescriptor.stride(builder, loc, sharedNonContiguous - 1); - sourceStride = rewriter.create( - op->getLoc(), sourceStride, elementSize); - Value destStride = - destDescriptor.stride(builder, loc, sharedNonContiguous - 1); - destStride = rewriter.create(op->getLoc(), destStride, - elementSize); - - Value outerLoopSize = - sourceDescriptor.size(builder, loc, sharedNonContiguous - 1); - return {builder - .create(loc, dmaStart2DFunc, - ValueRange{ - destAdjusted, - sourceAdjusted, - contiguousSize, - destStride, - sourceStride, - outerLoopSize, - }) - .getResult()}; - }); - - Type tokenType = typeConverter->convertType(op.getType()); - rewriter.replaceOp( - op, typeConverter->materializeTargetConversion( - rewriter, op->getLoc(), tokenType, loopNest.results.front())); - return success(); - } -}; - -// TODO: These should not be hardcoded. -constexpr unsigned zeroMemSize = 0x10000; -constexpr unsigned zeroMemAddress = 0x10030000; - -struct StartContiguousZeroMemTransferOpOpLowering - : ConvertOpToLLVMPattern { - - LLVM::LLVMFuncOp dmaStart1DFunc; - LLVM::LLVMFuncOp dmaStart2DFunc; - - StartContiguousZeroMemTransferOpOpLowering(LLVM::LLVMFuncOp dmaStart1DFunc, - LLVM::LLVMFuncOp dmaStart2DFunc, - const LLVMTypeConverter &converter) - : ConvertOpToLLVMPattern(converter, /*benefit=*/2), - dmaStart1DFunc(dmaStart1DFunc), dmaStart2DFunc(dmaStart2DFunc) {} - - LogicalResult - matchAndRewrite(StartZeroMemTransferOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - if (!isContiguous(op.getFilled().getType())) - return failure(); - - Value zeroPointer = rewriter.create( - op->getLoc(), rewriter.getType(), - rewriter.create( - op->getLoc(), rewriter.getI32IntegerAttr(zeroMemAddress))); - Value zeroMemSizeValue = rewriter.create( - op->getLoc(), rewriter.getI32IntegerAttr(zeroMemSize)); - - SmallVector sizes; - SmallVector strides; - Value size; - - auto filledDesc = MemRefDescriptor(adaptor.getFilled()); - - MemRefType memRefType = op.getFilled().getType(); - SmallVector dynamicSizes; - for (auto [index, shape] : llvm::enumerate(memRefType.getShape())) - if (ShapedType::isDynamic(shape)) - dynamicSizes.push_back(filledDesc.size(rewriter, op->getLoc(), index)); - - // Function does not support strided layout, even if it is contiguous. - // Lie about it and remove it. - // TODO: Consider fixing this upstream. - // TODO: Make a clone method of `MemRefType` that changes just the layout. - this->getMemRefDescriptorSizes( - op->getLoc(), - MemRefType::get(memRefType.getShape(), memRefType.getElementType()), - dynamicSizes, rewriter, sizes, strides, size); - - Value zero = - createIndexAttrConstant(rewriter, op->getLoc(), getIndexType(), 0); - Value bufferPointer = filledDesc.bufferPtr(rewriter, op->getLoc(), - *getTypeConverter(), memRefType); - Value times2D = - rewriter.create(op->getLoc(), size, zeroMemSizeValue); - // Note: This call would not be legal as a 'start_dma_transfer' call as - // MemRefs do not allow internal aliasing, which the below does via the - // stride of 0. - rewriter.create(op->getLoc(), dmaStart2DFunc, - ValueRange{bufferPointer, zeroPointer, - zeroMemSizeValue, zeroMemSizeValue, - zero, times2D}); - Value offset = - rewriter.create(op->getLoc(), times2D, zeroMemSizeValue); - bufferPointer = rewriter.create( - op->getLoc(), bufferPointer.getType(), rewriter.getI8Type(), - bufferPointer, offset); - Value rest = - rewriter.create(op->getLoc(), size, zeroMemSizeValue); - rewriter.replaceOpWithNewOp( - op, dmaStart1DFunc, ValueRange{bufferPointer, zeroPointer, rest}); - return success(); - } -}; - -struct StartZeroMemTransferOpOpLowering - : ConvertOpToLLVMPattern { - - using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern; - - LogicalResult - matchAndRewrite(StartZeroMemTransferOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - MemRefType memRefType = op.getFilled().getType(); - - FailureOr nonContiguousDims = - getNumNonContiguousOuterDims(memRefType); - if (failed(nonContiguousDims) || nonContiguousDims == 0) - return failure(); - - SmallVector sizes = - memref::getMixedSizes(rewriter, op->getLoc(), op.getFilled()); - - SmallVector lowerBounds; - SmallVector upperBounds; - SmallVector steps; - Value zeroIndex = rewriter.create(op.getLoc(), 0); - Value oneIndex = rewriter.create(op.getLoc(), 1); - for (size_t index : llvm::seq(*nonContiguousDims)) { - lowerBounds.push_back(zeroIndex); - steps.push_back(oneIndex); - upperBounds.push_back(getValueOrCreateConstantIndexOp( - rewriter, op->getLoc(), sizes[index])); - } - - // Loop over every non-contiguous dimension to zero every contiguous - // inner subview. - Value completedToken = rewriter.create(op->getLoc()); - scf::LoopNest loopNest = scf::buildLoopNest( - rewriter, op->getLoc(), lowerBounds, upperBounds, steps, completedToken, - [&](OpBuilder &builder, Location loc, ValueRange ivs, - ValueRange iterArgs) -> scf::ValueVector { - SmallVector offsets = ivs; - SmallVector subSizes(*nonContiguousDims, - rewriter.getIndexAttr(1)); - for (unsigned i : - llvm::seq(*nonContiguousDims, memRefType.getRank())) { - offsets.push_back(rewriter.getIndexAttr(0)); - subSizes.push_back(sizes[i]); - } - SmallVector strides(memRefType.getRank(), - rewriter.getIndexAttr(1)); - - Value subMemRef = rewriter.create( - loc, op.getFilled(), offsets, subSizes, strides); - return { - builder.create(op->getLoc(), subMemRef)}; - }); - - Type tokenType = typeConverter->convertType(op.getType()); - rewriter.replaceOp( - op, typeConverter->materializeTargetConversion( - rewriter, op->getLoc(), tokenType, loopNest.results.front())); - return success(); - } -}; - -struct WaitForDMATransfersOpLowering - : ConvertOpToLLVMPattern { - - using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern; - - LogicalResult - matchAndRewrite(WaitForDMATransfersOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - if (adaptor.getTokens().empty()) { - rewriter.eraseOp(op); - return success(); - } - - Value current = adaptor.getTokens().front(); - for (Value iter : llvm::drop_begin(adaptor.getTokens())) - current = rewriter.create(op->getLoc(), current, iter); - - Block *prev = op->getBlock(); - Block *body = rewriter.splitBlock(prev, op->getIterator()); - Block *after = rewriter.splitBlock(body, op->getNextNode()->getIterator()); - rewriter.setInsertionPointToEnd(prev); - rewriter.create(op->getLoc(), body); - - rewriter.setInsertionPointToEnd(body); - Value lastCompleted = - rewriter - .create( - op->getLoc(), /*res=*/rewriter.getI32Type(), - /*operands=*/ValueRange(), - // dmstati $0, 0 - // opcode6=0x2b, func3=0, func7=0b100, rd=$0, rs1=zero, - // rs2=imm5(0) - ".insn r 0x2b, 0, 0b100, $0, zero, zero\n", - /*constraints=*/"=r", - /*has_side_effects=*/true, /*is_align_stack=*/false, - /*asm_dialect=*/nullptr, /*operand_attrs=*/nullptr) - .getRes(); - Value notDone = rewriter.create( - op->getLoc(), LLVM::ICmpPredicate::ult, lastCompleted, current); - rewriter.create(op->getLoc(), notDone, body, after); - - rewriter.setInsertionPointToStart(after); - rewriter.eraseOp(op); - return success(); - } -}; - -struct CompletedTokenOpLowering : ConvertOpToLLVMPattern { - - using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern; - - LogicalResult - matchAndRewrite(CompletedTokenOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - rewriter.replaceOpWithNewOp( - op, typeConverter->convertType(op.getType()), 0); - return success(); - } -}; struct BarrierOpLowering : ConvertOpToLLVMPattern { @@ -623,40 +182,15 @@ void quidditch::populateSnitchToLLVMConversionPatterns( mlir::ModuleOp moduleOp, LLVMTypeConverter &typeConverter, RewritePatternSet &patterns) { - typeConverter.addConversion([](DMATokenType token) { - return IntegerType::get(token.getContext(), 32); - }); - auto builder = OpBuilder::atBlockEnd(moduleOp.getBody()); - auto ptrType = builder.getType(); IntegerType i32 = builder.getI32Type(); - IntegerType sizeT = i32; - auto dmaStart1D = builder.create( - builder.getUnknownLoc(), "snrt_dma_start_1d", - LLVM::LLVMFunctionType::get(i32, - ArrayRef{ptrType, ptrType, sizeT})); - dmaStart1D->setAttr("hal.import.bitcode", builder.getUnitAttr()); - - auto dmaStart2D = builder.create( - builder.getUnknownLoc(), "snrt_dma_start_2d", - LLVM::LLVMFunctionType::get( - i32, ArrayRef{ptrType, ptrType, sizeT, sizeT, sizeT, sizeT})); - dmaStart2D->setAttr("hal.import.bitcode", builder.getUnitAttr()); - auto computeCoreIndex = builder.create( builder.getUnknownLoc(), "snrt_cluster_core_idx", LLVM::LLVMFunctionType::get(i32, ArrayRef{})); computeCoreIndex->setAttr("hal.import.bitcode", builder.getUnitAttr()); - patterns - .insert( - typeConverter); - patterns.insert(dmaStart1D, typeConverter); - patterns.insert(dmaStart2D, typeConverter); - patterns.insert( - dmaStart1D, dmaStart2D, typeConverter); + patterns.insert(typeConverter); patterns.insert(computeCoreIndex, typeConverter); patterns.insert(SymbolTable(moduleOp), typeConverter); diff --git a/codegen/compiler/src/Quidditch/Dialect/DMA/CMakeLists.txt b/codegen/compiler/src/Quidditch/Dialect/DMA/CMakeLists.txt new file mode 100644 index 0000000..0e9c88b --- /dev/null +++ b/codegen/compiler/src/Quidditch/Dialect/DMA/CMakeLists.txt @@ -0,0 +1 @@ +iree_add_all_subdirs() diff --git a/codegen/compiler/src/Quidditch/Dialect/DMA/Extensions/CMakeLists.txt b/codegen/compiler/src/Quidditch/Dialect/DMA/Extensions/CMakeLists.txt new file mode 100644 index 0000000..2333a5b --- /dev/null +++ b/codegen/compiler/src/Quidditch/Dialect/DMA/Extensions/CMakeLists.txt @@ -0,0 +1,12 @@ + +iree_cc_library( + NAME + DMACoreSpecializationOpInterfaceImpl + HDRS + "DMACoreSpecializationOpInterfaceImpl.h" + SRCS + "DMACoreSpecializationOpInterfaceImpl.cpp" + DEPS + Quidditch::Dialect::Snitch::IR::QuidditchSnitchDialect + Quidditch::Dialect::DMA::IR::DMADialect +) diff --git a/codegen/compiler/src/Quidditch/Dialect/DMA/Extensions/DMACoreSpecializationOpInterfaceImpl.cpp b/codegen/compiler/src/Quidditch/Dialect/DMA/Extensions/DMACoreSpecializationOpInterfaceImpl.cpp new file mode 100644 index 0000000..1bb4cb0 --- /dev/null +++ b/codegen/compiler/src/Quidditch/Dialect/DMA/Extensions/DMACoreSpecializationOpInterfaceImpl.cpp @@ -0,0 +1,76 @@ +#include "DMACoreSpecializationOpInterfaceImpl.h" + +#include "Quidditch/Dialect/DMA/IR/DMADialect.h" +#include "Quidditch/Dialect/DMA/IR/DMAOps.h" +#include "Quidditch/Dialect/Snitch/IR/QuidditchSnitchInterfaces.h" +#include "mlir/IR/DialectRegistry.h" + +using namespace mlir; +using namespace quidditch::dma; +using namespace quidditch::Snitch; + +namespace { + +//===----------------------------------------------------------------------===// +// StartTransferOp::DMACoreSpecializationOpInterface +//===----------------------------------------------------------------------===// + +struct StartTransferOpImpl + : CoreSpecializationOpInterface::ExternalModel { + void replaceWithNoop(Operation *op, RewriterBase &rewriter) const { + rewriter.replaceOpWithNewOp(op); + } +}; + +struct StartTransferOpDMAImpl + : DMACoreSpecializationOpInterface::ExternalModel {}; + +//===----------------------------------------------------------------------===// +// StartZeroMemTransferOp::DMACoreSpecializationOpInterface +//===----------------------------------------------------------------------===// + +struct StartZeroMemTransferOpImpl + : CoreSpecializationOpInterface::ExternalModel { + void replaceWithNoop(Operation *op, RewriterBase &rewriter) const { + rewriter.replaceOpWithNewOp(op); + } + + // bool needsSynchronization(Operation *op) const { return true; } +}; + +struct StartZeroMemTransferOpDMAImpl + : DMACoreSpecializationOpInterface::ExternalModel< + StartZeroMemTransferOpDMAImpl, StartZeroMemTransferOp> {}; + +//===----------------------------------------------------------------------===// +// WaitForTransfersOpImpl::DMACoreSpecializationOpInterface +//===----------------------------------------------------------------------===// + +struct WaitForTransfersOpImpl + : CoreSpecializationOpInterface::ExternalModel { + void replaceWithNoop(Operation *op, RewriterBase &rewriter) const { + rewriter.eraseOp(op); + } + + bool needsSynchronization(Operation *op) const { return true; } +}; + +struct WaitForTransfersOpDMAImpl + : DMACoreSpecializationOpInterface::ExternalModel {}; + +} // namespace + +void quidditch::dma::registerDMACoreSpecializationOpInterface( + mlir::DialectRegistry ®istry) { + registry.addExtension(+[](MLIRContext *context, DMADialect *dialect) { +#define REGISTER_IMPLS(Op) Op::attachInterface(*context) + REGISTER_IMPLS(StartTransferOp); + REGISTER_IMPLS(StartZeroMemTransferOp); + REGISTER_IMPLS(WaitForTransfersOp); + }); +} diff --git a/codegen/compiler/src/Quidditch/Dialect/DMA/Extensions/DMACoreSpecializationOpInterfaceImpl.h b/codegen/compiler/src/Quidditch/Dialect/DMA/Extensions/DMACoreSpecializationOpInterfaceImpl.h new file mode 100644 index 0000000..28f67f7 --- /dev/null +++ b/codegen/compiler/src/Quidditch/Dialect/DMA/Extensions/DMACoreSpecializationOpInterfaceImpl.h @@ -0,0 +1,10 @@ + +#pragma once + +namespace mlir { +class DialectRegistry; +} + +namespace quidditch::dma { +void registerDMACoreSpecializationOpInterface(mlir::DialectRegistry ®istry); +} diff --git a/codegen/compiler/src/Quidditch/Dialect/DMA/IR/CMakeLists.txt b/codegen/compiler/src/Quidditch/Dialect/DMA/IR/CMakeLists.txt new file mode 100644 index 0000000..fee69c0 --- /dev/null +++ b/codegen/compiler/src/Quidditch/Dialect/DMA/IR/CMakeLists.txt @@ -0,0 +1,73 @@ +iree_add_all_subdirs() + +iree_cc_library( + NAME + DMADialect + HDRS + "DMADialect.h" + "DMAOps.h" + TEXTUAL_HDRS + "DMAAttrs.cpp.inc" + "DMAAttrs.h.inc" + "DMADialect.cpp.inc" + "DMADialect.h.inc" + "DMAOps.cpp.inc" + "DMAOps.h.inc" + "DMATypes.cpp.inc" + "DMATypes.h.inc" + SRCS + "DMAAttrs.cpp" + "DMADialect.cpp" + "DMAOps.cpp" + "DMATypes.cpp" + DEPS + ::DMAAttrsGen + ::DMADialectGen + ::DMAOpsGen + ::DMATypesGen + LLVMSupport + MLIRIR + MLIRInferTypeOpInterface + MLIRSupport + PUBLIC +) + +iree_tablegen_library( + NAME + DMAOpsGen + TD_FILE + "DMAOps.td" + OUTS + --gen-op-decls DMAOps.h.inc + --gen-op-defs DMAOps.cpp.inc +) + +iree_tablegen_library( + NAME + DMADialectGen + TD_FILE + "DMADialect.td" + OUTS + --gen-dialect-decls DMADialect.h.inc + --gen-dialect-defs DMADialect.cpp.inc +) + +iree_tablegen_library( + NAME + DMAAttrsGen + TD_FILE + "DMAAttrs.td" + OUTS + --gen-attrdef-decls DMAAttrs.h.inc + --gen-attrdef-defs DMAAttrs.cpp.inc +) + +iree_tablegen_library( + NAME + DMATypesGen + TD_FILE + "DMATypes.td" + OUTS + --gen-typedef-decls DMATypes.h.inc + --gen-typedef-defs DMATypes.cpp.inc +) diff --git a/codegen/compiler/src/Quidditch/Dialect/DMA/IR/DMAAttrs.cpp b/codegen/compiler/src/Quidditch/Dialect/DMA/IR/DMAAttrs.cpp new file mode 100644 index 0000000..ced2b88 --- /dev/null +++ b/codegen/compiler/src/Quidditch/Dialect/DMA/IR/DMAAttrs.cpp @@ -0,0 +1 @@ +#include "DMAAttrs.h" diff --git a/codegen/compiler/src/Quidditch/Dialect/DMA/IR/DMAAttrs.h b/codegen/compiler/src/Quidditch/Dialect/DMA/IR/DMAAttrs.h new file mode 100644 index 0000000..1b8b88e --- /dev/null +++ b/codegen/compiler/src/Quidditch/Dialect/DMA/IR/DMAAttrs.h @@ -0,0 +1,7 @@ + +#pragma once + +#include "mlir/IR/Attributes.h" + +#define GET_ATTRDEF_CLASSES +#include "Quidditch/Dialect/DMA/IR/DMAAttrs.h.inc" diff --git a/codegen/compiler/src/Quidditch/Dialect/DMA/IR/DMAAttrs.td b/codegen/compiler/src/Quidditch/Dialect/DMA/IR/DMAAttrs.td new file mode 100644 index 0000000..1fb3895 --- /dev/null +++ b/codegen/compiler/src/Quidditch/Dialect/DMA/IR/DMAAttrs.td @@ -0,0 +1,20 @@ +#ifndef QUIDDITCH_DIALECT_DMA_DMAATTRS +#define QUIDDITCH_DIALECT_DMA_DMAATTRS + +include "Quidditch/Dialect/DMA/IR/DMADialect.td" +include "mlir/IR/AttrTypeBase.td" + +class DMA_Attr traits = []> : + AttrDef; + +def DMA_CompletedTokenAttr : DMA_Attr<"CompletedToken"> { + + let mnemonic = "completed_token"; + + let description = [{ + Attribute representing an instance of a `!dma.token` + signaling a complete transfer. + }]; +} + +#endif diff --git a/codegen/compiler/src/Quidditch/Dialect/DMA/IR/DMADialect.cpp b/codegen/compiler/src/Quidditch/Dialect/DMA/IR/DMADialect.cpp new file mode 100644 index 0000000..058947f --- /dev/null +++ b/codegen/compiler/src/Quidditch/Dialect/DMA/IR/DMADialect.cpp @@ -0,0 +1,44 @@ +#include "DMADialect.h" + +#include "DMAAttrs.h" +#include "DMAOps.h" +#include "DMATypes.h" +#include "llvm/ADT/TypeSwitch.h" +#include "mlir/IR/DialectImplementation.h" +#include "mlir/IR/OpDefinition.h" +#include "mlir/IR/OpImplementation.h" + +#define GET_ATTRDEF_CLASSES +#include "Quidditch/Dialect/DMA/IR/DMAAttrs.cpp.inc" + +#include "Quidditch/Dialect/DMA/IR/DMADialect.cpp.inc" + +using namespace mlir; +using namespace quidditch::dma; + +//===----------------------------------------------------------------------===// +// DMADialect +//===----------------------------------------------------------------------===// + +void DMADialect::initialize() { + addOperations< +#define GET_OP_LIST +#include "Quidditch/Dialect/DMA/IR/DMAOps.cpp.inc" + >(); + addAttributes< +#define GET_ATTRDEF_LIST +#include "Quidditch/Dialect/DMA/IR/DMAAttrs.cpp.inc" + >(); + addTypes< +#define GET_TYPEDEF_LIST +#include "Quidditch/Dialect/DMA/IR/DMATypes.cpp.inc" + >(); +} + +Operation *DMADialect::materializeConstant(OpBuilder &builder, Attribute value, + Type type, Location loc) { + if (isa(value)) + return builder.create(loc); + + return nullptr; +} diff --git a/codegen/compiler/src/Quidditch/Dialect/DMA/IR/DMADialect.h b/codegen/compiler/src/Quidditch/Dialect/DMA/IR/DMADialect.h new file mode 100644 index 0000000..9f4a0e2 --- /dev/null +++ b/codegen/compiler/src/Quidditch/Dialect/DMA/IR/DMADialect.h @@ -0,0 +1,7 @@ + +#pragma once + +#include "mlir/IR/Dialect.h" +#include "mlir/IR/Operation.h" + +#include "Quidditch/Dialect/DMA/IR/DMADialect.h.inc" diff --git a/codegen/compiler/src/Quidditch/Dialect/DMA/IR/DMADialect.td b/codegen/compiler/src/Quidditch/Dialect/DMA/IR/DMADialect.td new file mode 100644 index 0000000..69afe78 --- /dev/null +++ b/codegen/compiler/src/Quidditch/Dialect/DMA/IR/DMADialect.td @@ -0,0 +1,15 @@ +#ifndef QUIDDITCH_DIALECT_SNITCH_DMADIALECT +#define QUIDDITCH_DIALECT_SNITCH_DMADIALECT + +include "mlir/IR/DialectBase.td" + +def DMA_Dialect : Dialect { + let name = "dma"; + let cppNamespace = "::quidditch::dma"; + + let useDefaultAttributePrinterParser = 1; + let useDefaultTypePrinterParser = 1; + let hasConstantMaterializer = 1; +} + +#endif diff --git a/codegen/compiler/src/Quidditch/Dialect/DMA/IR/DMAOps.cpp b/codegen/compiler/src/Quidditch/Dialect/DMA/IR/DMAOps.cpp new file mode 100644 index 0000000..1208ad0 --- /dev/null +++ b/codegen/compiler/src/Quidditch/Dialect/DMA/IR/DMAOps.cpp @@ -0,0 +1,434 @@ +#include "DMAOps.h" + +#include "llvm/ADT/ScopeExit.h" +#include "mlir/Dialect/Affine/IR/AffineOps.h" +#include "mlir/Dialect/Bufferization/IR/Bufferization.h" +#include "mlir/Dialect/Linalg/IR/Linalg.h" +#include "mlir/Dialect/MemRef/IR/MemRef.h" +#include "mlir/IR/Matchers.h" +#include "mlir/IR/TypeUtilities.h" + +#include "DMAAttrs.h" + +static mlir::ParseResult +parseTensorCopyTypes(mlir::OpAsmParser &parser, + mlir::DenseI64ArrayAttr staticHighPad, + mlir::Type ©Type, mlir::Type &resultType); + +static void printTensorCopyTypes(mlir::OpAsmPrinter &printer, mlir::Operation *, + mlir::DenseI64ArrayAttr staticHighPad, + mlir::Type copyType, mlir::Type resultType); + +#define GET_OP_CLASSES +#include "Quidditch/Dialect/DMA/IR/DMAOps.cpp.inc" + +using namespace mlir; +using namespace mlir::bufferization; +using namespace quidditch::dma; + +//===----------------------------------------------------------------------===// +// StartTensorCopyOp +//===----------------------------------------------------------------------===// + +ParseResult parseTensorCopyTypes(OpAsmParser &parser, + DenseI64ArrayAttr staticHighPad, + Type ©Type, Type &resultType) { + if (staticHighPad && !staticHighPad.empty()) { + if (parser.parseColon() || parser.parseType(copyType)) + return failure(); + } + if (parser.parseArrow() || parser.parseType(resultType)) + return failure(); + if (!staticHighPad || staticHighPad.empty()) + copyType = resultType; + return success(); +} + +static void printTensorCopyTypes(OpAsmPrinter &printer, mlir::Operation *, + DenseI64ArrayAttr staticHighPad, Type copyType, + Type resultType) { + if (staticHighPad && !staticHighPad.empty()) + printer << ": " << copyType; + printer << " -> " << resultType; +} + +LogicalResult StartTensorCopyOp::verify() { + if (getStaticHighPadAttr()) + if (getStaticHighPadAttr().size() != getCopy().getType().getRank()) + return emitOpError("expected padding number for every dimension"); + + unsigned numDynamicPads = llvm::count( + getStaticHighPad().value_or(std::nullopt), ShapedType::kDynamic); + if (numDynamicPads != getHighPad().size()) + return emitOpError("expected ") + << numDynamicPads << " dynamic padding values"; + + return success(); +} + +LogicalResult StartTensorCopyOp::fold(FoldAdaptor adaptor, + SmallVectorImpl &results) { + if (hasPadding()) { + // Remove noop padding. + if (llvm::all_of(getStaticHighPadAttr().asArrayRef(), + [](int64_t value) { return value == 0; })) { + removeStaticHighPadAttr(); + return success(); + } + + // Fold dynamic indices with constant values into the static list. + { + bool changed = false; + SmallVector padding = + llvm::to_vector(getStaticHighPadAttr().asArrayRef()); + unsigned dynamicIndex = 0; + for (int64_t &value : padding) { + if (!ShapedType::isDynamic(value)) + continue; + + if (auto integer = dyn_cast_or_null( + adaptor.getHighPad()[dynamicIndex])) { + value = integer.getValue().getZExtValue(); + getHighPadMutable().erase(dynamicIndex); + changed = true; + } else { + dynamicIndex++; + } + } + if (changed) { + setStaticHighPad(padding); + return success(); + } + } + } + + auto waitOp = getCopy().getDefiningOp(); + if (!waitOp) + return failure(); + auto copyOp = waitOp.getTransferTensor().getDefiningOp(); + if (!copyOp) + return failure(); + + if (hasPadding() && + (copyOp.getStaticHighPadAttr() != getStaticHighPadAttr() || + copyOp.getHighPad() != getHighPad())) + return failure(); + + results.emplace_back(waitOp); + results.emplace_back(CompletedTokenAttr::get(getContext())); + return success(); +} + +SmallVector StartTensorCopyOp::getMixedHighPad() { + Builder builder(getContext()); + if (!hasPadding()) + return SmallVector(getResult().getType().getRank(), + builder.getIndexAttr(0)); + + return getMixedValues(getStaticHighPadAttr().asArrayRef(), getHighPad(), + builder); +} + +//===----------------------------------------------------------------------===// +// StartTensorCopyOp::BufferizableOpInterface +//===----------------------------------------------------------------------===// + +/// Returns whether the allocation can be elided entirely. +/// Returns an empty optional if it was not possible to determine. +std::optional StartTensorCopyOp::elidesAllocation( + const bufferization::BufferizationOptions &options, + SmallVector *invocationStack) { + // Padding cannot be elided in general, even if the copied buffer is in L1. + if (hasPadding()) + return false; + + FailureOr copyType = + invocationStack + ? bufferization::getBufferType(getCopy(), options, *invocationStack) + : bufferization::getBufferType(getCopy(), options); + if (failed(copyType)) + return std::nullopt; + + return copyType->getMemorySpace() == getMemorySpaceAttr(); +} + +bool StartTensorCopyOp::resultBufferizesToMemoryWrite( + OpResult opResult, const bufferization::AnalysisState &state) { + assert(opResult == getResult() && "no other result"); + + std::optional matches = elidesAllocation(state.getOptions()); + // Conservative answer. + if (!matches) + return true; + + // No copy is performed unless the address space does not match. + // Copy in this context implies that we are writing to the result. + return !*matches; +} + +bool StartTensorCopyOp::bufferizesToMemoryRead( + OpOperand &opOperand, const bufferization::AnalysisState &state) { + assert(opOperand == getCopyMutable() && "have only one operand"); + + std::optional result = elidesAllocation(state.getOptions()); + // Conservative answer. + if (!result) + return true; + + // We only read from the buffer if we are copying. + return !*result; +} + +bool StartTensorCopyOp::bufferizesToMemoryWrite( + OpOperand &opOperand, const bufferization::AnalysisState &) { + assert(opOperand == getCopyMutable() && "have only one operand"); + + // We do not write into the buffer we are copying ever. + return false; +} + +AliasingValueList StartTensorCopyOp::getAliasingValues( + OpOperand &opOperand, const bufferization::AnalysisState &state) { + assert(opOperand == getCopyMutable() && "have only one operand"); + + std::optional result = elidesAllocation(state.getOptions()); + if (!result) + // Assume the worst case. + return {{getResult(), BufferRelation::Equivalent, /*isDefinite=*/false}}; + + // Always a brand-new allocation unless the input buffer is already in L1 and + // we elide the copy, in which case operand and result alias. + if (*result) + return {{getResult(), BufferRelation::Equivalent, /*isDefinite=*/true}}; + + return {}; +} + +bool StartTensorCopyOp::bufferizesToAllocation(Value value) { + assert(value == getResult() && "have only one result"); + + if (elidesAllocation() == true) + return false; + + // True is the conservative reply, according to the docs. + return true; +} + +FailureOr +StartTensorCopyOp::getBufferType(Value value, + const BufferizationOptions &options, + SmallVector &invocationStack) { + assert(value == getResult() && "have only one result"); + + bool contained = llvm::is_contained(invocationStack, value); + if (!contained) + if (elidesAllocation(options, &invocationStack) == true) + return bufferization::getBufferType(getCopy(), options, invocationStack); + + // Unless contained in the invocation stack (where we are free to impose the + // most optimal layout), we do not really impose a specific layout on the + // result. Contiguous is a good bet for now. + return getMemRefTypeWithStaticIdentityLayout(getResult().getType(), + getMemorySpaceAttr()); +} + +LogicalResult +StartTensorCopyOp::bufferize(RewriterBase &rewriter, + const BufferizationOptions &options) { + if (use_empty()) { + rewriter.eraseOp(*this); + return success(); + } + + FailureOr copyType = + bufferization::getBufferType(getCopy(), options); + if (failed(copyType)) + return failure(); + + FailureOr copyBuffer = getBuffer(rewriter, getCopy(), options); + if (failed(copyBuffer)) + return failure(); + + std::optional result = elidesAllocation(options); + if (!result) + return failure(); + + if (*result) { + Value token = rewriter.create(getLoc()); + replaceOpWithBufferizedValues(rewriter, getOperation(), + {*copyBuffer, token}); + return success(); + } + + FailureOr allocType = + bufferization::getBufferType(getResult(), options); + if (failed(allocType)) + return failure(); + + SmallVector copyBufferSizes = + memref::getMixedSizes(rewriter, getLoc(), *copyBuffer); + + // Compute the dynamic dimensions for the allocation. + SmallVector dynamicDims; + for (auto [index, shape, pad] : + llvm::enumerate(allocType->getShape(), getMixedHighPad())) { + if (!ShapedType::isDynamic(shape)) + continue; + + dynamicDims.push_back(affine::makeComposedAffineApply( + rewriter, getLoc(), + rewriter.getAffineDimExpr(0) + rewriter.getAffineDimExpr(1), + ArrayRef{copyBufferSizes[index], pad})); + } + + FailureOr alloc = options.createAlloc( + rewriter, getLoc(), llvm::cast(*allocType), + /*dynShape=*/dynamicDims); + if (failed(alloc)) + return failure(); + + // Zero out the entire buffer prior to overwriting it with the copied values. + // TODO: This could be optimized to only zero regions that won't be filled + // with the copied values at the cost of 2^rank transfers instead of two. + if (hasPadding() && !getUndefPadding()) + rewriter.create(getLoc(), *alloc); + + // Subview into the original memory without any padding. + // As we only add padding at the end of the dimensions, the offsets are always + // zero. + Value destination = rewriter.create( + getLoc(), *alloc, + /*offsets=*/ + SmallVector(allocType->getRank(), rewriter.getIndexAttr(0)), + copyBufferSizes, + /*strides=*/ + SmallVector(allocType->getRank(), + rewriter.getIndexAttr(1))); + Value token = + rewriter.create(getLoc(), *copyBuffer, destination); + + // Replace op. + replaceOpWithBufferizedValues(rewriter, getOperation(), {*alloc, token}); + return success(); +} + +//===----------------------------------------------------------------------===// +// WaitForTensorCopyOp +//===----------------------------------------------------------------------===// + +OpFoldResult WaitForTensorCopyOp::fold(FoldAdaptor adaptor) { + if (adaptor.getToken()) + return getTransferTensor(); + + return nullptr; +} + +//===----------------------------------------------------------------------===// +// WaitForTensorCopyOp::BufferizableOpInterface +//===----------------------------------------------------------------------===// + +bool WaitForTensorCopyOp::mustBufferizeInPlace( + OpOperand &opOperand, const bufferization::AnalysisState &state) { + return true; +} + +bool WaitForTensorCopyOp::bufferizesToMemoryRead( + OpOperand &opOperand, const bufferization::AnalysisState &state) { + if (opOperand == getTransferTensorMutable()) + return false; + + if (opOperand == getCopyMutable()) + return true; + + llvm_unreachable("unknown operand"); +} + +bool WaitForTensorCopyOp::bufferizesToMemoryWrite( + OpOperand &opOperand, const bufferization::AnalysisState &) { + if (opOperand == getTransferTensorMutable()) + return true; + + if (opOperand == getCopyMutable()) + return false; + + llvm_unreachable("unknown operand"); +} + +AliasingValueList WaitForTensorCopyOp::getAliasingValues( + OpOperand &opOperand, const bufferization::AnalysisState &state) { + if (opOperand == getCopyMutable()) + return {}; + + if (opOperand == getTransferTensorMutable()) + return {{getResult(), BufferRelation::Equivalent, /*isDefinite=*/true}}; + + llvm_unreachable("unknown operand"); +} + +LogicalResult +WaitForTensorCopyOp::bufferize(RewriterBase &rewriter, + const BufferizationOptions &options) { + FailureOr transferTensorBuffer = + getBuffer(rewriter, getTransferTensor(), options); + if (failed(transferTensorBuffer)) + return failure(); + + rewriter.create(getLoc(), getToken()); + replaceOpWithBufferizedValues(rewriter, getOperation(), + *transferTensorBuffer); + return success(); +} + +bool WaitForTensorCopyOp::isNotConflicting( + OpOperand *uRead, OpOperand *uWrite, + const bufferization::AnalysisState &state) { + if (*uRead == getCopyMutable() && *uWrite == getTransferTensorMutable()) + return true; + + return false; +} + +//===----------------------------------------------------------------------===// +// CompletedTokenOp +//===----------------------------------------------------------------------===// + +OpFoldResult CompletedTokenOp::fold(FoldAdaptor adaptor) { + return CompletedTokenAttr::get(getContext()); +} + +//===----------------------------------------------------------------------===// +// StartTransferOp +//===----------------------------------------------------------------------===// + +OpFoldResult StartTransferOp::fold(FoldAdaptor adaptor) { + if (getSource() != getDest()) + return nullptr; + + return CompletedTokenAttr::get(getContext()); +} + +//===----------------------------------------------------------------------===// +// WaitForTransfersOp +//===----------------------------------------------------------------------===// + +LogicalResult WaitForTransfersOp::fold(FoldAdaptor adaptor, + SmallVectorImpl &results) { + bool changed = false; + MutableOperandRange tokens = getTokensMutable(); + for (int i = tokens.size() - 1; i >= 0; i--) { + if (adaptor.getTokens()[i]) { + changed = true; + tokens.erase(i); + } + } + return success(changed); +} + +LogicalResult WaitForTransfersOp::canonicalize(WaitForTransfersOp op, + PatternRewriter &rewriter) { + if (!op.getTokens().empty()) + return failure(); + + rewriter.eraseOp(op); + return success(); +} diff --git a/codegen/compiler/src/Quidditch/Dialect/DMA/IR/DMAOps.h b/codegen/compiler/src/Quidditch/Dialect/DMA/IR/DMAOps.h new file mode 100644 index 0000000..a9f1f6f --- /dev/null +++ b/codegen/compiler/src/Quidditch/Dialect/DMA/IR/DMAOps.h @@ -0,0 +1,14 @@ + +#pragma once + +#include "mlir/Bytecode/BytecodeOpInterface.h" +#include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h" +#include "mlir/IR/BuiltinAttributes.h" +#include "mlir/IR/OpImplementation.h" +#include "mlir/Interfaces/InferTypeOpInterface.h" +#include "mlir/Interfaces/SideEffectInterfaces.h" + +#include "DMATypes.h" + +#define GET_OP_CLASSES +#include "Quidditch/Dialect/DMA/IR/DMAOps.h.inc" diff --git a/codegen/compiler/src/Quidditch/Dialect/DMA/IR/DMAOps.td b/codegen/compiler/src/Quidditch/Dialect/DMA/IR/DMAOps.td new file mode 100644 index 0000000..b5f23fd --- /dev/null +++ b/codegen/compiler/src/Quidditch/Dialect/DMA/IR/DMAOps.td @@ -0,0 +1,206 @@ +#ifndef QUIDDITCH_DIALECT_SNITCH_DMAOPS +#define QUIDDITCH_DIALECT_SNITCH_DMAOPS + +include "Quidditch/Dialect/DMA/IR/DMADialect.td" +include "Quidditch/Dialect/DMA/IR/DMATypes.td" +include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.td" +include "mlir/IR/CommonTypeConstraints.td" +include "mlir/IR/OpBase.td" +include "mlir/Interfaces/InferTypeOpInterface.td" +include "mlir/Interfaces/SideEffectInterfaces.td" + +class DMA_Op traits = []> : + Op; + +def DMA_StartTensorCopyOp : DMA_Op<"start_tensor_copy", + [Pure, AllRanksMatch<["copy", "result"]>, + DeclareOpInterfaceMethods]> { + + let description = [{ + Operation starting a copy of a tensor to another memory space, optionally + adding padding and returning it as a new tensor. + The contained values of the resulting tensor is in an unspecified state. + See `wait_for_tensor_copy` to transform the tensor value into a state + equal to `$copy`. + + The operation may optionally add padding at the end of each dimension of + the tensor. Zero is used as the padding value. + The dimensions of the result tensor are computed using + `dims(copy)[i] + high_pad[i]`. + + This operation is a noop if `$copy` is already in the given memory space, + no padding is added, and bufferization can elide the copy. + }]; + + let arguments = (ins AnyRankedTensor:$copy, + AnyAttr:$memory_space, + Variadic:$high_pad, + OptionalAttr:$static_high_pad, + UnitAttr:$undef_padding + ); + + let results = (outs + AnyRankedTensor:$result, + DMA_TokenType:$token + ); + + let assemblyFormat = [{ + `of` $copy `to` $memory_space + ( `pad` `with` (`undef` $undef_padding^) : (`zero`)? `by` + custom($high_pad, $static_high_pad)^)? + custom(ref($static_high_pad), type($copy), type($result)) + attr-dict + }]; + + let builders = [ + OpBuilder<(ins "mlir::Value":$copy, "mlir::Attribute":$memorySpace), [{ + build($_builder, $_state, copy.getType(), + $_builder.getType(), copy, memorySpace, + /*high_pad=*/mlir::ValueRange(), /*static_high_pad=*/nullptr); + }]> + ]; + + let hasVerifier = 1; + + let extraClassDeclaration = [{ + private: + std::optional + elidesAllocation(const mlir::bufferization::BufferizationOptions &options = {}, + llvm::SmallVector *invocationStack = nullptr); + public: + + bool hasPadding() { + return static_cast(getStaticHighPadAttr()); + } + + llvm::SmallVector getMixedHighPad(); + }]; + + let hasFolder = 1; +} + +def DMA_WaitForTensorCopyOp : DMA_Op<"wait_for_tensor_copy", + [AllTypesMatch<["transfer_tensor", "result"]>, Pure, + DeclareOpInterfaceMethods]> { + + let description = [{ + Operation asserting that a previous `start_tensor_copy` operation has finished. + Unless `token` is the result of an `completed_token` operation, + `transfer_tensor` and `token` must at runtime be a token and tensor yielded + by a `start_tensor_copy` operation and `copy` the original tensor used in + `start_tensor_copy`. + + Once this operation returns, the returned tensor's values are guaranteed + equal to the `copy` operand and in the memory space specified in + `start_tensor_copy`. + + Note: The additional `copy` operand is given as it is effectively read by + this operation. + This additionally guarantees that the bufferization frame work does not + perform a write to the underlying buffer of `copy` while the transfer is + in progress. + }]; + + let arguments = (ins + AnyRankedTensor:$transfer_tensor, + DMA_TokenType:$token, + AnyRankedTensor:$copy + ); + + let results = (outs + AnyRankedTensor:$result + ); + + let assemblyFormat = [{ + `of` $copy `:` type($copy) `to` $transfer_tensor `using` $token `->` type($transfer_tensor) attr-dict + }]; + + let hasFolder = 1; +} + +def DMA_StartTransferOp : DMA_Op<"start_transfer", + [MemoryEffects<[MemWrite]>, SameOperandsElementType, SameOperandsShape]> { + + let description = [{ + Operation performing a DMA transfer from one MemRef to another. + The shapes (including dynamic ones at runtime) of both MemRefs must be + identical with different strides and offsets allowed. + + The DMA operation is likely (but not guaranteed) to run asynchronous and + its completion only guaranteed by executing the `wait_for_transfers` + operation with the token returned by this operation or a later one. + }]; + + let arguments = (ins + Arg, "source", [MemRead]>:$source, + Arg, "destination", [MemWrite]>:$dest + ); + + let results = (outs DMA_TokenType:$token); + + let assemblyFormat = [{ + `from` $source `:` type($source) `to` $dest `:` type($dest) attr-dict + }]; + + let hasFolder = 1; +} + +def DMA_StartZeroMemTransferOp : DMA_Op<"start_zero_mem_transfer", + [MemoryEffects<[MemWrite]>]> { + + let description = [{ + + }]; + + let arguments = (ins + Arg, "zeroed buffer", [MemWrite]>:$filled + ); + + let results = (outs DMA_TokenType:$token); + + let assemblyFormat = [{ + $filled `:` type($filled) attr-dict + }]; +} + +def DMA_WaitForTransfersOp : DMA_Op<"wait_for_transfers"> { + + let description = [{ + Operation awaiting for DMA transfers denoted by its tokens to be finished. + }]; + + let arguments = (ins + Variadic:$tokens + ); + + let assemblyFormat = [{ + ($tokens^ `:` type($tokens))? attr-dict + }]; + + let hasFolder = 1; + let hasCanonicalizeMethod = 1; +} + +def DMA_CompletedTokenOp + : DMA_Op<"completed_token", [Pure, ConstantLike]> { + + let description = [{ + Op returning a special value representing a completed DMA transfer. + Passing this token to `wait_for_transfers` will always return immediately. + }]; + + let results = (outs DMA_TokenType:$token); + + let assemblyFormat = [{ + attr-dict + }]; + + let hasFolder = 1; +} + +#endif diff --git a/codegen/compiler/src/Quidditch/Dialect/DMA/IR/DMATypes.cpp b/codegen/compiler/src/Quidditch/Dialect/DMA/IR/DMATypes.cpp new file mode 100644 index 0000000..7d8c729 --- /dev/null +++ b/codegen/compiler/src/Quidditch/Dialect/DMA/IR/DMATypes.cpp @@ -0,0 +1,11 @@ +#include "DMATypes.h" + +#include "llvm/ADT/TypeSwitch.h" +#include "mlir/IR/DialectImplementation.h" +#include "mlir/IR/OpDefinition.h" +#include "mlir/IR/OpImplementation.h" + +#include "DMADialect.h" + +#define GET_TYPEDEF_CLASSES +#include "Quidditch/Dialect/DMA/IR/DMATypes.cpp.inc" diff --git a/codegen/compiler/src/Quidditch/Dialect/DMA/IR/DMATypes.h b/codegen/compiler/src/Quidditch/Dialect/DMA/IR/DMATypes.h new file mode 100644 index 0000000..fde865f --- /dev/null +++ b/codegen/compiler/src/Quidditch/Dialect/DMA/IR/DMATypes.h @@ -0,0 +1,7 @@ + +#pragma once + +#include "mlir/IR/Types.h" + +#define GET_TYPEDEF_CLASSES +#include "Quidditch/Dialect/DMA/IR/DMATypes.h.inc" diff --git a/codegen/compiler/src/Quidditch/Dialect/DMA/IR/DMATypes.td b/codegen/compiler/src/Quidditch/Dialect/DMA/IR/DMATypes.td new file mode 100644 index 0000000..0288166 --- /dev/null +++ b/codegen/compiler/src/Quidditch/Dialect/DMA/IR/DMATypes.td @@ -0,0 +1,18 @@ +#ifndef QUIDDITCH_DIALECT_SNITCH_DMATYPES +#define QUIDDITCH_DIALECT_SNITCH_DMATYPES + +include "Quidditch/Dialect/DMA/IR/DMADialect.td" +include "mlir/IR/AttrTypeBase.td" + +class DMA_Type traits = []> : + TypeDef; + +def DMA_TokenType : DMA_Type<"Token"> { + let mnemonic = "token"; + + let description = [{ + Type representing a potentially active DMA transfer. + }]; +} + +#endif diff --git a/codegen/compiler/src/Quidditch/Dialect/Snitch/IR/QuidditchSnitchAttrs.td b/codegen/compiler/src/Quidditch/Dialect/Snitch/IR/QuidditchSnitchAttrs.td index 634cb21..35f27f5 100644 --- a/codegen/compiler/src/Quidditch/Dialect/Snitch/IR/QuidditchSnitchAttrs.td +++ b/codegen/compiler/src/Quidditch/Dialect/Snitch/IR/QuidditchSnitchAttrs.td @@ -8,16 +8,6 @@ include "mlir/IR/AttrTypeBase.td" class QuidditchSnitch_Attr traits = []> : AttrDef; -def QuidditchSnitch_CompletedTokenAttr : QuidditchSnitch_Attr<"CompletedToken"> { - - let mnemonic = "completed_token"; - - let description = [{ - Attribute representing an instance of a `!quidditch_snitch.dma_token` - signaling a complete transfer. - }]; -} - def QuidditchSnitch_L1EncodingAttr : QuidditchSnitch_Attr<"L1Encoding"> { let mnemonic = "l1_encoding"; diff --git a/codegen/compiler/src/Quidditch/Dialect/Snitch/IR/QuidditchSnitchDialect.cpp b/codegen/compiler/src/Quidditch/Dialect/Snitch/IR/QuidditchSnitchDialect.cpp index 2196493..bfe363f 100644 --- a/codegen/compiler/src/Quidditch/Dialect/Snitch/IR/QuidditchSnitchDialect.cpp +++ b/codegen/compiler/src/Quidditch/Dialect/Snitch/IR/QuidditchSnitchDialect.cpp @@ -60,13 +60,3 @@ void QuidditchSnitchDialect::initialize() { #include "Quidditch/Dialect/Snitch/IR/QuidditchSnitchTypes.cpp.inc" >(); } - -Operation *QuidditchSnitchDialect::materializeConstant(OpBuilder &builder, - Attribute value, - Type type, - Location loc) { - if (isa(value)) - return builder.create(loc); - - return nullptr; -} diff --git a/codegen/compiler/src/Quidditch/Dialect/Snitch/IR/QuidditchSnitchDialect.td b/codegen/compiler/src/Quidditch/Dialect/Snitch/IR/QuidditchSnitchDialect.td index 4292a6c..eadd62e 100644 --- a/codegen/compiler/src/Quidditch/Dialect/Snitch/IR/QuidditchSnitchDialect.td +++ b/codegen/compiler/src/Quidditch/Dialect/Snitch/IR/QuidditchSnitchDialect.td @@ -14,8 +14,8 @@ def QuidditchSnitch_Dialect : Dialect { ); let useDefaultAttributePrinterParser = 1; - let useDefaultTypePrinterParser = 1; - let hasConstantMaterializer = 1; + let useDefaultTypePrinterParser = 0; + let hasConstantMaterializer = 0; } #endif diff --git a/codegen/compiler/src/Quidditch/Dialect/Snitch/IR/QuidditchSnitchOps.cpp b/codegen/compiler/src/Quidditch/Dialect/Snitch/IR/QuidditchSnitchOps.cpp index aca847c..143cabf 100644 --- a/codegen/compiler/src/Quidditch/Dialect/Snitch/IR/QuidditchSnitchOps.cpp +++ b/codegen/compiler/src/Quidditch/Dialect/Snitch/IR/QuidditchSnitchOps.cpp @@ -396,416 +396,6 @@ void MicrokernelFenceOp::replaceWithNoop(RewriterBase &rewriter) { rewriter.eraseOp(*this); } -//===----------------------------------------------------------------------===// -// StartTensorCopyOp -//===----------------------------------------------------------------------===// - -LogicalResult StartTensorCopyOp::verify() { - if (getStaticHighPadAttr()) - if (getStaticHighPadAttr().size() != getCopy().getType().getRank()) - return emitOpError("expected padding number for every dimension"); - - unsigned numDynamicPads = llvm::count( - getStaticHighPad().value_or(std::nullopt), ShapedType::kDynamic); - if (numDynamicPads != getHighPad().size()) - return emitOpError("expected ") - << numDynamicPads << " dynamic padding values"; - - return success(); -} - -LogicalResult StartTensorCopyOp::fold(FoldAdaptor adaptor, - SmallVectorImpl &results) { - if (hasPadding()) { - // Remove noop padding. - if (llvm::all_of(getStaticHighPadAttr().asArrayRef(), - [](int64_t value) { return value == 0; })) { - removeStaticHighPadAttr(); - return success(); - } - - // Fold dynamic indices with constant values into the static list. - { - bool changed = false; - SmallVector padding = - llvm::to_vector(getStaticHighPadAttr().asArrayRef()); - unsigned dynamicIndex = 0; - for (int64_t &value : padding) { - if (!ShapedType::isDynamic(value)) - continue; - - if (auto integer = dyn_cast_or_null( - adaptor.getHighPad()[dynamicIndex])) { - value = integer.getValue().getZExtValue(); - getHighPadMutable().erase(dynamicIndex); - changed = true; - } else { - dynamicIndex++; - } - } - if (changed) { - setStaticHighPad(padding); - return success(); - } - } - } - - auto waitOp = getCopy().getDefiningOp(); - if (!waitOp) - return failure(); - auto copyOp = waitOp.getTransferTensor().getDefiningOp(); - if (!copyOp) - return failure(); - - if (hasPadding() && - (copyOp.getStaticHighPadAttr() != getStaticHighPadAttr() || - copyOp.getHighPad() != getHighPad())) - return failure(); - - results.emplace_back(waitOp); - results.emplace_back(CompletedTokenAttr::get(getContext())); - return success(); -} - -SmallVector StartTensorCopyOp::getMixedHighPad() { - Builder builder(getContext()); - if (!hasPadding()) - return SmallVector(getResult().getType().getRank(), - builder.getIndexAttr(0)); - - return getMixedValues(getStaticHighPadAttr().asArrayRef(), getHighPad(), - builder); -} - -//===----------------------------------------------------------------------===// -// StartTensorCopyOp::BufferizableOpInterface -//===----------------------------------------------------------------------===// - -/// Returns whether the allocation can be elided entirely. -/// Returns an empty optional if it was not possible to determine. -std::optional StartTensorCopyOp::elidesAllocation( - const bufferization::BufferizationOptions &options, - SmallVector *invocationStack) { - // Padding cannot be elided in general, even if the copied buffer is in L1. - if (hasPadding()) - return false; - - FailureOr copyType = - invocationStack - ? bufferization::getBufferType(getCopy(), options, *invocationStack) - : bufferization::getBufferType(getCopy(), options); - if (failed(copyType)) - return std::nullopt; - - return isa_and_nonnull(copyType->getMemorySpace()); -} - -bool StartTensorCopyOp::resultBufferizesToMemoryWrite( - OpResult opResult, const bufferization::AnalysisState &state) { - assert(opResult == getResult() && "no other result"); - - std::optional matches = elidesAllocation(state.getOptions()); - // Conservative answer. - if (!matches) - return true; - - // No copy is performed unless the address space does not match. - // Copy in this context implies that we are writing to the result. - return !*matches; -} - -bool StartTensorCopyOp::bufferizesToMemoryRead( - OpOperand &opOperand, const bufferization::AnalysisState &state) { - assert(opOperand == getCopyMutable() && "have only one operand"); - - std::optional result = elidesAllocation(state.getOptions()); - // Conservative answer. - if (!result) - return true; - - // We only read from the buffer if we are copying. - return !*result; -} - -bool StartTensorCopyOp::bufferizesToMemoryWrite( - OpOperand &opOperand, const bufferization::AnalysisState &) { - assert(opOperand == getCopyMutable() && "have only one operand"); - - // We do not write into the buffer we are copying ever. - return false; -} - -AliasingValueList StartTensorCopyOp::getAliasingValues( - OpOperand &opOperand, const bufferization::AnalysisState &state) { - assert(opOperand == getCopyMutable() && "have only one operand"); - - std::optional result = elidesAllocation(state.getOptions()); - if (!result) - // Assume the worst case. - return {{getResult(), BufferRelation::Equivalent, /*isDefinite=*/false}}; - - // Always a brand-new allocation unless the input buffer is already in L1 and - // we elide the copy, in which case operand and result alias. - if (*result) - return {{getResult(), BufferRelation::Equivalent, /*isDefinite=*/true}}; - - return {}; -} - -bool StartTensorCopyOp::bufferizesToAllocation(Value value) { - assert(value == getResult() && "have only one result"); - - if (elidesAllocation() == true) - return false; - - // True is the conservative reply, according to the docs. - return true; -} - -FailureOr -StartTensorCopyOp::getBufferType(Value value, - const BufferizationOptions &options, - SmallVector &invocationStack) { - assert(value == getResult() && "have only one result"); - - bool contained = llvm::is_contained(invocationStack, value); - if (!contained) - if (elidesAllocation(options, &invocationStack) == true) - return bufferization::getBufferType(getCopy(), options, invocationStack); - - // Unless contained in the invocation stack (where we are free to impose the - // most optimal layout), we do not really impose a specific layout on the - // result. Contiguous is a good bet for now. - return getMemRefTypeWithStaticIdentityLayout( - getResult().getType(), L1EncodingAttr::get(getContext())); -} - -LogicalResult -StartTensorCopyOp::bufferize(RewriterBase &rewriter, - const BufferizationOptions &options) { - if (use_empty()) { - rewriter.eraseOp(*this); - return success(); - } - - FailureOr copyType = - bufferization::getBufferType(getCopy(), options); - if (failed(copyType)) - return failure(); - - FailureOr copyBuffer = getBuffer(rewriter, getCopy(), options); - if (failed(copyBuffer)) - return failure(); - - std::optional result = elidesAllocation(options); - if (!result) - return failure(); - - if (*result) { - Value token = rewriter.create(getLoc()); - replaceOpWithBufferizedValues(rewriter, getOperation(), - {*copyBuffer, token}); - return success(); - } - - FailureOr allocType = - bufferization::getBufferType(getResult(), options); - if (failed(allocType)) - return failure(); - - SmallVector copyBufferSizes = - memref::getMixedSizes(rewriter, getLoc(), *copyBuffer); - - // Compute the dynamic dimensions for the allocation. - SmallVector dynamicDims; - for (auto [index, shape, pad] : - llvm::enumerate(allocType->getShape(), getMixedHighPad())) { - if (!ShapedType::isDynamic(shape)) - continue; - - dynamicDims.push_back(affine::makeComposedAffineApply( - rewriter, getLoc(), - rewriter.getAffineDimExpr(0) + rewriter.getAffineDimExpr(1), - ArrayRef{copyBufferSizes[index], pad})); - } - - FailureOr alloc = options.createAlloc( - rewriter, getLoc(), llvm::cast(*allocType), - /*dynShape=*/dynamicDims); - if (failed(alloc)) - return failure(); - - // Zero out the entire buffer prior to overwriting it with the copied values. - // TODO: This could be optimized to only zero regions that won't be filled - // with the copied values at the cost of 2^rank transfers instead of two. - if (hasPadding() && !getUndefPadding()) - rewriter.create(getLoc(), *alloc); - - // Subview into the original memory without any padding. - // As we only add padding at the end of the dimensions, the offsets are always - // zero. - Value destination = rewriter.create( - getLoc(), *alloc, - /*offsets=*/ - SmallVector(allocType->getRank(), rewriter.getIndexAttr(0)), - copyBufferSizes, - /*strides=*/ - SmallVector(allocType->getRank(), - rewriter.getIndexAttr(1))); - Value token = - rewriter.create(getLoc(), *copyBuffer, destination); - - // Replace op. - replaceOpWithBufferizedValues(rewriter, getOperation(), {*alloc, token}); - return success(); -} - -//===----------------------------------------------------------------------===// -// WaitForTensorCopyOp -//===----------------------------------------------------------------------===// - -OpFoldResult WaitForTensorCopyOp::fold(FoldAdaptor adaptor) { - if (adaptor.getToken()) - return getTransferTensor(); - - return nullptr; -} - -//===----------------------------------------------------------------------===// -// WaitForTensorCopyOp::BufferizableOpInterface -//===----------------------------------------------------------------------===// - -bool WaitForTensorCopyOp::mustBufferizeInPlace( - OpOperand &opOperand, const bufferization::AnalysisState &state) { - return true; -} - -bool WaitForTensorCopyOp::bufferizesToMemoryRead( - OpOperand &opOperand, const bufferization::AnalysisState &state) { - if (opOperand == getTransferTensorMutable()) - return false; - - if (opOperand == getCopyMutable()) - return true; - - llvm_unreachable("unknown operand"); -} - -bool WaitForTensorCopyOp::bufferizesToMemoryWrite( - OpOperand &opOperand, const bufferization::AnalysisState &) { - if (opOperand == getTransferTensorMutable()) - return true; - - if (opOperand == getCopyMutable()) - return false; - - llvm_unreachable("unknown operand"); -} - -AliasingValueList WaitForTensorCopyOp::getAliasingValues( - OpOperand &opOperand, const bufferization::AnalysisState &state) { - if (opOperand == getCopyMutable()) - return {}; - - if (opOperand == getTransferTensorMutable()) - return {{getResult(), BufferRelation::Equivalent, /*isDefinite=*/true}}; - - llvm_unreachable("unknown operand"); -} - -LogicalResult -WaitForTensorCopyOp::bufferize(RewriterBase &rewriter, - const BufferizationOptions &options) { - FailureOr transferTensorBuffer = - getBuffer(rewriter, getTransferTensor(), options); - if (failed(transferTensorBuffer)) - return failure(); - - rewriter.create(getLoc(), getToken()); - replaceOpWithBufferizedValues(rewriter, getOperation(), - *transferTensorBuffer); - return success(); -} - -bool WaitForTensorCopyOp::isNotConflicting( - OpOperand *uRead, OpOperand *uWrite, - const bufferization::AnalysisState &state) { - if (*uRead == getCopyMutable() && *uWrite == getTransferTensorMutable()) - return true; - - return false; -} - -//===----------------------------------------------------------------------===// -// CompletedTokenOp -//===----------------------------------------------------------------------===// - -OpFoldResult CompletedTokenOp::fold(FoldAdaptor adaptor) { - return CompletedTokenAttr::get(getContext()); -} - -//===----------------------------------------------------------------------===// -// StartDMATransferOp -//===----------------------------------------------------------------------===// - -OpFoldResult StartDMATransferOp::fold(FoldAdaptor adaptor) { - if (getSource() != getDest()) - return nullptr; - - return CompletedTokenAttr::get(getContext()); -} - -//===----------------------------------------------------------------------===// -// StartDMATransferOp::DMACoreSpecializationOpInterface -//===----------------------------------------------------------------------===// - -void StartDMATransferOp::replaceWithNoop(RewriterBase &rewriter) { - rewriter.replaceOpWithNewOp(*this); -} - -//===----------------------------------------------------------------------===// -// StartZeroMemTransferOp::DMACoreSpecializationOpInterface -//===----------------------------------------------------------------------===// - -void StartZeroMemTransferOp::replaceWithNoop(RewriterBase &rewriter) { - rewriter.replaceOpWithNewOp(*this); -} - -//===----------------------------------------------------------------------===// -// WaitForDMATransfersOp -//===----------------------------------------------------------------------===// - -LogicalResult -WaitForDMATransfersOp::fold(FoldAdaptor adaptor, - SmallVectorImpl &results) { - bool changed = false; - MutableOperandRange tokens = getTokensMutable(); - for (int i = tokens.size() - 1; i >= 0; i--) { - if (adaptor.getTokens()[i]) { - changed = true; - tokens.erase(i); - } - } - return success(changed); -} - -LogicalResult WaitForDMATransfersOp::canonicalize(WaitForDMATransfersOp op, - PatternRewriter &rewriter) { - if (!op.getTokens().empty()) - return failure(); - - rewriter.eraseOp(op); - return success(); -} - -//===----------------------------------------------------------------------===// -// WaitForDMATransfersOp::DMACoreSpecializationOpInterface -//===----------------------------------------------------------------------===// - -void WaitForDMATransfersOp::replaceWithNoop(RewriterBase &rewriter) { - rewriter.eraseOp(*this); -} - //===----------------------------------------------------------------------===// // ComputeCoreIndexOp::ComputeCoreSpecializationOpInterface //===----------------------------------------------------------------------===// diff --git a/codegen/compiler/src/Quidditch/Dialect/Snitch/IR/QuidditchSnitchOps.td b/codegen/compiler/src/Quidditch/Dialect/Snitch/IR/QuidditchSnitchOps.td index 6c1b9a7..8612295 100644 --- a/codegen/compiler/src/Quidditch/Dialect/Snitch/IR/QuidditchSnitchOps.td +++ b/codegen/compiler/src/Quidditch/Dialect/Snitch/IR/QuidditchSnitchOps.td @@ -174,114 +174,6 @@ def QuidditchSnitch_MicrokernelFenceOp : QuidditchSnitch_Op<"microkernel_fence", }]; } -def QuidditchSnitch_StartTensorCopyOp : QuidditchSnitch_Op<"start_tensor_copy", - [Pure, AllRanksMatch<["copy", "result"]>, - DeclareOpInterfaceMethods]> { - - let description = [{ - Operation starting a copy of a tensor to L1 memory space, optionally adding - padding and returning it as a new tensor. - The contained values of the resulting tensor is in an unspecified state. - See `wait_for_tensor_copy` to transform the tensor value into a state - equal to `$copy`. - - The operation may optionally add padding at the end of each dimension of - the tensor. Zero is used as the padding value. - The dimensions of the result tensor are computed using - `dims(copy)[i] + high_pad[i]`. - - This operation is a noop if `$copy` is already in L1, no padding is added, - and bufferization can elide the copy. - }]; - - let arguments = (ins AnyRankedTensor:$copy, - Variadic:$high_pad, - OptionalAttr:$static_high_pad, - UnitAttr:$undef_padding - ); - - let results = (outs - AnyRankedTensor:$result, - QuidditchSnitch_DMATokenType:$token - ); - - let assemblyFormat = [{ - $copy `to` `L1` - ( `pad` `with` (`undef` $undef_padding^) : (`zero`)? `to` - custom($high_pad, $static_high_pad)^)? - `:` type($copy) `->` type($result) attr-dict - }]; - - let builders = [ - OpBuilder<(ins "mlir::Value":$copy), [{ - build($_builder, $_state, copy.getType(), - $_builder.getType(), copy, - /*high_pad=*/mlir::ValueRange(), /*static_high_pad=*/nullptr); - }]> - ]; - - let hasVerifier = 1; - - let extraClassDeclaration = [{ - private: - std::optional - elidesAllocation(const mlir::bufferization::BufferizationOptions &options = {}, - llvm::SmallVector *invocationStack = nullptr); - public: - - bool hasPadding() { - return static_cast(getStaticHighPadAttr()); - } - - llvm::SmallVector getMixedHighPad(); - }]; - - let hasFolder = 1; -} - -def QuidditchSnitch_WaitForTensorCopyOp : QuidditchSnitch_Op<"wait_for_tensor_copy", - [AllTypesMatch<["transfer_tensor", "result"]>, Pure, - DeclareOpInterfaceMethods]> { - - let description = [{ - Operation asserting that a previous `start_tensor_copy` operation has finished. - Unless `token` is the result of an `completed_token` operation, - `transfer_tensor` and `token` must at runtime be a token and tensor yielded - by a `start_tensor_copy` operation and `copy` the original tensor used in - `start_tensor_copy`. - - Once this operation returns, the returned tensor's values are guaranteed - equal to the `copy` operand and in L1 memory. - - Note: The additional `copy` operand is given as it is effectively read by - this operation. - This additionally guarantees that the bufferization frame work does not - perform a write to the underlying buffer of `copy` while the transfer is - in progress. - }]; - - let arguments = (ins - AnyRankedTensor:$transfer_tensor, - QuidditchSnitch_DMATokenType:$token, - AnyRankedTensor:$copy - ); - - let results = (outs - AnyRankedTensor:$result - ); - - let assemblyFormat = [{ - `of` $copy `:` type($copy) `to` $transfer_tensor `using` $token `->` type($transfer_tensor) attr-dict - }]; - - let hasFolder = 1; -} - def FlatI8MemRef : ConfinedType, [HasStaticShapePred, HasAnyRankOfPred<[1]>], "one-dimensional i8 MemRef of a static size">; @@ -294,107 +186,6 @@ def QuidditchSnitch_L1MemoryViewOp : QuidditchSnitch_Op<"l1_memory_view", }]; } -def QuidditchSnitch_StartDMATransferOp : QuidditchSnitch_Op<"start_dma_transfer", - [MemoryEffects<[MemWrite]>, SameOperandsElementType, SameOperandsShape, - QuidditchSnitch_DMACoreSpecializationOpInterface]> { - - let description = [{ - Operation performing a DMA transfer from one MemRef to another. - The shapes (including dynamic ones at runtime) of both MemRefs must be - identical with different strides and offsets allowed. - - The DMA operation is likely (but not guaranteed) to run asynchronous and - its completion only guaranteed by executing the `wait_for_dma_transfers` - operation with the token returned by this operation or a later one. - }]; - - let arguments = (ins - Arg, "source", [MemRead]>:$source, - Arg, "destination", [MemWrite]>:$dest - ); - - let results = (outs QuidditchSnitch_DMATokenType:$token); - - let assemblyFormat = [{ - `from` $source `:` type($source) `to` $dest `:` type($dest) attr-dict - }]; - - let hasFolder = 1; - - let extraClassDeclaration = [{ - void replaceWithNoop(mlir::RewriterBase& rewriter); - }]; -} - -def QuidditchSnitch_StartZeroMemTransferOp : QuidditchSnitch_Op<"start_zero_mem_transfer", - [MemoryEffects<[MemWrite]>, - QuidditchSnitch_DMACoreSpecializationOpInterface]> { - - let description = [{ - - }]; - - let arguments = (ins - Arg, "zeroed buffer", [MemWrite]>:$filled - ); - - let results = (outs QuidditchSnitch_DMATokenType:$token); - - let assemblyFormat = [{ - $filled `:` type($filled) attr-dict - }]; - - let extraClassDeclaration = [{ - void replaceWithNoop(mlir::RewriterBase& rewriter); - }]; -} - -def QuidditchSnitch_WaitForDMATransfersOp - : QuidditchSnitch_Op<"wait_for_dma_transfers", [ - QuidditchSnitch_DMACoreSpecializationOpInterface - ]> { - - let description = [{ - Operation awaiting for DMA transfers denoted by its tokens to be finished. - }]; - - let arguments = (ins - Variadic:$tokens - ); - - let assemblyFormat = [{ - ($tokens^ `:` type($tokens))? attr-dict - }]; - - let hasFolder = 1; - let hasCanonicalizeMethod = 1; - - let extraClassDeclaration = [{ - bool needsSynchronization() { - return true; - } - - void replaceWithNoop(mlir::RewriterBase& rewriter); - }]; -} - -def QuidditchSnitch_CompletedTokenOp - : QuidditchSnitch_Op<"completed_token", [Pure, ConstantLike]> { - - let description = [{ - Op returning a special value representing a completed DMA transfer. - Passing this token to `wait_for_dma_transfers` will always return immediately. - }]; - - let results = (outs QuidditchSnitch_DMATokenType:$token); - - let assemblyFormat = [{ - attr-dict - }]; - - let hasFolder = 1; -} - def QuidditchSnitch_BarrierOp : QuidditchSnitch_Op<"barrier"> { let assemblyFormat = [{ attr-dict diff --git a/codegen/compiler/src/Quidditch/Dialect/Snitch/IR/QuidditchSnitchTypes.td b/codegen/compiler/src/Quidditch/Dialect/Snitch/IR/QuidditchSnitchTypes.td index 42a3bfc..05a006c 100644 --- a/codegen/compiler/src/Quidditch/Dialect/Snitch/IR/QuidditchSnitchTypes.td +++ b/codegen/compiler/src/Quidditch/Dialect/Snitch/IR/QuidditchSnitchTypes.td @@ -7,12 +7,4 @@ include "mlir/IR/AttrTypeBase.td" class QuidditchSnitch_Type traits = []> : TypeDef; -def QuidditchSnitch_DMATokenType : QuidditchSnitch_Type<"DMAToken"> { - let mnemonic = "dma_token"; - - let description = [{ - Type representing a potentially active DMA transfer. - }]; -} - #endif diff --git a/codegen/compiler/src/Quidditch/Dialect/Snitch/Transforms/CMakeLists.txt b/codegen/compiler/src/Quidditch/Dialect/Snitch/Transforms/CMakeLists.txt index aca8165..7833461 100644 --- a/codegen/compiler/src/Quidditch/Dialect/Snitch/Transforms/CMakeLists.txt +++ b/codegen/compiler/src/Quidditch/Dialect/Snitch/Transforms/CMakeLists.txt @@ -23,6 +23,7 @@ iree_cc_library( "SpecializeDMACode.cpp" DEPS ::PassesIncGen + Quidditch::Dialect::DMA::IR::DMADialect Quidditch::Dialect::Snitch::IR::QuidditchSnitchDialect MLIRIR MLIRAffineDialect diff --git a/codegen/compiler/src/Quidditch/Dialect/Snitch/Transforms/Passes.td b/codegen/compiler/src/Quidditch/Dialect/Snitch/Transforms/Passes.td index 3224d91..015ae86 100644 --- a/codegen/compiler/src/Quidditch/Dialect/Snitch/Transforms/Passes.td +++ b/codegen/compiler/src/Quidditch/Dialect/Snitch/Transforms/Passes.td @@ -18,6 +18,7 @@ def PromotePadsToL1Pass : Pass<"quidditch-promote-pads-to-l1"> { let dependentDialects = [ "quidditch::Snitch::QuidditchSnitchDialect", + "quidditch::dma::DMADialect", ]; } @@ -28,6 +29,7 @@ def PromoteOperandsToL1Pass : Pass<"quidditch-promote-operands-to-l1"> { let dependentDialects = [ "quidditch::Snitch::QuidditchSnitchDialect", + "quidditch::dma::DMADialect", ]; } @@ -88,6 +90,7 @@ def LowerForallOpPass : Pass<"quidditch-lower-forall-op"> { def PipelineCopyComputePass : Pass<"quidditch-pipeline-copy-compute"> { let dependentDialects = [ "quidditch::Snitch::QuidditchSnitchDialect", + "quidditch::dma::DMADialect", ]; } diff --git a/codegen/compiler/src/Quidditch/Dialect/Snitch/Transforms/PipelineCopyCompute.cpp b/codegen/compiler/src/Quidditch/Dialect/Snitch/Transforms/PipelineCopyCompute.cpp index cb49f0d..68369f1 100644 --- a/codegen/compiler/src/Quidditch/Dialect/Snitch/Transforms/PipelineCopyCompute.cpp +++ b/codegen/compiler/src/Quidditch/Dialect/Snitch/Transforms/PipelineCopyCompute.cpp @@ -1,5 +1,7 @@ #include "Passes.h" +#include "Quidditch/Dialect/DMA/IR/DMADialect.h" +#include "Quidditch/Dialect/DMA/IR/DMAOps.h" #include "Quidditch/Dialect/Snitch/IR/QuidditchSnitchAttrs.h" #include "Quidditch/Dialect/Snitch/IR/QuidditchSnitchDialect.h" #include "Quidditch/Dialect/Snitch/IR/QuidditchSnitchOps.h" @@ -29,6 +31,7 @@ class PipelineCopyCompute using namespace mlir; using namespace mlir::iree_compiler; using namespace quidditch::Snitch; +using namespace quidditch::dma; /// Lifts an 'scf.for' op to a pipeline op with two stages. /// The body of the for loop gets placed in the second stage with all iter args diff --git a/codegen/compiler/src/Quidditch/Dialect/Snitch/Transforms/PromoteToL1.cpp b/codegen/compiler/src/Quidditch/Dialect/Snitch/Transforms/PromoteToL1.cpp index a99a932..e440f6c 100644 --- a/codegen/compiler/src/Quidditch/Dialect/Snitch/Transforms/PromoteToL1.cpp +++ b/codegen/compiler/src/Quidditch/Dialect/Snitch/Transforms/PromoteToL1.cpp @@ -1,5 +1,7 @@ #include "Passes.h" +#include "Quidditch/Dialect/DMA/IR/DMADialect.h" +#include "Quidditch/Dialect/DMA/IR/DMAOps.h" #include "Quidditch/Dialect/Snitch/IR/QuidditchSnitchAttrs.h" #include "Quidditch/Dialect/Snitch/IR/QuidditchSnitchDialect.h" #include "Quidditch/Dialect/Snitch/IR/QuidditchSnitchOps.h" @@ -50,6 +52,7 @@ class PromotePadsToL1 using namespace mlir; using namespace quidditch::Snitch; +using namespace quidditch::dma; void PromoteOperandsToL1::runOnOperation() { // Copy all tensors used as operands to compute ops into L1 memory. @@ -62,8 +65,9 @@ void PromoteOperandsToL1::runOnOperation() { auto builder = OpBuilder(computeOp); for (OpOperand *use : nonL1Uses) { - auto copyOp = builder.create(computeOp.getLoc(), - /*copy=*/use->get()); + auto copyOp = builder.create( + computeOp.getLoc(), + /*copy=*/use->get(), builder.getAttr()); auto waitOp = builder.create( computeOp.getLoc(), copyOp.getResult(), copyOp.getToken(), /*copy=*/use->get()); @@ -81,8 +85,9 @@ void PromoteAllocsToL1::runOnOperation() { } OpBuilder builder(tensorOp); - auto copyOp = builder.create(tensorOp.getLoc(), - tensorOp.getCopy()); + auto copyOp = + builder.create(tensorOp.getLoc(), tensorOp.getCopy(), + builder.getAttr()); auto waitOp = builder.create( tensorOp.getLoc(), copyOp.getResult(), copyOp.getToken(), /*copy=*/tensorOp.getCopy()); @@ -112,9 +117,9 @@ void PromotePadsToL1::runOnOperation() { OpBuilder builder(padOp); auto copyOp = builder.create( - padOp.getLoc(), padOp.getType(), builder.getType(), - padOp.getSource(), padOp.getHigh(), padOp.getStaticHighAttr(), - undefPadding); + padOp.getLoc(), padOp.getType(), builder.getType(), + padOp.getSource(), builder.getAttr(), padOp.getHigh(), + padOp.getStaticHighAttr(), undefPadding); auto waitOp = builder.create( padOp.getLoc(), copyOp.getResult(), copyOp.getToken(), /*copy=*/padOp.getSource()); diff --git a/codegen/compiler/src/Quidditch/Target/CMakeLists.txt b/codegen/compiler/src/Quidditch/Target/CMakeLists.txt index 9f4cc7e..bdca133 100644 --- a/codegen/compiler/src/Quidditch/Target/CMakeLists.txt +++ b/codegen/compiler/src/Quidditch/Target/CMakeLists.txt @@ -32,6 +32,7 @@ iree_cc_library( DEPS ::PassesIncGen Quidditch::Conversion::ConvertSnitchToLLVM + Quidditch::Conversion::ConvertDMAToLLVM Quidditch::Dialect::Snitch::IR::QuidditchSnitchDialect MLIRFuncDialect MLIRIR @@ -48,6 +49,7 @@ iree_cc_library( ::Passes Quidditch::Conversion::ConvertToRISCV Quidditch::Dialect::Snitch::Transforms::Passes + Quidditch::Dialect::DMA::Extensions::DMACoreSpecializationOpInterfaceImpl IREELinalgTransformDialect LLVMAnalysis LLVMBitReader diff --git a/codegen/compiler/src/Quidditch/Target/ConvertToLLVM.cpp b/codegen/compiler/src/Quidditch/Target/ConvertToLLVM.cpp index bdeba3e..3a4138e 100644 --- a/codegen/compiler/src/Quidditch/Target/ConvertToLLVM.cpp +++ b/codegen/compiler/src/Quidditch/Target/ConvertToLLVM.cpp @@ -4,6 +4,7 @@ // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +#include "Quidditch/Conversion/ConvertDMAToLLVM.h" #include "Quidditch/Conversion/ConvertSnitchToLLVM.h" #include "Quidditch/Dialect/Snitch/IR/QuidditchSnitchDialect.h" #include "iree/compiler/Codegen/LLVMCPU/DispatchABI.h" @@ -1036,6 +1037,7 @@ void ConvertToLLVMPass::runOnOperation() { populateVectorToLLVMMatrixConversionPatterns(typeConverter, patterns); populateVectorToLLVMConversionPatterns(typeConverter, patterns, false); populateSnitchToLLVMConversionPatterns(module, typeConverter, patterns); + populateDMAToLLVMConversionPatterns(module, typeConverter, patterns); HALDispatchABI abi(&typeConverter); // clang-format off diff --git a/codegen/compiler/src/Quidditch/Target/QuidditchTarget.cpp b/codegen/compiler/src/Quidditch/Target/QuidditchTarget.cpp index 5e57741..dc51630 100644 --- a/codegen/compiler/src/Quidditch/Target/QuidditchTarget.cpp +++ b/codegen/compiler/src/Quidditch/Target/QuidditchTarget.cpp @@ -25,6 +25,9 @@ #include "mlir/Transforms/Passes.h" #include "Quidditch/Conversion/Passes.h" +#include "Quidditch/Dialect/DMA/Extensions/DMACoreSpecializationOpInterfaceImpl.h" +#include "Quidditch/Dialect/DMA/IR/DMADialect.h" +#include "Quidditch/Dialect/DMA/IR/DMAOps.h" #include "Quidditch/Dialect/Snitch/IR/QuidditchSnitchDialect.h" #include "Quidditch/Dialect/Snitch/IR/QuidditchSnitchOps.h" #include "Quidditch/Dialect/Snitch/Transforms/Passes.h" @@ -129,9 +132,11 @@ class QuidditchTargetBackend final : public IREE::HAL::TargetBackend { void getDependentDialects(DialectRegistry ®istry) const override { mlir::registerBuiltinDialectTranslation(registry); mlir::registerLLVMDialectTranslation(registry); + quidditch::dma::registerDMACoreSpecializationOpInterface(registry); registry.insert(); + quidditch::Snitch::QuidditchSnitchDialect, + quidditch::dma::DMADialect>(); } void getDefaultExecutableTargets( @@ -208,14 +213,13 @@ class QuidditchTargetBackend final : public IREE::HAL::TargetBackend { return builder.create( loc, memRefType, dynamicSizes, builder.getI64IntegerAttr(alignment)); }; - BufferizationOptions::MemCpyFn memcpyFn = [](OpBuilder &builder, - Location loc, Value from, - Value to) { - Value token = - builder.create(loc, from, to); - builder.create(loc, token); - return success(); - }; + BufferizationOptions::MemCpyFn memcpyFn = + [](OpBuilder &builder, Location loc, Value from, Value to) { + Value token = + builder.create(loc, from, to); + builder.create(loc, token); + return success(); + }; FunctionLikeNest(modulePassManager) .addPass(createEliminateEmptyTensorsPass) diff --git a/codegen/tests/Conversion/ConvertSnitchToLLVM/dma_transfer.mlir b/codegen/tests/Conversion/ConvertDMAToLLVM/dma_transfer.mlir similarity index 79% rename from codegen/tests/Conversion/ConvertSnitchToLLVM/dma_transfer.mlir rename to codegen/tests/Conversion/ConvertDMAToLLVM/dma_transfer.mlir index dc09379..259049e 100644 --- a/codegen/tests/Conversion/ConvertSnitchToLLVM/dma_transfer.mlir +++ b/codegen/tests/Conversion/ConvertDMAToLLVM/dma_transfer.mlir @@ -8,14 +8,14 @@ // CHECK-SAME: %{{[[:alnum:]]+}} // CHECK-SAME: %{{[[:alnum:]]+}} // CHECK-SAME: %[[ARG1_PTR:[[:alnum:]]+]] -func.func private @test(%arg0 : memref, %arg1 : memref) -> !quidditch_snitch.dma_token { +func.func private @test(%arg0 : memref, %arg1 : memref) -> !dma.token { // CHECK: %[[ZERO:.*]] = llvm.mlir.zero // CHECK: %[[GEP:.*]] = llvm.getelementptr %[[ZERO]][%[[ARG0_SIZE]]] // CHECK: %[[SIZE:.*]] = llvm.ptrtoint %[[GEP]] // CHECK: %[[R:.*]] = llvm.call @snrt_dma_start_1d(%[[ARG1_PTR]], %[[ARG0_PTR]], %[[SIZE]]) - %0 = quidditch_snitch.start_dma_transfer from %arg0 : memref to %arg1 : memref + %0 = dma.start_transfer from %arg0 : memref to %arg1 : memref // CHECK: return %[[R]] - return %0 : !quidditch_snitch.dma_token + return %0 : !dma.token } // CHECK-LABEL: @test2 @@ -27,22 +27,22 @@ func.func private @test(%arg0 : memref, %arg1 : memref) -> !quiddi // CHECK-SAME: %{{[[:alnum:]]+}} // CHECK-SAME: %[[ARG1_ALIGNED_PTR:[[:alnum:]]+]] // CHECK-SAME: %[[ARG1_OFFSET:[[:alnum:]]+]] -func.func private @test2(%arg0 : memref, %arg1 : memref>) -> !quidditch_snitch.dma_token { +func.func private @test2(%arg0 : memref, %arg1 : memref>) -> !dma.token { // CHECK: %[[ARG1_PTR:.*]] = llvm.getelementptr %[[ARG1_ALIGNED_PTR]][%[[ARG1_OFFSET]]] // CHECK: %[[GEP:.*]] = llvm.getelementptr %{{.*}}[%[[ARG0_SIZE]]] // CHECK: %[[SIZE:.*]] = llvm.ptrtoint %[[GEP]] // CHECK: %[[R:.*]] = llvm.call @snrt_dma_start_1d(%[[ARG1_PTR]], %[[ARG0_PTR]], %[[SIZE]]) - %0 = quidditch_snitch.start_dma_transfer from %arg0 : memref to %arg1 : memref> + %0 = dma.start_transfer from %arg0 : memref to %arg1 : memref> // CHECK: llvm.call @snrt_dma_start_1d( - %1 = quidditch_snitch.start_dma_transfer from %arg1 : memref> to %arg0 : memref - return %0 : !quidditch_snitch.dma_token + %1 = dma.start_transfer from %arg1 : memref> to %arg0 : memref + return %0 : !dma.token } // CHECK-LABEL: @test3 -func.func private @test3(%arg0 : memref, %arg1 : memref>) -> !quidditch_snitch.dma_token { +func.func private @test3(%arg0 : memref, %arg1 : memref>) -> !dma.token { // CHECK: llvm.call @snrt_dma_start_1d( - %0 = quidditch_snitch.start_dma_transfer from %arg0 : memref to %arg1 : memref> - return %0 : !quidditch_snitch.dma_token + %0 = dma.start_transfer from %arg0 : memref to %arg1 : memref> + return %0 : !dma.token } // CHECK-LABEL: @dynamic_inner( @@ -61,15 +61,15 @@ func.func private @dynamic_inner(%subview_3 : memref<1x?xf64, strided<[161, 1], // CHECK-SAME: %{{[[:alnum:]]+}} // CHECK-SAME: %{{[[:alnum:]]+}} // CHECK-SAME: %[[BYTES]] - %12 = quidditch_snitch.start_dma_transfer from %subview_3 : memref<1x?xf64, strided<[161, 1], offset: ?>> to %subview_5 : memref<1x?xf64, strided<[81, 1]>> + %12 = dma.start_transfer from %subview_3 : memref<1x?xf64, strided<[161, 1], offset: ?>> to %subview_5 : memref<1x?xf64, strided<[81, 1]>> return } // CHECK-LABEL: @test4 -func.func private @test4(%arg0 : memref<1x4xf32>, %arg1 : memref<1x4xf32, strided<[40, 1], offset: ?>>) -> !quidditch_snitch.dma_token { +func.func private @test4(%arg0 : memref<1x4xf32>, %arg1 : memref<1x4xf32, strided<[40, 1], offset: ?>>) -> !dma.token { // CHECK: llvm.call @snrt_dma_start_1d( - %0 = quidditch_snitch.start_dma_transfer from %arg0 : memref<1x4xf32> to %arg1 : memref<1x4xf32, strided<[40, 1], offset: ?>> - return %0 : !quidditch_snitch.dma_token + %0 = dma.start_transfer from %arg0 : memref<1x4xf32> to %arg1 : memref<1x4xf32, strided<[40, 1], offset: ?>> + return %0 : !dma.token } // CHECK-LABEL: @test5 @@ -86,7 +86,7 @@ func.func private @test4(%arg0 : memref<1x4xf32>, %arg1 : memref<1x4xf32, stride // CHECK-SAME: %{{[[:alnum:]]+}} // CHECK-SAME: %{{[[:alnum:]]+}} // CHECK-SAME: %[[ARG1_STRIDE_N:[[:alnum:]]+]] -func.func private @test5(%arg0 : memref<2x4xf32>, %arg1 : memref<2x4xf32, strided<[8, 1], offset: 0>>) -> !quidditch_snitch.dma_token { +func.func private @test5(%arg0 : memref<2x4xf32>, %arg1 : memref<2x4xf32, strided<[8, 1], offset: 0>>) -> !dma.token { // CHECK-DAG: %[[ELEMENT_WIDTH:.*]] = llvm.mlir.constant(4 : i32) // CHECK-DAG: %[[FOUR_INDEX:.*]] = llvm.mlir.constant(4 : index) // CHECK-DAG: %[[TWO:.*]] = llvm.mlir.constant(2 : index) @@ -94,8 +94,8 @@ func.func private @test5(%arg0 : memref<2x4xf32>, %arg1 : memref<2x4xf32, stride // CHECK: %[[ARG0_STRIDE:.*]] = llvm.mul %[[FOUR_INDEX]], %[[ELEMENT_WIDTH]] // CHECK: %[[ARG1_STRIDE:.*]] = llvm.mul %[[ARG1_STRIDE_N]], %[[ELEMENT_WIDTH]] // CHECK: llvm.call @snrt_dma_start_2d(%[[ARG1_PTR]], %[[ARG0_PTR]], %[[INNER_SIZE]], %[[ARG1_STRIDE]], %[[ARG0_STRIDE]], %[[TWO]]) - %0 = quidditch_snitch.start_dma_transfer from %arg0 : memref<2x4xf32> to %arg1 : memref<2x4xf32, strided<[8, 1], offset: 0>> - return %0 : !quidditch_snitch.dma_token + %0 = dma.start_transfer from %arg0 : memref<2x4xf32> to %arg1 : memref<2x4xf32, strided<[8, 1], offset: 0>> + return %0 : !dma.token } // CHECK-LABEL: @test6 @@ -116,7 +116,7 @@ func.func private @test5(%arg0 : memref<2x4xf32>, %arg1 : memref<2x4xf32, stride // CHECK-SAME: %{{[[:alnum:]]+}} // CHECK-SAME: %[[ARG1_STRIDE0:[[:alnum:]]+]] // CHECK-SAME: %[[ARG1_STRIDE_N:[[:alnum:]]+]] -func.func private @test6(%arg0 : memref<3x2x4xf32>, %arg1 : memref<3x2x4xf32, strided<[16, 8, 1], offset: 2>>) -> !quidditch_snitch.dma_token { +func.func private @test6(%arg0 : memref<3x2x4xf32>, %arg1 : memref<3x2x4xf32, strided<[16, 8, 1], offset: 2>>) -> !dma.token { // CHECK-DAG: %[[ELEMENT_WIDTH:.*]] = llvm.mlir.constant(4 : i32) // CHECK-DAG: %[[ZERO32:.*]] = llvm.mlir.constant(0 : i32) : i32 // CHECK-DAG: %[[ZERO:.*]] = llvm.mlir.constant(0 : index) : i32 @@ -147,9 +147,9 @@ func.func private @test6(%arg0 : memref<3x2x4xf32>, %arg1 : memref<3x2x4xf32, st // CHECK: %[[INV:.*]] = llvm.add %[[IV1]], %[[ONE]] // CHECK: llvm.br ^[[BB1]](%[[INV]], %[[RES]] - %0 = quidditch_snitch.start_dma_transfer from %arg0 : memref<3x2x4xf32> to %arg1 : memref<3x2x4xf32, strided<[16, 8, 1], offset: 2>> + %0 = dma.start_transfer from %arg0 : memref<3x2x4xf32> to %arg1 : memref<3x2x4xf32, strided<[16, 8, 1], offset: 2>> // CHECK: return %[[IV2]] - return %0 : !quidditch_snitch.dma_token + return %0 : !dma.token } @@ -167,7 +167,7 @@ func.func private @test6(%arg0 : memref<3x2x4xf32>, %arg1 : memref<3x2x4xf32, st // CHECK-SAME: %{{[[:alnum:]]+}} // CHECK-SAME: %{{[[:alnum:]]+}} // CHECK-SAME: %[[ARG1_STRIDE_N:[[:alnum:]]+]] -func.func private @dynamic_strides(%arg0 : memref<2x4xf32>, %arg1 : memref<2x4xf32, strided<[?, 1]>>) -> !quidditch_snitch.dma_token { +func.func private @dynamic_strides(%arg0 : memref<2x4xf32>, %arg1 : memref<2x4xf32, strided<[?, 1]>>) -> !dma.token { // CHECK-DAG: %[[ELEMENT_WIDTH:.*]] = llvm.mlir.constant(4 : i32) // CHECK-DAG: %[[FOUR:.*]] = llvm.mlir.constant(4 : index) // CHECK-DAG: %[[TWO:.*]] = llvm.mlir.constant(2 : index) @@ -175,8 +175,8 @@ func.func private @dynamic_strides(%arg0 : memref<2x4xf32>, %arg1 : memref<2x4xf // CHECK: %[[ARG0_STRIDE:.*]] = llvm.mul %[[FOUR]], %[[ELEMENT_WIDTH]] // CHECK: %[[ARG1_STRIDE:.*]] = llvm.mul %[[ARG1_STRIDE_N]], %[[ELEMENT_WIDTH]] // CHECK: llvm.call @snrt_dma_start_2d(%[[ARG1_PTR]], %[[ARG0_PTR]], %[[INNER_SIZE]], %[[ARG1_STRIDE]], %[[ARG0_STRIDE]], %[[TWO]]) - %0 = quidditch_snitch.start_dma_transfer from %arg0 : memref<2x4xf32> to %arg1 : memref<2x4xf32, strided<[?, 1]>> - return %0 : !quidditch_snitch.dma_token + %0 = dma.start_transfer from %arg0 : memref<2x4xf32> to %arg1 : memref<2x4xf32, strided<[?, 1]>> + return %0 : !dma.token } // CHECK-LABEL: @contigious_dynamic_inner @@ -193,12 +193,12 @@ func.func private @dynamic_strides(%arg0 : memref<2x4xf32>, %arg1 : memref<2x4xf // CHECK-SAME: %{{[[:alnum:]]+}} // CHECK-SAME: %{{[[:alnum:]]+}} // CHECK-SAME: %[[ARG1_STRIDE_N:[[:alnum:]]+]] -func.func private @contigious_dynamic_inner(%arg0 : memref, %arg1 : memref>) -> !quidditch_snitch.dma_token { +func.func private @contigious_dynamic_inner(%arg0 : memref, %arg1 : memref>) -> !dma.token { // CHECK: %[[ELEMENT_WIDTH:.*]] = llvm.mlir.constant(4 : i32) // CHECK: %[[INNER_SIZE:.*]] = llvm.mul %[[ARG0_STRIDE_0]], %[[ELEMENT_WIDTH]] // CHECK: %[[ARG0_STRIDE:.*]] = llvm.mul %[[ARG0_STRIDE_N]], %[[ELEMENT_WIDTH]] // CHECK: %[[ARG1_STRIDE:.*]] = llvm.mul %[[ARG1_STRIDE_N]], %[[ELEMENT_WIDTH]] // CHECK: llvm.call @snrt_dma_start_2d(%[[ARG1_PTR]], %[[ARG0_PTR]], %[[INNER_SIZE]], %[[ARG1_STRIDE]], %[[ARG0_STRIDE]], %[[ARG0_SIZE]]) - %0 = quidditch_snitch.start_dma_transfer from %arg0 : memref to %arg1 : memref> - return %0 : !quidditch_snitch.dma_token + %0 = dma.start_transfer from %arg0 : memref to %arg1 : memref> + return %0 : !dma.token } diff --git a/codegen/tests/Conversion/ConvertSnitchToLLVM/dma_wait.mlir b/codegen/tests/Conversion/ConvertDMAToLLVM/dma_wait.mlir similarity index 79% rename from codegen/tests/Conversion/ConvertSnitchToLLVM/dma_wait.mlir rename to codegen/tests/Conversion/ConvertDMAToLLVM/dma_wait.mlir index 62e4128..47ad305 100644 --- a/codegen/tests/Conversion/ConvertSnitchToLLVM/dma_wait.mlir +++ b/codegen/tests/Conversion/ConvertDMAToLLVM/dma_wait.mlir @@ -2,7 +2,7 @@ // CHECK-LABEL: @test // CHECK-SAME: %[[ARG0:[[:alnum:]]+]] -func.func private @test(%arg0 : !quidditch_snitch.dma_token) { +func.func private @test(%arg0 : !dma.token) { // CHECK: llvm.br ^[[BODY:[[:alnum:]]+]] // CHECK: ^[[BODY]]: // CHECK-NEXT: %[[ID:.*]] = llvm.inline_asm has_side_effects ".insn r 0x2b, 0, 0b100, $0, zero, zero @@ -11,7 +11,7 @@ func.func private @test(%arg0 : !quidditch_snitch.dma_token) { // CHECK: %[[COND:.*]] = llvm.icmp "ult" %[[ID]], %[[ARG0]] // CHECK: llvm.cond_br %[[COND]], ^[[BODY]], ^[[CONT:[[:alnum:]]+]] // CHECK: ^[[CONT]]: - quidditch_snitch.wait_for_dma_transfers %arg0 : !quidditch_snitch.dma_token + dma.wait_for_transfers %arg0 : !dma.token // CHECK-NEXT: llvm.return return } diff --git a/codegen/tests/Conversion/ConvertSnitchToLLVM/zero_mem_transfer.mlir b/codegen/tests/Conversion/ConvertDMAToLLVM/zero_mem_transfer.mlir similarity index 88% rename from codegen/tests/Conversion/ConvertSnitchToLLVM/zero_mem_transfer.mlir rename to codegen/tests/Conversion/ConvertDMAToLLVM/zero_mem_transfer.mlir index 5dcadb8..e0553a2 100644 --- a/codegen/tests/Conversion/ConvertSnitchToLLVM/zero_mem_transfer.mlir +++ b/codegen/tests/Conversion/ConvertDMAToLLVM/zero_mem_transfer.mlir @@ -5,7 +5,7 @@ // CHECK-SAME: %[[PTR:[[:alnum:]]+]] // CHECK-SAME: %{{[[:alnum:]]+}} // CHECK-SAME: %[[DIM0:[[:alnum:]]+]] -func.func private @test(%arg0 : memref) -> !quidditch_snitch.dma_token { +func.func private @test(%arg0 : memref) -> !dma.token { // CHECK-DAG: %[[NULL:.*]] = llvm.mlir.zero // CHECK-DAG: %[[ZERO:.*]] = llvm.mlir.constant(0 : // CHECK: %[[GEP:.*]] = llvm.getelementptr %[[NULL]][%[[DIM0]]] @@ -16,9 +16,9 @@ func.func private @test(%arg0 : memref) -> !quidditch_snitch.dma_token { // CHECK: %[[GEP:.*]] = llvm.getelementptr %[[PTR]][%[[OFFSET]]] // CHECK: %[[REM:.*]] = llvm.urem %[[SIZE]], %[[ZERO_MEM_SIZE]] // CHECK: %[[TOKEN:.*]] = llvm.call @snrt_dma_start_1d(%[[GEP]], %[[ZERO_MEM]], %[[REM]]) - %0 = quidditch_snitch.start_zero_mem_transfer %arg0 : memref + %0 = dma.start_zero_mem_transfer %arg0 : memref // CHECK: return %[[TOKEN]] - return %0 : !quidditch_snitch.dma_token + return %0 : !dma.token } // CHECK-LABEL: @test1( @@ -30,7 +30,7 @@ func.func private @test(%arg0 : memref) -> !quidditch_snitch.dma_token { // CHECK-SAME: %[[DIM2:[[:alnum:]]+]] // CHECK-SAME: %[[STRIDE0:[[:alnum:]]+]] // CHECK-SAME: %[[STRIDE1:[[:alnum:]]+]] -func.func private @test1(%arg0 : memref>) -> !quidditch_snitch.dma_token { +func.func private @test1(%arg0 : memref>) -> !dma.token { // CHECK-DAG: %[[ZERO_INDEX:.*]] = llvm.mlir.constant(0 : index) // CHECK-DAG: %[[ZERO_I32:.*]] = llvm.mlir.constant(0 : i32) // CHECK-DAG: %[[ONE:.*]] = llvm.mlir.constant(1 : @@ -67,7 +67,7 @@ func.func private @test1(%arg0 : memref>) -> !quid // CHECK: llvm.br ^[[LOOP0]](%[[INC0]], %[[TOKEN1]] : // CHECK: ^[[EXIT0]]: - %0 = quidditch_snitch.start_zero_mem_transfer %arg0 : memref> + %0 = dma.start_zero_mem_transfer %arg0 : memref> // CHECK: return %[[TOKEN0]] - return %0 : !quidditch_snitch.dma_token + return %0 : !dma.token } diff --git a/codegen/tests/Conversion/ConvertSnitchToLLVM/completed_token.mlir b/codegen/tests/Conversion/ConvertSnitchToLLVM/completed_token.mlir index 5008ba2..45617b9 100644 --- a/codegen/tests/Conversion/ConvertSnitchToLLVM/completed_token.mlir +++ b/codegen/tests/Conversion/ConvertSnitchToLLVM/completed_token.mlir @@ -1,9 +1,9 @@ // RUN: quidditch-opt %s --quidditch-convert-to-llvm | FileCheck %s // CHECK-LABEL: @test -func.func private @test() -> !quidditch_snitch.dma_token { +func.func private @test() -> !dma.token { // CHECK: %[[T:.*]] = llvm.mlir.constant(0 : {{.*}}) // CHECK: return %[[T]] - %0 = quidditch_snitch.completed_token - return %0 : !quidditch_snitch.dma_token + %0 = dma.completed_token + return %0 : !dma.token } diff --git a/codegen/tests/Dialect/DMA/IR/bufferization.mlir b/codegen/tests/Dialect/DMA/IR/bufferization.mlir new file mode 100644 index 0000000..c6843fc --- /dev/null +++ b/codegen/tests/Dialect/DMA/IR/bufferization.mlir @@ -0,0 +1,125 @@ +// RUN: quidditch-opt %s --one-shot-bufferize | FileCheck %s + +// CHECK: #[[$MAP2:.*]] = affine_map<()[s0, s1] -> (s0 + s1)> + +// CHECK: func @copy_l1_buffer( +func.func @copy_l1_buffer(%arg0 : tensor<32xf32>) -> (tensor<32xf32>, !dma.token) { + // CHECK: %[[ARG0:.*]] = bufferization.to_memref + + // CHECK: %[[ALLOC:.*]] = memref.alloc() + // CHECK-SAME: : memref<32xf32, #quidditch_snitch.l1_encoding> + // CHECK: %[[SUBVIEW:.*]] = memref.subview %[[ALLOC]] + // CHECK-SAME: to memref<32xf32, strided<[1]>, #quidditch_snitch.l1_encoding> + // CHECK: %[[TOKEN:.*]] = dma.start_transfer from %[[ARG0]] + // CHECK-SAME: to %[[SUBVIEW]] + // CHECK: %[[R:.*]] = bufferization.to_tensor %[[ALLOC]] + %r, %token = dma.start_tensor_copy of %arg0 to #quidditch_snitch.l1_encoding -> tensor<32xf32> + // CHECK: return %[[R]], %[[TOKEN]] + return %r, %token : tensor<32xf32>, !dma.token +} + +// CHECK: func @copy_l1_buffer_elided( +func.func @copy_l1_buffer_elided(%arg0 : tensor<32xf32>) -> tensor<32xf32> { + // CHECK: memref.alloc() + // CHECK-NOT: memref.alloc() + %r:2 = dma.start_tensor_copy of %arg0 to #quidditch_snitch.l1_encoding -> tensor<32xf32> + %r2 = dma.wait_for_tensor_copy of %arg0 : tensor<32xf32> to %r#0 using %r#1 -> tensor<32xf32> + %r3:2 = dma.start_tensor_copy of %r2 to #quidditch_snitch.l1_encoding -> tensor<32xf32> + %r4 = dma.wait_for_tensor_copy of %r2 : tensor<32xf32> to %r3#0 using %r3#1 -> tensor<32xf32> + // CHECK: return + return %r4 : tensor<32xf32> +} + +// CHECK: func @copy_l1_buffer_alloca_elided( +func.func @copy_l1_buffer_alloca_elided() -> tensor<32xf32> { + // CHECK: memref.alloc() + // CHECK-NOT: memref.alloc() + %r = bufferization.alloc_tensor() {memory_space = #quidditch_snitch.l1_encoding} : tensor<32xf32> + %r2:2 = dma.start_tensor_copy of %r to #quidditch_snitch.l1_encoding : tensor<32xf32> -> tensor<32xf32> + // CHECK: return + return %r2#0 : tensor<32xf32> +} + +// CHECK: func @scf_for_copy_l1_buffer( +func.func @scf_for_copy_l1_buffer() -> tensor<32xf32> { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + // CHECK: %[[MEMREF:.*]] = memref.alloc + %r = bufferization.alloc_tensor() {memory_space = #quidditch_snitch.l1_encoding} : tensor<32xf32> + %r2:2 = dma.start_tensor_copy of %r to #quidditch_snitch.l1_encoding : tensor<32xf32> -> tensor<32xf32> + // CHECK-NEXT: dma.completed_token + // CHECK-NEXT: %[[R:.*]] = scf.for + // CHECK-SAME: iter_args(%[[ITER:.*]] = %[[MEMREF]]) + // CHECK-NEXT: dma.completed_token + // CHECK-NEXT: scf.yield %[[ITER]] + // CHECK: bufferization.to_tensor %[[R]] + %r3 = scf.for %i = %c0 to %c1 step %c1 iter_args(%iter = %r2#0) -> (tensor<32xf32>) { + %r4:2 = dma.start_tensor_copy of %iter to #quidditch_snitch.l1_encoding -> tensor<32xf32> + scf.yield %r4#0 : tensor<32xf32> + } + return %r3 : tensor<32xf32> +} + +// CHECK: func @copy_l1_buffer_dynamic_dims( +func.func @copy_l1_buffer_dynamic_dims(%arg0 : tensor) -> tensor { + // CHECK: %[[ARG0:.*]] = bufferization.to_memref + // CHECK: %[[ZERO:.*]] = arith.constant 0 + // CHECK: %[[DIM_IN:.*]] = memref.dim %[[ARG0]], %[[ZERO]] + // CHECK: %[[DIM:.*]] = affine.apply #{{.*}}()[%[[DIM_IN]]] + // CHECK: %[[ALLOC:.*]] = memref.alloc(%[[DIM]]) + // CHECK-SAME: : memref + // CHECK: %[[SUBVIEW:.*]] = memref.subview %[[ALLOC]] + // CHECK-SAME: to memref, #quidditch_snitch.l1_encoding> + // CHECK: dma.start_transfer from %[[ARG0]] + // CHECK-SAME: to %[[SUBVIEW]] + // CHECK: %[[R:.*]] = bufferization.to_tensor %[[ALLOC]] + %r:2 = dma.start_tensor_copy of %arg0 to #quidditch_snitch.l1_encoding -> tensor + // CHECK: return %[[R]] + return %r#0 : tensor +} + +// CHECK-LABEL: @tensor_copy_pad +// CHECK-SAME: %[[ARG0:[[:alnum:]]+]] +// CHECK-SAME: %[[PAD0:[[:alnum:]]+]] +// CHECK-SAME: %[[PAD1:[[:alnum:]]+]] +func.func @tensor_copy_pad(%arg0 : tensor, %pad0 : index, %pad1 : index) -> (tensor, !dma.token) { + // CHECK: %[[COPY:.*]] = bufferization.to_memref %[[ARG0]] + // CHECK: %[[ZERO:.*]] = arith.constant 0 + // CHECK: %[[DIM0:.*]] = memref.dim %[[COPY]], %[[ZERO]] + // CHECK: %[[ONE:.*]] = arith.constant 1 + // CHECK: %[[DIM1:.*]] = memref.dim %[[COPY]], %[[ONE]] + // CHECK: %[[NEW_DIM0:.*]] = affine.apply #[[$MAP2]]()[%[[DIM0]], %[[PAD0]]] + // CHECK: %[[NEW_DIM1:.*]] = affine.apply #[[$MAP2]]()[%[[DIM1]], %[[PAD1]]] + // CHECK: %[[ALLOC:.*]] = memref.alloc(%[[NEW_DIM0]], %[[NEW_DIM1]]) + // CHECK: start_zero_mem_transfer %[[ALLOC]] + // CHECK: %[[UNPADDED:.*]] = memref.subview %[[ALLOC]][0, 0] [%[[DIM0]], %[[DIM1]]] [1, 1] + // CHECK: %[[TOKEN:.*]] = dma.start_transfer from %[[COPY]] + // CHECK-SAME: to %[[UNPADDED]] + %r, %t = dma.start_tensor_copy of %arg0 to #quidditch_snitch.l1_encoding pad with zero by [%pad0, %pad1] : tensor -> tensor + // CHECK: %[[TENSOR:.*]] = bufferization.to_tensor %[[ALLOC]] + // CHECK: return %[[TENSOR]], %[[TOKEN]] + return %r, %t : tensor, !dma.token +} + +// CHECK-LABEL: @tensor_copy_pad_undef +// CHECK-SAME: %[[ARG0:[[:alnum:]]+]] +// CHECK-SAME: %[[PAD0:[[:alnum:]]+]] +// CHECK-SAME: %[[PAD1:[[:alnum:]]+]] +func.func @tensor_copy_pad_undef(%arg0 : tensor, %pad0 : index, %pad1 : index) -> (tensor, !dma.token) { + // CHECK: %[[COPY:.*]] = bufferization.to_memref %[[ARG0]] + // CHECK: %[[ZERO:.*]] = arith.constant 0 + // CHECK: %[[DIM0:.*]] = memref.dim %[[COPY]], %[[ZERO]] + // CHECK: %[[ONE:.*]] = arith.constant 1 + // CHECK: %[[DIM1:.*]] = memref.dim %[[COPY]], %[[ONE]] + // CHECK: %[[NEW_DIM0:.*]] = affine.apply #[[$MAP2]]()[%[[DIM0]], %[[PAD0]]] + // CHECK: %[[NEW_DIM1:.*]] = affine.apply #[[$MAP2]]()[%[[DIM1]], %[[PAD1]]] + // CHECK: %[[ALLOC:.*]] = memref.alloc(%[[NEW_DIM0]], %[[NEW_DIM1]]) + // CHECK-NOT: start_zero_mem_transfer + // CHECK: %[[UNPADDED:.*]] = memref.subview %[[ALLOC]][0, 0] [%[[DIM0]], %[[DIM1]]] [1, 1] + // CHECK-NEXT: %[[TOKEN:.*]] = dma.start_transfer from %[[COPY]] + // CHECK-SAME: to %[[UNPADDED]] + %r, %t = dma.start_tensor_copy of %arg0 to #quidditch_snitch.l1_encoding pad with undef by [%pad0, %pad1] : tensor -> tensor + // CHECK: %[[TENSOR:.*]] = bufferization.to_tensor %[[ALLOC]] + // CHECK: return %[[TENSOR]], %[[TOKEN]] + return %r, %t : tensor, !dma.token +} diff --git a/codegen/tests/Dialect/DMA/IR/canonicalization.mlir b/codegen/tests/Dialect/DMA/IR/canonicalization.mlir new file mode 100644 index 0000000..60998f0 --- /dev/null +++ b/codegen/tests/Dialect/DMA/IR/canonicalization.mlir @@ -0,0 +1,111 @@ +// RUN: quidditch-opt %s --canonicalize --split-input-file --allow-unregistered-dialect | FileCheck %s + +// CHECK-LABEL: @wait_gets_removed +func.func @wait_gets_removed() { + // CHECK-NEXT: return + %0 = dma.completed_token + dma.wait_for_transfers %0 : !dma.token + return +} + +// CHECK-LABEL: @noop_transfer +func.func @noop_transfer(%arg0 : memref) -> !dma.token { + // CHECK-NEXT: %[[R:.*]] = dma.completed_token + // CHECK-NEXT: return %[[R]] + %0 = dma.start_transfer from %arg0 : memref to %arg0 : memref + return %0 : !dma.token +} + +// CHECK-LABEL: @tensor_wait_gets_removed +// CHECK-SAME: %[[ARG0:[[:alnum:]]+]] +// CHECK-SAME: %[[ARG1:[[:alnum:]]+]] +func.func @tensor_wait_gets_removed(%arg0 : tensor, %arg1 : tensor) -> tensor { + // CHECK-NEXT: return %[[ARG1]] + %t = dma.completed_token + %0 = dma.wait_for_tensor_copy of %arg0 : tensor to %arg1 using %t -> tensor + return %0 : tensor +} + +// CHECK-LABEL: @tensor_noop_transfer +// CHECK-SAME: %[[ARG0:[[:alnum:]]+]] +func.func @tensor_noop_transfer(%arg0 : tensor) -> (tensor, !dma.token) { + // CHECK: %[[T2:.*]] = dma.completed_token + // CHECK: %[[R:.*]], %[[T:.*]] = dma.start_tensor_copy of %[[ARG0]] + %r, %t = dma.start_tensor_copy of %arg0 to #quidditch_snitch.l1_encoding -> tensor + // CHECK: %[[R2:.*]] = dma.wait_for_tensor_copy of %[[ARG0]] + // CHECK-SAME: to %[[R]] using %[[T]] + %0 = dma.wait_for_tensor_copy of %arg0 : tensor to %r using %t -> tensor + + // CHECK-NOT: wait_for_tensor_copy + %r2, %t2 = dma.start_tensor_copy of %0 to #quidditch_snitch.l1_encoding -> tensor + + // CHECK: return %[[R2]], %[[T2]] + return %r2, %t2 : tensor, !dma.token +} + +// CHECK-LABEL: @tensor_noop_pad +func.func @tensor_noop_pad(%arg0 : tensor) -> (tensor, !dma.token) { + // CHECK: %[[R:.*]], %[[T:.*]] = dma.start_tensor_copy + // CHECK-NOT: pad with + %r, %t = dma.start_tensor_copy of %arg0 to #quidditch_snitch.l1_encoding pad with zero by [0] : tensor -> tensor + // CHECK-NEXT: return %[[R]], %[[T]] + return %r, %t : tensor, !dma.token +} + +// CHECK-LABEL: @tensor_pad_constant +func.func @tensor_pad_constant(%arg0 : tensor) -> (tensor, !dma.token) { + %zero = arith.constant 0 : index + // CHECK: %[[R:.*]], %[[T:.*]] = dma.start_tensor_copy + // CHECK-NOT: pad with + %r, %t = dma.start_tensor_copy of %arg0 to #quidditch_snitch.l1_encoding pad with zero by [%zero] : tensor -> tensor + // CHECK-NEXT: return %[[R]], %[[T]] + return %r, %t : tensor, !dma.token +} + +// CHECK-LABEL: @tensor_noop_transfer_pad +// CHECK-SAME: %[[ARG0:[[:alnum:]]+]] +func.func @tensor_noop_transfer_pad(%arg0 : tensor) -> (tensor, !dma.token) { + // CHECK: %[[T2:.*]] = dma.completed_token + // CHECK: %[[R:.*]], %[[T:.*]] = dma.start_tensor_copy of %[[ARG0]] + %r, %t = dma.start_tensor_copy of %arg0 to #quidditch_snitch.l1_encoding pad with zero by [1] : tensor -> tensor + // CHECK: %[[R2:.*]] = dma.wait_for_tensor_copy of %[[ARG0]] + // CHECK-SAME: to %[[R]] using %[[T]] + %0 = dma.wait_for_tensor_copy of %arg0 : tensor to %r using %t -> tensor + + // CHECK-NOT: wait_for_tensor_copy + %r2, %t2 = dma.start_tensor_copy of %0 to #quidditch_snitch.l1_encoding -> tensor + + // CHECK: return %[[R2]], %[[T2]] + return %r2, %t2 : tensor, !dma.token +} + +// CHECK-LABEL: @tensor_noop_transfer_pad_neg +// CHECK-SAME: %[[ARG0:[[:alnum:]]+]] +func.func @tensor_noop_transfer_pad_neg(%arg0 : tensor) -> (tensor, !dma.token) { + // CHECK: start_tensor_copy + // CHECK: wait_for_tensor_copy + // CHECK: %[[R:.*]], %[[T:.*]] = dma.start_tensor_copy + // CHECK: return %[[R]], %[[T]] + + %r, %t = dma.start_tensor_copy of %arg0 to #quidditch_snitch.l1_encoding -> tensor + %0 = dma.wait_for_tensor_copy of %arg0 : tensor to %r using %t -> tensor + %r2, %t2 = dma.start_tensor_copy of %0 to #quidditch_snitch.l1_encoding pad with zero by [1] : tensor -> tensor + return %r2, %t2 : tensor, !dma.token +} + +// CHECK-LABEL: @tensor_noop_transfer_same_padding +// CHECK-SAME: %[[ARG0:[[:alnum:]]+]] +func.func @tensor_noop_transfer_same_padding(%arg0 : tensor) -> (tensor, !dma.token) { + // CHECK: %[[T2:.*]] = dma.completed_token + // CHECK: %[[R:.*]], %[[T:.*]] = dma.start_tensor_copy of %[[ARG0]] + %r, %t = dma.start_tensor_copy of %arg0 to #quidditch_snitch.l1_encoding pad with zero by [1] : tensor -> tensor + // CHECK: %[[R2:.*]] = dma.wait_for_tensor_copy of %[[ARG0]] + // CHECK-SAME: to %[[R]] using %[[T]] + %0 = dma.wait_for_tensor_copy of %arg0 : tensor to %r using %t -> tensor + + // CHECK-NOT: wait_for_tensor_copy + %r2, %t2 = dma.start_tensor_copy of %0 to #quidditch_snitch.l1_encoding pad with zero by [1] : tensor -> tensor + + // CHECK: return %[[R2]], %[[T2]] + return %r2, %t2 : tensor, !dma.token +} diff --git a/codegen/tests/Dialect/DMA/IR/roundtrip.mlir b/codegen/tests/Dialect/DMA/IR/roundtrip.mlir new file mode 100644 index 0000000..de6d37b --- /dev/null +++ b/codegen/tests/Dialect/DMA/IR/roundtrip.mlir @@ -0,0 +1,11 @@ +// RUN: quidditch-opt %s --verify-roundtrip + +func.func @test(%arg0 : memref) { + dma.wait_for_transfers + return +} + +func.func @test3(%arg0 : tensor) -> (tensor, !dma.token) { + %0:2 = dma.start_tensor_copy of %arg0 to #quidditch_snitch.l1_encoding -> tensor + return %0#0, %0#1 : tensor, !dma.token +} diff --git a/codegen/tests/Dialect/Snitch/IR/bufferization.mlir b/codegen/tests/Dialect/Snitch/IR/bufferization.mlir index 97d07c3..97fce22 100644 --- a/codegen/tests/Dialect/Snitch/IR/bufferization.mlir +++ b/codegen/tests/Dialect/Snitch/IR/bufferization.mlir @@ -1,83 +1,5 @@ // RUN: quidditch-opt %s --one-shot-bufferize | FileCheck %s -// CHECK: #[[$MAP2:.*]] = affine_map<()[s0, s1] -> (s0 + s1)> - -// CHECK: func @copy_l1_buffer( -func.func @copy_l1_buffer(%arg0 : tensor<32xf32>) -> (tensor<32xf32>, !quidditch_snitch.dma_token) { - // CHECK: %[[ARG0:.*]] = bufferization.to_memref - - // CHECK: %[[ALLOC:.*]] = memref.alloc() - // CHECK-SAME: : memref<32xf32, #quidditch_snitch.l1_encoding> - // CHECK: %[[SUBVIEW:.*]] = memref.subview %[[ALLOC]] - // CHECK-SAME: to memref<32xf32, strided<[1]>, #quidditch_snitch.l1_encoding> - // CHECK: %[[TOKEN:.*]] = quidditch_snitch.start_dma_transfer from %[[ARG0]] - // CHECK-SAME: to %[[SUBVIEW]] - // CHECK: %[[R:.*]] = bufferization.to_tensor %[[ALLOC]] - %r, %token = quidditch_snitch.start_tensor_copy %arg0 to L1 : tensor<32xf32> -> tensor<32xf32> - // CHECK: return %[[R]], %[[TOKEN]] - return %r, %token : tensor<32xf32>, !quidditch_snitch.dma_token -} - -// CHECK: func @copy_l1_buffer_elided( -func.func @copy_l1_buffer_elided(%arg0 : tensor<32xf32>) -> tensor<32xf32> { - // CHECK: memref.alloc() - // CHECK-NOT: memref.alloc() - %r:2 = quidditch_snitch.start_tensor_copy %arg0 to L1 : tensor<32xf32> -> tensor<32xf32> - %r2 = quidditch_snitch.wait_for_tensor_copy of %arg0 : tensor<32xf32> to %r#0 using %r#1 -> tensor<32xf32> - %r3:2 = quidditch_snitch.start_tensor_copy %r2 to L1 : tensor<32xf32> -> tensor<32xf32> - %r4 = quidditch_snitch.wait_for_tensor_copy of %r2 : tensor<32xf32> to %r3#0 using %r3#1 -> tensor<32xf32> - // CHECK: return - return %r4 : tensor<32xf32> -} - -// CHECK: func @copy_l1_buffer_alloca_elided( -func.func @copy_l1_buffer_alloca_elided() -> tensor<32xf32> { - // CHECK: memref.alloc() - // CHECK-NOT: memref.alloc() - %r = bufferization.alloc_tensor() {memory_space = #quidditch_snitch.l1_encoding} : tensor<32xf32> - %r2:2 = quidditch_snitch.start_tensor_copy %r to L1 : tensor<32xf32> -> tensor<32xf32> - // CHECK: return - return %r2#0 : tensor<32xf32> -} - -// CHECK: func @scf_for_copy_l1_buffer( -func.func @scf_for_copy_l1_buffer() -> tensor<32xf32> { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - // CHECK: %[[MEMREF:.*]] = memref.alloc - %r = bufferization.alloc_tensor() {memory_space = #quidditch_snitch.l1_encoding} : tensor<32xf32> - %r2:2 = quidditch_snitch.start_tensor_copy %r to L1 : tensor<32xf32> -> tensor<32xf32> - // CHECK-NEXT: quidditch_snitch.completed_token - // CHECK-NEXT: %[[R:.*]] = scf.for - // CHECK-SAME: iter_args(%[[ITER:.*]] = %[[MEMREF]]) - // CHECK-NEXT: quidditch_snitch.completed_token - // CHECK-NEXT: scf.yield %[[ITER]] - // CHECK: bufferization.to_tensor %[[R]] - %r3 = scf.for %i = %c0 to %c1 step %c1 iter_args(%iter = %r2#0) -> (tensor<32xf32>) { - %r4:2 = quidditch_snitch.start_tensor_copy %iter to L1 : tensor<32xf32> -> tensor<32xf32> - scf.yield %r4#0 : tensor<32xf32> - } - return %r3 : tensor<32xf32> -} - -// CHECK: func @copy_l1_buffer_dynamic_dims( -func.func @copy_l1_buffer_dynamic_dims(%arg0 : tensor) -> tensor { - // CHECK: %[[ARG0:.*]] = bufferization.to_memref - // CHECK: %[[ZERO:.*]] = arith.constant 0 - // CHECK: %[[DIM_IN:.*]] = memref.dim %[[ARG0]], %[[ZERO]] - // CHECK: %[[DIM:.*]] = affine.apply #{{.*}}()[%[[DIM_IN]]] - // CHECK: %[[ALLOC:.*]] = memref.alloc(%[[DIM]]) - // CHECK-SAME: : memref - // CHECK: %[[SUBVIEW:.*]] = memref.subview %[[ALLOC]] - // CHECK-SAME: to memref, #quidditch_snitch.l1_encoding> - // CHECK: quidditch_snitch.start_dma_transfer from %[[ARG0]] - // CHECK-SAME: to %[[SUBVIEW]] - // CHECK: %[[R:.*]] = bufferization.to_tensor %[[ALLOC]] - %r:2 = quidditch_snitch.start_tensor_copy %arg0 to L1 : tensor -> tensor - // CHECK: return %[[R]] - return %r#0 : tensor -} - // CHECK-LABEL: @pipeline_op( func.func @pipeline_op(%arg0_dim : index) -> tensor { // CHECK-DAG: %[[C0:.*]] = arith.constant 0 @@ -142,49 +64,3 @@ func.func @sync_tensor() -> tensor<32xf32> { // CHECK: return %[[R]] return %r : tensor<32xf32> } - -// CHECK-LABEL: @tensor_copy_pad -// CHECK-SAME: %[[ARG0:[[:alnum:]]+]] -// CHECK-SAME: %[[PAD0:[[:alnum:]]+]] -// CHECK-SAME: %[[PAD1:[[:alnum:]]+]] -func.func @tensor_copy_pad(%arg0 : tensor, %pad0 : index, %pad1 : index) -> (tensor, !quidditch_snitch.dma_token) { - // CHECK: %[[COPY:.*]] = bufferization.to_memref %[[ARG0]] - // CHECK: %[[ZERO:.*]] = arith.constant 0 - // CHECK: %[[DIM0:.*]] = memref.dim %[[COPY]], %[[ZERO]] - // CHECK: %[[ONE:.*]] = arith.constant 1 - // CHECK: %[[DIM1:.*]] = memref.dim %[[COPY]], %[[ONE]] - // CHECK: %[[NEW_DIM0:.*]] = affine.apply #[[$MAP2]]()[%[[DIM0]], %[[PAD0]]] - // CHECK: %[[NEW_DIM1:.*]] = affine.apply #[[$MAP2]]()[%[[DIM1]], %[[PAD1]]] - // CHECK: %[[ALLOC:.*]] = memref.alloc(%[[NEW_DIM0]], %[[NEW_DIM1]]) - // CHECK: start_zero_mem_transfer %[[ALLOC]] - // CHECK: %[[UNPADDED:.*]] = memref.subview %[[ALLOC]][0, 0] [%[[DIM0]], %[[DIM1]]] [1, 1] - // CHECK: %[[TOKEN:.*]] = quidditch_snitch.start_dma_transfer from %[[COPY]] - // CHECK-SAME: to %[[UNPADDED]] - %r, %t = quidditch_snitch.start_tensor_copy %arg0 to L1 pad with zero to [%pad0, %pad1] : tensor -> tensor - // CHECK: %[[TENSOR:.*]] = bufferization.to_tensor %[[ALLOC]] - // CHECK: return %[[TENSOR]], %[[TOKEN]] - return %r, %t : tensor, !quidditch_snitch.dma_token -} - -// CHECK-LABEL: @tensor_copy_pad_undef -// CHECK-SAME: %[[ARG0:[[:alnum:]]+]] -// CHECK-SAME: %[[PAD0:[[:alnum:]]+]] -// CHECK-SAME: %[[PAD1:[[:alnum:]]+]] -func.func @tensor_copy_pad_undef(%arg0 : tensor, %pad0 : index, %pad1 : index) -> (tensor, !quidditch_snitch.dma_token) { - // CHECK: %[[COPY:.*]] = bufferization.to_memref %[[ARG0]] - // CHECK: %[[ZERO:.*]] = arith.constant 0 - // CHECK: %[[DIM0:.*]] = memref.dim %[[COPY]], %[[ZERO]] - // CHECK: %[[ONE:.*]] = arith.constant 1 - // CHECK: %[[DIM1:.*]] = memref.dim %[[COPY]], %[[ONE]] - // CHECK: %[[NEW_DIM0:.*]] = affine.apply #[[$MAP2]]()[%[[DIM0]], %[[PAD0]]] - // CHECK: %[[NEW_DIM1:.*]] = affine.apply #[[$MAP2]]()[%[[DIM1]], %[[PAD1]]] - // CHECK: %[[ALLOC:.*]] = memref.alloc(%[[NEW_DIM0]], %[[NEW_DIM1]]) - // CHECK-NOT: start_zero_mem_transfer - // CHECK: %[[UNPADDED:.*]] = memref.subview %[[ALLOC]][0, 0] [%[[DIM0]], %[[DIM1]]] [1, 1] - // CHECK-NEXT: %[[TOKEN:.*]] = quidditch_snitch.start_dma_transfer from %[[COPY]] - // CHECK-SAME: to %[[UNPADDED]] - %r, %t = quidditch_snitch.start_tensor_copy %arg0 to L1 pad with undef to [%pad0, %pad1] : tensor -> tensor - // CHECK: %[[TENSOR:.*]] = bufferization.to_tensor %[[ALLOC]] - // CHECK: return %[[TENSOR]], %[[TOKEN]] - return %r, %t : tensor, !quidditch_snitch.dma_token -} diff --git a/codegen/tests/Dialect/Snitch/IR/canonicalization.mlir b/codegen/tests/Dialect/Snitch/IR/canonicalization.mlir index f049168..49aa8ad 100644 --- a/codegen/tests/Dialect/Snitch/IR/canonicalization.mlir +++ b/codegen/tests/Dialect/Snitch/IR/canonicalization.mlir @@ -36,22 +36,6 @@ func.func @identical_argument(%arg0 : i32) { return } -// CHECK-LABEL: @wait_gets_removed -func.func @wait_gets_removed() { - // CHECK-NEXT: return - %0 = quidditch_snitch.completed_token - quidditch_snitch.wait_for_dma_transfers %0 : !quidditch_snitch.dma_token - return -} - -// CHECK-LABEL: @noop_transfer -func.func @noop_transfer(%arg0 : memref) -> !quidditch_snitch.dma_token { - // CHECK-NEXT: %[[R:.*]] = quidditch_snitch.completed_token - // CHECK-NEXT: return %[[R]] - %0 = quidditch_snitch.start_dma_transfer from %arg0 : memref to %arg0 : memref - return %0 : !quidditch_snitch.dma_token -} - // CHECK-LABEL: @pipeline_dead_block_arg( func.func @pipeline_dead_block_arg(%tensor : tensor) { %c0 = arith.constant 0 : index @@ -91,97 +75,3 @@ func.func @pipeline_invariant(%tensor : tensor) { } return } - -// CHECK-LABEL: @tensor_wait_gets_removed -// CHECK-SAME: %[[ARG0:[[:alnum:]]+]] -// CHECK-SAME: %[[ARG1:[[:alnum:]]+]] -func.func @tensor_wait_gets_removed(%arg0 : tensor, %arg1 : tensor) -> tensor { - // CHECK-NEXT: return %[[ARG1]] - %t = quidditch_snitch.completed_token - %0 = quidditch_snitch.wait_for_tensor_copy of %arg0 : tensor to %arg1 using %t -> tensor - return %0 : tensor -} - -// CHECK-LABEL: @tensor_noop_transfer -// CHECK-SAME: %[[ARG0:[[:alnum:]]+]] -func.func @tensor_noop_transfer(%arg0 : tensor) -> (tensor, !quidditch_snitch.dma_token) { - // CHECK: %[[T2:.*]] = quidditch_snitch.completed_token - // CHECK: %[[R:.*]], %[[T:.*]] = quidditch_snitch.start_tensor_copy %[[ARG0]] - %r, %t = quidditch_snitch.start_tensor_copy %arg0 to L1 : tensor -> tensor - // CHECK: %[[R2:.*]] = quidditch_snitch.wait_for_tensor_copy of %[[ARG0]] - // CHECK-SAME: to %[[R]] using %[[T]] - %0 = quidditch_snitch.wait_for_tensor_copy of %arg0 : tensor to %r using %t -> tensor - - // CHECK-NOT: wait_for_tensor_copy - %r2, %t2 = quidditch_snitch.start_tensor_copy %0 to L1 : tensor -> tensor - - // CHECK: return %[[R2]], %[[T2]] - return %r2, %t2 : tensor, !quidditch_snitch.dma_token -} - -// CHECK-LABEL: @tensor_noop_pad -func.func @tensor_noop_pad(%arg0 : tensor) -> (tensor, !quidditch_snitch.dma_token) { - // CHECK: %[[R:.*]], %[[T:.*]] = quidditch_snitch.start_tensor_copy - // CHECK-NOT: pad with - %r, %t = quidditch_snitch.start_tensor_copy %arg0 to L1 pad with zero to [0] : tensor -> tensor - // CHECK-NEXT: return %[[R]], %[[T]] - return %r, %t : tensor, !quidditch_snitch.dma_token -} - -// CHECK-LABEL: @tensor_pad_constant -func.func @tensor_pad_constant(%arg0 : tensor) -> (tensor, !quidditch_snitch.dma_token) { - %zero = arith.constant 0 : index - // CHECK: %[[R:.*]], %[[T:.*]] = quidditch_snitch.start_tensor_copy - // CHECK-NOT: pad with - %r, %t = quidditch_snitch.start_tensor_copy %arg0 to L1 pad with zero to [%zero] : tensor -> tensor - // CHECK-NEXT: return %[[R]], %[[T]] - return %r, %t : tensor, !quidditch_snitch.dma_token -} - -// CHECK-LABEL: @tensor_noop_transfer_pad -// CHECK-SAME: %[[ARG0:[[:alnum:]]+]] -func.func @tensor_noop_transfer_pad(%arg0 : tensor) -> (tensor, !quidditch_snitch.dma_token) { - // CHECK: %[[T2:.*]] = quidditch_snitch.completed_token - // CHECK: %[[R:.*]], %[[T:.*]] = quidditch_snitch.start_tensor_copy %[[ARG0]] - %r, %t = quidditch_snitch.start_tensor_copy %arg0 to L1 pad with zero to [1] : tensor -> tensor - // CHECK: %[[R2:.*]] = quidditch_snitch.wait_for_tensor_copy of %[[ARG0]] - // CHECK-SAME: to %[[R]] using %[[T]] - %0 = quidditch_snitch.wait_for_tensor_copy of %arg0 : tensor to %r using %t -> tensor - - // CHECK-NOT: wait_for_tensor_copy - %r2, %t2 = quidditch_snitch.start_tensor_copy %0 to L1 : tensor -> tensor - - // CHECK: return %[[R2]], %[[T2]] - return %r2, %t2 : tensor, !quidditch_snitch.dma_token -} - -// CHECK-LABEL: @tensor_noop_transfer_pad_neg -// CHECK-SAME: %[[ARG0:[[:alnum:]]+]] -func.func @tensor_noop_transfer_pad_neg(%arg0 : tensor) -> (tensor, !quidditch_snitch.dma_token) { - // CHECK: start_tensor_copy - // CHECK: wait_for_tensor_copy - // CHECK: %[[R:.*]], %[[T:.*]] = quidditch_snitch.start_tensor_copy - // CHECK: return %[[R]], %[[T]] - - %r, %t = quidditch_snitch.start_tensor_copy %arg0 to L1 : tensor -> tensor - %0 = quidditch_snitch.wait_for_tensor_copy of %arg0 : tensor to %r using %t -> tensor - %r2, %t2 = quidditch_snitch.start_tensor_copy %0 to L1 pad with zero to [1] : tensor -> tensor - return %r2, %t2 : tensor, !quidditch_snitch.dma_token -} - -// CHECK-LABEL: @tensor_noop_transfer_same_padding -// CHECK-SAME: %[[ARG0:[[:alnum:]]+]] -func.func @tensor_noop_transfer_same_padding(%arg0 : tensor) -> (tensor, !quidditch_snitch.dma_token) { - // CHECK: %[[T2:.*]] = quidditch_snitch.completed_token - // CHECK: %[[R:.*]], %[[T:.*]] = quidditch_snitch.start_tensor_copy %[[ARG0]] - %r, %t = quidditch_snitch.start_tensor_copy %arg0 to L1 pad with zero to [1] : tensor -> tensor - // CHECK: %[[R2:.*]] = quidditch_snitch.wait_for_tensor_copy of %[[ARG0]] - // CHECK-SAME: to %[[R]] using %[[T]] - %0 = quidditch_snitch.wait_for_tensor_copy of %arg0 : tensor to %r using %t -> tensor - - // CHECK-NOT: wait_for_tensor_copy - %r2, %t2 = quidditch_snitch.start_tensor_copy %0 to L1 pad with zero to [1] : tensor -> tensor - - // CHECK: return %[[R2]], %[[T2]] - return %r2, %t2 : tensor, !quidditch_snitch.dma_token -} diff --git a/codegen/tests/Dialect/Snitch/IR/roundtrip.mlir b/codegen/tests/Dialect/Snitch/IR/roundtrip.mlir index 0f22a4d..e30a6d7 100644 --- a/codegen/tests/Dialect/Snitch/IR/roundtrip.mlir +++ b/codegen/tests/Dialect/Snitch/IR/roundtrip.mlir @@ -5,11 +5,5 @@ func.func @test(%arg0 : memref) { ^bb0(%arg1 : memref): } - quidditch_snitch.wait_for_dma_transfers return } - -func.func @test3(%arg0 : tensor) -> (tensor, !quidditch_snitch.dma_token) { - %0:2 = quidditch_snitch.start_tensor_copy %arg0 to L1 : tensor -> tensor - return %0#0, %0#1 : tensor, !quidditch_snitch.dma_token -} diff --git a/codegen/tests/Dialect/Snitch/Transforms/lower-pipeline.mlir b/codegen/tests/Dialect/Snitch/Transforms/lower-pipeline.mlir index 34d73dc..59254c0 100644 --- a/codegen/tests/Dialect/Snitch/Transforms/lower-pipeline.mlir +++ b/codegen/tests/Dialect/Snitch/Transforms/lower-pipeline.mlir @@ -34,7 +34,7 @@ func.func @test( // CHECK-NEXT: yield %[[ALLOCA0]] // CHECK: default // CHECK-NEXT: yield %[[ALLOCA1]] - // CHECK: %[[TOKEN:.*]] = quidditch_snitch.start_dma_transfer from %{{.*}} to %[[ALLOCA]] + // CHECK: %[[TOKEN:.*]] = dma.start_transfer from %{{.*}} to %[[ALLOCA]] // Full pipeline. // CHECK: %[[NEW_LB:.*]] = arith.addi %[[LB]], %[[STEP]] @@ -47,18 +47,18 @@ func.func @test( %subview_3 = memref.subview %9[%arg1, %arg0] [40, 100] [1, 1] : memref<1200x400xf64, strided<[400, 1], offset: ?>> to memref<40x100xf64, strided<[400, 1], offset: ?>> %alloca_4 = memref.alloca() {alignment = 64 : i64} : memref<40x100xf64, #quidditch_snitch.l1_encoding> - %16 = quidditch_snitch.start_dma_transfer from %subview_3 : memref<40x100xf64, strided<[400, 1], offset: ?>> to %alloca_4 : memref<40x100xf64, #quidditch_snitch.l1_encoding> - quidditch_snitch.pipeline_yield %alloca_4, %16 : memref<40x100xf64, #quidditch_snitch.l1_encoding>, !quidditch_snitch.dma_token + %16 = dma.start_transfer from %subview_3 : memref<40x100xf64, strided<[400, 1], offset: ?>> to %alloca_4 : memref<40x100xf64, #quidditch_snitch.l1_encoding> + quidditch_snitch.pipeline_yield %alloca_4, %16 : memref<40x100xf64, #quidditch_snitch.l1_encoding>, !dma.token }, { - ^bb0(%arg1: index, %arg2: memref<40x100xf64, #quidditch_snitch.l1_encoding>, %arg3: !quidditch_snitch.dma_token): + ^bb0(%arg1: index, %arg2: memref<40x100xf64, #quidditch_snitch.l1_encoding>, %arg3: !dma.token): // CHECK: %[[STAGE1_IV:.*]] = affine.apply #[[$MAP3]](%[[IV]]) // CHECK: memref.subview %{{.*}}[0, %[[STAGE1_IV]]] - // CHECK: wait_for_dma_transfers %[[YIELDED1]] + // CHECK: wait_for_transfers %[[YIELDED1]] // CHECK: linalg.matmul_transpose_b ins(%{{.*}}, %[[YIELDED0]] : {{.*}}) // CHECK: yield %[[NEXT_YIELDED]], %{{.*}} : %subview_3 = memref.subview %alloca[0, %arg1] [1, 40] [1, 1] : memref<1x1200xf64, #quidditch_snitch.l1_encoding> to memref<1x40xf64, strided<[1200, 1], offset: ?>, #quidditch_snitch.l1_encoding> - quidditch_snitch.wait_for_dma_transfers %arg3 : !quidditch_snitch.dma_token + dma.wait_for_transfers %arg3 : !dma.token linalg.matmul_transpose_b ins(%alloca2, %arg2 : memref<1x100xf64, #quidditch_snitch.l1_encoding>, memref<40x100xf64, #quidditch_snitch.l1_encoding>) outs(%out : memref<1x40xf64, #quidditch_snitch.l1_encoding>) @@ -66,7 +66,7 @@ func.func @test( // CHECK: %[[IV:.*]] = affine.apply #[[$MAP4]]() // CHECK: %[[STAGE1_IV:.*]] = affine.apply #[[$MAP5]]() // CHECK: memref.subview %{{.*}}[0, %[[STAGE1_IV]]] - // CHECK: wait_for_dma_transfers %[[LAST]]#1 + // CHECK: wait_for_transfers %[[LAST]]#1 // CHECK: linalg.matmul_transpose_b ins(%{{.*}}, %[[LAST]]#0 : {{.*}}) return } diff --git a/codegen/tests/Dialect/Snitch/Transforms/pipeline-copy-compute.mlir b/codegen/tests/Dialect/Snitch/Transforms/pipeline-copy-compute.mlir index b9c70aa..0190eaf 100644 --- a/codegen/tests/Dialect/Snitch/Transforms/pipeline-copy-compute.mlir +++ b/codegen/tests/Dialect/Snitch/Transforms/pipeline-copy-compute.mlir @@ -16,21 +16,21 @@ func.func @test(%arg0: index, %extracted_slice : tensor<1x100xf64>, %14 : tensor // CHECK: pipeline %[[C0]] to %[[C1200]] step %[[C40]] inits(%[[EMPTY]]) %24 = scf.for %arg2 = %c0 to %c1200 step %c40 iter_args(%arg3 = %arg1) -> (tensor<1x1200xf64>) { // CHECK: ^{{.*}}(%[[IV:.*]]: index, %[[ITER:[[:alnum:]]+]]: - // CHECK: %[[RESULT0:.*]], %[[TOKEN0:.*]] = quidditch_snitch.start_tensor_copy %[[ARG1]] + // CHECK: %[[RESULT0:.*]], %[[TOKEN0:.*]] = dma.start_tensor_copy of %[[ARG1]] // CHECK: %[[SLICE1:.*]] = tensor.extract_slice %[[ARG2]][%[[IV]], %[[ARG0]]] - // CHECK: %[[RESULT1:.*]], %[[TOKEN1:.*]] = quidditch_snitch.start_tensor_copy %[[SLICE1]] + // CHECK: %[[RESULT1:.*]], %[[TOKEN1:.*]] = dma.start_tensor_copy of %[[SLICE1]] // CHECK: %[[SLICE2:.*]] = tensor.extract_slice %[[ITER]][0, %[[IV]]] - // CHECK: %[[RESULT2:.*]], %[[TOKEN2:.*]] = quidditch_snitch.start_tensor_copy %[[SLICE2]] + // CHECK: %[[RESULT2:.*]], %[[TOKEN2:.*]] = dma.start_tensor_copy of %[[SLICE2]] // CHECK: pipeline_yield %[[ITER]], %[[RESULT0:.*]], %[[TOKEN0]], %[[SLICE1]], %[[RESULT1]], %[[TOKEN1]], %[[SLICE2]], %[[RESULT2]], %[[TOKEN2]] %extracted_slice_6 = tensor.extract_slice %14[%arg2, %arg0] [40, 100] [1, 1] : tensor<1200x400xf64> to tensor<40x100xf64> %extracted_slice_7 = tensor.extract_slice %arg3[0, %arg2] [1, 40] [1, 1] : tensor<1x1200xf64> to tensor<1x40xf64> - %result_8, %token_9 = quidditch_snitch.start_tensor_copy %extracted_slice to L1 : tensor<1x100xf64> -> tensor<1x100xf64> - %25 = quidditch_snitch.wait_for_tensor_copy of %extracted_slice : tensor<1x100xf64> to %result_8 using %token_9 -> tensor<1x100xf64> - %result_10, %token_11 = quidditch_snitch.start_tensor_copy %extracted_slice_6 to L1 : tensor<40x100xf64> -> tensor<40x100xf64> - %26 = quidditch_snitch.wait_for_tensor_copy of %extracted_slice_6 : tensor<40x100xf64> to %result_10 using %token_11 -> tensor<40x100xf64> - %result_12, %token_13 = quidditch_snitch.start_tensor_copy %extracted_slice_7 to L1 : tensor<1x40xf64> -> tensor<1x40xf64> - %27 = quidditch_snitch.wait_for_tensor_copy of %extracted_slice_7 : tensor<1x40xf64> to %result_12 using %token_13 -> tensor<1x40xf64> + %result_8, %token_9 = dma.start_tensor_copy of %extracted_slice to #quidditch_snitch.l1_encoding : tensor<1x100xf64> -> tensor<1x100xf64> + %25 = dma.wait_for_tensor_copy of %extracted_slice : tensor<1x100xf64> to %result_8 using %token_9 -> tensor<1x100xf64> + %result_10, %token_11 = dma.start_tensor_copy of %extracted_slice_6 to #quidditch_snitch.l1_encoding : tensor<40x100xf64> -> tensor<40x100xf64> + %26 = dma.wait_for_tensor_copy of %extracted_slice_6 : tensor<40x100xf64> to %result_10 using %token_11 -> tensor<40x100xf64> + %result_12, %token_13 = dma.start_tensor_copy of %extracted_slice_7 to #quidditch_snitch.l1_encoding : tensor<1x40xf64> -> tensor<1x40xf64> + %27 = dma.wait_for_tensor_copy of %extracted_slice_7 : tensor<1x40xf64> to %result_12 using %token_13 -> tensor<1x40xf64> // CHECK: ^{{.*}}( // CHECK-SAME: %[[IV:[[:alnum:]]+]] @@ -43,13 +43,13 @@ func.func @test(%arg0: index, %extracted_slice : tensor<1x100xf64>, %14 : tensor // CHECK-SAME: %[[SLICE2:[[:alnum:]]+]] // CHECK-SAME: %[[RESULT2:[[:alnum:]]+]] // CHECK-SAME: %[[TOKEN2:[[:alnum:]]+]] - // CHECK: %[[OPA:.*]] = quidditch_snitch.wait_for_tensor_copy of %[[ARG1]] + // CHECK: %[[OPA:.*]] = dma.wait_for_tensor_copy of %[[ARG1]] // CHECK-SAME: to %[[RESULT0]] // CHECK-SAME: using %[[TOKEN0]] - // CHECK: %[[OPB:.*]] = quidditch_snitch.wait_for_tensor_copy of %[[SLICE1]] + // CHECK: %[[OPB:.*]] = dma.wait_for_tensor_copy of %[[SLICE1]] // CHECK-SAME: to %[[RESULT1]] // CHECK-SAME: using %[[TOKEN1]] - // CHECK: %[[OPC:.*]] = quidditch_snitch.wait_for_tensor_copy of %[[SLICE2]] + // CHECK: %[[OPC:.*]] = dma.wait_for_tensor_copy of %[[SLICE2]] // CHECK-SAME: to %[[RESULT2]] // CHECK-SAME: using %[[TOKEN2]] // CHECK: %[[RES:.*]] = linalg.matmul_transpose_b diff --git a/codegen/tests/Dialect/Snitch/Transforms/promote-operands-to-l1.mlir b/codegen/tests/Dialect/Snitch/Transforms/promote-operands-to-l1.mlir index 1c4c6de..a71201f 100644 --- a/codegen/tests/Dialect/Snitch/Transforms/promote-operands-to-l1.mlir +++ b/codegen/tests/Dialect/Snitch/Transforms/promote-operands-to-l1.mlir @@ -6,16 +6,16 @@ func.func @test(%a : tensor<32x32xf32>, %b : tensor<32x32xf32>) -> tensor<32x32xf32> { // CHECK: %[[E:.*]] = bufferization.alloc_tensor %e = bufferization.alloc_tensor() : tensor<32x32xf32> - // CHECK: %[[A1:.*]], %[[TOKEN:.*]] = quidditch_snitch.start_tensor_copy %[[A]] to L1 - // CHECK: %[[A2:.*]] = quidditch_snitch.wait_for_tensor_copy of %[[A]] + // CHECK: %[[A1:.*]], %[[TOKEN:.*]] = dma.start_tensor_copy of %[[A]] to #quidditch_snitch.l1_encoding + // CHECK: %[[A2:.*]] = dma.wait_for_tensor_copy of %[[A]] // CHECK-SAME: to %[[A1]] // CEHCK-SAME: using %[[TOKEN]] - // CHECK: %[[B1:.*]], %[[TOKEN:.*]] = quidditch_snitch.start_tensor_copy %[[B]] to L1 - // CHECK: %[[B2:.*]] = quidditch_snitch.wait_for_tensor_copy of %[[B]] + // CHECK: %[[B1:.*]], %[[TOKEN:.*]] = dma.start_tensor_copy of %[[B]] to #quidditch_snitch.l1_encoding + // CHECK: %[[B2:.*]] = dma.wait_for_tensor_copy of %[[B]] // CHECK-SAME: to %[[B1]] // CHECK-SAME: using %[[TOKEN]] - // CHECK: %[[E1:.*]], %[[TOKEN:.*]] = quidditch_snitch.start_tensor_copy %[[E]] to L1 - // CHECK: %[[E2:.*]] = quidditch_snitch.wait_for_tensor_copy of %[[E]] + // CHECK: %[[E1:.*]], %[[TOKEN:.*]] = dma.start_tensor_copy of %[[E]] to #quidditch_snitch.l1_encoding + // CHECK: %[[E2:.*]] = dma.wait_for_tensor_copy of %[[E]] // CHECK-SAME: to %[[E1]] // CHECK-SAME: using %[[TOKEN]] // CHECK: linalg.matmul ins(%[[A2]], %[[B2]] : {{.*}}) outs(%[[E2]] : {{.*}}) diff --git a/codegen/tests/Dialect/Snitch/Transforms/promote-pads-to-l1.mlir b/codegen/tests/Dialect/Snitch/Transforms/promote-pads-to-l1.mlir index 4359b9e..2265a44 100644 --- a/codegen/tests/Dialect/Snitch/Transforms/promote-pads-to-l1.mlir +++ b/codegen/tests/Dialect/Snitch/Transforms/promote-pads-to-l1.mlir @@ -4,9 +4,9 @@ // CHECK-SAME: %[[A:[[:alnum:]]+]]: tensor<32x32xf32> func.func @test_zero_f32(%a : tensor<32x32xf32>) -> tensor<33x33xf32> { %c = arith.constant 0.0 : f32 - // CHECK: %[[R:.*]], %[[T:.*]] = quidditch_snitch.start_tensor_copy %[[A]] - // CHECK-SAME: pad with zero to [1, 1] - // CHECK: %[[R2:.*]] = quidditch_snitch.wait_for_tensor_copy of %[[A]] + // CHECK: %[[R:.*]], %[[T:.*]] = dma.start_tensor_copy of %[[A]] + // CHECK-SAME: pad with zero by [1, 1] + // CHECK: %[[R2:.*]] = dma.wait_for_tensor_copy of %[[A]] // CHECK-SAME: to %[[R]] // CHECK-SAME: using %[[T]] %0 = tensor.pad %a low[0, 0] high[1, 1] { @@ -21,9 +21,9 @@ func.func @test_zero_f32(%a : tensor<32x32xf32>) -> tensor<33x33xf32> { // CHECK-SAME: %[[A:[[:alnum:]]+]]: tensor<32x32xf32> func.func @test_poison(%a : tensor<32x32xf32>) -> tensor<33x33xf32> { %c = ub.poison : f32 - // CHECK: %[[R:.*]], %[[T:.*]] = quidditch_snitch.start_tensor_copy %[[A]] - // CHECK-SAME: pad with undef to [1, 1] - // CHECK: %[[R2:.*]] = quidditch_snitch.wait_for_tensor_copy of %[[A]] + // CHECK: %[[R:.*]], %[[T:.*]] = dma.start_tensor_copy of %[[A]] + // CHECK-SAME: pad with undef by [1, 1] + // CHECK: %[[R2:.*]] = dma.wait_for_tensor_copy of %[[A]] // CHECK-SAME: to %[[R]] // CHECK-SAME: using %[[T]] %0 = tensor.pad %a low[0, 0] high[1, 1] { diff --git a/codegen/tests/Dialect/Snitch/Transforms/specialize-dma-code.mlir b/codegen/tests/Dialect/Snitch/Transforms/specialize-dma-code.mlir index b579ec9..351b2d7 100644 --- a/codegen/tests/Dialect/Snitch/Transforms/specialize-dma-code.mlir +++ b/codegen/tests/Dialect/Snitch/Transforms/specialize-dma-code.mlir @@ -14,12 +14,12 @@ func.func @test(%a : memref<32xf32>, %b : memref<32xf32>, %cond : i1) { %a_l1 = memref.view %view[%c0][] : memref<512xi8> to memref<32xf32> %b_l1 = memref.view %view[%c256][] : memref<512xi8> to memref<32xf32> - // CHECK-NEXT: quidditch_snitch.completed_token - // CHECK-NEXT: quidditch_snitch.completed_token + // CHECK-NEXT: dma.completed_token + // CHECK-NEXT: dma.completed_token // CHECK-NEXT: quidditch_snitch.barrier - quidditch_snitch.start_dma_transfer from %a : memref<32xf32> to %a_l1 : memref<32xf32> - %t = quidditch_snitch.start_dma_transfer from %b : memref<32xf32> to %b_l1 : memref<32xf32> - quidditch_snitch.wait_for_dma_transfers %t : !quidditch_snitch.dma_token + dma.start_transfer from %a : memref<32xf32> to %a_l1 : memref<32xf32> + %t = dma.start_transfer from %b : memref<32xf32> to %b_l1 : memref<32xf32> + dma.wait_for_transfers %t : !dma.token // CHECK-NEXT: microkernel // CHECK: } @@ -31,31 +31,31 @@ func.func @test(%a : memref<32xf32>, %b : memref<32xf32>, %cond : i1) { // CHECK-NEXT: quidditch_snitch.microkernel_fence // CHECK-NEXT: quidditch_snitch.barrier - // CHECK-NEXT: quidditch_snitch.completed_token - %t2 = quidditch_snitch.start_dma_transfer from %b_l1 : memref<32xf32> to %b : memref<32xf32> + // CHECK-NEXT: dma.completed_token + %t2 = dma.start_transfer from %b_l1 : memref<32xf32> to %b : memref<32xf32> // CHECK-NEXT: quidditch_snitch.barrier - quidditch_snitch.wait_for_dma_transfers %t2 : !quidditch_snitch.dma_token + dma.wait_for_transfers %t2 : !dma.token // CHECK: scf.if - %r:2 = scf.if %cond -> (!quidditch_snitch.dma_token, index) { - // CHECK-NEXT: %[[C:.*]] = quidditch_snitch.completed_token - %t3 = quidditch_snitch.start_dma_transfer from %b_l1 : memref<32xf32> to %b : memref<32xf32> + %r:2 = scf.if %cond -> (!dma.token, index) { + // CHECK-NEXT: %[[C:.*]] = dma.completed_token + %t3 = dma.start_transfer from %b_l1 : memref<32xf32> to %b : memref<32xf32> // CHECK-NEXT: %[[I:.*]] = quidditch_snitch.compute_core_index %i = quidditch_snitch.compute_core_index // CHECK-NEXT: yield %[[C]], %[[I]] - scf.yield %t3, %i : !quidditch_snitch.dma_token, index + scf.yield %t3, %i : !dma.token, index } else { // CHECK-NEXT: else - // CHECK-NEXT: %[[C:.*]] = quidditch_snitch.completed_token - %c = quidditch_snitch.completed_token + // CHECK-NEXT: %[[C:.*]] = dma.completed_token + %c = dma.completed_token // CHECK-NEXT: %[[I:.*]] = arith.constant %i = arith.constant 1 : index // CHECK-NEXT: yield %[[C]], %[[I]] - scf.yield %c, %i : !quidditch_snitch.dma_token, index + scf.yield %c, %i : !dma.token, index } // CHECK: quidditch_snitch.barrier - quidditch_snitch.wait_for_dma_transfers %r#0 : !quidditch_snitch.dma_token + dma.wait_for_transfers %r#0 : !dma.token // CHECK-NEXT: return return } @@ -67,25 +67,25 @@ func.func @test(%a : memref<32xf32>, %b : memref<32xf32>, %cond : i1) { // CHECK: memref.view // CHECK-NEXT: memref.view -// CHECK-NEXT: quidditch_snitch.start_dma_transfer -// CHECK-NEXT: quidditch_snitch.start_dma_transfer -// CHECK-NEXT: quidditch_snitch.wait_for_dma_transfers +// CHECK-NEXT: dma.start_transfer +// CHECK-NEXT: dma.start_transfer +// CHECK-NEXT: dma.wait_for_transfers // CHECK-NEXT: quidditch_snitch.barrier // CHECK-NEXT: quidditch_snitch.barrier -// CHECK-NEXT: quidditch_snitch.start_dma_transfer -// CHECK-NEXT: quidditch_snitch.wait_for_dma_transfers +// CHECK-NEXT: dma.start_transfer +// CHECK-NEXT: dma.wait_for_transfers // CHECK-NEXT: quidditch_snitch.barrier // CHECK-NEXT: scf.if -// CHECK-NEXT: quidditch_snitch.start_dma_transfer +// CHECK-NEXT: dma.start_transfer // CHECK-NEXT: %[[ZERO:.*]] = arith.constant 0 // CHECK-NEXT: yield %{{.*}}, %[[ZERO]] : // CHECK-NEXT: else // CHECK-NEXT: completed_token // CHECK-NEXT: arith.constant // CHECK-NEXT: yield -// CHECK: quidditch_snitch.wait_for_dma_transfers +// CHECK: dma.wait_for_transfers // CHECK-NEXT: quidditch_snitch.barrier // CHECK-NEXT: return diff --git a/codegen/tools/CMakeLists.txt b/codegen/tools/CMakeLists.txt index 275568b..e3da2fb 100644 --- a/codegen/tools/CMakeLists.txt +++ b/codegen/tools/CMakeLists.txt @@ -4,6 +4,7 @@ target_link_libraries(quidditch-opt MLIROptLib Quidditch::Conversion::ConvertSnitchToLLVM Quidditch::Conversion::ConvertToRISCV + Quidditch::Dialect::DMA::Extensions::DMACoreSpecializationOpInterfaceImpl Quidditch::Dialect::Snitch::IR::QuidditchSnitchDialect Quidditch::Dialect::Snitch::Transforms::Passes Quidditch::Target::Passes diff --git a/codegen/tools/quidditch-opt.cpp b/codegen/tools/quidditch-opt.cpp index a949287..68cd9e9 100644 --- a/codegen/tools/quidditch-opt.cpp +++ b/codegen/tools/quidditch-opt.cpp @@ -2,6 +2,8 @@ #include #include "Quidditch/Conversion/Passes.h" +#include "Quidditch/Dialect/DMA/Extensions/DMACoreSpecializationOpInterfaceImpl.h" +#include "Quidditch/Dialect/DMA/IR/DMADialect.h" #include "Quidditch/Dialect/Snitch/IR/QuidditchSnitchDialect.h" #include "Quidditch/Dialect/Snitch/Transforms/Passes.h" #include "Quidditch/Target/Passes.h" @@ -26,8 +28,10 @@ int main(int argc, char **argv) { // Be lazy and support all upstream dialects as input dialects. DialectRegistry registry; + quidditch::dma::registerDMACoreSpecializationOpInterface(registry); iree_compiler::registerAllDialects(registry); - registry.insert(); + registry.insert(); quidditch::registerPasses(); quidditch::registerConversionPasses();