From ef92dce7da7e6caf9d0d43c521259e1095f1de41 Mon Sep 17 00:00:00 2001 From: Jorn Tuyls Date: Wed, 11 Dec 2024 08:17:06 -0800 Subject: [PATCH] Fix and refactor DmaDimConfig --- .../AMDAIECanonicalizeDoublyStridedOp.cpp | 10 +- .../Transforms/AMDAIECombineStridedOps.cpp | 8 +- .../Transforms/AMDAIEControlCodeLowering.cpp | 3 +- .../Transforms/AMDAIEDmaLoopSubsumption.cpp | 48 +++--- .../Transforms/AMDAIELowerToAIE.cpp | 5 +- .../Transforms/Utils/AMDAIEDmaUtils.cpp | 28 +++ .../Transforms/Utils/AMDAIEDmaUtils.h | 160 +++--------------- .../Transforms/test/AMDAIEDmaUtilsTest.cpp | 45 +++++ ...lize_doubly_strided_op_hardware_aware.mlir | 2 +- .../aie_runtime/iree_aie_runtime.h | 26 +++ 10 files changed, 166 insertions(+), 169 deletions(-) diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECanonicalizeDoublyStridedOp.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECanonicalizeDoublyStridedOp.cpp index c8908a523..f84989265 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECanonicalizeDoublyStridedOp.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECanonicalizeDoublyStridedOp.cpp @@ -54,10 +54,12 @@ class FoldDmaOpLinearDims "expected a source and target memory space for hardware aware " "linear dimension folding"); } - AMDAIE::DmaDimConfig dmaDimConfig( - deviceModel.value(), sourceMemSpace.value(), targetMemSpace.value()); - maxSourceSizes = dmaDimConfig.getMaxSizes(); - maxTargetSizes = dmaDimConfig.getMaxSizes(); + DmaDimConfig sourceDmaDimConfig(deviceModel.value(), + sourceMemSpace.value()); + maxSourceSizes = sourceDmaDimConfig.getMaxSizes(); + DmaDimConfig targetDmaDimConfig(deviceModel.value(), + targetMemSpace.value()); + maxTargetSizes = targetDmaDimConfig.getMaxSizes(); } LogicalResult sourceRes = foldLinearDims( op.getContext(), sourceOffsets, sourceSizes, sourceStrides, diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECombineStridedOps.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECombineStridedOps.cpp index e549ff197..2c0c97a31 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECombineStridedOps.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECombineStridedOps.cpp @@ -83,10 +83,10 @@ struct CombineStridedOps return rewriter.notifyMatchFailure( nextStridedOp, "expected a source and target memory space"); } - AMDAIE::DmaDimConfig dmaDimConfig(deviceModel, sourceMemspaceInt.value(), - targetMemspaceInt.value()); - size_t sourceMaxNbDims = dmaDimConfig.sourceMaxNbDims; - size_t targetMaxNbDims = dmaDimConfig.targetMaxNbDims; + DmaDimConfig sourceDmaDimConfig(deviceModel, sourceMemspaceInt.value()); + size_t sourceMaxNbDims = sourceDmaDimConfig.maxNbDims; + DmaDimConfig targetDmaDimConfig(deviceModel, targetMemspaceInt.value()); + size_t targetMaxNbDims = targetDmaDimConfig.maxNbDims; SmallVector sourceOffsetsA = op.getSourceMixedOffsets(); SmallVector sourceSizesA = op.getSourceMixedSizes(); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeLowering.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeLowering.cpp index 3c8c0a843..60bb8144a 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeLowering.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeLowering.cpp @@ -40,7 +40,8 @@ struct HalfDmaCpyNdToNpuConverter final ArrayRef strides) const { uint8_t numIntraAddrDim = deviceModel.getDmaProp( tileType, AMDAIE::AMDAIEDmaProp::NumAddrDim); - uint8_t numAddrDim = numIntraAddrDim + kAMDAIEDmaNbInterDims; + uint8_t numAddrDim = + numIntraAddrDim + deviceModel.deviceConfig.dmaNbInterDims; auto subspanOp = dyn_cast_if_present( logicalObjFifo.getMemref().getDefiningOp()); if (!subspanOp) { diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaLoopSubsumption.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaLoopSubsumption.cpp index 698faeaed..3541ef33c 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaLoopSubsumption.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaLoopSubsumption.cpp @@ -152,7 +152,8 @@ struct SubsumeLoopIntoDMA /// operation. LogicalResult rewriteWithLoopLikeOpParent( AMDAIE::DoublyStridedOpInterface op, PatternRewriter &rewriter, - const AMDAIE::DmaDimConfig &dmaDimConfig, + const DmaDimConfig &sourceDmaDimConfig, + const DmaDimConfig &targetDmaDimConfig, const SmallVector &lowerBounds, const SmallVector &upperBounds, const SmallVector &steps, @@ -210,10 +211,10 @@ struct SubsumeLoopIntoDMA if (nbIterations > 1) nbNonUnitIterations++; } if (newSourceOffsets.size() + nbNonUnitIterations > - dmaDimConfig.sourceMaxNbDims) + sourceDmaDimConfig.maxNbDims) return failure(); if (newTargetOffsets.size() + nbNonUnitIterations > - dmaDimConfig.targetMaxNbDims) + targetDmaDimConfig.maxNbDims) return failure(); // Fail if zero stride is only supported on the outer dimension and adding @@ -309,10 +310,8 @@ struct SubsumeLoopIntoDMA insertInFront(newSourceSizes, insertSourceSizes); SmallVector newSourceStridesInt = insertInFront(newSourceStrides, insertSourceStrides); - SmallVector maxSizes = - dmaDimConfig.getMaxSizes(); - SmallVector maxStrides = - dmaDimConfig.getMaxStrides(); + SmallVector maxSizes = sourceDmaDimConfig.getMaxSizes(); + SmallVector maxStrides = sourceDmaDimConfig.getMaxStrides(); assert(maxSizes.size() >= newSourceSizesInt.size() && "Max number of dimensions exceeded"); size_t begin = maxSizes.size() - newSourceSizesInt.size(); @@ -335,10 +334,8 @@ struct SubsumeLoopIntoDMA insertInFront(newTargetSizes, insertTargetSizes); SmallVector newTargetStridesInt = insertInFront(newTargetStrides, insertTargetStrides); - SmallVector maxSizes = - dmaDimConfig.getMaxSizes(); - SmallVector maxStrides = - dmaDimConfig.getMaxStrides(); + SmallVector maxSizes = targetDmaDimConfig.getMaxSizes(); + SmallVector maxStrides = targetDmaDimConfig.getMaxStrides(); assert(maxSizes.size() >= newTargetSizesInt.size() && "Max number of dimensions exceeded"); size_t begin = maxSizes.size() - newTargetSizesInt.size(); @@ -413,7 +410,8 @@ struct SubsumeLoopIntoDMA /// optional `affine.apply` user for now. LogicalResult rewriteWithForOpParent( AMDAIE::DoublyStridedOpInterface op, PatternRewriter &rewriter, - const AMDAIE::DmaDimConfig &dmaDimConfig) const { + const DmaDimConfig &sourceDmaDimConfig, + const DmaDimConfig &targetDmaDimConfig) const { auto forOp = dyn_cast(op->getParentOp()); if (!forOp) return failure(); @@ -440,9 +438,9 @@ struct SubsumeLoopIntoDMA SmallVector upperBounds = {upperBound.value()}; SmallVector steps = {step.value()}; SmallVector> inductionValues = {curIvValues}; - return rewriteWithLoopLikeOpParent(op, rewriter, dmaDimConfig, lowerBounds, - upperBounds, steps, inductionValues, - curIvValues); + return rewriteWithLoopLikeOpParent( + op, rewriter, sourceDmaDimConfig, targetDmaDimConfig, lowerBounds, + upperBounds, steps, inductionValues, curIvValues); } /// Main rewrite function for a doubly strided operation with a `scf.forall` @@ -450,7 +448,8 @@ struct SubsumeLoopIntoDMA /// optional `affine.apply` user for now. LogicalResult rewriteWithForallOpParent( AMDAIE::DoublyStridedOpInterface op, PatternRewriter &rewriter, - const AMDAIE::DmaDimConfig &dmaDimConfig) const { + const DmaDimConfig &sourceDmaDimConfig, + const DmaDimConfig &targetDmaDimConfig) const { auto forallOp = dyn_cast(op->getParentOp()); if (!forallOp) return failure(); @@ -481,9 +480,10 @@ struct SubsumeLoopIntoDMA } inductionValues.push_back(curIvValues); } - return rewriteWithLoopLikeOpParent( - op, rewriter, dmaDimConfig, lowerBounds.value(), upperBounds.value(), - steps.value(), inductionValues, allInductionValues); + return rewriteWithLoopLikeOpParent(op, rewriter, sourceDmaDimConfig, + targetDmaDimConfig, lowerBounds.value(), + upperBounds.value(), steps.value(), + inductionValues, allInductionValues); } LogicalResult matchAndRewrite(AMDAIE::DoublyStridedOpInterface op, @@ -562,13 +562,15 @@ struct SubsumeLoopIntoDMA return rewriter.notifyMatchFailure( op, "expected a source and target memory space"); } - AMDAIE::DmaDimConfig dmaDimConfig(deviceModel, sourceMemspaceInt.value(), - targetMemspaceInt.value()); + DmaDimConfig sourceDmaDimConfig(deviceModel, sourceMemspaceInt.value()); + DmaDimConfig targetDmaDimConfig(deviceModel, targetMemspaceInt.value()); if (isa(parentOp)) { - return rewriteWithForOpParent(op, rewriter, dmaDimConfig); + return rewriteWithForOpParent(op, rewriter, sourceDmaDimConfig, + targetDmaDimConfig); } else if (isa(parentOp)) { - return rewriteWithForallOpParent(op, rewriter, dmaDimConfig); + return rewriteWithForallOpParent(op, rewriter, sourceDmaDimConfig, + targetDmaDimConfig); } else { return rewriter.notifyMatchFailure( op, "Has parent operation of currently unsupported type"); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.cpp index 9a575f672..a1aa3244b 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.cpp @@ -229,9 +229,8 @@ void AIEDeviceBuilder::foldDims(const SmallVector &offsets, SmallVector tmpStrides; (void)foldUnitDims(rewriter.getContext(), offsets, sizes, strides, tmpOffsets, tmpSizes, tmpStrides); - AMDAIE::DmaDimConfig dmaDimConfig(deviceModel, memSpace, memSpace); - SmallVector maxSizes = - dmaDimConfig.getMaxSizes(); + DmaDimConfig dmaDimConfig(deviceModel, memSpace); + SmallVector maxSizes = dmaDimConfig.getMaxSizes(); (void)foldLinearDims(rewriter.getContext(), tmpOffsets, tmpSizes, tmpStrides, newOffsets, newSizes, newStrides, maxSizes); (void)foldSingleDim(newOffsets, newSizes, newStrides); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIEDmaUtils.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIEDmaUtils.cpp index 9c2e43c6e..db9eb8569 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIEDmaUtils.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIEDmaUtils.cpp @@ -483,4 +483,32 @@ LogicalResult moveNpuDmaSyncUsersAfterAncestorInSameBlock( return success(); } +//===----------------------------------------------------------------------===// +// DmaDimConfig +//===----------------------------------------------------------------------===// + +SmallVector DmaDimConfig::getMaxSizes() const { + uint32_t maxIntraSize = deviceModel.getDmaBdProp( + tileType, 0, AMDAIE::AMDAIEDmaBdProp::WrapMax); + uint32_t maxInterSize = deviceModel.getDmaBdProp( + tileType, 0, AMDAIE::AMDAIEDmaBdProp::IterWrapMax); + SmallVector maxSizes(maxNbDims, maxIntraSize); + std::fill_n(maxSizes.begin(), nbInterDims, maxInterSize); + // The outermost intra size doesn't have limit in HW. + maxSizes[nbInterDims] = std::numeric_limits::max(); + return maxSizes; +} + +SmallVector DmaDimConfig::getMaxStrides() const { + uint32_t maxIntraStride = deviceModel.getDmaBdProp( + tileType, 0, AMDAIE::AMDAIEDmaBdProp::StepSizeMax); + uint32_t maxInterStride = deviceModel.getDmaBdProp( + tileType, 0, AMDAIE::AMDAIEDmaBdProp::IterStepSizeMax); + // +1 because values are encoded in HW BDs as (value - 1), so the range is + // [1:2^x]. + SmallVector stepSizes(maxNbDims, maxIntraStride + 1); + std::fill_n(stepSizes.begin(), nbInterDims, maxInterStride + 1); + return stepSizes; +} + } // namespace mlir::iree_compiler::AMDAIE diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIEDmaUtils.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIEDmaUtils.h index e4dbfd36b..4ca056463 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIEDmaUtils.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIEDmaUtils.h @@ -74,33 +74,6 @@ struct RetrieveScaleAndBias } }; -// Constant specifying the number of inter-iteration dimension for DMA -// operations. -// -// NOTE(jornt): this number is implicitly assumed in the device model and can't -// be retrieved from it afaik. -// -// Some background: -// -// DMAs support multi-dimensional addressing through buffer descriptors in two -// ways: -// 1. Intra-iteration access pattern. Specified via 'strides' ('steps' in buffer -// descriptor lingo), 'sizes' ('wraps' in buffer descriptro lingo) and -// 'padding'. When a DMA executes a buffer descriptor, it will access the data -// (read/write) as specified by the intra-iteration access pattern. -// 2. Inter-iteration access pattern. Specified via an iteration 'stride', -// 'size' and 'current_iteration' ('stride' is the same as 'stepsize' and 'size' -// is the same as 'wrap' in buffer descriptor lingo). Here, 'current_iteration' -// keeps track of the current execution iteration of the buffer descriptor and -// is incremented after buffer descriptor execution. the 'stride' is the offset -// to be used for each execution of the buffer descriptor, relative to the -// previous one. When 'iteration_current' is equal to 'size', the -// 'iteration_current' is reset to zero. -// -// Although DMAs can have a different number of intra-iteration dimensions, all -// DMAs have a single inter-iteration dimension (at least in AIE2 and AIE2p). -static const size_t kAMDAIEDmaNbInterDims = 1; - /// Check whether two access patterns are equal in value, starting from /// specified indices. bool areAccessPatternsEqualFromIndices(ArrayRef offsetsA, @@ -220,119 +193,40 @@ LogicalResult foldUnitDims(MLIRContext *ctx, /// be used for DMA access patterns. struct DmaDimConfig { const AMDAIE::AMDAIEDeviceModel &deviceModel; - AMDAIE::AMDAIETileType sourceTileType; - AMDAIE::AMDAIETileType targetTileType; - /// The maximum number of addressing dimensions on the source side of the DMA. - uint8_t sourceMaxNbDims{0}; - /// The maximum number of addressing dimensions on the target side of the DMA. - uint8_t targetMaxNbDims{0}; + AMDAIE::AMDAIETileType tileType; + /// The maximum number of addressing dimensions on of the DMA. + uint8_t maxNbDims{0}; + /// The number of `inter` addressing dimensions on of the DMA. + uint8_t nbInterDims{0}; - DmaDimConfig(const AMDAIE::AMDAIEDeviceModel &deviceModel, - uint8_t sourceMemspaceInt, uint8_t targetMemspaceInt) + DmaDimConfig(const AMDAIE::AMDAIEDeviceModel &deviceModel, uint8_t memSpace) : deviceModel(deviceModel) { - uint8_t shimNbIntraDims = deviceModel.getDmaProp( - AMDAIE::AMDAIETileType::SHIMNOC, AMDAIE::AMDAIEDmaProp::NumAddrDim); - uint8_t memTileNbIntraDims = deviceModel.getDmaProp( - AMDAIE::AMDAIETileType::MEMTILE, AMDAIE::AMDAIEDmaProp::NumAddrDim); - uint8_t coreNbIntraDims = deviceModel.getDmaProp( - AMDAIE::AMDAIETileType::AIETILE, AMDAIE::AMDAIEDmaProp::NumAddrDim); - if (sourceMemspaceInt == 0) { - sourceTileType = AMDAIE::AMDAIETileType::SHIMNOC; - sourceMaxNbDims = shimNbIntraDims + kAMDAIEDmaNbInterDims; - } else if (sourceMemspaceInt == 1) { - sourceTileType = AMDAIE::AMDAIETileType::MEMTILE; - sourceMaxNbDims = memTileNbIntraDims; - } else if (sourceMemspaceInt == 2) { - sourceTileType = AMDAIE::AMDAIETileType::AIETILE; - sourceMaxNbDims = coreNbIntraDims; - } else { - assert(false && "unsupported source memspace"); - } - if (targetMemspaceInt == 0) { - targetTileType = AMDAIE::AMDAIETileType::SHIMNOC; - targetMaxNbDims = shimNbIntraDims + kAMDAIEDmaNbInterDims; - } else if (targetMemspaceInt == 1) { - targetTileType = AMDAIE::AMDAIETileType::MEMTILE; - targetMaxNbDims = memTileNbIntraDims; - } else if (targetMemspaceInt == 2) { - targetTileType = AMDAIE::AMDAIETileType::AIETILE; - targetMaxNbDims = coreNbIntraDims; + if (memSpace == 0) { + uint8_t shimNbIntraDims = deviceModel.getDmaProp( + AMDAIE::AMDAIETileType::SHIMNOC, AMDAIE::AMDAIEDmaProp::NumAddrDim); + tileType = AMDAIE::AMDAIETileType::SHIMNOC; + nbInterDims = deviceModel.deviceConfig.dmaNbInterDims; + maxNbDims = shimNbIntraDims + nbInterDims; + } else if (memSpace == 1) { + uint8_t memTileNbIntraDims = deviceModel.getDmaProp( + AMDAIE::AMDAIETileType::MEMTILE, AMDAIE::AMDAIEDmaProp::NumAddrDim); + tileType = AMDAIE::AMDAIETileType::MEMTILE; + maxNbDims = memTileNbIntraDims; + } else if (memSpace == 2) { + uint8_t coreNbIntraDims = deviceModel.getDmaProp( + AMDAIE::AMDAIETileType::AIETILE, AMDAIE::AMDAIEDmaProp::NumAddrDim); + tileType = AMDAIE::AMDAIETileType::AIETILE; + maxNbDims = coreNbIntraDims; } else { - assert(false && "unsupported target memspace"); + assert(false && "unsupported memspace: "); } } - /// Return a vector containing the max stride values for every dimension. The - /// first dimension is the inter-iteration dimension, while the latter are - /// intra-iteration dimensions. - /// NOTE: It doesn't need to be known which BDs will be used exactly as all - /// BDs on the same tile types should have the same step and wrap sizes. - /// Therefore, `BD ID == 0` is choosen to be used to retrieve device - /// information. - template - SmallVector getMaxStrides() const { - uint32_t maxIntraStride; - uint32_t maxInterStride; - if constexpr (OperateOn == CopyOpOperateOn::Source) { - maxIntraStride = deviceModel.getDmaBdProp( - sourceTileType, 0, AMDAIE::AMDAIEDmaBdProp::StepSizeMax); - maxInterStride = deviceModel.getDmaBdProp( - sourceTileType, 0, AMDAIE::AMDAIEDmaBdProp::IterStepSizeMax); - // +1 because values are encoded in HW BDs as (value - 1), so the range is - // [1:2^x]. - SmallVector stepSizes(sourceMaxNbDims, maxIntraStride + 1); - stepSizes[0] = maxInterStride + 1; - return stepSizes; - } else if constexpr (OperateOn == CopyOpOperateOn::Target) { - maxIntraStride = deviceModel.getDmaBdProp( - targetTileType, 0, AMDAIE::AMDAIEDmaBdProp::StepSizeMax); - maxInterStride = deviceModel.getDmaBdProp( - targetTileType, 0, AMDAIE::AMDAIEDmaBdProp::IterStepSizeMax); - // +1 because values are encoded in HW BDs as (value - 1), so the range is - // [1:2^x]. - SmallVector stepSizes(targetMaxNbDims, maxIntraStride + 1); - stepSizes[0] = maxInterStride + 1; - return stepSizes; - } else { - assert(false && "Function can only operate on Source or Target"); - } - } + /// Return a vector containing the max size values for every dimension. + SmallVector getMaxSizes() const; - /// Return a vector containing the max size values for every dimension. The - /// first dimension is the inter-iteration dimension, while the latter are - /// intra-iteration dimensions. - /// NOTE: It doesn't need to be known which BDs will be used exactly as all - /// BDs on the same tile types should have the same step and wrap sizes. - /// Therefore, `BD ID == 0` is choosen to be used to retrieve device - /// information. - template - SmallVector getMaxSizes() const { - uint32_t maxIntraSize; - uint32_t maxInterSize; - if constexpr (OperateOn == CopyOpOperateOn::Source) { - maxIntraSize = deviceModel.getDmaBdProp( - sourceTileType, 0, AMDAIE::AMDAIEDmaBdProp::WrapMax); - maxInterSize = deviceModel.getDmaBdProp( - sourceTileType, 0, AMDAIE::AMDAIEDmaBdProp::IterWrapMax); - SmallVector stepSizes(sourceMaxNbDims, maxIntraSize); - stepSizes[0] = maxInterSize; - // The outermost intra size doesn't have limit in HW. - stepSizes[1] = std::numeric_limits::max(); - return stepSizes; - } else if constexpr (OperateOn == CopyOpOperateOn::Target) { - maxIntraSize = deviceModel.getDmaBdProp( - targetTileType, 0, AMDAIE::AMDAIEDmaBdProp::WrapMax); - maxInterSize = deviceModel.getDmaBdProp( - targetTileType, 0, AMDAIE::AMDAIEDmaBdProp::IterWrapMax); - SmallVector stepSizes(targetMaxNbDims, maxIntraSize); - stepSizes[0] = maxInterSize; - // The outermost intra size doesn't have limit in HW. - stepSizes[1] = std::numeric_limits::max(); - return stepSizes; - } else { - assert(false && "Function can only operate on Source or Target"); - } - } + /// Return a vector containing the max stride values for every dimension. + SmallVector getMaxStrides() const; }; /// Utility to move the synchronization users (`amdaie.npu.dma_wait`) directly diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/AMDAIEDmaUtilsTest.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/AMDAIEDmaUtilsTest.cpp index a43f47360..60e572057 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/AMDAIEDmaUtilsTest.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/AMDAIEDmaUtilsTest.cpp @@ -473,6 +473,51 @@ TEST_F(FoldTest, UnitDimsFoldAndMerge) { {1, 8}, {1024, 1}, true); } +class DmaDimConfigTest : public testing::TestWithParam { + protected: + DmaDimConfigTest() : deviceModel(getDeviceModel(GetParam())) {} + AMDAIEDeviceModel deviceModel; +}; + +TEST_P(DmaDimConfigTest, ShimTile) { + DmaDimConfig config(deviceModel, 0); + SmallVector maxSizes = config.getMaxSizes(); + SmallVector expectedMaxSizes = { + 63, std::numeric_limits::max(), 1023, 1023}; + EXPECT_EQ(maxSizes, expectedMaxSizes); + SmallVector maxStrides = config.getMaxStrides(); + SmallVector expectedMaxStrides(4, 1 << 20); + EXPECT_EQ(maxStrides, expectedMaxStrides); +} + +TEST_P(DmaDimConfigTest, MemTile) { + DmaDimConfig config(deviceModel, 1); + SmallVector maxSizes = config.getMaxSizes(); + SmallVector expectedMaxSizes = {std::numeric_limits::max(), + 1023, 1023, 1023}; + EXPECT_EQ(maxSizes, expectedMaxSizes); + SmallVector maxStrides = config.getMaxStrides(); + SmallVector expectedMaxStrides(4, 1 << 17); + EXPECT_EQ(maxStrides, expectedMaxStrides); +} + +TEST_P(DmaDimConfigTest, CoreTile) { + DmaDimConfig config(deviceModel, 2); + SmallVector maxSizes = config.getMaxSizes(); + SmallVector expectedMaxSizes = {std::numeric_limits::max(), + 255, 255}; + EXPECT_EQ(maxSizes, expectedMaxSizes); + SmallVector maxStrides = config.getMaxStrides(); + SmallVector expectedMaxStrides(3, 1 << 13); + EXPECT_EQ(maxStrides, expectedMaxStrides); +} + +INSTANTIATE_TEST_SUITE_P( + Devices, DmaDimConfigTest, + testing::Values(AMDAIEDevice::npu1, AMDAIEDevice::npu1_1col, + AMDAIEDevice::npu1_2col, AMDAIEDevice::npu1_3col, + AMDAIEDevice::npu1_4col, AMDAIEDevice::npu4)); + } // namespace int main(int argc, char **argv) { diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/canonicalize_doubly_strided_op_hardware_aware.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/canonicalize_doubly_strided_op_hardware_aware.mlir index 27712b765..a9a634202 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/canonicalize_doubly_strided_op_hardware_aware.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/canonicalize_doubly_strided_op_hardware_aware.mlir @@ -38,7 +38,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // CHECK-LABEL: func.func @dma_cpy_nd_no_fold // CHECK: amdaie.dma_cpy_nd(%{{.+}}[0, 0] [2, 512] [512, 1], %{{.+}}[0, 0] [2, 512] [512, 1]) // CHECK: amdaie.dma_cpy_nd(%{{.+}}[0, 0, 0] [128, 8, 512] [2048, 256, 1], %{{.+}}[0, 0, 0] [128, 8, 512] [2048, 256, 1]) -// CHECK: amdaie.dma_cpy_nd(%{{.+}}[0, 0, 0, 0, 0] [8, 8, 16, 8, 512] [1024, 128, 1024, 256, 1], %{{.+}}[0, 0, 0, 0, 0] [8, 8, 16, 8, 512] [1024, 128, 1024, 256, 1]) +// CHECK: amdaie.dma_cpy_nd(%{{.+}}[0, 0, 0, 0] [64, 16, 8, 512] [128, 1024, 256, 1], %{{.+}}[0, 0, 0, 0] [64, 16, 8, 512] [128, 1024, 256, 1]) #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { func.func @dma_cpy_nd_no_fold(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>) { diff --git a/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.h b/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.h index 742eb8a59..b554a0097 100644 --- a/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.h +++ b/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.h @@ -234,6 +234,32 @@ struct AMDAIEDeviceModel { /////////////////////////////////////// // AIE Array configuration constants // /////////////////////////////////////// + /// Constant specifying the number of inter-iteration dimension for DMA + /// operations. + /// + /// NOTE(jornt): this number is implicitly assumed in the device model and can't + /// be retrieved from it afaik. + /// + /// Some background: + /// + /// DMAs support multi-dimensional addressing through buffer descriptors in two + /// ways: + /// 1. Intra-iteration access pattern. Specified via 'strides' ('steps' in buffer + /// descriptor lingo), 'sizes' ('wraps' in buffer descriptro lingo) and + /// 'padding'. When a DMA executes a buffer descriptor, it will access the data + /// (read/write) as specified by the intra-iteration access pattern. + /// 2. Inter-iteration access pattern. Specified via an iteration 'stride', + /// 'size' and 'current_iteration' ('stride' is the same as 'stepsize' and 'size' + /// is the same as 'wrap' in buffer descriptor lingo). Here, 'current_iteration' + /// keeps track of the current execution iteration of the buffer descriptor and + /// is incremented after buffer descriptor execution. the 'stride' is the offset + /// to be used for each execution of the buffer descriptor, relative to the + /// previous one. When 'iteration_current' is equal to 'size', the + /// 'iteration_current' is reset to zero. + /// + /// Although DMAs can have a different number of intra-iteration dimensions, all + /// DMAs have a single inter-iteration dimension (at least in AIE2 and AIE2p). + uint8_t dmaNbInterDims = 1; /// The number of shim tile rows. Not found in aie-rt data structures, but /// provided as `XAIE_SHIM_NUM_ROWS`. uint8_t shimTileNumRows{1};