Skip to content

Commit

Permalink
Fix and refactor DmaDimConfig
Browse files Browse the repository at this point in the history
  • Loading branch information
jtuyls committed Dec 11, 2024
1 parent e954a73 commit 838062a
Show file tree
Hide file tree
Showing 10 changed files with 166 additions and 169 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -54,10 +54,12 @@ class FoldDmaOpLinearDims
"expected a source and target memory space for hardware aware "
"linear dimension folding");
}
AMDAIE::DmaDimConfig dmaDimConfig(
deviceModel.value(), sourceMemSpace.value(), targetMemSpace.value());
maxSourceSizes = dmaDimConfig.getMaxSizes<CopyOpOperateOn::Source>();
maxTargetSizes = dmaDimConfig.getMaxSizes<CopyOpOperateOn::Target>();
DmaDimConfig sourceDmaDimConfig(deviceModel.value(),
sourceMemSpace.value());
maxSourceSizes = sourceDmaDimConfig.getMaxSizes();
DmaDimConfig targetDmaDimConfig(deviceModel.value(),
targetMemSpace.value());
maxTargetSizes = targetDmaDimConfig.getMaxSizes();
}
LogicalResult sourceRes = foldLinearDims(
op.getContext(), sourceOffsets, sourceSizes, sourceStrides,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -83,10 +83,10 @@ struct CombineStridedOps
return rewriter.notifyMatchFailure(
nextStridedOp, "expected a source and target memory space");
}
AMDAIE::DmaDimConfig dmaDimConfig(deviceModel, sourceMemspaceInt.value(),
targetMemspaceInt.value());
size_t sourceMaxNbDims = dmaDimConfig.sourceMaxNbDims;
size_t targetMaxNbDims = dmaDimConfig.targetMaxNbDims;
DmaDimConfig sourceDmaDimConfig(deviceModel, sourceMemspaceInt.value());
size_t sourceMaxNbDims = sourceDmaDimConfig.maxNbDims;
DmaDimConfig targetDmaDimConfig(deviceModel, targetMemspaceInt.value());
size_t targetMaxNbDims = targetDmaDimConfig.maxNbDims;

SmallVector<OpFoldResult> sourceOffsetsA = op.getSourceMixedOffsets();
SmallVector<OpFoldResult> sourceSizesA = op.getSourceMixedSizes();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,8 @@ struct HalfDmaCpyNdToNpuConverter final
ArrayRef<OpFoldResult> strides) const {
uint8_t numIntraAddrDim = deviceModel.getDmaProp<uint8_t>(
tileType, AMDAIE::AMDAIEDmaProp::NumAddrDim);
uint8_t numAddrDim = numIntraAddrDim + kAMDAIEDmaNbInterDims;
uint8_t numAddrDim =
numIntraAddrDim + deviceModel.deviceConfig.dmaNbInterDims;
auto subspanOp = dyn_cast_if_present<IREE::HAL::InterfaceBindingSubspanOp>(
logicalObjFifo.getMemref().getDefiningOp());
if (!subspanOp) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,8 @@ struct SubsumeLoopIntoDMA
/// operation.
LogicalResult rewriteWithLoopLikeOpParent(
AMDAIE::DoublyStridedOpInterface op, PatternRewriter &rewriter,
const AMDAIE::DmaDimConfig &dmaDimConfig,
const DmaDimConfig &sourceDmaDimConfig,
const DmaDimConfig &targetDmaDimConfig,
const SmallVector<int64_t> &lowerBounds,
const SmallVector<int64_t> &upperBounds,
const SmallVector<int64_t> &steps,
Expand Down Expand Up @@ -210,10 +211,10 @@ struct SubsumeLoopIntoDMA
if (nbIterations > 1) nbNonUnitIterations++;
}
if (newSourceOffsets.size() + nbNonUnitIterations >
dmaDimConfig.sourceMaxNbDims)
sourceDmaDimConfig.maxNbDims)
return failure();
if (newTargetOffsets.size() + nbNonUnitIterations >
dmaDimConfig.targetMaxNbDims)
targetDmaDimConfig.maxNbDims)
return failure();

// Fail if zero stride is only supported on the outer dimension and adding
Expand Down Expand Up @@ -309,10 +310,8 @@ struct SubsumeLoopIntoDMA
insertInFront(newSourceSizes, insertSourceSizes);
SmallVector<int64_t> newSourceStridesInt =
insertInFront(newSourceStrides, insertSourceStrides);
SmallVector<int64_t> maxSizes =
dmaDimConfig.getMaxSizes<CopyOpOperateOn::Source>();
SmallVector<int64_t> maxStrides =
dmaDimConfig.getMaxStrides<CopyOpOperateOn::Source>();
SmallVector<int64_t> maxSizes = sourceDmaDimConfig.getMaxSizes();
SmallVector<int64_t> maxStrides = sourceDmaDimConfig.getMaxStrides();
assert(maxSizes.size() >= newSourceSizesInt.size() &&
"Max number of dimensions exceeded");
size_t begin = maxSizes.size() - newSourceSizesInt.size();
Expand All @@ -335,10 +334,8 @@ struct SubsumeLoopIntoDMA
insertInFront(newTargetSizes, insertTargetSizes);
SmallVector<int64_t> newTargetStridesInt =
insertInFront(newTargetStrides, insertTargetStrides);
SmallVector<int64_t> maxSizes =
dmaDimConfig.getMaxSizes<CopyOpOperateOn::Target>();
SmallVector<int64_t> maxStrides =
dmaDimConfig.getMaxStrides<CopyOpOperateOn::Target>();
SmallVector<int64_t> maxSizes = targetDmaDimConfig.getMaxSizes();
SmallVector<int64_t> maxStrides = targetDmaDimConfig.getMaxStrides();
assert(maxSizes.size() >= newTargetSizesInt.size() &&
"Max number of dimensions exceeded");
size_t begin = maxSizes.size() - newTargetSizesInt.size();
Expand Down Expand Up @@ -413,7 +410,8 @@ struct SubsumeLoopIntoDMA
/// optional `affine.apply` user for now.
LogicalResult rewriteWithForOpParent(
AMDAIE::DoublyStridedOpInterface op, PatternRewriter &rewriter,
const AMDAIE::DmaDimConfig &dmaDimConfig) const {
const DmaDimConfig &sourceDmaDimConfig,
const DmaDimConfig &targetDmaDimConfig) const {
auto forOp = dyn_cast<scf::ForOp>(op->getParentOp());
if (!forOp) return failure();

Expand All @@ -440,17 +438,18 @@ struct SubsumeLoopIntoDMA
SmallVector<int64_t> upperBounds = {upperBound.value()};
SmallVector<int64_t> steps = {step.value()};
SmallVector<DenseSet<Value>> inductionValues = {curIvValues};
return rewriteWithLoopLikeOpParent(op, rewriter, dmaDimConfig, lowerBounds,
upperBounds, steps, inductionValues,
curIvValues);
return rewriteWithLoopLikeOpParent(
op, rewriter, sourceDmaDimConfig, targetDmaDimConfig, lowerBounds,
upperBounds, steps, inductionValues, curIvValues);
}

/// Main rewrite function for a doubly strided operation with a `scf.forall`
/// parent operation. Only handle loop induction variables with an
/// optional `affine.apply` user for now.
LogicalResult rewriteWithForallOpParent(
AMDAIE::DoublyStridedOpInterface op, PatternRewriter &rewriter,
const AMDAIE::DmaDimConfig &dmaDimConfig) const {
const DmaDimConfig &sourceDmaDimConfig,
const DmaDimConfig &targetDmaDimConfig) const {
auto forallOp = dyn_cast<scf::ForallOp>(op->getParentOp());
if (!forallOp) return failure();

Expand Down Expand Up @@ -481,9 +480,10 @@ struct SubsumeLoopIntoDMA
}
inductionValues.push_back(curIvValues);
}
return rewriteWithLoopLikeOpParent(
op, rewriter, dmaDimConfig, lowerBounds.value(), upperBounds.value(),
steps.value(), inductionValues, allInductionValues);
return rewriteWithLoopLikeOpParent(op, rewriter, sourceDmaDimConfig,
targetDmaDimConfig, lowerBounds.value(),
upperBounds.value(), steps.value(),
inductionValues, allInductionValues);
}

LogicalResult matchAndRewrite(AMDAIE::DoublyStridedOpInterface op,
Expand Down Expand Up @@ -562,13 +562,15 @@ struct SubsumeLoopIntoDMA
return rewriter.notifyMatchFailure(
op, "expected a source and target memory space");
}
AMDAIE::DmaDimConfig dmaDimConfig(deviceModel, sourceMemspaceInt.value(),
targetMemspaceInt.value());
DmaDimConfig sourceDmaDimConfig(deviceModel, sourceMemspaceInt.value());
DmaDimConfig targetDmaDimConfig(deviceModel, targetMemspaceInt.value());

if (isa<scf::ForOp>(parentOp)) {
return rewriteWithForOpParent(op, rewriter, dmaDimConfig);
return rewriteWithForOpParent(op, rewriter, sourceDmaDimConfig,
targetDmaDimConfig);
} else if (isa<scf::ForallOp>(parentOp)) {
return rewriteWithForallOpParent(op, rewriter, dmaDimConfig);
return rewriteWithForallOpParent(op, rewriter, sourceDmaDimConfig,
targetDmaDimConfig);
} else {
return rewriter.notifyMatchFailure(
op, "Has parent operation of currently unsupported type");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -229,9 +229,8 @@ void AIEDeviceBuilder::foldDims(const SmallVector<OpFoldResult> &offsets,
SmallVector<OpFoldResult> tmpStrides;
(void)foldUnitDims(rewriter.getContext(), offsets, sizes, strides, tmpOffsets,
tmpSizes, tmpStrides);
AMDAIE::DmaDimConfig dmaDimConfig(deviceModel, memSpace, memSpace);
SmallVector<int64_t> maxSizes =
dmaDimConfig.getMaxSizes<CopyOpOperateOn::Source>();
DmaDimConfig dmaDimConfig(deviceModel, memSpace);
SmallVector<int64_t> maxSizes = dmaDimConfig.getMaxSizes();
(void)foldLinearDims(rewriter.getContext(), tmpOffsets, tmpSizes, tmpStrides,
newOffsets, newSizes, newStrides, maxSizes);
(void)foldSingleDim(newOffsets, newSizes, newStrides);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -483,4 +483,32 @@ LogicalResult moveNpuDmaSyncUsersAfterAncestorInSameBlock(
return success();
}

//===----------------------------------------------------------------------===//
// DmaDimConfig
//===----------------------------------------------------------------------===//

SmallVector<int64_t> DmaDimConfig::getMaxSizes() const {
uint32_t maxIntraSize = deviceModel.getDmaBdProp<uint16_t>(
tileType, 0, AMDAIE::AMDAIEDmaBdProp::WrapMax);
uint32_t maxInterSize = deviceModel.getDmaBdProp<uint8_t>(
tileType, 0, AMDAIE::AMDAIEDmaBdProp::IterWrapMax);
SmallVector<int64_t> stepSizes(maxNbDims, maxIntraSize);
std::fill_n(stepSizes.begin(), nbInterDims, maxInterSize);
// The outermost intra size doesn't have limit in HW.
stepSizes[nbInterDims] = std::numeric_limits<int64_t>::max();
return stepSizes;
}

SmallVector<int64_t> DmaDimConfig::getMaxStrides() const {
uint32_t maxIntraStride = deviceModel.getDmaBdProp<uint32_t>(
tileType, 0, AMDAIE::AMDAIEDmaBdProp::StepSizeMax);
uint32_t maxInterStride = deviceModel.getDmaBdProp<uint32_t>(
tileType, 0, AMDAIE::AMDAIEDmaBdProp::IterStepSizeMax);
// +1 because values are encoded in HW BDs as (value - 1), so the range is
// [1:2^x].
SmallVector<int64_t> stepSizes(maxNbDims, maxIntraStride + 1);
stepSizes[0] = maxInterStride + 1;
return stepSizes;
}

} // namespace mlir::iree_compiler::AMDAIE
Original file line number Diff line number Diff line change
Expand Up @@ -74,33 +74,6 @@ struct RetrieveScaleAndBias
}
};

// Constant specifying the number of inter-iteration dimension for DMA
// operations.
//
// NOTE(jornt): this number is implicitly assumed in the device model and can't
// be retrieved from it afaik.
//
// Some background:
//
// DMAs support multi-dimensional addressing through buffer descriptors in two
// ways:
// 1. Intra-iteration access pattern. Specified via 'strides' ('steps' in buffer
// descriptor lingo), 'sizes' ('wraps' in buffer descriptro lingo) and
// 'padding'. When a DMA executes a buffer descriptor, it will access the data
// (read/write) as specified by the intra-iteration access pattern.
// 2. Inter-iteration access pattern. Specified via an iteration 'stride',
// 'size' and 'current_iteration' ('stride' is the same as 'stepsize' and 'size'
// is the same as 'wrap' in buffer descriptor lingo). Here, 'current_iteration'
// keeps track of the current execution iteration of the buffer descriptor and
// is incremented after buffer descriptor execution. the 'stride' is the offset
// to be used for each execution of the buffer descriptor, relative to the
// previous one. When 'iteration_current' is equal to 'size', the
// 'iteration_current' is reset to zero.
//
// Although DMAs can have a different number of intra-iteration dimensions, all
// DMAs have a single inter-iteration dimension (at least in AIE2 and AIE2p).
static const size_t kAMDAIEDmaNbInterDims = 1;

/// Check whether two access patterns are equal in value, starting from
/// specified indices.
bool areAccessPatternsEqualFromIndices(ArrayRef<OpFoldResult> offsetsA,
Expand Down Expand Up @@ -220,119 +193,40 @@ LogicalResult foldUnitDims(MLIRContext *ctx,
/// be used for DMA access patterns.
struct DmaDimConfig {
const AMDAIE::AMDAIEDeviceModel &deviceModel;
AMDAIE::AMDAIETileType sourceTileType;
AMDAIE::AMDAIETileType targetTileType;
/// The maximum number of addressing dimensions on the source side of the DMA.
uint8_t sourceMaxNbDims{0};
/// The maximum number of addressing dimensions on the target side of the DMA.
uint8_t targetMaxNbDims{0};
AMDAIE::AMDAIETileType tileType;
/// The maximum number of addressing dimensions on of the DMA.
uint8_t maxNbDims{0};
/// The number of `inter` addressing dimensions on of the DMA.
uint8_t nbInterDims{0};

DmaDimConfig(const AMDAIE::AMDAIEDeviceModel &deviceModel,
uint8_t sourceMemspaceInt, uint8_t targetMemspaceInt)
DmaDimConfig(const AMDAIE::AMDAIEDeviceModel &deviceModel, uint8_t memSpace)
: deviceModel(deviceModel) {
uint8_t shimNbIntraDims = deviceModel.getDmaProp<uint8_t>(
AMDAIE::AMDAIETileType::SHIMNOC, AMDAIE::AMDAIEDmaProp::NumAddrDim);
uint8_t memTileNbIntraDims = deviceModel.getDmaProp<uint8_t>(
AMDAIE::AMDAIETileType::MEMTILE, AMDAIE::AMDAIEDmaProp::NumAddrDim);
uint8_t coreNbIntraDims = deviceModel.getDmaProp<uint8_t>(
AMDAIE::AMDAIETileType::AIETILE, AMDAIE::AMDAIEDmaProp::NumAddrDim);
if (sourceMemspaceInt == 0) {
sourceTileType = AMDAIE::AMDAIETileType::SHIMNOC;
sourceMaxNbDims = shimNbIntraDims + kAMDAIEDmaNbInterDims;
} else if (sourceMemspaceInt == 1) {
sourceTileType = AMDAIE::AMDAIETileType::MEMTILE;
sourceMaxNbDims = memTileNbIntraDims;
} else if (sourceMemspaceInt == 2) {
sourceTileType = AMDAIE::AMDAIETileType::AIETILE;
sourceMaxNbDims = coreNbIntraDims;
} else {
assert(false && "unsupported source memspace");
}
if (targetMemspaceInt == 0) {
targetTileType = AMDAIE::AMDAIETileType::SHIMNOC;
targetMaxNbDims = shimNbIntraDims + kAMDAIEDmaNbInterDims;
} else if (targetMemspaceInt == 1) {
targetTileType = AMDAIE::AMDAIETileType::MEMTILE;
targetMaxNbDims = memTileNbIntraDims;
} else if (targetMemspaceInt == 2) {
targetTileType = AMDAIE::AMDAIETileType::AIETILE;
targetMaxNbDims = coreNbIntraDims;
if (memSpace == 0) {
uint8_t shimNbIntraDims = deviceModel.getDmaProp<uint8_t>(
AMDAIE::AMDAIETileType::SHIMNOC, AMDAIE::AMDAIEDmaProp::NumAddrDim);
tileType = AMDAIE::AMDAIETileType::SHIMNOC;
nbInterDims = deviceModel.deviceConfig.dmaNbInterDims;
maxNbDims = shimNbIntraDims + nbInterDims;
} else if (memSpace == 1) {
uint8_t memTileNbIntraDims = deviceModel.getDmaProp<uint8_t>(
AMDAIE::AMDAIETileType::MEMTILE, AMDAIE::AMDAIEDmaProp::NumAddrDim);
tileType = AMDAIE::AMDAIETileType::MEMTILE;
maxNbDims = memTileNbIntraDims;
} else if (memSpace == 2) {
uint8_t coreNbIntraDims = deviceModel.getDmaProp<uint8_t>(
AMDAIE::AMDAIETileType::AIETILE, AMDAIE::AMDAIEDmaProp::NumAddrDim);
tileType = AMDAIE::AMDAIETileType::AIETILE;
maxNbDims = coreNbIntraDims;
} else {
assert(false && "unsupported target memspace");
assert(false && "unsupported memspace: ");
}
}

/// Return a vector containing the max stride values for every dimension. The
/// first dimension is the inter-iteration dimension, while the latter are
/// intra-iteration dimensions.
/// NOTE: It doesn't need to be known which BDs will be used exactly as all
/// BDs on the same tile types should have the same step and wrap sizes.
/// Therefore, `BD ID == 0` is choosen to be used to retrieve device
/// information.
template <CopyOpOperateOn OperateOn>
SmallVector<int64_t> getMaxStrides() const {
uint32_t maxIntraStride;
uint32_t maxInterStride;
if constexpr (OperateOn == CopyOpOperateOn::Source) {
maxIntraStride = deviceModel.getDmaBdProp<uint32_t>(
sourceTileType, 0, AMDAIE::AMDAIEDmaBdProp::StepSizeMax);
maxInterStride = deviceModel.getDmaBdProp<uint32_t>(
sourceTileType, 0, AMDAIE::AMDAIEDmaBdProp::IterStepSizeMax);
// +1 because values are encoded in HW BDs as (value - 1), so the range is
// [1:2^x].
SmallVector<int64_t> stepSizes(sourceMaxNbDims, maxIntraStride + 1);
stepSizes[0] = maxInterStride + 1;
return stepSizes;
} else if constexpr (OperateOn == CopyOpOperateOn::Target) {
maxIntraStride = deviceModel.getDmaBdProp<uint32_t>(
targetTileType, 0, AMDAIE::AMDAIEDmaBdProp::StepSizeMax);
maxInterStride = deviceModel.getDmaBdProp<uint32_t>(
targetTileType, 0, AMDAIE::AMDAIEDmaBdProp::IterStepSizeMax);
// +1 because values are encoded in HW BDs as (value - 1), so the range is
// [1:2^x].
SmallVector<int64_t> stepSizes(targetMaxNbDims, maxIntraStride + 1);
stepSizes[0] = maxInterStride + 1;
return stepSizes;
} else {
assert(false && "Function can only operate on Source or Target");
}
}
/// Return a vector containing the max size values for every dimension.
SmallVector<int64_t> getMaxSizes() const;

/// Return a vector containing the max size values for every dimension. The
/// first dimension is the inter-iteration dimension, while the latter are
/// intra-iteration dimensions.
/// NOTE: It doesn't need to be known which BDs will be used exactly as all
/// BDs on the same tile types should have the same step and wrap sizes.
/// Therefore, `BD ID == 0` is choosen to be used to retrieve device
/// information.
template <CopyOpOperateOn OperateOn>
SmallVector<int64_t> getMaxSizes() const {
uint32_t maxIntraSize;
uint32_t maxInterSize;
if constexpr (OperateOn == CopyOpOperateOn::Source) {
maxIntraSize = deviceModel.getDmaBdProp<uint16_t>(
sourceTileType, 0, AMDAIE::AMDAIEDmaBdProp::WrapMax);
maxInterSize = deviceModel.getDmaBdProp<uint8_t>(
sourceTileType, 0, AMDAIE::AMDAIEDmaBdProp::IterWrapMax);
SmallVector<int64_t> stepSizes(sourceMaxNbDims, maxIntraSize);
stepSizes[0] = maxInterSize;
// The outermost intra size doesn't have limit in HW.
stepSizes[1] = std::numeric_limits<int64_t>::max();
return stepSizes;
} else if constexpr (OperateOn == CopyOpOperateOn::Target) {
maxIntraSize = deviceModel.getDmaBdProp<uint16_t>(
targetTileType, 0, AMDAIE::AMDAIEDmaBdProp::WrapMax);
maxInterSize = deviceModel.getDmaBdProp<uint8_t>(
targetTileType, 0, AMDAIE::AMDAIEDmaBdProp::IterWrapMax);
SmallVector<int64_t> stepSizes(targetMaxNbDims, maxIntraSize);
stepSizes[0] = maxInterSize;
// The outermost intra size doesn't have limit in HW.
stepSizes[1] = std::numeric_limits<int64_t>::max();
return stepSizes;
} else {
assert(false && "Function can only operate on Source or Target");
}
}
/// Return a vector containing the max stride values for every dimension.
SmallVector<int64_t> getMaxStrides() const;
};

/// Utility to move the synchronization users (`amdaie.npu.dma_wait`) directly
Expand Down
Loading

0 comments on commit 838062a

Please sign in to comment.