Skip to content

Commit

Permalink
first commit
Browse files Browse the repository at this point in the history
  • Loading branch information
Yu-Zhewen committed Dec 10, 2024
1 parent 2243dd8 commit 3844e23
Show file tree
Hide file tree
Showing 4 changed files with 219 additions and 166 deletions.
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
// This pipeline is obtained by going into Passes.cpp, and dumping the pass pipeline (at the end of addAMDAIEObjectFifoLoweringPasses) using `passManager.dump()`. This test is included, as it can be useful to have a reference in IR of all the passes that are run.

// RUN: iree-opt --pass-pipeline="builtin.module(fold-memref-alias-ops,iree-amdaie-distribute-l1-allocations,iree-amdaie-convert-to-dma,iree-amdaie-normalize-loop-bounds,iree-amdaie-insert-cores,iree-amdaie-localize-logicalobjectfifo,cse,iree-amdaie-distribute-cores-and-objectfifos,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-split-logical-objectfifos-for-connection-reuse,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-assign-tiles,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-dma-to-circular-dma,func.func(iree-amdaie-create-aie-workgroup),cse,iree-amdaie-dma-cse,iree-amdaie-hoist-logical-objectfifo,iree-amdaie-canonicalize-doubly-strided-op{fold-single-dims=false},iree-amdaie-flatten-logicalobjectfifo,iree-amdaie-assign-logical-objectfifo-depth{l1-buffer-depth=2 l2-buffer-depth=2 l3-buffer-depth=1},iree-amdaie-access-to-acquire-release,iree-amdaie-none-access-to-temporary-buffer,iree-amdaie-assign-connection-types,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-dma-composition{only-zero-stride-on-outer-dim=true},cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-dma-cse,iree-amdaie-assign-npu-dma-bd-ids,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-controlcode-loop-unroll,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-dma-cse,iree-amdaie-canonicalize-doubly-strided-op{fold-single-dims=false},canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-convert-core-forall-to-for,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-assign-channels,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-objfifo-bufferization,iree-amdaie-connection-to-flow,iree-amdaie-assign-packet-ids,iree-amdaie-npu-dma-to-half-dma-cpy-nd,iree-amdaie-controlcode-lowering,iree-amdaie-controlcode-to-transaction,iree-amdaie-acquire-release-to-use-lock,iree-amdaie-canonicalize-npu-dma-cpy-nd{nb-dimensions=4},canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-sink-into-core,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-lower-to-aie,iree-amdaie-remove-memoryspace)" --split-input-file %s | FileCheck %s
// RUN: iree-opt --pass-pipeline="builtin.module(fold-memref-alias-ops,iree-amdaie-distribute-l1-allocations,iree-amdaie-convert-to-dma,iree-amdaie-normalize-loop-bounds,iree-amdaie-insert-cores,iree-amdaie-localize-logicalobjectfifo,cse,iree-amdaie-distribute-cores-and-objectfifos,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-split-logical-objectfifos-for-connection-reuse,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-assign-tiles,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-dma-to-circular-dma,func.func(iree-amdaie-create-aie-workgroup),cse,iree-amdaie-dma-cse,iree-amdaie-hoist-logical-objectfifo,iree-amdaie-canonicalize-doubly-strided-op{fold-single-dims=false},iree-amdaie-flatten-logicalobjectfifo,iree-amdaie-assign-logical-objectfifo-depth{l1-buffer-depth=2 l2-buffer-depth=2 l3-buffer-depth=1},iree-amdaie-access-to-acquire-release,iree-amdaie-none-access-to-temporary-buffer,iree-amdaie-assign-connection-types,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-dma-composition{only-zero-stride-on-outer-dim=true},cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-dma-cse,iree-amdaie-assign-channels,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-assign-npu-dma-bd-ids,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-controlcode-loop-unroll,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-dma-cse,iree-amdaie-canonicalize-doubly-strided-op{fold-single-dims=false},canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-convert-core-forall-to-for,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-objfifo-bufferization,iree-amdaie-connection-to-flow,iree-amdaie-assign-packet-ids,iree-amdaie-npu-dma-to-half-dma-cpy-nd,iree-amdaie-controlcode-lowering,iree-amdaie-controlcode-to-transaction,iree-amdaie-acquire-release-to-use-lock,iree-amdaie-canonicalize-npu-dma-cpy-nd{nb-dimensions=4},canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-sink-into-core,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-lower-to-aie,iree-amdaie-remove-memoryspace)" --split-input-file %s | FileCheck %s



Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,32 @@ FailureOr<AMDAIE::TileOp> getGeneratorTileOp(
return tileOp;
};

/// Utility to retrieve a ChannelOp from a DMA copy operation.
template <CopyOpOperateOn OperateOn>
FailureOr<AMDAIE::ChannelOp> getChannelOp(AMDAIE::NpuDmaCpyNdOp &npuDmaOp) {
AMDAIE::ConnectionOp connectionOp = npuDmaOp.getConnectionOp();
if (!connectionOp) {
return npuDmaOp.emitOpError()
<< "should operate on an `amdaie.connection` op";
}
if constexpr (OperateOn == CopyOpOperateOn::Source) {
if (connectionOp.getSourceChannels().size() != 1)
return connectionOp.emitOpError() << "expected a single source channel";
auto sourceChannelOp = dyn_cast<AMDAIE::ChannelOp>(
connectionOp.getSourceChannels()[0].getDefiningOp());
return sourceChannelOp;
} else if constexpr (OperateOn == CopyOpOperateOn::Target) {
if (connectionOp.getTargetChannels().size() != 1)
return connectionOp.emitOpError() << "expected a single target channel";
auto targetChannelOp = dyn_cast<AMDAIE::ChannelOp>(
connectionOp.getTargetChannels()[0].getDefiningOp());
return targetChannelOp;
} else {
return npuDmaOp.emitOpError()
<< "Function can only operate on Source or Target";
}
}

std::optional<uint32_t> getNumberIterations(scf::ForOp loop) {
std::optional<uint32_t> lowerBound =
getConstantIntValue(loop.getLowerBound());
Expand Down Expand Up @@ -146,13 +172,20 @@ template <CopyOpOperateOn OperateOn>
FailureOr<AMDAIE::BdIdOp> getBdIdOp(
IRRewriter &rewriter, AMDAIE::NpuDmaCpyNdOp &npuDmaOp,
DenseMap<Value, ChannelBdIdGenerator> &shimTileToGeneratorMap,
DenseMap<AMDAIE::BdIdOp, SmallVector<uint32_t>> &bdIdOpToBdIdsMap,
uint32_t channel) {
FailureOr<AMDAIE::TileOp> tileOp =
DenseMap<AMDAIE::BdIdOp, SmallVector<uint32_t>> &bdIdOpToBdIdsMap) {
// Get the TileOp.
FailureOr<AMDAIE::TileOp> maybeTileOp =
getGeneratorTileOp<OperateOn>(npuDmaOp, shimTileToGeneratorMap);
if (failed(tileOp)) return failure();
if (failed(maybeTileOp)) return failure();
AMDAIE::TileOp tileOp = maybeTileOp.value();
// Get the channel.
FailureOr<AMDAIE::ChannelOp> maybeChannelOp =
getChannelOp<OperateOn>(npuDmaOp);
if (failed(maybeChannelOp)) return failure();
AMDAIE::ChannelOp channelOp = maybeChannelOp.value();
uint32_t channel = channelOp.getValue();

ChannelBdIdGenerator &generator = shimTileToGeneratorMap[tileOp->getResult()];
ChannelBdIdGenerator &generator = shimTileToGeneratorMap[tileOp.getResult()];
rewriter.setInsertionPoint(npuDmaOp);
if (scf::ForOp loop = npuDmaOp->getParentOfType<scf::ForOp>();
loop && getNumberIterations(loop)) {
Expand All @@ -165,7 +198,7 @@ FailureOr<AMDAIE::BdIdOp> getBdIdOp(

// Get the number of BD IDs will be assigned to current DMA op.
uint32_t numRequired = 0;
getNumRequiredBdIds(loop, npuDmaOp, *tileOp, shimTileToGeneratorMap,
getNumRequiredBdIds(loop, npuDmaOp, tileOp, shimTileToGeneratorMap,
numRequired);
uint32_t numAvailable = generator.getNumAvailableBdIds(channel);
uint32_t size = std::max(numAvailable / numRequired, 1u);
Expand Down Expand Up @@ -193,7 +226,7 @@ FailureOr<AMDAIE::BdIdOp> getBdIdOp(
iv,
});
AMDAIE::BdIdOp bdIdOp = rewriter.create<AMDAIE::BdIdOp>(
rewriter.getUnknownLoc(), *tileOp, affineApply.getResult());
rewriter.getUnknownLoc(), tileOp, affineApply.getResult());
bdIdOpToBdIdsMap[bdIdOp] = bdIds;
return bdIdOp;
}
Expand All @@ -206,7 +239,7 @@ FailureOr<AMDAIE::BdIdOp> getBdIdOp(
auto constant = rewriter.create<arith::ConstantOp>(
rewriter.getUnknownLoc(), rewriter.getIndexAttr(bdId.value()));
AMDAIE::BdIdOp bdIdOp = rewriter.create<AMDAIE::BdIdOp>(
rewriter.getUnknownLoc(), *tileOp, constant.getResult());
rewriter.getUnknownLoc(), tileOp, constant.getResult());
return bdIdOp;
};

Expand Down Expand Up @@ -266,13 +299,6 @@ LogicalResult assignNpuDmaBdIds(AMDAIE::WorkgroupOp workgroupOp) {
}
});

// TODO(jornt): Temporarily use channel 0 for all DMAs. This should
// return correct results for Shim channels, however, for generality
// towards other DMAs and future hardware generations, channel
// assignment should happen before BD assignemnt. This requires more
// refactoring.
const uint32_t channel = 0;

DenseMap<AMDAIE::BdIdOp, SmallVector<uint32_t>> bdIdOpToBdIdsMap;
// Walk `amdaie.npu_dma_cpy_nd` and `amdaie.dma_wait` operations and assign
// and release BD IDs when encountering the respective operations using the
Expand All @@ -282,8 +308,7 @@ LogicalResult assignNpuDmaBdIds(AMDAIE::WorkgroupOp workgroupOp) {
if (auto npuDmaOp = dyn_cast<AMDAIE::NpuDmaCpyNdOp>(op)) {
if (npuDmaOp.getSource()) {
FailureOr<AMDAIE::BdIdOp> bdIdOp = getBdIdOp<CopyOpOperateOn::Source>(
rewriter, npuDmaOp, shimTileToGeneratorMap, bdIdOpToBdIdsMap,
channel);
rewriter, npuDmaOp, shimTileToGeneratorMap, bdIdOpToBdIdsMap);
if (failed(bdIdOp)) return WalkResult::interrupt();
rewriter.setInsertionPoint(npuDmaOp);
npuDmaOp = rewriter.replaceOpWithNewOp<AMDAIE::NpuDmaCpyNdOp>(
Expand All @@ -296,8 +321,7 @@ LogicalResult assignNpuDmaBdIds(AMDAIE::WorkgroupOp workgroupOp) {
}
if (npuDmaOp.getTarget()) {
FailureOr<AMDAIE::BdIdOp> bdIdOp = getBdIdOp<CopyOpOperateOn::Target>(
rewriter, npuDmaOp, shimTileToGeneratorMap, bdIdOpToBdIdsMap,
channel);
rewriter, npuDmaOp, shimTileToGeneratorMap, bdIdOpToBdIdsMap);
if (failed(bdIdOp)) return WalkResult::interrupt();
rewriter.setInsertionPoint(npuDmaOp);
(void)rewriter.replaceOpWithNewOp<AMDAIE::NpuDmaCpyNdOp>(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -635,6 +635,10 @@ void addAMDAIEObjectFifoLoweringPasses(
passManager.addPass(createCanonicalizerPass());
passManager.addPass(createAMDAIEDmaCSEPass());

passManager.addPass(createAMDAIEAssignChannelsPass());
passManager.addPass(createCSEPass());
passManager.addPass(createCanonicalizerPass());

passManager.addPass(createAMDAIEAssignNpuDmaBdIdsPass());
passManager.addPass(createCSEPass());
passManager.addPass(createCanonicalizerPass());
Expand All @@ -650,10 +654,6 @@ void addAMDAIEObjectFifoLoweringPasses(
passManager.addPass(createAMDAIEConvertCoreForallToForPass());
passManager.addPass(createCanonicalizerPass());

passManager.addPass(createAMDAIEAssignChannelsPass());
passManager.addPass(createCSEPass());
passManager.addPass(createCanonicalizerPass());

passManager.addPass(createAMDAIEObjFifoBufferizationPass());
passManager.addPass(createAMDAIETemporaryAllocBufferizationPass());
passManager.addPass(createAMDAIEConnectionToFlowPass());
Expand Down
Loading

0 comments on commit 3844e23

Please sign in to comment.