diff --git a/mlir/include/air/Util/Util.h b/mlir/include/air/Util/Util.h index 36e845931..dad3f83ea 100644 --- a/mlir/include/air/Util/Util.h +++ b/mlir/include/air/Util/Util.h @@ -222,6 +222,13 @@ std::optional getOffsetDimFromMemrefDim(int dimOnMemref, SmallVector strides, SmallVector memrefShape); +// Evaluate the affine expression of affine map on a sparse vector of constant +// ints. +std::optional +evaluateConstantsInMap(AffineMap map, + SmallVector> const_inputs, + MLIRContext *ctx); + } // namespace air } // namespace xilinx diff --git a/mlir/lib/Transform/AIRDependencyScheduleOpt.cpp b/mlir/lib/Transform/AIRDependencyScheduleOpt.cpp index 6414f8b02..343961a00 100644 --- a/mlir/lib/Transform/AIRDependencyScheduleOpt.cpp +++ b/mlir/lib/Transform/AIRDependencyScheduleOpt.cpp @@ -1697,11 +1697,15 @@ struct CanonicalizeAffineApplyOnLoopInductionVar if (!getStaticScfForTripCountAsInt(sfo)) return failure(); int tripCount = *getStaticScfForTripCountAsInt(sfo); - auto new_ub = evaluateConstantInMap( - apply.getAffineMap(), *mlir::getConstantIntValue(sfo.getUpperBound()), + auto new_ub = air::evaluateConstantsInMap( + apply.getAffineMap(), + SmallVector>{ + *mlir::getConstantIntValue(sfo.getUpperBound())}, ctx); - auto new_lb = evaluateConstantInMap( - apply.getAffineMap(), *mlir::getConstantIntValue(sfo.getLowerBound()), + auto new_lb = air::evaluateConstantsInMap( + apply.getAffineMap(), + SmallVector>{ + *mlir::getConstantIntValue(sfo.getLowerBound())}, ctx); assert(new_ub && new_lb); int newStepInInt = llvm::divideCeilSigned(*new_ub - *new_lb, tripCount); @@ -1723,10 +1727,14 @@ struct CanonicalizeAffineApplyOnLoopInductionVar if (!afo.hasConstantBounds()) return failure(); int tripCount = *getStaticAffineForTripCountAsInt(afo); - auto new_ub = evaluateConstantInMap(apply.getAffineMap(), - afo.getConstantUpperBound(), ctx); - auto new_lb = evaluateConstantInMap(apply.getAffineMap(), - afo.getConstantLowerBound(), ctx); + auto new_ub = air::evaluateConstantsInMap( + apply.getAffineMap(), + SmallVector>{afo.getConstantUpperBound()}, + ctx); + auto new_lb = air::evaluateConstantsInMap( + apply.getAffineMap(), + SmallVector>{afo.getConstantLowerBound()}, + ctx); assert(new_ub && new_lb); int newStepInInt = llvm::divideCeilSigned(*new_ub - *new_lb, tripCount); IRMapping remap; @@ -1743,19 +1751,6 @@ struct CanonicalizeAffineApplyOnLoopInductionVar } private: - // Evaluate the affine expression of affine map on a constant affine - // expression. Only works with affine maps with a single input. - std::optional evaluateConstantInMap(AffineMap map, - int64_t const_input, - MLIRContext *ctx) const { - std::optional output = std::nullopt; - if (map.getNumInputs() != 1) - return output; - auto c = getAffineConstantExpr(const_input, ctx); - auto newmap = map.replace(getAffineSymbolExpr(0, ctx), c, 0, 1); - output = simplifyAffineMap(newmap).getSingleConstantResult(); - return output; - } }; // Fold arith.muli op operating on loop induction variable into loop bounds. diff --git a/mlir/lib/Transform/AIRMiscPasses.cpp b/mlir/lib/Transform/AIRMiscPasses.cpp index 89b905415..18cf6ff61 100644 --- a/mlir/lib/Transform/AIRMiscPasses.cpp +++ b/mlir/lib/Transform/AIRMiscPasses.cpp @@ -883,8 +883,8 @@ int findGCD(SmallVector vec) { // Tile air.channel put/get wrt a memref. Value tileChannelOpByFactor(air::ChannelInterface originalChanOp, int factor, int originalMemrefSize, int dim, - air::ChannelOp newChanOp, Location loc, - MLIRContext *ctx) { + memref::AllocOp allocOp, air::ChannelOp newChanOp, + Location loc, MLIRContext *ctx) { OpBuilder builder(originalChanOp); SmallVector originalApplyOperands; Operation *affineApplyOp = nullptr; @@ -912,11 +912,21 @@ Value tileChannelOpByFactor(air::ChannelInterface originalChanOp, int factor, auto checkpoint = builder.saveInsertionPoint(); if (affineApplyOp) builder.setInsertionPoint(affineApplyOp); + // Generate default affine.map assuming non-overlapping data access pattern. AffineExpr s0 = builder.getAffineSymbolExpr(0); AffineExpr mul = s0 * originalMemrefSize; AffineExpr add = mul + i * llvm::divideCeilSigned(originalMemrefSize, factor); auto map = AffineMap::get(0, 1, add); + // If allocOp has "affine_map" attribute set, then use that map instead + // (potentially overlapping access pattern). + if (allocOp->hasAttr("affine_map")) { + auto original_map = + allocOp->getAttrOfType("affine_map").getAffineMap(); + if (original_map.getNumInputs() == 2) + map = original_map.replace(getAffineSymbolExpr(1, ctx), + getAffineConstantExpr(i, ctx), 0, 1); + } auto newApplyOp = builder.create(loc, map, originalApplyOperands); if (affineApplyOp) @@ -956,6 +966,26 @@ Value tileChannelOpByFactor(air::ChannelInterface originalChanOp, int factor, return newWaitAll.getAsyncToken(); } +// Get scf.for op whose iv (indirectly) produces the val. +scf::ForOp getScfForFromVal(Value val) { + if (!val) + return scf::ForOp(); + if (auto res = scf::getForInductionVarOwner(val)) + return res; + auto defOp = val.getDefiningOp(); + if (!defOp) + return scf::ForOp(); + if (auto exec = dyn_cast(defOp)) { + auto exec_child = exec.getChildOp(); + if (!exec_child) + return scf::ForOp(); + for (auto oper : exec_child->getOperands()) + if (auto res = scf::getForInductionVarOwner(oper)) + return res; + } + return scf::ForOp(); +} + // Partition L2 memref. void AIRSplitL2MemrefForBufferConstraintPass::partitionMemref( SmallVector &puts, SmallVector &gets, @@ -964,6 +994,7 @@ void AIRSplitL2MemrefForBufferConstraintPass::partitionMemref( MemRefType ty = llvm::cast(memref.getType()); auto allocOp = memref.getDefiningOp(); auto loc = allocOp->getLoc(); + auto ctx = allocOp->getContext(); Operation *deallocOp = nullptr; for (auto user : memref.getUsers()) { if (auto execOp = dyn_cast(user->getParentOp())) { @@ -979,34 +1010,63 @@ void AIRSplitL2MemrefForBufferConstraintPass::partitionMemref( std::map> chanOpPartitions; SmallVector keys; - for (auto op : puts) { - auto offsetDim = air::getOffsetDimFromMemrefDim(dim, op.getStrides(), - air::getTensorShape(ty)); - if (!offsetDim) - continue; - auto offset = getConstantIntValue(op.getOffsets()[*offsetDim]); - if (!offset) - continue; - push_back_if_unique(keys, *offset); - if (!chanOpPartitions.count(*offset)) - chanOpPartitions[*offset] = SmallVector{op}; - else - chanOpPartitions[*offset].push_back(op); - } - for (auto op : gets) { - auto offsetDim = air::getOffsetDimFromMemrefDim(dim, op.getStrides(), - air::getTensorShape(ty)); - if (!offsetDim) - continue; - auto offset = getConstantIntValue(op.getOffsets()[*offsetDim]); - if (!offset) - continue; - push_back_if_unique(keys, *offset); - if (!chanOpPartitions.count(*offset)) - chanOpPartitions[*offset] = SmallVector{op}; - else - chanOpPartitions[*offset].push_back(op); - } + + // Get map of channel ops + auto getChanOpPartitionsMap = + [ctx, dim, + ty](std::map> &chanOpPartitions, + SmallVector &keys, air::ChannelInterface op) { + auto offsetDim = air::getOffsetDimFromMemrefDim( + dim, op.getStrides(), air::getTensorShape(ty)); + if (!offsetDim) + return; + auto offset = getConstantIntValue(op.getOffsets()[*offsetDim]); + int offset_key = -1; + if (offset) + offset_key = *offset; // Const offset. + else { // Variadic offset (induction variable to an scf.for). + auto forOp = getScfForFromVal(op.getOffsets()[*offsetDim]); + if (!forOp) + return; + auto iv = forOp.getInductionVar(); + if (!iv.hasOneUse()) + return; + auto lb = getConstantIntValue(forOp.getLowerBound()); + if (!lb) + return; + Operation *oneUser = nullptr; + for (auto user : iv.getUsers()) + oneUser = user; + if (auto apply = dyn_cast(oneUser)) { + SmallVector> const_ints; + for (auto oper : apply->getOperands()) { + if (auto constVal = getConstantIntValue(oper)) + const_ints.push_back(constVal); + else + const_ints.push_back(lb); + } + auto key_opt = air::evaluateConstantsInMap(apply.getAffineMap(), + const_ints, ctx); + if (!key_opt) + return; + offset_key = *key_opt; + } else + offset_key = *lb; + } + if (offset_key < 0) + return; + push_back_if_unique(keys, offset_key); + if (!chanOpPartitions.count(offset_key)) + chanOpPartitions[offset_key] = SmallVector{op}; + else + chanOpPartitions[offset_key].push_back(op); + }; + + for (auto op : puts) + getChanOpPartitionsMap(chanOpPartitions, keys, op); + for (auto op : gets) + getChanOpPartitionsMap(chanOpPartitions, keys, op); + OpBuilder builder(allocOp); SmallVector mutatedScfForOps; for (auto key : keys) { @@ -1019,10 +1079,22 @@ void AIRSplitL2MemrefForBufferConstraintPass::partitionMemref( air::getTensorShape(ty)); if (!offsetDim) continue; - if (op.getSizes().size() == newMemrefShape.size()) { + if (op.getSizes().size() != newMemrefShape.size()) + continue; + auto offset = getConstantIntValue(op.getOffsets()[*offsetDim]); + if (offset) newMemrefShape[dim] = *getConstantIntValue(op.getSizes()[*offsetDim]); - break; + else { + auto forOp = getScfForFromVal(op.getOffsets()[*offsetDim]); + if (!forOp) + continue; + auto trip_count = air::getStaticScfForTripCountAsInt(forOp); + if (!trip_count) + continue; + newMemrefShape[dim] = + *getConstantIntValue(op.getSizes()[*offsetDim]) * (*trip_count); } + break; } auto newMemrefType = MemRefType::get(newMemrefShape, ty.getElementType(), @@ -1031,9 +1103,9 @@ void AIRSplitL2MemrefForBufferConstraintPass::partitionMemref( Value newMemref = nullptr; // Create new alloc ops. if (isa(allocOp)) { - auto execOp = builder.create( - loc, air::AsyncTokenType::get(allocOp->getContext()), newMemrefType, - SmallVector{}); + auto execOp = + builder.create(loc, air::AsyncTokenType::get(ctx), + newMemrefType, SmallVector{}); Block *async_bb = builder.createBlock(&execOp.getBody()); builder.setInsertionPointToStart(async_bb); auto childMemAlloc = builder.create(loc, newMemrefType); @@ -1048,7 +1120,7 @@ void AIRSplitL2MemrefForBufferConstraintPass::partitionMemref( builder.setInsertionPoint(deallocOp); if (auto execDeallocOp = dyn_cast(deallocOp)) { auto execOp = builder.create( - loc, air::AsyncTokenType::get(deallocOp->getContext()), + loc, air::AsyncTokenType::get(ctx), execDeallocOp.getAsyncDependencies()); Block *async_bb = builder.createBlock(&execOp.getBody()); builder.setInsertionPointToStart(async_bb); @@ -1073,7 +1145,24 @@ void AIRSplitL2MemrefForBufferConstraintPass::partitionMemref( continue; int offsetOperandOffset = memrefOperandOffset + *offsetDim + 1; auto &offsetOpOper = op->getOpOperand(offsetOperandOffset); - offsetOpOper.assign(builder.create(loc, 0)); + + // Const offset. Reset offset to 0. + if (getConstantIntValue(op.getOffsets()[*offsetDim])) + offsetOpOper.assign(builder.create(loc, 0)); + // Variadic offset. Reset const operands of apply to 0. + else { + auto defOp = op.getOffsets()[*offsetDim].getDefiningOp(); + affine::AffineApplyOp apply = dyn_cast(defOp); + air::ExecuteOp exec = dyn_cast(defOp); + if (exec && isa(exec.getChildOp())) + apply = dyn_cast(exec.getChildOp()); + assert(apply && "Apply op not found. NYI."); + for (auto oper : apply->getOperands()) + if (getConstantIntValue(oper)) + apply->replaceUsesOfWith( + oper, builder.create(loc, 0)); + } + // Update strides (contiguous, row-major) after memref tiling. SmallVector newStrides; // One dimensional default stride value. @@ -1209,6 +1298,21 @@ AIRSplitL2MemrefForBufferConstraintPass::getTargetMemrefAllocs( allocOp->setAttr( "split_dim", IntegerAttr::get(IntegerType::get(ctx, 32), splitDim)); + + // If there is an affine.apply operating on offsets[split_dim], then + // log the affine.map. + auto offsetDefOp = chanOp.getOffsets()[splitDim].getDefiningOp(); + if (offsetDefOp) { + affine::AffineApplyOp apply = + dyn_cast(offsetDefOp); + if (auto exec = dyn_cast(offsetDefOp)) + if (auto exec_child_apply = + dyn_cast(exec.getChildOp())) + apply = exec_child_apply; + if (apply) + allocOp->setAttr("affine_map", + AffineMapAttr::get(apply.getAffineMap())); + } } // Tiling along the first (x) dimension of scf.parallel only, as one NPU // memtile is located at the bottom of each column. @@ -1345,7 +1449,7 @@ void AIRSplitL2MemrefForBufferConstraintPass::runOnOperation() { int offsetDim = offsetDimOpt ? *offsetDimOpt : dim; auto newWaitAll = tileChannelOpByFactor( chanUserOp, targetColTilingFactor, memrefShape[dim], offsetDim, - new_chan, loc, ctx); + allocOp, new_chan, loc, ctx); // Update async dependency. auto old_token = @@ -1384,7 +1488,7 @@ void AIRSplitL2MemrefForBufferConstraintPass::runOnOperation() { } Value newWaitAll1 = tileChannelOpByFactor( theOtherChanOp[0], targetColTilingFactor, memrefShape[dim], - dim - numLeadingSingletonDimDiff, new_chan, loc, ctx); + dim - numLeadingSingletonDimDiff, allocOp, new_chan, loc, ctx); // Update dependency. auto oldToken = diff --git a/mlir/lib/Util/Util.cpp b/mlir/lib/Util/Util.cpp index 245a7404b..1647a1062 100644 --- a/mlir/lib/Util/Util.cpp +++ b/mlir/lib/Util/Util.cpp @@ -1511,3 +1511,26 @@ air::getOffsetDimFromMemrefDim(int dimOnMemref, SmallVector strides, } return std::nullopt; } + +// Evaluate the affine expression of affine map on a sparse vector of constant +// ints. +std::optional +air::evaluateConstantsInMap(AffineMap map, + SmallVector> const_inputs, + MLIRContext *ctx) { + std::optional output = std::nullopt; + if (map.getNumInputs() != const_inputs.size()) + return output; + auto newmap = map; + for (unsigned i = 0; i < map.getNumSymbols(); i++) { + if (!const_inputs[i]) + continue; + auto c = getAffineConstantExpr(*const_inputs[i], ctx); + newmap = + newmap.replace(getAffineSymbolExpr(i, ctx), c, 0, map.getNumSymbols()); + } + // auto c = getAffineConstantExpr(const_input, ctx); + // auto newmap = map.replace(getAffineSymbolExpr(0, ctx), c, 0, 1); + output = simplifyAffineMap(newmap).getSingleConstantResult(); + return output; +} diff --git a/mlir/test/Transform/AIRMiscPasses/air_split_l2_memref.mlir b/mlir/test/Transform/AIRMiscPasses/air_split_l2_memref.mlir index 3a246dbd0..57749050e 100644 --- a/mlir/test/Transform/AIRMiscPasses/air_split_l2_memref.mlir +++ b/mlir/test/Transform/AIRMiscPasses/air_split_l2_memref.mlir @@ -1174,3 +1174,156 @@ module { return } } + +// ----- + +// Conv2d 3x3, stride 2 (overlapping l2 access). + +// CHECK: [[$MAP0:#map[0-9]*]] = affine_map<()[s0] -> (s0 + 2)> +// CHECK: [[$MAP1:#map[0-9]+]] = affine_map<()[s0] -> (s0 + 4)> +// CHECK: [[$MAP2:#map[0-9]+]] = affine_map<()[s0] -> (s0 + 6)> + +// CHECK-LABEL: func.func @test9 +// CHECK: air.launch +// CHECK: %[[VAL0:.*]] = affine.apply [[$MAP0]]() +// CHECK: %[[VAL1:.*]] = affine.apply [[$MAP1]]() +// CHECK: %[[VAL2:.*]] = affine.apply [[$MAP2]]() +// CHECK: air.channel.put {{.*}} @channel_0[%c0, %c0] +// CHECK: air.channel.put {{.*}} @channel_0[%c1, %c0] (%{{.*}}[%c0, %[[VAL0]] +// CHECK: air.channel.put {{.*}} @channel_0[%c2, %c0] (%{{.*}}[%c0, %[[VAL1]] +// CHECK: air.channel.put {{.*}} @channel_0[%c3, %c0] (%{{.*}}[%c0, %[[VAL2]] +// CHECK: air.segment +// CHECK: %[[TOKEN0:.*]], %[[ALLOC0:.*]] = air.execute -> (memref<1x3x33x16xi8, 1>) { +// CHECK-NEXT: memref.alloc() : memref<1x3x33x16xi8, 1> +// CHECK: %[[TOKEN1:.*]], %[[ALLOC1:.*]] = air.execute -> (memref<1x3x33x16xi8, 1>) { +// CHECK-NEXT: memref.alloc() : memref<1x3x33x16xi8, 1> +// CHECK: %[[TOKEN2:.*]], %[[ALLOC2:.*]] = air.execute -> (memref<1x3x33x16xi8, 1>) { +// CHECK-NEXT: memref.alloc() : memref<1x3x33x16xi8, 1> +// CHECK: %[[TOKEN3:.*]], %[[ALLOC3:.*]] = air.execute -> (memref<1x3x33x16xi8, 1>) { +// CHECK-NEXT: memref.alloc() : memref<1x3x33x16xi8, 1> +// CHECK: air.channel.get async{{.*}}@channel_0[%c0{{.*}}, %c0{{.*}}] (%[[ALLOC0]] +// CHECK: air.channel.get async{{.*}}@channel_0[%c1{{.*}}, %c0{{.*}}] (%[[ALLOC1]] +// CHECK: air.channel.get async{{.*}}@channel_0[%c2{{.*}}, %c0{{.*}}] (%[[ALLOC2]] +// CHECK: air.channel.get async{{.*}}@channel_0[%c3{{.*}}, %c0{{.*}}] (%[[ALLOC3]] +// CHECK: air.channel.put async{{.*}}@channel_3[%c0{{.*}}, %c0{{.*}}] (%[[ALLOC0]] +// CHECK: air.channel.put async{{.*}}@channel_3[%c1{{.*}}, %c0{{.*}}] (%[[ALLOC1]] +// CHECK: air.channel.put async{{.*}}@channel_3[%c2{{.*}}, %c0{{.*}}] (%[[ALLOC2]] +// CHECK: air.channel.put async{{.*}}@channel_3[%c3{{.*}}, %c0{{.*}}] (%[[ALLOC3]] +// CHECK: air.channel.put async{{.*}}@channel_3[%c0{{.*}}, %c1{{.*}}] (%[[ALLOC0]] +// CHECK: air.channel.put async{{.*}}@channel_3[%c1{{.*}}, %c1{{.*}}] (%[[ALLOC1]] +// CHECK: air.channel.put async{{.*}}@channel_3[%c2{{.*}}, %c1{{.*}}] (%[[ALLOC2]] +// CHECK: air.channel.put async{{.*}}@channel_3[%c3{{.*}}, %c1{{.*}}] (%[[ALLOC3]] +// CHECK: air.channel.put async{{.*}}@channel_3[%c0{{.*}}, %c2{{.*}}] (%[[ALLOC0]] +// CHECK: air.channel.put async{{.*}}@channel_3[%c1{{.*}}, %c2{{.*}}] (%[[ALLOC1]] +// CHECK: air.channel.put async{{.*}}@channel_3[%c2{{.*}}, %c2{{.*}}] (%[[ALLOC2]] +// CHECK: air.channel.put async{{.*}}@channel_3[%c3{{.*}}, %c2{{.*}}] (%[[ALLOC3]] +// CHECK: air.channel.put async{{.*}}@channel_3[%c0{{.*}}, %c3{{.*}}] (%[[ALLOC0]] +// CHECK: air.channel.put async{{.*}}@channel_3[%c1{{.*}}, %c3{{.*}}] (%[[ALLOC1]] +// CHECK: air.channel.put async{{.*}}@channel_3[%c2{{.*}}, %c3{{.*}}] (%[[ALLOC2]] +// CHECK: air.channel.put async{{.*}}@channel_3[%c3{{.*}}, %c3{{.*}}] (%[[ALLOC3]] +// CHECK: air.herd +// CHECK: air.channel.get async{{.*}}@channel_3 + +#map = affine_map<()[s0] -> (s0 * 8)> +#map1 = affine_map<()[s0] -> (s0 * 32)> +#map2 = affine_map<()[s0, s1] -> (s0 + s1 * 2)> +#map3 = affine_map<()[s0, s1] -> (s0 + s1 * 8)> +module { + air.channel @channel_3 [4, 4] + air.channel @channel_1 [1, 1] + func.func @test9(%arg0: memref<1x513x513x16xi8>, %arg1: memref<3x3x16x32xi8>, %arg2: memref<1x256x256x32xi32>) { + %c64 = arith.constant 64 : index + %c4 = arith.constant 4 : index + %c16 = arith.constant 16 : index + %0 = air.launch async (%arg3, %arg4, %arg5) in (%arg6=%c64, %arg7=%c16, %arg8=%c4) args(%arg9=%arg0) : memref<1x513x513x16xi8> attributes {id = 1 : i32} { + %c8208 = arith.constant 8208 : index + %c4210704 = arith.constant 4210704 : index + %c16_0 = arith.constant 16 : index + %c33 = arith.constant 33 : index + %c9 = arith.constant 9 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %async_token, %results = air.execute -> (index) { + %3 = affine.apply #map()[%arg3] + air.execute_terminator %3 : index + } + %async_token_1, %results_2 = air.execute -> (index) { + %3 = affine.apply #map1()[%arg4] + air.execute_terminator %3 : index + } + %1 = air.channel.put async [%async_token, %async_token_1] @channel_1[] (%arg9[%c0, %results, %results_2, %c0] [%c1, %c9, %c33, %c16_0] [%c4210704, %c8208, %c16_0, %c1]) {id = 1 : i32} : (memref<1x513x513x16xi8>) + %2 = air.segment @segment_0 async attributes {id = 2 : i32} { + %c7 = arith.constant 7 : index + %c4752 = arith.constant 4752 : index + %c528 = arith.constant 528 : index + %c8 = arith.constant 8 : index + %c3 = arith.constant 3 : index + %c16_3 = arith.constant 16 : index + %c1_4 = arith.constant 1 : index + %c0_5 = arith.constant 0 : index + %c4_6 = arith.constant 4 : index + %3 = air.wait_all async + %4 = air.wait_all async + %async_token_7, %results_8 = air.execute -> (memref<1x9x33x16xi8, 1 : i32>) { + %alloc = memref.alloc() : memref<1x9x33x16xi8, 1 : i32> + air.execute_terminator %alloc : memref<1x9x33x16xi8, 1 : i32> + } + %5 = air.channel.get async [%3, %4, %async_token_7] @channel_1[] (%results_8[] [] []) {id = 4 : i32} : (memref<1x9x33x16xi8, 1 : i32>) + %6 = scf.parallel (%arg10, %arg11) = (%c0_5, %c0_5) to (%c4_6, %c4_6) step (%c1_4, %c1_4) init (%5) -> !air.async.token { + %8 = scf.for %arg12 = %c0_5 to %c3 step %c1_4 iter_args(%arg13 = %5) -> (!air.async.token) { + %9 = scf.for %arg14 = %c0_5 to %c3 step %c1_4 iter_args(%arg15 = %arg13) -> (!air.async.token) { + %10 = scf.for %arg16 = %c0_5 to %c16_3 step %c8 iter_args(%arg17 = %arg15) -> (!air.async.token) { + %async_token_10, %results_11 = air.execute [%arg17] -> (index) { + %12 = affine.apply #map2()[%arg12, %arg10] + air.execute_terminator %12 : index + } + %async_token_12, %results_13 = air.execute [%arg17] -> (index) { + %12 = affine.apply #map3()[%arg14, %arg11] + air.execute_terminator %12 : index + } + %11 = air.channel.put async [%async_token_10, %async_token_12] @channel_3[%arg10, %arg11] (%results_8[%c0_5, %results_11, %results_13, %arg16] [%c1_4, %c1_4, %c7, %c8] [%c4752, %c528, %c16_3, %c1_4]) {id = 7 : i32} : (memref<1x9x33x16xi8, 1 : i32>) + scf.yield %11 : !air.async.token + } + scf.yield %10 : !air.async.token + } + scf.yield %9 : !air.async.token + } + scf.reduce(%8 : !air.async.token) { + ^bb0(%arg12: !air.async.token, %arg13: !air.async.token): + %9 = air.wait_all async [%arg12, %arg13] + scf.reduce.return %9 : !air.async.token + } + } + %7 = air.herd @herd_0 async [%5] tile (%arg10, %arg11) in (%arg12=%c4_6, %arg13=%c4_6) attributes {id = 3 : i32} { + %c0_10 = arith.constant 0 : index + %c16_11 = arith.constant 16 : index + %c8_12 = arith.constant 8 : index + %c3_13 = arith.constant 3 : index + %c1_14 = arith.constant 1 : index + %8 = air.wait_all async + %9 = scf.for %arg14 = %c0_10 to %c3_13 step %c1_14 iter_args(%arg15 = %8) -> (!air.async.token) { + %10 = scf.for %arg16 = %c0_10 to %c3_13 step %c1_14 iter_args(%arg17 = %arg15) -> (!air.async.token) { + %11 = scf.for %arg18 = %c0_10 to %c16_11 step %c8_12 iter_args(%arg19 = %arg17) -> (!air.async.token) { + %async_token_15, %results_16 = air.execute -> (memref<1x1x7x8xi8, 2 : i32>) { + %alloc = memref.alloc() : memref<1x1x7x8xi8, 2 : i32> + air.execute_terminator %alloc : memref<1x1x7x8xi8, 2 : i32> + } + %12 = air.channel.get async [%arg19, %async_token_15] @channel_3[%arg10, %arg11] (%results_16[] [] []) {id = 9 : i32} : (memref<1x1x7x8xi8, 2 : i32>) + %async_token_17 = air.execute { + memref.dealloc %results_16 : memref<1x1x7x8xi8, 2 : i32> + } + scf.yield %12 : !air.async.token + } + scf.yield %11 : !air.async.token + } + scf.yield %10 : !air.async.token + } + } + %async_token_9 = air.execute [%5] { + memref.dealloc %results_8 : memref<1x9x33x16xi8, 1 : i32> + } + } + } + return + } +}